upbeatBytes/goodnews/paywall.py

"""Domain-level paywall hints.

We never fetch article pages, so a paywall can only be inferred from the host.
This is a curated, conservative list of hard/soft paywalls — enough to label a
card "subscription may be required" and to prefer readable stories for the lead
and for replacements. It will never be perfect; it's an honest hint, not a gate.
"""

from __future__ import annotations

import re
from urllib.parse import urlsplit

# Host suffixes considered paywalled. Subdomains match (news.nature.com → nature.com).
PAYWALL_DOMAINS = {
    "newscientist.com",
    "nature.com",
    "nytimes.com",
    "wsj.com",
    "ft.com",
    "economist.com",
    "wired.com",
    "theatlantic.com",
    "washingtonpost.com",
    "bloomberg.com",
    "technologyreview.com",
    "newyorker.com",
    "scientificamerican.com",
    "nationalgeographic.com",
    "thetimes.co.uk",
    "telegraph.co.uk",
    "foreignpolicy.com",
    "hbr.org",
    "harpers.org",
}


def is_paywalled(url: str | None) -> bool:
    """Low-level DOMAIN rule. Keep this distinct from the source-aware decision so
    callers can tell 'domain says paywalled' from 'this source is overridden'."""
    host = urlsplit(url or "").netloc.lower()
    if host.startswith("www."):
        host = host[4:]
    return any(host == d or host.endswith("." + d) for d in PAYWALL_DOMAINS)


def is_paywalled_for_source(url: str | None, override: str | None = None) -> bool:
    """The EFFECTIVE paywall decision used for ranking/lead/badges: a per-source
    override (set in admin after inspecting the articles) wins over the domain
    rule — 'free' clears a false positive (e.g. NY Times Learning), 'paywalled'
    flags a false negative. NULL falls back to the domain rule."""
    if override == "free":
        return False
    if override == "paywalled":
        return True
    return is_paywalled(url)


# --- Content-level accessibility (deep-preview only; the live pipeline still never
#     fetches article pages) -----------------------------------------------------

# Wall phrases that appear in the rendered, walled state. Kept specific so a footer
# "subscribe to our newsletter" doesn't read as a paywall.
_WALL_MARKERS = (
    "subscribe to continue", "subscribe to keep reading", "subscribe to read",
    "to continue reading", "already a subscriber", "subscribers only",
    "this article is for subscribers", "this content is for subscribers",
    "create a free account to continue", "create an account to keep reading",
    "unlock this article", "register to continue reading",
)
_ACCESS_FALSE = re.compile(r'"isaccessibleforfree"\s*:\s*("?)(false)\1', re.I)
_ACCESS_TRUE = re.compile(r'"isaccessibleforfree"\s*:\s*("?)(true)\1', re.I)
_CONTENT_LOCKED = re.compile(r'content[_-]tier"[^>]*content="locked', re.I)
_STRIP_BLOCKS = re.compile(r"(?is)<(script|style|noscript|template)[^>]*>.*?</\1>")
_STRIP_TAGS = re.compile(r"(?s)<[^>]+>")
_WS = re.compile(r"\s+")


def check_article_access(url: str, fetcher, timeout: int = 8) -> str:
    """Best-effort readability of ONE article URL, for the deep-preview accessibility
    sample. Returns 'readable' | 'paywalled' | 'blocked' | 'unknown'.

    Conservative + evidence-led: an explicit signal (schema.org isAccessibleForFree,
    content-tier=locked, or a clear wall phrase) marks 'paywalled'; otherwise a page
    with substantial body text reads as 'readable'; thin/ambiguous pages stay
    'unknown'. A fetch error is 'blocked'. Heuristic by nature — it informs the
    verdict, it never auto-rejects (domain rules already proved they can lie)."""
    try:
        raw = fetcher(url, timeout=timeout)
    except Exception:  # noqa: BLE001 — any fetch failure = can't read it right now
        return "blocked"
    try:
        html = raw.decode("utf-8", "ignore")
    except Exception:  # noqa: BLE001
        return "unknown"
    if _ACCESS_FALSE.search(html) or _CONTENT_LOCKED.search(html):
        return "paywalled"
    low = html.lower()
    if any(m in low for m in _WALL_MARKERS):
        return "paywalled"
    # No wall signal — judge by how much real article text is present.
    text = _WS.sub(" ", _STRIP_TAGS.sub(" ", _STRIP_BLOCKS.sub(" ", html))).strip()
    if _ACCESS_TRUE.search(html) and len(text) >= 600:
        return "readable"
    if len(text) >= 1500:
        return "readable"
    return "unknown"