upbeatBytes/tests/test_access_check.py

"""Deep-preview accessibility check — content-level readable/paywalled/blocked/unknown,
and the layered verdict (domain rule + sampled access, evidence over domain alone)."""
import time

from goodnews import feeds
from goodnews.paywall import check_article_access

READABLE = b"<html><body><article>" + (b"<p>Real article text here. </p>" * 80) + b"</article></body></html>"
WALLED_SCHEMA = b'<html><head><script type="application/ld+json">{"isAccessibleForFree": false}</script></head><body><p>teaser</p></body></html>'
WALLED_PHRASE = b"<html><body><p>Subscribe to continue reading this story.</p></body></html>"
THIN = b"<html><body><p>hi</p></body></html>"


def _fetcher(mapping):
    def f(url, timeout=8):
        if mapping.get(url) == "ERR":
            raise RuntimeError("boom")
        return mapping[url]
    return f


def test_classifies_each_access_state():
    f = _fetcher({"r": READABLE, "s": WALLED_SCHEMA, "p": WALLED_PHRASE, "t": THIN, "b": "ERR"})
    assert check_article_access("r", f) == "readable"
    assert check_article_access("s", f) == "paywalled"   # schema.org isAccessibleForFree:false
    assert check_article_access("p", f) == "paywalled"   # explicit wall phrase
    assert check_article_access("t", f) == "unknown"     # too thin to tell
    assert check_article_access("b", f) == "blocked"     # fetch failed


def test_does_not_falseflag_a_readable_page():
    # a long article that merely links "subscribe to our newsletter" in the footer
    html = b"<html><body><article>" + (b"<p>Lots of real content. </p>" * 100) + \
           b"<footer>Subscribe to our newsletter</footer></article></body></html>"
    assert check_article_access("x", _fetcher({"x": html})) == "readable"


def _items(urls):
    return [feeds.FeedItem(title=f"T{i}", url=u, description="d", published_at=None)
            for i, u in enumerate(urls)]


def test_preview_verdict_layers_domain_and_sample(monkeypatch):
    # a non-paywall-domain feed whose sampled articles mostly read fine -> "fine"
    urls = ["https://good.example/a1", "https://good.example/a2", "https://good.example/a3"]
    monkeypatch.setattr(feeds, "parse_feed", lambda raw: _items(urls))

    class FakeClient:
        model = "test"
        def classify(self, art):
            return {"accepted": True, "topic": "science", "flavor": "discovery",
                    "cortisol_score": 1, "ragebait_score": 1, "pr_risk_score": 2}

    def fetcher(url, timeout=10):
        return READABLE  # every sampled article reads fine

    out = feeds.preview_feed("https://good.example/feed", sample=8, client=FakeClient(), fetcher=fetcher)
    assert out["paywall_rule"] is False
    assert out["access"]["readable"] >= 1 and out["access"]["paywalled"] == 0
    assert out["access_verdict"] == "fine"


def test_mostly_blocked_is_review_not_fine(monkeypatch):
    # bot-blocked sites (readable in a browser, blocked to our fetcher) must NOT read
    # as 'fine' off one sample, nor as 'reject-ready' — they land in 'review'.
    urls = [f"https://blocky.example/a{i}" for i in range(6)]
    monkeypatch.setattr(feeds, "parse_feed", lambda raw: _items(urls))

    class FakeClient:
        model = "test"
        def classify(self, art):
            return {"accepted": True, "topic": "science", "flavor": "discovery",
                    "cortisol_score": 1, "ragebait_score": 1, "pr_risk_score": 2}

    def fetcher(url, timeout=10):
        if url.endswith("/feed") or url.endswith("a0"):
            return READABLE   # the feed fetch + one readable article
        raise RuntimeError("403 blocked")  # the rest block (bot-blocked)

    out = feeds.preview_feed("https://blocky.example/feed", sample=8, client=FakeClient(), fetcher=fetcher)
    assert out["access"]["blocked"] >= 4 and out["access"]["readable"] == 1
    assert out["access_verdict"] == "review"   # thin assessable evidence → not 'fine', not 'reject-ready'


def test_source_preview_endpoint_handles_null_rate(tmp_path, monkeypatch):
    # All-held (non-English) sample → acceptance_rate is None; the legacy
    # /api/source-preview must not 500 on it (SourcePreview.acceptance_rate is nullable).
    db = tmp_path / "t.sqlite3"
    monkeypatch.setenv("GOODNEWS_DB", str(db))
    monkeypatch.setenv("GOODNEWS_PUBLIC_BASE_URL", "http://testserver")
    import importlib
    import goodnews.api as api
    importlib.reload(api)
    from goodnews.db import connect, init_db
    c = connect(str(db)); init_db(c); c.commit(); c.close()
    all_held = {
        "url": "http://x/feed", "sampled": 4, "classified": True, "accepted": 0,
        "non_english": 4, "acceptance_rate": None, "avg_cortisol": 0.0, "avg_ragebait": 0.0,
        "avg_pr_risk": 0.0, "newest_published": None, "recent_7d": 0,
        "topic_mix": {}, "flavor_mix": {}, "examples_accepted": [], "examples_rejected": [],
    }
    monkeypatch.setattr(feeds, "preview_feed", lambda *a, **k: all_held)
    from fastapi.testclient import TestClient
    r = TestClient(api.create_app()).get("/api/source-preview?url=http://x/feed")
    assert r.status_code == 200                  # was 500: None rejected by float field
    assert r.json()["acceptance_rate"] is None


def test_one_hung_fetch_does_not_stall_the_preview(monkeypatch):
    # Codex's wall-clock audit: one article that sleeps WAY past the deadline must
    # not pin Deep Preview — it returns at the cap, with the slow one left 'unknown'.
    monkeypatch.setattr(feeds, "_ACCESS_DEADLINE_S", 0.5)   # shrink the cap for the test
    urls = [f"https://mixed.example/a{i}" for i in range(6)]
    monkeypatch.setattr(feeds, "parse_feed", lambda raw: _items(urls))

    class FakeClient:
        model = "test"
        def classify(self, art):
            return {"accepted": True, "topic": "science", "flavor": "discovery",
                    "cortisol_score": 1, "ragebait_score": 1, "pr_risk_score": 2}

    def fetcher(url, timeout=10):
        if url.endswith("a0"):
            time.sleep(5)          # one ugly site hangs far past the 0.5s cap
        return READABLE

    start = time.monotonic()
    out = feeds.preview_feed("https://mixed.example/feed", sample=8, client=FakeClient(), fetcher=fetcher)
    elapsed = time.monotonic() - start
    assert elapsed < 2.5          # returned at the cap (~0.5s), NOT after the 5s sleep
    # the hung one is 'unknown' (unverified), the rest read fine
    slow = next(e for e in out["access"]["examples"] if e["url"].endswith("a0"))
    assert slow["access"] == "unknown"
    assert out["access"]["readable"] >= 4