"""Deep-preview accessibility check — content-level readable/paywalled/blocked/unknown, and the layered verdict (domain rule + sampled access, evidence over domain alone).""" import time from goodnews import feeds from goodnews.paywall import check_article_access READABLE = b"
" + (b"

Real article text here.

" * 80) + b"
" WALLED_SCHEMA = b'

teaser

' WALLED_PHRASE = b"

Subscribe to continue reading this story.

" THIN = b"

hi

" def _fetcher(mapping): def f(url, timeout=8): if mapping.get(url) == "ERR": raise RuntimeError("boom") return mapping[url] return f def test_classifies_each_access_state(): f = _fetcher({"r": READABLE, "s": WALLED_SCHEMA, "p": WALLED_PHRASE, "t": THIN, "b": "ERR"}) assert check_article_access("r", f) == "readable" assert check_article_access("s", f) == "paywalled" # schema.org isAccessibleForFree:false assert check_article_access("p", f) == "paywalled" # explicit wall phrase assert check_article_access("t", f) == "unknown" # too thin to tell assert check_article_access("b", f) == "blocked" # fetch failed def test_does_not_falseflag_a_readable_page(): # a long article that merely links "subscribe to our newsletter" in the footer html = b"
" + (b"

Lots of real content.

" * 100) + \ b"
" assert check_article_access("x", _fetcher({"x": html})) == "readable" def _items(urls): return [feeds.FeedItem(title=f"T{i}", url=u, description="d", published_at=None) for i, u in enumerate(urls)] def test_preview_verdict_layers_domain_and_sample(monkeypatch): # a non-paywall-domain feed whose sampled articles mostly read fine -> "fine" urls = ["https://good.example/a1", "https://good.example/a2", "https://good.example/a3"] monkeypatch.setattr(feeds, "parse_feed", lambda raw: _items(urls)) class FakeClient: model = "test" def classify(self, art): return {"accepted": True, "topic": "science", "flavor": "discovery", "cortisol_score": 1, "ragebait_score": 1, "pr_risk_score": 2} def fetcher(url, timeout=10): return READABLE # every sampled article reads fine out = feeds.preview_feed("https://good.example/feed", sample=8, client=FakeClient(), fetcher=fetcher) assert out["paywall_rule"] is False assert out["access"]["readable"] >= 1 and out["access"]["paywalled"] == 0 assert out["access_verdict"] == "fine" def test_mostly_blocked_is_review_not_fine(monkeypatch): # bot-blocked sites (readable in a browser, blocked to our fetcher) must NOT read # as 'fine' off one sample, nor as 'reject-ready' — they land in 'review'. urls = [f"https://blocky.example/a{i}" for i in range(6)] monkeypatch.setattr(feeds, "parse_feed", lambda raw: _items(urls)) class FakeClient: model = "test" def classify(self, art): return {"accepted": True, "topic": "science", "flavor": "discovery", "cortisol_score": 1, "ragebait_score": 1, "pr_risk_score": 2} def fetcher(url, timeout=10): if url.endswith("/feed") or url.endswith("a0"): return READABLE # the feed fetch + one readable article raise RuntimeError("403 blocked") # the rest block (bot-blocked) out = feeds.preview_feed("https://blocky.example/feed", sample=8, client=FakeClient(), fetcher=fetcher) assert out["access"]["blocked"] >= 4 and out["access"]["readable"] == 1 assert out["access_verdict"] == "review" # thin assessable evidence → not 'fine', not 'reject-ready' def test_source_preview_endpoint_handles_null_rate(tmp_path, monkeypatch): # All-held (non-English) sample → acceptance_rate is None; the legacy # /api/source-preview must not 500 on it (SourcePreview.acceptance_rate is nullable). db = tmp_path / "t.sqlite3" monkeypatch.setenv("GOODNEWS_DB", str(db)) monkeypatch.setenv("GOODNEWS_PUBLIC_BASE_URL", "http://testserver") import importlib import goodnews.api as api importlib.reload(api) from goodnews.db import connect, init_db c = connect(str(db)); init_db(c); c.commit(); c.close() all_held = { "url": "http://x/feed", "sampled": 4, "classified": True, "accepted": 0, "non_english": 4, "acceptance_rate": None, "avg_cortisol": 0.0, "avg_ragebait": 0.0, "avg_pr_risk": 0.0, "newest_published": None, "recent_7d": 0, "topic_mix": {}, "flavor_mix": {}, "examples_accepted": [], "examples_rejected": [], } monkeypatch.setattr(feeds, "preview_feed", lambda *a, **k: all_held) from fastapi.testclient import TestClient r = TestClient(api.create_app()).get("/api/source-preview?url=http://x/feed") assert r.status_code == 200 # was 500: None rejected by float field assert r.json()["acceptance_rate"] is None def test_one_hung_fetch_does_not_stall_the_preview(monkeypatch): # Codex's wall-clock audit: one article that sleeps WAY past the deadline must # not pin Deep Preview — it returns at the cap, with the slow one left 'unknown'. monkeypatch.setattr(feeds, "_ACCESS_DEADLINE_S", 0.5) # shrink the cap for the test urls = [f"https://mixed.example/a{i}" for i in range(6)] monkeypatch.setattr(feeds, "parse_feed", lambda raw: _items(urls)) class FakeClient: model = "test" def classify(self, art): return {"accepted": True, "topic": "science", "flavor": "discovery", "cortisol_score": 1, "ragebait_score": 1, "pr_risk_score": 2} def fetcher(url, timeout=10): if url.endswith("a0"): time.sleep(5) # one ugly site hangs far past the 0.5s cap return READABLE start = time.monotonic() out = feeds.preview_feed("https://mixed.example/feed", sample=8, client=FakeClient(), fetcher=fetcher) elapsed = time.monotonic() - start assert elapsed < 2.5 # returned at the cap (~0.5s), NOT after the 5s sleep # the hung one is 'unknown' (unverified), the rest read fine slow = next(e for e in out["access"]["examples"] if e["url"].endswith("a0")) assert slow["access"] == "unknown" assert out["access"]["readable"] >= 4