"""Deep-preview accessibility check — content-level readable/paywalled/blocked/unknown,
and the layered verdict (domain rule + sampled access, evidence over domain alone)."""
import time
from goodnews import feeds
from goodnews.paywall import check_article_access
READABLE = b"
" + (b"
Real article text here.
" * 80) + b""
WALLED_SCHEMA = b'
teaser
'
WALLED_PHRASE = b"
Subscribe to continue reading this story.
"
THIN = b"
hi
"
def _fetcher(mapping):
def f(url, timeout=8):
if mapping.get(url) == "ERR":
raise RuntimeError("boom")
return mapping[url]
return f
def test_classifies_each_access_state():
f = _fetcher({"r": READABLE, "s": WALLED_SCHEMA, "p": WALLED_PHRASE, "t": THIN, "b": "ERR"})
assert check_article_access("r", f) == "readable"
assert check_article_access("s", f) == "paywalled" # schema.org isAccessibleForFree:false
assert check_article_access("p", f) == "paywalled" # explicit wall phrase
assert check_article_access("t", f) == "unknown" # too thin to tell
assert check_article_access("b", f) == "blocked" # fetch failed
def test_does_not_falseflag_a_readable_page():
# a long article that merely links "subscribe to our newsletter" in the footer
html = b"" + (b"
Lots of real content.
" * 100) + \
b""
assert check_article_access("x", _fetcher({"x": html})) == "readable"
def _items(urls):
return [feeds.FeedItem(title=f"T{i}", url=u, description="d", published_at=None)
for i, u in enumerate(urls)]
def test_preview_verdict_layers_domain_and_sample(monkeypatch):
# a non-paywall-domain feed whose sampled articles mostly read fine -> "fine"
urls = ["https://good.example/a1", "https://good.example/a2", "https://good.example/a3"]
monkeypatch.setattr(feeds, "parse_feed", lambda raw: _items(urls))
class FakeClient:
model = "test"
def classify(self, art):
return {"accepted": True, "topic": "science", "flavor": "discovery",
"cortisol_score": 1, "ragebait_score": 1, "pr_risk_score": 2}
def fetcher(url, timeout=10):
return READABLE # every sampled article reads fine
out = feeds.preview_feed("https://good.example/feed", sample=8, client=FakeClient(), fetcher=fetcher)
assert out["paywall_rule"] is False
assert out["access"]["readable"] >= 1 and out["access"]["paywalled"] == 0
assert out["access_verdict"] == "fine"
def test_mostly_blocked_is_review_not_fine(monkeypatch):
# bot-blocked sites (readable in a browser, blocked to our fetcher) must NOT read
# as 'fine' off one sample, nor as 'reject-ready' — they land in 'review'.
urls = [f"https://blocky.example/a{i}" for i in range(6)]
monkeypatch.setattr(feeds, "parse_feed", lambda raw: _items(urls))
class FakeClient:
model = "test"
def classify(self, art):
return {"accepted": True, "topic": "science", "flavor": "discovery",
"cortisol_score": 1, "ragebait_score": 1, "pr_risk_score": 2}
def fetcher(url, timeout=10):
if url.endswith("/feed") or url.endswith("a0"):
return READABLE # the feed fetch + one readable article
raise RuntimeError("403 blocked") # the rest block (bot-blocked)
out = feeds.preview_feed("https://blocky.example/feed", sample=8, client=FakeClient(), fetcher=fetcher)
assert out["access"]["blocked"] >= 4 and out["access"]["readable"] == 1
assert out["access_verdict"] == "review" # thin assessable evidence → not 'fine', not 'reject-ready'
def test_source_preview_endpoint_handles_null_rate(tmp_path, monkeypatch):
# All-held (non-English) sample → acceptance_rate is None; the legacy
# /api/source-preview must not 500 on it (SourcePreview.acceptance_rate is nullable).
db = tmp_path / "t.sqlite3"
monkeypatch.setenv("GOODNEWS_DB", str(db))
monkeypatch.setenv("GOODNEWS_PUBLIC_BASE_URL", "http://testserver")
import importlib
import goodnews.api as api
importlib.reload(api)
from goodnews.db import connect, init_db
c = connect(str(db)); init_db(c); c.commit(); c.close()
all_held = {
"url": "http://x/feed", "sampled": 4, "classified": True, "accepted": 0,
"non_english": 4, "acceptance_rate": None, "avg_cortisol": 0.0, "avg_ragebait": 0.0,
"avg_pr_risk": 0.0, "newest_published": None, "recent_7d": 0,
"topic_mix": {}, "flavor_mix": {}, "examples_accepted": [], "examples_rejected": [],
}
monkeypatch.setattr(feeds, "preview_feed", lambda *a, **k: all_held)
from fastapi.testclient import TestClient
r = TestClient(api.create_app()).get("/api/source-preview?url=http://x/feed")
assert r.status_code == 200 # was 500: None rejected by float field
assert r.json()["acceptance_rate"] is None
def test_one_hung_fetch_does_not_stall_the_preview(monkeypatch):
# Codex's wall-clock audit: one article that sleeps WAY past the deadline must
# not pin Deep Preview — it returns at the cap, with the slow one left 'unknown'.
monkeypatch.setattr(feeds, "_ACCESS_DEADLINE_S", 0.5) # shrink the cap for the test
urls = [f"https://mixed.example/a{i}" for i in range(6)]
monkeypatch.setattr(feeds, "parse_feed", lambda raw: _items(urls))
class FakeClient:
model = "test"
def classify(self, art):
return {"accepted": True, "topic": "science", "flavor": "discovery",
"cortisol_score": 1, "ragebait_score": 1, "pr_risk_score": 2}
def fetcher(url, timeout=10):
if url.endswith("a0"):
time.sleep(5) # one ugly site hangs far past the 0.5s cap
return READABLE
start = time.monotonic()
out = feeds.preview_feed("https://mixed.example/feed", sample=8, client=FakeClient(), fetcher=fetcher)
elapsed = time.monotonic() - start
assert elapsed < 2.5 # returned at the cap (~0.5s), NOT after the 5s sleep
# the hung one is 'unknown' (unverified), the rest read fine
slow = next(e for e in out["access"]["examples"] if e["url"].endswith("a0"))
assert slow["access"] == "unknown"
assert out["access"]["readable"] >= 4