89c0fbe1f6
The deploy pipeline runs from the working tree, so a wave of shipped features
had never been committed. This snapshots git to what's actually running.
SEO impression recovery (live + verified):
- Duplicate /a/{id} now 301-redirect to their canonical twin instead of 404
(a hard 404 silently dropped already-indexed URLs and tanked impressions).
- Dedup representative selection reworked: accepted/serveable -> established
rep (URL stability) -> quality score, so an accepted page never retires to a
rejected rep and an indexed canonical doesn't churn when a newer twin arrives.
- HEAD /a/{id} returns the same status as GET (api_route GET+HEAD) instead of
falling through to the static mount and 404ing.
- `dedup --force-recluster`: cycle-locked, model-free re-cluster to re-apply the
policy to the existing corpus (shared cycle_lock context manager).
- CLI honors GOODNEWS_DB for its default --db (was silently ignored).
Publishing Desk (admin tool to post highlights to X via Web Intents):
- publishing.py queue/rank/handle-resolution; admin UI; full searchable emoji
picker (bundled data, no CDN) for the blurb editor.
Play games + site:
- Bloom (word-wheel), Memory Match, daily ritual set, Zen Den (dev-gated).
- English-only language gate; source prospecting; paywall + dedup hardening.
Tests: full suite green (349). Ignores tightened (node_modules, data/*.db).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
135 lines
6.4 KiB
Python
135 lines
6.4 KiB
Python
"""Deep-preview accessibility check — content-level readable/paywalled/blocked/unknown,
|
|
and the layered verdict (domain rule + sampled access, evidence over domain alone)."""
|
|
import time
|
|
|
|
from goodnews import feeds
|
|
from goodnews.paywall import check_article_access
|
|
|
|
READABLE = b"<html><body><article>" + (b"<p>Real article text here. </p>" * 80) + b"</article></body></html>"
|
|
WALLED_SCHEMA = b'<html><head><script type="application/ld+json">{"isAccessibleForFree": false}</script></head><body><p>teaser</p></body></html>'
|
|
WALLED_PHRASE = b"<html><body><p>Subscribe to continue reading this story.</p></body></html>"
|
|
THIN = b"<html><body><p>hi</p></body></html>"
|
|
|
|
|
|
def _fetcher(mapping):
|
|
def f(url, timeout=8):
|
|
if mapping.get(url) == "ERR":
|
|
raise RuntimeError("boom")
|
|
return mapping[url]
|
|
return f
|
|
|
|
|
|
def test_classifies_each_access_state():
|
|
f = _fetcher({"r": READABLE, "s": WALLED_SCHEMA, "p": WALLED_PHRASE, "t": THIN, "b": "ERR"})
|
|
assert check_article_access("r", f) == "readable"
|
|
assert check_article_access("s", f) == "paywalled" # schema.org isAccessibleForFree:false
|
|
assert check_article_access("p", f) == "paywalled" # explicit wall phrase
|
|
assert check_article_access("t", f) == "unknown" # too thin to tell
|
|
assert check_article_access("b", f) == "blocked" # fetch failed
|
|
|
|
|
|
def test_does_not_falseflag_a_readable_page():
|
|
# a long article that merely links "subscribe to our newsletter" in the footer
|
|
html = b"<html><body><article>" + (b"<p>Lots of real content. </p>" * 100) + \
|
|
b"<footer>Subscribe to our newsletter</footer></article></body></html>"
|
|
assert check_article_access("x", _fetcher({"x": html})) == "readable"
|
|
|
|
|
|
def _items(urls):
|
|
return [feeds.FeedItem(title=f"T{i}", url=u, description="d", published_at=None)
|
|
for i, u in enumerate(urls)]
|
|
|
|
|
|
def test_preview_verdict_layers_domain_and_sample(monkeypatch):
|
|
# a non-paywall-domain feed whose sampled articles mostly read fine -> "fine"
|
|
urls = ["https://good.example/a1", "https://good.example/a2", "https://good.example/a3"]
|
|
monkeypatch.setattr(feeds, "parse_feed", lambda raw: _items(urls))
|
|
|
|
class FakeClient:
|
|
model = "test"
|
|
def classify(self, art):
|
|
return {"accepted": True, "topic": "science", "flavor": "discovery",
|
|
"cortisol_score": 1, "ragebait_score": 1, "pr_risk_score": 2}
|
|
|
|
def fetcher(url, timeout=10):
|
|
return READABLE # every sampled article reads fine
|
|
|
|
out = feeds.preview_feed("https://good.example/feed", sample=8, client=FakeClient(), fetcher=fetcher)
|
|
assert out["paywall_rule"] is False
|
|
assert out["access"]["readable"] >= 1 and out["access"]["paywalled"] == 0
|
|
assert out["access_verdict"] == "fine"
|
|
|
|
|
|
def test_mostly_blocked_is_review_not_fine(monkeypatch):
|
|
# bot-blocked sites (readable in a browser, blocked to our fetcher) must NOT read
|
|
# as 'fine' off one sample, nor as 'reject-ready' — they land in 'review'.
|
|
urls = [f"https://blocky.example/a{i}" for i in range(6)]
|
|
monkeypatch.setattr(feeds, "parse_feed", lambda raw: _items(urls))
|
|
|
|
class FakeClient:
|
|
model = "test"
|
|
def classify(self, art):
|
|
return {"accepted": True, "topic": "science", "flavor": "discovery",
|
|
"cortisol_score": 1, "ragebait_score": 1, "pr_risk_score": 2}
|
|
|
|
def fetcher(url, timeout=10):
|
|
if url.endswith("/feed") or url.endswith("a0"):
|
|
return READABLE # the feed fetch + one readable article
|
|
raise RuntimeError("403 blocked") # the rest block (bot-blocked)
|
|
|
|
out = feeds.preview_feed("https://blocky.example/feed", sample=8, client=FakeClient(), fetcher=fetcher)
|
|
assert out["access"]["blocked"] >= 4 and out["access"]["readable"] == 1
|
|
assert out["access_verdict"] == "review" # thin assessable evidence → not 'fine', not 'reject-ready'
|
|
|
|
|
|
def test_source_preview_endpoint_handles_null_rate(tmp_path, monkeypatch):
|
|
# All-held (non-English) sample → acceptance_rate is None; the legacy
|
|
# /api/source-preview must not 500 on it (SourcePreview.acceptance_rate is nullable).
|
|
db = tmp_path / "t.sqlite3"
|
|
monkeypatch.setenv("GOODNEWS_DB", str(db))
|
|
monkeypatch.setenv("GOODNEWS_PUBLIC_BASE_URL", "http://testserver")
|
|
import importlib
|
|
import goodnews.api as api
|
|
importlib.reload(api)
|
|
from goodnews.db import connect, init_db
|
|
c = connect(str(db)); init_db(c); c.commit(); c.close()
|
|
all_held = {
|
|
"url": "http://x/feed", "sampled": 4, "classified": True, "accepted": 0,
|
|
"non_english": 4, "acceptance_rate": None, "avg_cortisol": 0.0, "avg_ragebait": 0.0,
|
|
"avg_pr_risk": 0.0, "newest_published": None, "recent_7d": 0,
|
|
"topic_mix": {}, "flavor_mix": {}, "examples_accepted": [], "examples_rejected": [],
|
|
}
|
|
monkeypatch.setattr(feeds, "preview_feed", lambda *a, **k: all_held)
|
|
from fastapi.testclient import TestClient
|
|
r = TestClient(api.create_app()).get("/api/source-preview?url=http://x/feed")
|
|
assert r.status_code == 200 # was 500: None rejected by float field
|
|
assert r.json()["acceptance_rate"] is None
|
|
|
|
|
|
def test_one_hung_fetch_does_not_stall_the_preview(monkeypatch):
|
|
# Codex's wall-clock audit: one article that sleeps WAY past the deadline must
|
|
# not pin Deep Preview — it returns at the cap, with the slow one left 'unknown'.
|
|
monkeypatch.setattr(feeds, "_ACCESS_DEADLINE_S", 0.5) # shrink the cap for the test
|
|
urls = [f"https://mixed.example/a{i}" for i in range(6)]
|
|
monkeypatch.setattr(feeds, "parse_feed", lambda raw: _items(urls))
|
|
|
|
class FakeClient:
|
|
model = "test"
|
|
def classify(self, art):
|
|
return {"accepted": True, "topic": "science", "flavor": "discovery",
|
|
"cortisol_score": 1, "ragebait_score": 1, "pr_risk_score": 2}
|
|
|
|
def fetcher(url, timeout=10):
|
|
if url.endswith("a0"):
|
|
time.sleep(5) # one ugly site hangs far past the 0.5s cap
|
|
return READABLE
|
|
|
|
start = time.monotonic()
|
|
out = feeds.preview_feed("https://mixed.example/feed", sample=8, client=FakeClient(), fetcher=fetcher)
|
|
elapsed = time.monotonic() - start
|
|
assert elapsed < 2.5 # returned at the cap (~0.5s), NOT after the 5s sleep
|
|
# the hung one is 'unknown' (unverified), the rest read fine
|
|
slow = next(e for e in out["access"]["examples"] if e["url"].endswith("a0"))
|
|
assert slow["access"] == "unknown"
|
|
assert out["access"]["readable"] >= 4
|