news: close the remaining no-paywall bypass paths (Codex audit)
queries.feed was the main chokepoint, but several discovery paths have their own SQL. Apply the shared source exclusion to all of them so "no paywalls" is truly site-wide: - briefs.build_daily_brief: EXCLUDE paywalled candidates (was: demote) — never stored in a new brief. - queries.brief: stored-brief retrieval (covers /today + /api/brief) filters the paywalled source. - digest.digest_items + followed_digest_items: the morning email + "from what you follow" omit paywalled sources. - sitemap(): paywalled article pages excluded from the sitemap. All reuse queries.paywalled_source_ids (admin override still wins). Regression tests (test_paywall_exclusion.py): never stored in a new brief; /today + digest omit it; followed-source email omits it; Saved retains it; 'free' override restores eligibility. 423 backend tests green. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
+5
-2
@@ -1698,11 +1698,14 @@ def create_app() -> FastAPI:
|
|||||||
@app.get("/sitemap.xml")
|
@app.get("/sitemap.xml")
|
||||||
def sitemap() -> Response:
|
def sitemap() -> Response:
|
||||||
with get_conn() as conn:
|
with get_conn() as conn:
|
||||||
|
pwx = queries.paywalled_source_ids(conn)
|
||||||
|
pw_clause = f" AND a.source_id NOT IN ({','.join('?' * len(pwx))})" if pwx else ""
|
||||||
rows = conn.execute(
|
rows = conn.execute(
|
||||||
"SELECT a.id, COALESCE(a.published_at, a.discovered_at) AS lm "
|
"SELECT a.id, COALESCE(a.published_at, a.discovered_at) AS lm "
|
||||||
"FROM articles a JOIN article_scores s ON s.article_id = a.id "
|
"FROM articles a JOIN article_scores s ON s.article_id = a.id "
|
||||||
"WHERE s.accepted = 1 AND a.duplicate_of IS NULL "
|
"WHERE s.accepted = 1 AND a.duplicate_of IS NULL" + pw_clause + " "
|
||||||
"ORDER BY lm DESC LIMIT 5000"
|
"ORDER BY lm DESC LIMIT 5000",
|
||||||
|
pwx,
|
||||||
).fetchall()
|
).fetchall()
|
||||||
base = PUBLIC_BASE_URL
|
base = PUBLIC_BASE_URL
|
||||||
urls = [
|
urls = [
|
||||||
|
|||||||
+4
-4
@@ -15,11 +15,11 @@ def build_daily_brief(
|
|||||||
) -> int:
|
) -> int:
|
||||||
target_date = brief_date or local_today()
|
target_date = brief_date or local_today()
|
||||||
|
|
||||||
# Compose the selection first so we can tell whether anything actually
|
# Compose the selection first so we can tell whether anything actually changed.
|
||||||
# changed. A calm daily brief shouldn't repeatedly hand the reader a locked
|
# A calm daily brief never hands the reader a locked door: paywalled-source
|
||||||
# door: push paywalled candidates below readable ones (stable sort) first.
|
# candidates are excluded outright (no unreadable news), not just demoted.
|
||||||
rows = _candidate_articles(conn, target_date, window_days)
|
rows = _candidate_articles(conn, target_date, window_days)
|
||||||
rows = sorted(rows, key=lambda r: is_paywalled_for_source(r["canonical_url"], r["paywall_override"]))
|
rows = [r for r in rows if not is_paywalled_for_source(r["canonical_url"], r["paywall_override"])]
|
||||||
selected = _select_diverse(rows, limit)
|
selected = _select_diverse(rows, limit)
|
||||||
selected_ids = [row["id"] for row in selected]
|
selected_ids = [row["id"] for row in selected]
|
||||||
|
|
||||||
|
|||||||
+10
-5
@@ -17,6 +17,7 @@ from html import escape
|
|||||||
from . import email_send
|
from . import email_send
|
||||||
from .localtime import local_now, local_today
|
from .localtime import local_now, local_today
|
||||||
from .paywall import is_paywalled, is_paywalled_for_source
|
from .paywall import is_paywalled, is_paywalled_for_source
|
||||||
|
from .queries import paywalled_source_ids
|
||||||
|
|
||||||
DIGEST_HOUR = int(os.environ.get("GOODNEWS_DIGEST_HOUR", "7"))
|
DIGEST_HOUR = int(os.environ.get("GOODNEWS_DIGEST_HOUR", "7"))
|
||||||
DIGEST_WINDOW_HOURS = 4 # send between DIGEST_HOUR and +4h, site-local
|
DIGEST_WINDOW_HOURS = 4 # send between DIGEST_HOUR and +4h, site-local
|
||||||
@@ -29,8 +30,10 @@ def _base_url() -> str:
|
|||||||
|
|
||||||
def digest_items(conn: sqlite3.Connection, brief_date: str, limit: int = 7) -> list[dict]:
|
def digest_items(conn: sqlite3.Connection, brief_date: str, limit: int = 7) -> list[dict]:
|
||||||
"""The brief's items with the bits a calm email needs (visible sources only)."""
|
"""The brief's items with the bits a calm email needs (visible sources only)."""
|
||||||
|
pwx = paywalled_source_ids(conn)
|
||||||
|
pw_clause = f" AND a.source_id NOT IN ({','.join('?' * len(pwx))})" if pwx else ""
|
||||||
rows = conn.execute(
|
rows = conn.execute(
|
||||||
"""
|
f"""
|
||||||
SELECT a.id, a.title, a.canonical_url, s.name AS source, s.paywall_override, sc.reason_text,
|
SELECT a.id, a.title, a.canonical_url, s.name AS source, s.paywall_override, sc.reason_text,
|
||||||
(SELECT summary FROM article_summaries WHERE article_id = a.id) AS summary
|
(SELECT summary FROM article_summaries WHERE article_id = a.id) AS summary
|
||||||
FROM daily_briefs b
|
FROM daily_briefs b
|
||||||
@@ -38,11 +41,11 @@ def digest_items(conn: sqlite3.Connection, brief_date: str, limit: int = 7) -> l
|
|||||||
JOIN articles a ON a.id = bi.article_id
|
JOIN articles a ON a.id = bi.article_id
|
||||||
JOIN sources s ON s.id = a.source_id
|
JOIN sources s ON s.id = a.source_id
|
||||||
LEFT JOIN article_scores sc ON sc.article_id = a.id
|
LEFT JOIN article_scores sc ON sc.article_id = a.id
|
||||||
WHERE b.brief_date = ? AND s.content_visible = 1
|
WHERE b.brief_date = ? AND s.content_visible = 1{pw_clause}
|
||||||
ORDER BY bi.rank
|
ORDER BY bi.rank
|
||||||
LIMIT ?
|
LIMIT ?
|
||||||
""",
|
""",
|
||||||
(brief_date, limit),
|
(brief_date, *pwx, limit),
|
||||||
).fetchall()
|
).fetchall()
|
||||||
items = []
|
items = []
|
||||||
for r in rows:
|
for r in rows:
|
||||||
@@ -72,6 +75,8 @@ def followed_digest_items(conn: sqlite3.Connection, user_id: int, exclude_ids, l
|
|||||||
f"AND at.tag IN ({','.join('?' * len(ftags))}))"
|
f"AND at.tag IN ({','.join('?' * len(ftags))}))"
|
||||||
)
|
)
|
||||||
params += ftags
|
params += ftags
|
||||||
|
pwx = paywalled_source_ids(conn)
|
||||||
|
pw_clause = f" AND a.source_id NOT IN ({','.join('?' * len(pwx))})" if pwx else ""
|
||||||
rows = conn.execute(
|
rows = conn.execute(
|
||||||
f"""
|
f"""
|
||||||
SELECT a.id, a.title, a.canonical_url, s.name AS source, s.paywall_override, a.source_id, sc.reason_text,
|
SELECT a.id, a.title, a.canonical_url, s.name AS source, s.paywall_override, a.source_id, sc.reason_text,
|
||||||
@@ -79,12 +84,12 @@ def followed_digest_items(conn: sqlite3.Connection, user_id: int, exclude_ids, l
|
|||||||
FROM articles a
|
FROM articles a
|
||||||
JOIN sources s ON s.id = a.source_id
|
JOIN sources s ON s.id = a.source_id
|
||||||
JOIN article_scores sc ON sc.article_id = a.id
|
JOIN article_scores sc ON sc.article_id = a.id
|
||||||
WHERE sc.accepted = 1 AND a.duplicate_of IS NULL AND s.content_visible = 1
|
WHERE sc.accepted = 1 AND a.duplicate_of IS NULL AND s.content_visible = 1{pw_clause}
|
||||||
AND ({' OR '.join(ors)})
|
AND ({' OR '.join(ors)})
|
||||||
ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
|
ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
|
||||||
LIMIT 30
|
LIMIT 30
|
||||||
""",
|
""",
|
||||||
params,
|
[*pwx, *params],
|
||||||
).fetchall()
|
).fetchall()
|
||||||
exclude, per_source, out = set(exclude_ids), {}, []
|
exclude, per_source, out = set(exclude_ids), {}, []
|
||||||
for r in rows:
|
for r in rows:
|
||||||
|
|||||||
+4
-2
@@ -374,6 +374,8 @@ def brief(conn: sqlite3.Connection, brief_date: str | None = None, limit: int =
|
|||||||
if not header:
|
if not header:
|
||||||
return {"brief_date": target_date, "title": None, "created_at": None, "items": []}
|
return {"brief_date": target_date, "title": None, "created_at": None, "items": []}
|
||||||
|
|
||||||
|
pwx = paywalled_source_ids(conn)
|
||||||
|
pw_clause = f" AND a.source_id NOT IN ({','.join('?' * len(pwx))})" if pwx else ""
|
||||||
rows = conn.execute(
|
rows = conn.execute(
|
||||||
f"""
|
f"""
|
||||||
SELECT bi.rank, bi.selection_reason, {_ARTICLE_COLUMNS},
|
SELECT bi.rank, bi.selection_reason, {_ARTICLE_COLUMNS},
|
||||||
@@ -383,11 +385,11 @@ def brief(conn: sqlite3.Connection, brief_date: str | None = None, limit: int =
|
|||||||
JOIN articles a ON a.id = bi.article_id
|
JOIN articles a ON a.id = bi.article_id
|
||||||
JOIN sources src ON src.id = a.source_id
|
JOIN sources src ON src.id = a.source_id
|
||||||
LEFT JOIN article_scores s ON s.article_id = a.id
|
LEFT JOIN article_scores s ON s.article_id = a.id
|
||||||
WHERE b.brief_date = ? AND src.content_visible = 1
|
WHERE b.brief_date = ? AND src.content_visible = 1{pw_clause}
|
||||||
ORDER BY bi.rank
|
ORDER BY bi.rank
|
||||||
LIMIT ?
|
LIMIT ?
|
||||||
""",
|
""",
|
||||||
(target_date, limit),
|
(target_date, *pwx, limit),
|
||||||
).fetchall()
|
).fetchall()
|
||||||
return {
|
return {
|
||||||
"brief_date": header["brief_date"],
|
"brief_date": header["brief_date"],
|
||||||
|
|||||||
@@ -0,0 +1,76 @@
|
|||||||
|
"""The no-paywall promise across every public discovery path: paywalled sources are
|
||||||
|
excluded from brief generation, stored-brief retrieval (/today + /api/brief), and both
|
||||||
|
digest queries — while Saved keeps anything the reader saved, and a 'free' override
|
||||||
|
restores eligibility."""
|
||||||
|
from datetime import date
|
||||||
|
|
||||||
|
from goodnews.db import connect, init_db
|
||||||
|
from goodnews import briefs, digest, queries
|
||||||
|
|
||||||
|
|
||||||
|
def _setup(c, pay_override="paywalled"):
|
||||||
|
"""Source 1 = paywalled (via override, so the test doesn't depend on the domain list),
|
||||||
|
source 2 = free. One recent accepted article each, both in a stored brief."""
|
||||||
|
today = date.today().isoformat()
|
||||||
|
c.execute("INSERT INTO sources (id,name,feed_url,content_visible,paywall_override) VALUES (1,'Pay','http://p/f',1,?)",
|
||||||
|
(pay_override,))
|
||||||
|
c.execute("INSERT INTO sources (id,name,feed_url,content_visible) VALUES (2,'Free','http://f/f',1)")
|
||||||
|
for aid, sid in [(1, 1), (2, 2)]:
|
||||||
|
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,published_at,url_hash) VALUES (?,?,?,?,?,?)",
|
||||||
|
(aid, sid, f"http://x/{aid}", f"t{aid}", today + "T12:00:00+00:00", f"h{aid}"))
|
||||||
|
c.execute("INSERT INTO article_scores (article_id,accepted,topic,flavor) VALUES (?,1,'science','discovery')", (aid,))
|
||||||
|
c.execute("INSERT INTO article_summaries (article_id,summary) VALUES (?,?)", (aid, f"s{aid}"))
|
||||||
|
c.commit()
|
||||||
|
return today
|
||||||
|
|
||||||
|
|
||||||
|
def _store_brief(c, today, ids=(1, 2)):
|
||||||
|
bid = c.execute("INSERT INTO daily_briefs (brief_date,title) VALUES (?,'t')", (today,)).lastrowid
|
||||||
|
for rank, aid in enumerate(ids, start=1):
|
||||||
|
c.execute("INSERT INTO daily_brief_items (brief_id,article_id,rank) VALUES (?,?,?)", (bid, aid, rank))
|
||||||
|
c.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def test_paywalled_never_stored_in_a_new_brief():
|
||||||
|
c = connect(":memory:"); init_db(c)
|
||||||
|
today = _setup(c)
|
||||||
|
briefs.build_daily_brief(c, brief_date=today, limit=5, replace=True)
|
||||||
|
stored = [r["article_id"] for r in c.execute("SELECT article_id FROM daily_brief_items")]
|
||||||
|
assert stored == [2] # paywalled candidate excluded, never written
|
||||||
|
|
||||||
|
|
||||||
|
def test_stored_brief_retrieval_and_digest_omit_paywalled():
|
||||||
|
c = connect(":memory:"); init_db(c)
|
||||||
|
today = _setup(c)
|
||||||
|
_store_brief(c, today) # both stored directly → retrieval/digest must still filter
|
||||||
|
assert [r["id"] for r in queries.brief(c)["items"]] == [2] # /today + /api/brief
|
||||||
|
assert [d["id"] for d in digest.digest_items(c, today)] == [2] # morning email
|
||||||
|
|
||||||
|
|
||||||
|
def test_followed_source_email_omits_paywalled():
|
||||||
|
c = connect(":memory:"); init_db(c)
|
||||||
|
today = _setup(c)
|
||||||
|
c.execute("INSERT INTO users (id,email) VALUES (1,'r@x.com')")
|
||||||
|
c.execute("INSERT INTO user_follows (user_id,kind,value) VALUES (1,'source','1')") # follow the paywalled one
|
||||||
|
c.execute("INSERT INTO user_follows (user_id,kind,value) VALUES (1,'source','2')")
|
||||||
|
c.commit()
|
||||||
|
ids = [d["id"] for d in digest.followed_digest_items(c, 1, exclude_ids=[])]
|
||||||
|
assert ids == [2] # even a followed paywalled source is omitted from the email
|
||||||
|
|
||||||
|
|
||||||
|
def test_saved_retains_paywalled():
|
||||||
|
c = connect(":memory:"); init_db(c)
|
||||||
|
_setup(c)
|
||||||
|
c.execute("INSERT INTO users (id,email) VALUES (1,'r@x.com')")
|
||||||
|
c.execute("INSERT INTO saved_articles (user_id,article_id,saved_at) VALUES (1,1,'2026-06-28T00:00:00')")
|
||||||
|
c.commit()
|
||||||
|
assert 1 in [r["id"] for r in queries.saved(c, 1)] # you keep what you saved
|
||||||
|
|
||||||
|
|
||||||
|
def test_free_override_restores_eligibility():
|
||||||
|
c = connect(":memory:"); init_db(c)
|
||||||
|
today = _setup(c, pay_override="free") # same domain-less source, but marked free
|
||||||
|
briefs.build_daily_brief(c, brief_date=today, limit=5, replace=True)
|
||||||
|
stored = [r["article_id"] for r in c.execute("SELECT article_id FROM daily_brief_items")]
|
||||||
|
assert 1 in stored and 2 in stored
|
||||||
|
assert {r["id"] for r in queries.brief(c)["items"]} == {1, 2}
|
||||||
Reference in New Issue
Block a user