news: harden paywall exclusion at the candidate query + add the missing regressions

Codex's two non-blocking hardening items, folded in before cutover:
- _candidate_articles() now excludes paywalled sources IN-QUERY (before LIMIT 50),
  so flagged stories can't consume candidate slots and leave a full brief thin.
  Dropped the now-redundant post-fetch filter in build_daily_brief.
- Regressions: history retains a viewed paywalled article; sitemap omits a
  paywalled source AND restores it under override="free".
- Aligned test_brief_paywall to the source-level model (paywalled sources carry a
  paywalled homepage, as in production) — it had relied on article-URL detection.

425 backend tests green.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
jay
2026-06-28 18:54:53 -04:00
parent c600145ba5
commit 1c1ecefde8
4 changed files with 50 additions and 9 deletions
+9 -5
View File
@@ -4,6 +4,7 @@ import sqlite3
from .localtime import local_today
from .paywall import is_paywalled, is_paywalled_for_source
from .queries import paywalled_source_ids
def build_daily_brief(
@@ -17,9 +18,8 @@ def build_daily_brief(
# Compose the selection first so we can tell whether anything actually changed.
# A calm daily brief never hands the reader a locked door: paywalled-source
# candidates are excluded outright (no unreadable news), not just demoted.
# candidates are excluded in _candidate_articles (before LIMIT) — no unreadable news.
rows = _candidate_articles(conn, target_date, window_days)
rows = [r for r in rows if not is_paywalled_for_source(r["canonical_url"], r["paywall_override"])]
selected = _select_diverse(rows, limit)
selected_ids = [row["id"] for row in selected]
@@ -107,10 +107,13 @@ def _candidate_articles(
`window_days` so the brief still fills on slow news days. Anything already
featured in a brief within the last 7 days (other than this same date, which
is being rebuilt) is excluded so backfilled stories cannot linger across
consecutive days.
consecutive days. Paywalled sources are excluded here (before LIMIT) so they
can't consume candidate slots and leave an otherwise-full brief thin.
"""
pwx = paywalled_source_ids(conn)
pw_clause = f"AND a.source_id NOT IN ({','.join('?' * len(pwx))})" if pwx else ""
return conn.execute(
"""
f"""
SELECT
a.id,
a.title,
@@ -152,6 +155,7 @@ def _candidate_articles(
AND b.brief_date <= date(?)
AND b.brief_date > date(?, '-7 days')
)
{pw_clause}
ORDER BY
is_today DESC,
(s.constructive_score + s.agency_score + s.human_benefit_score + src.trust_score
@@ -159,7 +163,7 @@ def _candidate_articles(
COALESCE(a.published_at, a.discovered_at) DESC
LIMIT 50
""",
(target_date, target_date, target_date, window_days, target_date, target_date, target_date),
(target_date, target_date, target_date, window_days, target_date, target_date, target_date, *pwx),
).fetchall()
+7 -4
View File
@@ -8,9 +8,12 @@ from goodnews.paywall import is_paywalled
def test_brief_prefers_readable_over_higher_scored_paywalled():
c = connect(":memory:"); init_db(c)
today = date.today().isoformat()
# Sources 1-2 are paywalled by their homepage domain (matches production: a paywalled
# source has a paywalled site); 3-7 are free.
homes = {1: "https://www.newscientist.com/", 2: "https://www.nature.com/"}
for sid in range(1, 8):
c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (?,?,?,5)",
(sid, f"S{sid}", f"http://s{sid}/f"))
c.execute("INSERT INTO sources (id,name,feed_url,homepage_url,trust_score) VALUES (?,?,?,?,5)",
(sid, f"S{sid}", f"http://s{sid}/f", homes.get(sid)))
def add(aid, sid, url, score):
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,published_at,url_hash) "
@@ -19,7 +22,7 @@ def test_brief_prefers_readable_over_higher_scored_paywalled():
"cortisol_score,ragebait_score,pr_risk_score,accepted,topic,flavor) "
"VALUES (?,?,?,?,0,0,2,1,'science','discovery')", (aid, score, score, score))
# Paywalled ones are scored HIGHER — readability must still win for the five.
# Paywalled sources are scored HIGHER — they must still be EXCLUDED, leaving the five readable.
add(1, 1, "https://www.newscientist.com/a", 9)
add(2, 2, "https://www.nature.com/b", 9)
add(3, 3, "https://phys.org/c", 4)
@@ -33,4 +36,4 @@ def test_brief_prefers_readable_over_higher_scored_paywalled():
urls = [r["canonical_url"] for r in show_brief(c, brief_date=today, limit=10)]
c.close()
assert len(urls) == 5
assert not any(is_paywalled(u) for u in urls) # five readable chosen over paywalled
assert not any(is_paywalled(u) for u in urls) # paywalled sources excluded; five readable fill
+9
View File
@@ -67,6 +67,15 @@ def test_saved_retains_paywalled():
assert 1 in [r["id"] for r in queries.saved(c, 1)] # you keep what you saved
def test_history_retains_paywalled():
c = connect(":memory:"); init_db(c)
_setup(c)
c.execute("INSERT INTO users (id,email) VALUES (1,'r@x.com')")
c.execute("INSERT INTO user_history (user_id,article_id,at) VALUES (1,1,'2026-06-28T00:00:00')")
c.commit()
assert 1 in [r["id"] for r in queries.history(c, 1)] # a viewed paywalled article stays in history
def test_free_override_restores_eligibility():
c = connect(":memory:"); init_db(c)
today = _setup(c, pay_override="free") # same domain-less source, but marked free
+25
View File
@@ -43,3 +43,28 @@ def test_sitemap(client):
assert "https://upbeatbytes.com/a/1" in xml
assert "https://upbeatbytes.com/today" in xml
assert "<lastmod>2026-06-05</lastmod>" in xml
def test_sitemap_excludes_paywalled_and_restores_on_free(tmp_path, monkeypatch):
db = tmp_path / "sm.sqlite3"
monkeypatch.setenv("GOODNEWS_DB", str(db))
monkeypatch.setenv("GOODNEWS_PUBLIC_BASE_URL", "https://upbeatbytes.com")
import importlib
import goodnews.api as api
importlib.reload(api)
from goodnews.db import connect, init_db
c = connect(str(db)); init_db(c)
c.execute("INSERT INTO sources (id,name,feed_url,trust_score,content_visible) VALUES (1,'Free','http://f',5,1)")
c.execute("INSERT INTO sources (id,name,feed_url,trust_score,content_visible,paywall_override) "
"VALUES (2,'Pay','http://p',5,1,'paywalled')")
for aid, sid in [(1, 1), (2, 2)]:
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,published_at) VALUES (?,?,?,?,?,?)",
(aid, sid, f"http://x/{aid}", f"t{aid}", f"h{aid}", "2026-06-05T08:00:00"))
c.execute("INSERT INTO article_scores (article_id,accepted) VALUES (?,1)", (aid,))
c.commit()
tc = TestClient(api.create_app())
xml = tc.get("/sitemap.xml").text
assert "/a/1</loc>" in xml and "/a/2</loc>" not in xml # paywalled source omitted
c.execute("UPDATE sources SET paywall_override='free' WHERE id=2"); c.commit()
assert "/a/2</loc>" in tc.get("/sitemap.xml").text # restored under 'free'
c.close()