news: harden paywall exclusion at the candidate query + add the missing regressions
Codex's two non-blocking hardening items, folded in before cutover: - _candidate_articles() now excludes paywalled sources IN-QUERY (before LIMIT 50), so flagged stories can't consume candidate slots and leave a full brief thin. Dropped the now-redundant post-fetch filter in build_daily_brief. - Regressions: history retains a viewed paywalled article; sitemap omits a paywalled source AND restores it under override="free". - Aligned test_brief_paywall to the source-level model (paywalled sources carry a paywalled homepage, as in production) — it had relied on article-URL detection. 425 backend tests green. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
+9
-5
@@ -4,6 +4,7 @@ import sqlite3
|
|||||||
|
|
||||||
from .localtime import local_today
|
from .localtime import local_today
|
||||||
from .paywall import is_paywalled, is_paywalled_for_source
|
from .paywall import is_paywalled, is_paywalled_for_source
|
||||||
|
from .queries import paywalled_source_ids
|
||||||
|
|
||||||
|
|
||||||
def build_daily_brief(
|
def build_daily_brief(
|
||||||
@@ -17,9 +18,8 @@ def build_daily_brief(
|
|||||||
|
|
||||||
# Compose the selection first so we can tell whether anything actually changed.
|
# Compose the selection first so we can tell whether anything actually changed.
|
||||||
# A calm daily brief never hands the reader a locked door: paywalled-source
|
# A calm daily brief never hands the reader a locked door: paywalled-source
|
||||||
# candidates are excluded outright (no unreadable news), not just demoted.
|
# candidates are excluded in _candidate_articles (before LIMIT) — no unreadable news.
|
||||||
rows = _candidate_articles(conn, target_date, window_days)
|
rows = _candidate_articles(conn, target_date, window_days)
|
||||||
rows = [r for r in rows if not is_paywalled_for_source(r["canonical_url"], r["paywall_override"])]
|
|
||||||
selected = _select_diverse(rows, limit)
|
selected = _select_diverse(rows, limit)
|
||||||
selected_ids = [row["id"] for row in selected]
|
selected_ids = [row["id"] for row in selected]
|
||||||
|
|
||||||
@@ -107,10 +107,13 @@ def _candidate_articles(
|
|||||||
`window_days` so the brief still fills on slow news days. Anything already
|
`window_days` so the brief still fills on slow news days. Anything already
|
||||||
featured in a brief within the last 7 days (other than this same date, which
|
featured in a brief within the last 7 days (other than this same date, which
|
||||||
is being rebuilt) is excluded so backfilled stories cannot linger across
|
is being rebuilt) is excluded so backfilled stories cannot linger across
|
||||||
consecutive days.
|
consecutive days. Paywalled sources are excluded here (before LIMIT) so they
|
||||||
|
can't consume candidate slots and leave an otherwise-full brief thin.
|
||||||
"""
|
"""
|
||||||
|
pwx = paywalled_source_ids(conn)
|
||||||
|
pw_clause = f"AND a.source_id NOT IN ({','.join('?' * len(pwx))})" if pwx else ""
|
||||||
return conn.execute(
|
return conn.execute(
|
||||||
"""
|
f"""
|
||||||
SELECT
|
SELECT
|
||||||
a.id,
|
a.id,
|
||||||
a.title,
|
a.title,
|
||||||
@@ -152,6 +155,7 @@ def _candidate_articles(
|
|||||||
AND b.brief_date <= date(?)
|
AND b.brief_date <= date(?)
|
||||||
AND b.brief_date > date(?, '-7 days')
|
AND b.brief_date > date(?, '-7 days')
|
||||||
)
|
)
|
||||||
|
{pw_clause}
|
||||||
ORDER BY
|
ORDER BY
|
||||||
is_today DESC,
|
is_today DESC,
|
||||||
(s.constructive_score + s.agency_score + s.human_benefit_score + src.trust_score
|
(s.constructive_score + s.agency_score + s.human_benefit_score + src.trust_score
|
||||||
@@ -159,7 +163,7 @@ def _candidate_articles(
|
|||||||
COALESCE(a.published_at, a.discovered_at) DESC
|
COALESCE(a.published_at, a.discovered_at) DESC
|
||||||
LIMIT 50
|
LIMIT 50
|
||||||
""",
|
""",
|
||||||
(target_date, target_date, target_date, window_days, target_date, target_date, target_date),
|
(target_date, target_date, target_date, window_days, target_date, target_date, target_date, *pwx),
|
||||||
).fetchall()
|
).fetchall()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -8,9 +8,12 @@ from goodnews.paywall import is_paywalled
|
|||||||
def test_brief_prefers_readable_over_higher_scored_paywalled():
|
def test_brief_prefers_readable_over_higher_scored_paywalled():
|
||||||
c = connect(":memory:"); init_db(c)
|
c = connect(":memory:"); init_db(c)
|
||||||
today = date.today().isoformat()
|
today = date.today().isoformat()
|
||||||
|
# Sources 1-2 are paywalled by their homepage domain (matches production: a paywalled
|
||||||
|
# source has a paywalled site); 3-7 are free.
|
||||||
|
homes = {1: "https://www.newscientist.com/", 2: "https://www.nature.com/"}
|
||||||
for sid in range(1, 8):
|
for sid in range(1, 8):
|
||||||
c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (?,?,?,5)",
|
c.execute("INSERT INTO sources (id,name,feed_url,homepage_url,trust_score) VALUES (?,?,?,?,5)",
|
||||||
(sid, f"S{sid}", f"http://s{sid}/f"))
|
(sid, f"S{sid}", f"http://s{sid}/f", homes.get(sid)))
|
||||||
|
|
||||||
def add(aid, sid, url, score):
|
def add(aid, sid, url, score):
|
||||||
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,published_at,url_hash) "
|
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,published_at,url_hash) "
|
||||||
@@ -19,7 +22,7 @@ def test_brief_prefers_readable_over_higher_scored_paywalled():
|
|||||||
"cortisol_score,ragebait_score,pr_risk_score,accepted,topic,flavor) "
|
"cortisol_score,ragebait_score,pr_risk_score,accepted,topic,flavor) "
|
||||||
"VALUES (?,?,?,?,0,0,2,1,'science','discovery')", (aid, score, score, score))
|
"VALUES (?,?,?,?,0,0,2,1,'science','discovery')", (aid, score, score, score))
|
||||||
|
|
||||||
# Paywalled ones are scored HIGHER — readability must still win for the five.
|
# Paywalled sources are scored HIGHER — they must still be EXCLUDED, leaving the five readable.
|
||||||
add(1, 1, "https://www.newscientist.com/a", 9)
|
add(1, 1, "https://www.newscientist.com/a", 9)
|
||||||
add(2, 2, "https://www.nature.com/b", 9)
|
add(2, 2, "https://www.nature.com/b", 9)
|
||||||
add(3, 3, "https://phys.org/c", 4)
|
add(3, 3, "https://phys.org/c", 4)
|
||||||
@@ -33,4 +36,4 @@ def test_brief_prefers_readable_over_higher_scored_paywalled():
|
|||||||
urls = [r["canonical_url"] for r in show_brief(c, brief_date=today, limit=10)]
|
urls = [r["canonical_url"] for r in show_brief(c, brief_date=today, limit=10)]
|
||||||
c.close()
|
c.close()
|
||||||
assert len(urls) == 5
|
assert len(urls) == 5
|
||||||
assert not any(is_paywalled(u) for u in urls) # five readable chosen over paywalled
|
assert not any(is_paywalled(u) for u in urls) # paywalled sources excluded; five readable fill
|
||||||
|
|||||||
@@ -67,6 +67,15 @@ def test_saved_retains_paywalled():
|
|||||||
assert 1 in [r["id"] for r in queries.saved(c, 1)] # you keep what you saved
|
assert 1 in [r["id"] for r in queries.saved(c, 1)] # you keep what you saved
|
||||||
|
|
||||||
|
|
||||||
|
def test_history_retains_paywalled():
|
||||||
|
c = connect(":memory:"); init_db(c)
|
||||||
|
_setup(c)
|
||||||
|
c.execute("INSERT INTO users (id,email) VALUES (1,'r@x.com')")
|
||||||
|
c.execute("INSERT INTO user_history (user_id,article_id,at) VALUES (1,1,'2026-06-28T00:00:00')")
|
||||||
|
c.commit()
|
||||||
|
assert 1 in [r["id"] for r in queries.history(c, 1)] # a viewed paywalled article stays in history
|
||||||
|
|
||||||
|
|
||||||
def test_free_override_restores_eligibility():
|
def test_free_override_restores_eligibility():
|
||||||
c = connect(":memory:"); init_db(c)
|
c = connect(":memory:"); init_db(c)
|
||||||
today = _setup(c, pay_override="free") # same domain-less source, but marked free
|
today = _setup(c, pay_override="free") # same domain-less source, but marked free
|
||||||
|
|||||||
@@ -43,3 +43,28 @@ def test_sitemap(client):
|
|||||||
assert "https://upbeatbytes.com/a/1" in xml
|
assert "https://upbeatbytes.com/a/1" in xml
|
||||||
assert "https://upbeatbytes.com/today" in xml
|
assert "https://upbeatbytes.com/today" in xml
|
||||||
assert "<lastmod>2026-06-05</lastmod>" in xml
|
assert "<lastmod>2026-06-05</lastmod>" in xml
|
||||||
|
|
||||||
|
|
||||||
|
def test_sitemap_excludes_paywalled_and_restores_on_free(tmp_path, monkeypatch):
|
||||||
|
db = tmp_path / "sm.sqlite3"
|
||||||
|
monkeypatch.setenv("GOODNEWS_DB", str(db))
|
||||||
|
monkeypatch.setenv("GOODNEWS_PUBLIC_BASE_URL", "https://upbeatbytes.com")
|
||||||
|
import importlib
|
||||||
|
import goodnews.api as api
|
||||||
|
importlib.reload(api)
|
||||||
|
from goodnews.db import connect, init_db
|
||||||
|
c = connect(str(db)); init_db(c)
|
||||||
|
c.execute("INSERT INTO sources (id,name,feed_url,trust_score,content_visible) VALUES (1,'Free','http://f',5,1)")
|
||||||
|
c.execute("INSERT INTO sources (id,name,feed_url,trust_score,content_visible,paywall_override) "
|
||||||
|
"VALUES (2,'Pay','http://p',5,1,'paywalled')")
|
||||||
|
for aid, sid in [(1, 1), (2, 2)]:
|
||||||
|
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,published_at) VALUES (?,?,?,?,?,?)",
|
||||||
|
(aid, sid, f"http://x/{aid}", f"t{aid}", f"h{aid}", "2026-06-05T08:00:00"))
|
||||||
|
c.execute("INSERT INTO article_scores (article_id,accepted) VALUES (?,1)", (aid,))
|
||||||
|
c.commit()
|
||||||
|
tc = TestClient(api.create_app())
|
||||||
|
xml = tc.get("/sitemap.xml").text
|
||||||
|
assert "/a/1</loc>" in xml and "/a/2</loc>" not in xml # paywalled source omitted
|
||||||
|
c.execute("UPDATE sources SET paywall_override='free' WHERE id=2"); c.commit()
|
||||||
|
assert "/a/2</loc>" in tc.get("/sitemap.xml").text # restored under 'free'
|
||||||
|
c.close()
|
||||||
|
|||||||
Reference in New Issue
Block a user