From 1c1ecefde8270d34511186d057c023b968eb11a6 Mon Sep 17 00:00:00 2001 From: jay Date: Sun, 28 Jun 2026 18:54:53 -0400 Subject: [PATCH] news: harden paywall exclusion at the candidate query + add the missing regressions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex's two non-blocking hardening items, folded in before cutover: - _candidate_articles() now excludes paywalled sources IN-QUERY (before LIMIT 50), so flagged stories can't consume candidate slots and leave a full brief thin. Dropped the now-redundant post-fetch filter in build_daily_brief. - Regressions: history retains a viewed paywalled article; sitemap omits a paywalled source AND restores it under override="free". - Aligned test_brief_paywall to the source-level model (paywalled sources carry a paywalled homepage, as in production) — it had relied on article-URL detection. 425 backend tests green. Co-Authored-By: Claude Opus 4.8 --- goodnews/briefs.py | 14 +++++++++----- tests/test_brief_paywall.py | 11 +++++++---- tests/test_paywall_exclusion.py | 9 +++++++++ tests/test_seo.py | 25 +++++++++++++++++++++++++ 4 files changed, 50 insertions(+), 9 deletions(-) diff --git a/goodnews/briefs.py b/goodnews/briefs.py index 84edbb0..b0d7284 100644 --- a/goodnews/briefs.py +++ b/goodnews/briefs.py @@ -4,6 +4,7 @@ import sqlite3 from .localtime import local_today from .paywall import is_paywalled, is_paywalled_for_source +from .queries import paywalled_source_ids def build_daily_brief( @@ -17,9 +18,8 @@ def build_daily_brief( # Compose the selection first so we can tell whether anything actually changed. # A calm daily brief never hands the reader a locked door: paywalled-source - # candidates are excluded outright (no unreadable news), not just demoted. + # candidates are excluded in _candidate_articles (before LIMIT) — no unreadable news. rows = _candidate_articles(conn, target_date, window_days) - rows = [r for r in rows if not is_paywalled_for_source(r["canonical_url"], r["paywall_override"])] selected = _select_diverse(rows, limit) selected_ids = [row["id"] for row in selected] @@ -107,10 +107,13 @@ def _candidate_articles( `window_days` so the brief still fills on slow news days. Anything already featured in a brief within the last 7 days (other than this same date, which is being rebuilt) is excluded so backfilled stories cannot linger across - consecutive days. + consecutive days. Paywalled sources are excluded here (before LIMIT) so they + can't consume candidate slots and leave an otherwise-full brief thin. """ + pwx = paywalled_source_ids(conn) + pw_clause = f"AND a.source_id NOT IN ({','.join('?' * len(pwx))})" if pwx else "" return conn.execute( - """ + f""" SELECT a.id, a.title, @@ -152,6 +155,7 @@ def _candidate_articles( AND b.brief_date <= date(?) AND b.brief_date > date(?, '-7 days') ) + {pw_clause} ORDER BY is_today DESC, (s.constructive_score + s.agency_score + s.human_benefit_score + src.trust_score @@ -159,7 +163,7 @@ def _candidate_articles( COALESCE(a.published_at, a.discovered_at) DESC LIMIT 50 """, - (target_date, target_date, target_date, window_days, target_date, target_date, target_date), + (target_date, target_date, target_date, window_days, target_date, target_date, target_date, *pwx), ).fetchall() diff --git a/tests/test_brief_paywall.py b/tests/test_brief_paywall.py index 74075e8..000d3e6 100644 --- a/tests/test_brief_paywall.py +++ b/tests/test_brief_paywall.py @@ -8,9 +8,12 @@ from goodnews.paywall import is_paywalled def test_brief_prefers_readable_over_higher_scored_paywalled(): c = connect(":memory:"); init_db(c) today = date.today().isoformat() + # Sources 1-2 are paywalled by their homepage domain (matches production: a paywalled + # source has a paywalled site); 3-7 are free. + homes = {1: "https://www.newscientist.com/", 2: "https://www.nature.com/"} for sid in range(1, 8): - c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (?,?,?,5)", - (sid, f"S{sid}", f"http://s{sid}/f")) + c.execute("INSERT INTO sources (id,name,feed_url,homepage_url,trust_score) VALUES (?,?,?,?,5)", + (sid, f"S{sid}", f"http://s{sid}/f", homes.get(sid))) def add(aid, sid, url, score): c.execute("INSERT INTO articles (id,source_id,canonical_url,title,published_at,url_hash) " @@ -19,7 +22,7 @@ def test_brief_prefers_readable_over_higher_scored_paywalled(): "cortisol_score,ragebait_score,pr_risk_score,accepted,topic,flavor) " "VALUES (?,?,?,?,0,0,2,1,'science','discovery')", (aid, score, score, score)) - # Paywalled ones are scored HIGHER — readability must still win for the five. + # Paywalled sources are scored HIGHER — they must still be EXCLUDED, leaving the five readable. add(1, 1, "https://www.newscientist.com/a", 9) add(2, 2, "https://www.nature.com/b", 9) add(3, 3, "https://phys.org/c", 4) @@ -33,4 +36,4 @@ def test_brief_prefers_readable_over_higher_scored_paywalled(): urls = [r["canonical_url"] for r in show_brief(c, brief_date=today, limit=10)] c.close() assert len(urls) == 5 - assert not any(is_paywalled(u) for u in urls) # five readable chosen over paywalled + assert not any(is_paywalled(u) for u in urls) # paywalled sources excluded; five readable fill diff --git a/tests/test_paywall_exclusion.py b/tests/test_paywall_exclusion.py index f4f4f7c..245e5a8 100644 --- a/tests/test_paywall_exclusion.py +++ b/tests/test_paywall_exclusion.py @@ -67,6 +67,15 @@ def test_saved_retains_paywalled(): assert 1 in [r["id"] for r in queries.saved(c, 1)] # you keep what you saved +def test_history_retains_paywalled(): + c = connect(":memory:"); init_db(c) + _setup(c) + c.execute("INSERT INTO users (id,email) VALUES (1,'r@x.com')") + c.execute("INSERT INTO user_history (user_id,article_id,at) VALUES (1,1,'2026-06-28T00:00:00')") + c.commit() + assert 1 in [r["id"] for r in queries.history(c, 1)] # a viewed paywalled article stays in history + + def test_free_override_restores_eligibility(): c = connect(":memory:"); init_db(c) today = _setup(c, pay_override="free") # same domain-less source, but marked free diff --git a/tests/test_seo.py b/tests/test_seo.py index 9069af4..164235a 100644 --- a/tests/test_seo.py +++ b/tests/test_seo.py @@ -43,3 +43,28 @@ def test_sitemap(client): assert "https://upbeatbytes.com/a/1" in xml assert "https://upbeatbytes.com/today" in xml assert "2026-06-05" in xml + + +def test_sitemap_excludes_paywalled_and_restores_on_free(tmp_path, monkeypatch): + db = tmp_path / "sm.sqlite3" + monkeypatch.setenv("GOODNEWS_DB", str(db)) + monkeypatch.setenv("GOODNEWS_PUBLIC_BASE_URL", "https://upbeatbytes.com") + import importlib + import goodnews.api as api + importlib.reload(api) + from goodnews.db import connect, init_db + c = connect(str(db)); init_db(c) + c.execute("INSERT INTO sources (id,name,feed_url,trust_score,content_visible) VALUES (1,'Free','http://f',5,1)") + c.execute("INSERT INTO sources (id,name,feed_url,trust_score,content_visible,paywall_override) " + "VALUES (2,'Pay','http://p',5,1,'paywalled')") + for aid, sid in [(1, 1), (2, 2)]: + c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,published_at) VALUES (?,?,?,?,?,?)", + (aid, sid, f"http://x/{aid}", f"t{aid}", f"h{aid}", "2026-06-05T08:00:00")) + c.execute("INSERT INTO article_scores (article_id,accepted) VALUES (?,1)", (aid,)) + c.commit() + tc = TestClient(api.create_app()) + xml = tc.get("/sitemap.xml").text + assert "/a/1" in xml and "/a/2" not in xml # paywalled source omitted + c.execute("UPDATE sources SET paywall_override='free' WHERE id=2"); c.commit() + assert "/a/2" in tc.get("/sitemap.xml").text # restored under 'free' + c.close()