diff --git a/goodnews/api.py b/goodnews/api.py index fd76447..bd2cda3 100644 --- a/goodnews/api.py +++ b/goodnews/api.py @@ -1698,11 +1698,14 @@ def create_app() -> FastAPI: @app.get("/sitemap.xml") def sitemap() -> Response: with get_conn() as conn: + pwx = queries.paywalled_source_ids(conn) + pw_clause = f" AND a.source_id NOT IN ({','.join('?' * len(pwx))})" if pwx else "" rows = conn.execute( "SELECT a.id, COALESCE(a.published_at, a.discovered_at) AS lm " "FROM articles a JOIN article_scores s ON s.article_id = a.id " - "WHERE s.accepted = 1 AND a.duplicate_of IS NULL " - "ORDER BY lm DESC LIMIT 5000" + "WHERE s.accepted = 1 AND a.duplicate_of IS NULL" + pw_clause + " " + "ORDER BY lm DESC LIMIT 5000", + pwx, ).fetchall() base = PUBLIC_BASE_URL urls = [ diff --git a/goodnews/briefs.py b/goodnews/briefs.py index 66a326c..84edbb0 100644 --- a/goodnews/briefs.py +++ b/goodnews/briefs.py @@ -15,11 +15,11 @@ def build_daily_brief( ) -> int: target_date = brief_date or local_today() - # Compose the selection first so we can tell whether anything actually - # changed. A calm daily brief shouldn't repeatedly hand the reader a locked - # door: push paywalled candidates below readable ones (stable sort) first. + # Compose the selection first so we can tell whether anything actually changed. + # A calm daily brief never hands the reader a locked door: paywalled-source + # candidates are excluded outright (no unreadable news), not just demoted. rows = _candidate_articles(conn, target_date, window_days) - rows = sorted(rows, key=lambda r: is_paywalled_for_source(r["canonical_url"], r["paywall_override"])) + rows = [r for r in rows if not is_paywalled_for_source(r["canonical_url"], r["paywall_override"])] selected = _select_diverse(rows, limit) selected_ids = [row["id"] for row in selected] diff --git a/goodnews/digest.py b/goodnews/digest.py index 1a96d1d..64f1772 100644 --- a/goodnews/digest.py +++ b/goodnews/digest.py @@ -17,6 +17,7 @@ from html import escape from . import email_send from .localtime import local_now, local_today from .paywall import is_paywalled, is_paywalled_for_source +from .queries import paywalled_source_ids DIGEST_HOUR = int(os.environ.get("GOODNEWS_DIGEST_HOUR", "7")) DIGEST_WINDOW_HOURS = 4 # send between DIGEST_HOUR and +4h, site-local @@ -29,8 +30,10 @@ def _base_url() -> str: def digest_items(conn: sqlite3.Connection, brief_date: str, limit: int = 7) -> list[dict]: """The brief's items with the bits a calm email needs (visible sources only).""" + pwx = paywalled_source_ids(conn) + pw_clause = f" AND a.source_id NOT IN ({','.join('?' * len(pwx))})" if pwx else "" rows = conn.execute( - """ + f""" SELECT a.id, a.title, a.canonical_url, s.name AS source, s.paywall_override, sc.reason_text, (SELECT summary FROM article_summaries WHERE article_id = a.id) AS summary FROM daily_briefs b @@ -38,11 +41,11 @@ def digest_items(conn: sqlite3.Connection, brief_date: str, limit: int = 7) -> l JOIN articles a ON a.id = bi.article_id JOIN sources s ON s.id = a.source_id LEFT JOIN article_scores sc ON sc.article_id = a.id - WHERE b.brief_date = ? AND s.content_visible = 1 + WHERE b.brief_date = ? AND s.content_visible = 1{pw_clause} ORDER BY bi.rank LIMIT ? """, - (brief_date, limit), + (brief_date, *pwx, limit), ).fetchall() items = [] for r in rows: @@ -72,6 +75,8 @@ def followed_digest_items(conn: sqlite3.Connection, user_id: int, exclude_ids, l f"AND at.tag IN ({','.join('?' * len(ftags))}))" ) params += ftags + pwx = paywalled_source_ids(conn) + pw_clause = f" AND a.source_id NOT IN ({','.join('?' * len(pwx))})" if pwx else "" rows = conn.execute( f""" SELECT a.id, a.title, a.canonical_url, s.name AS source, s.paywall_override, a.source_id, sc.reason_text, @@ -79,12 +84,12 @@ def followed_digest_items(conn: sqlite3.Connection, user_id: int, exclude_ids, l FROM articles a JOIN sources s ON s.id = a.source_id JOIN article_scores sc ON sc.article_id = a.id - WHERE sc.accepted = 1 AND a.duplicate_of IS NULL AND s.content_visible = 1 + WHERE sc.accepted = 1 AND a.duplicate_of IS NULL AND s.content_visible = 1{pw_clause} AND ({' OR '.join(ors)}) ORDER BY COALESCE(a.published_at, a.discovered_at) DESC LIMIT 30 """, - params, + [*pwx, *params], ).fetchall() exclude, per_source, out = set(exclude_ids), {}, [] for r in rows: diff --git a/goodnews/queries.py b/goodnews/queries.py index 6819ad1..cca3f48 100644 --- a/goodnews/queries.py +++ b/goodnews/queries.py @@ -374,6 +374,8 @@ def brief(conn: sqlite3.Connection, brief_date: str | None = None, limit: int = if not header: return {"brief_date": target_date, "title": None, "created_at": None, "items": []} + pwx = paywalled_source_ids(conn) + pw_clause = f" AND a.source_id NOT IN ({','.join('?' * len(pwx))})" if pwx else "" rows = conn.execute( f""" SELECT bi.rank, bi.selection_reason, {_ARTICLE_COLUMNS}, @@ -383,11 +385,11 @@ def brief(conn: sqlite3.Connection, brief_date: str | None = None, limit: int = JOIN articles a ON a.id = bi.article_id JOIN sources src ON src.id = a.source_id LEFT JOIN article_scores s ON s.article_id = a.id - WHERE b.brief_date = ? AND src.content_visible = 1 + WHERE b.brief_date = ? AND src.content_visible = 1{pw_clause} ORDER BY bi.rank LIMIT ? """, - (target_date, limit), + (target_date, *pwx, limit), ).fetchall() return { "brief_date": header["brief_date"], diff --git a/tests/test_paywall_exclusion.py b/tests/test_paywall_exclusion.py new file mode 100644 index 0000000..f4f4f7c --- /dev/null +++ b/tests/test_paywall_exclusion.py @@ -0,0 +1,76 @@ +"""The no-paywall promise across every public discovery path: paywalled sources are +excluded from brief generation, stored-brief retrieval (/today + /api/brief), and both +digest queries — while Saved keeps anything the reader saved, and a 'free' override +restores eligibility.""" +from datetime import date + +from goodnews.db import connect, init_db +from goodnews import briefs, digest, queries + + +def _setup(c, pay_override="paywalled"): + """Source 1 = paywalled (via override, so the test doesn't depend on the domain list), + source 2 = free. One recent accepted article each, both in a stored brief.""" + today = date.today().isoformat() + c.execute("INSERT INTO sources (id,name,feed_url,content_visible,paywall_override) VALUES (1,'Pay','http://p/f',1,?)", + (pay_override,)) + c.execute("INSERT INTO sources (id,name,feed_url,content_visible) VALUES (2,'Free','http://f/f',1)") + for aid, sid in [(1, 1), (2, 2)]: + c.execute("INSERT INTO articles (id,source_id,canonical_url,title,published_at,url_hash) VALUES (?,?,?,?,?,?)", + (aid, sid, f"http://x/{aid}", f"t{aid}", today + "T12:00:00+00:00", f"h{aid}")) + c.execute("INSERT INTO article_scores (article_id,accepted,topic,flavor) VALUES (?,1,'science','discovery')", (aid,)) + c.execute("INSERT INTO article_summaries (article_id,summary) VALUES (?,?)", (aid, f"s{aid}")) + c.commit() + return today + + +def _store_brief(c, today, ids=(1, 2)): + bid = c.execute("INSERT INTO daily_briefs (brief_date,title) VALUES (?,'t')", (today,)).lastrowid + for rank, aid in enumerate(ids, start=1): + c.execute("INSERT INTO daily_brief_items (brief_id,article_id,rank) VALUES (?,?,?)", (bid, aid, rank)) + c.commit() + + +def test_paywalled_never_stored_in_a_new_brief(): + c = connect(":memory:"); init_db(c) + today = _setup(c) + briefs.build_daily_brief(c, brief_date=today, limit=5, replace=True) + stored = [r["article_id"] for r in c.execute("SELECT article_id FROM daily_brief_items")] + assert stored == [2] # paywalled candidate excluded, never written + + +def test_stored_brief_retrieval_and_digest_omit_paywalled(): + c = connect(":memory:"); init_db(c) + today = _setup(c) + _store_brief(c, today) # both stored directly → retrieval/digest must still filter + assert [r["id"] for r in queries.brief(c)["items"]] == [2] # /today + /api/brief + assert [d["id"] for d in digest.digest_items(c, today)] == [2] # morning email + + +def test_followed_source_email_omits_paywalled(): + c = connect(":memory:"); init_db(c) + today = _setup(c) + c.execute("INSERT INTO users (id,email) VALUES (1,'r@x.com')") + c.execute("INSERT INTO user_follows (user_id,kind,value) VALUES (1,'source','1')") # follow the paywalled one + c.execute("INSERT INTO user_follows (user_id,kind,value) VALUES (1,'source','2')") + c.commit() + ids = [d["id"] for d in digest.followed_digest_items(c, 1, exclude_ids=[])] + assert ids == [2] # even a followed paywalled source is omitted from the email + + +def test_saved_retains_paywalled(): + c = connect(":memory:"); init_db(c) + _setup(c) + c.execute("INSERT INTO users (id,email) VALUES (1,'r@x.com')") + c.execute("INSERT INTO saved_articles (user_id,article_id,saved_at) VALUES (1,1,'2026-06-28T00:00:00')") + c.commit() + assert 1 in [r["id"] for r in queries.saved(c, 1)] # you keep what you saved + + +def test_free_override_restores_eligibility(): + c = connect(":memory:"); init_db(c) + today = _setup(c, pay_override="free") # same domain-less source, but marked free + briefs.build_daily_brief(c, brief_date=today, limit=5, replace=True) + stored = [r["article_id"] for r in c.execute("SELECT article_id FROM daily_brief_items")] + assert 1 in stored and 2 in stored + assert {r["id"] for r in queries.brief(c)["items"]} == {1, 2}