news: close the remaining no-paywall bypass paths (Codex audit)

queries.feed was the main chokepoint, but several discovery paths have their own SQL. Apply the shared source exclusion to all of them so "no paywalls" is truly site-wide: - briefs.build_daily_brief: EXCLUDE paywalled candidates (was: demote) — never stored in a new brief. - queries.brief: stored-brief retrieval (covers /today + /api/brief) filters the paywalled source. - digest.digest_items + followed_digest_items: the morning email + "from what you follow" omit paywalled sources. - sitemap(): paywalled article pages excluded from the sitemap. All reuse queries.paywalled_source_ids (admin override still wins). Regression tests (test_paywall_exclusion.py): never stored in a new brief; /today + digest omit it; followed-source email omits it; Saved retains it; 'free' override restores eligibility. 423 backend tests green. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-28 17:22:52 -04:00
parent 0d21231597
commit c600145ba5
5 changed files with 99 additions and 13 deletions
@@ -1698,11 +1698,14 @@ def create_app() -> FastAPI:
    @app.get("/sitemap.xml")
    def sitemap() -> Response:
        with get_conn() as conn:
+            pwx = queries.paywalled_source_ids(conn)
+            pw_clause = f" AND a.source_id NOT IN ({','.join('?' * len(pwx))})" if pwx else ""
            rows = conn.execute(
                "SELECT a.id, COALESCE(a.published_at, a.discovered_at) AS lm "
                "FROM articles a JOIN article_scores s ON s.article_id = a.id "
-                "WHERE s.accepted = 1 AND a.duplicate_of IS NULL "
-                "ORDER BY lm DESC LIMIT 5000"
+                "WHERE s.accepted = 1 AND a.duplicate_of IS NULL" + pw_clause + " "
+                "ORDER BY lm DESC LIMIT 5000",
+                pwx,
            ).fetchall()
        base = PUBLIC_BASE_URL
        urls = [
@@ -15,11 +15,11 @@ def build_daily_brief(
 ) -> int:
    target_date = brief_date or local_today()

-    # Compose the selection first so we can tell whether anything actually
-    # changed. A calm daily brief shouldn't repeatedly hand the reader a locked
-    # door: push paywalled candidates below readable ones (stable sort) first.
+    # Compose the selection first so we can tell whether anything actually changed.
+    # A calm daily brief never hands the reader a locked door: paywalled-source
+    # candidates are excluded outright (no unreadable news), not just demoted.
    rows = _candidate_articles(conn, target_date, window_days)
-    rows = sorted(rows, key=lambda r: is_paywalled_for_source(r["canonical_url"], r["paywall_override"]))
+    rows = [r for r in rows if not is_paywalled_for_source(r["canonical_url"], r["paywall_override"])]
    selected = _select_diverse(rows, limit)
    selected_ids = [row["id"] for row in selected]

@@ -17,6 +17,7 @@ from html import escape
 from . import email_send
 from .localtime import local_now, local_today
 from .paywall import is_paywalled, is_paywalled_for_source
+from .queries import paywalled_source_ids

 DIGEST_HOUR = int(os.environ.get("GOODNEWS_DIGEST_HOUR", "7"))
 DIGEST_WINDOW_HOURS = 4  # send between DIGEST_HOUR and +4h, site-local
@@ -29,8 +30,10 @@ def _base_url() -> str:

 def digest_items(conn: sqlite3.Connection, brief_date: str, limit: int = 7) -> list[dict]:
    """The brief's items with the bits a calm email needs (visible sources only)."""
+    pwx = paywalled_source_ids(conn)
+    pw_clause = f" AND a.source_id NOT IN ({','.join('?' * len(pwx))})" if pwx else ""
    rows = conn.execute(
-        """
+        f"""
        SELECT a.id, a.title, a.canonical_url, s.name AS source, s.paywall_override, sc.reason_text,
               (SELECT summary FROM article_summaries WHERE article_id = a.id) AS summary
        FROM daily_briefs b
@@ -38,11 +41,11 @@ def digest_items(conn: sqlite3.Connection, brief_date: str, limit: int = 7) -> l
        JOIN articles a ON a.id = bi.article_id
        JOIN sources s ON s.id = a.source_id
        LEFT JOIN article_scores sc ON sc.article_id = a.id
-        WHERE b.brief_date = ? AND s.content_visible = 1
+        WHERE b.brief_date = ? AND s.content_visible = 1{pw_clause}
        ORDER BY bi.rank
        LIMIT ?
        """,
-        (brief_date, limit),
+        (brief_date, *pwx, limit),
    ).fetchall()
    items = []
    for r in rows:
@@ -72,6 +75,8 @@ def followed_digest_items(conn: sqlite3.Connection, user_id: int, exclude_ids, l
            f"AND at.tag IN ({','.join('?' * len(ftags))}))"
        )
        params += ftags
+    pwx = paywalled_source_ids(conn)
+    pw_clause = f" AND a.source_id NOT IN ({','.join('?' * len(pwx))})" if pwx else ""
    rows = conn.execute(
        f"""
        SELECT a.id, a.title, a.canonical_url, s.name AS source, s.paywall_override, a.source_id, sc.reason_text,
@@ -79,12 +84,12 @@ def followed_digest_items(conn: sqlite3.Connection, user_id: int, exclude_ids, l
        FROM articles a
        JOIN sources s ON s.id = a.source_id
        JOIN article_scores sc ON sc.article_id = a.id
-        WHERE sc.accepted = 1 AND a.duplicate_of IS NULL AND s.content_visible = 1
+        WHERE sc.accepted = 1 AND a.duplicate_of IS NULL AND s.content_visible = 1{pw_clause}
          AND ({' OR '.join(ors)})
        ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
        LIMIT 30
        """,
-        params,
+        [*pwx, *params],
    ).fetchall()
    exclude, per_source, out = set(exclude_ids), {}, []
    for r in rows:
@@ -374,6 +374,8 @@ def brief(conn: sqlite3.Connection, brief_date: str | None = None, limit: int =
    if not header:
        return {"brief_date": target_date, "title": None, "created_at": None, "items": []}

+    pwx = paywalled_source_ids(conn)
+    pw_clause = f" AND a.source_id NOT IN ({','.join('?' * len(pwx))})" if pwx else ""
    rows = conn.execute(
        f"""
        SELECT bi.rank, bi.selection_reason, {_ARTICLE_COLUMNS},
@@ -383,11 +385,11 @@ def brief(conn: sqlite3.Connection, brief_date: str | None = None, limit: int =
        JOIN articles a ON a.id = bi.article_id
        JOIN sources src ON src.id = a.source_id
        LEFT JOIN article_scores s ON s.article_id = a.id
-        WHERE b.brief_date = ? AND src.content_visible = 1
+        WHERE b.brief_date = ? AND src.content_visible = 1{pw_clause}
        ORDER BY bi.rank
        LIMIT ?
        """,
-        (target_date, limit),
+        (target_date, *pwx, limit),
    ).fetchall()
    return {
        "brief_date": header["brief_date"],