Sources: per-source paywall override (3-state) — fix domain-rule mis-flags

The Articles inspector revealed paywall is domain-coarse: nytimes.com is flagged,
so NY Times Learning's free Word-of-the-Day inherits 🔒 — and that flag isn't
cosmetic, it deprioritizes the content in feed sort + lead selection. Add a
per-source override so admins can correct it after inspecting.

- sources.paywall_override: NULL (domain rule) | 'free' | 'paywalled'.
- paywall.py: keep low-level is_paywalled(url) (domain); add is_paywalled_for_source
  (url, override) for the EFFECTIVE decision — never patched the domain helper
  globally (per Codex), so "domain says X" stays distinguishable from "overridden".
- Threaded everywhere ranking/UI touches paywall, via src.paywall_override on the
  shared _ARTICLE_COLUMNS + the source-aware helper: feed sort, /api/since, replace,
  lead selection, Article badge, brief composition (briefs.py), digest, source_health
  (table 🔒), the Articles inspector, and the review/attention check — so ranking and
  UI always agree.
- Endpoint POST /api/admin/sources/{id}/paywall {override}; admin UI: a select in the
  inspector header (Use domain rule / Treat as free / Treat as paywalled) + the basis
  ("ON (domain)" / "OFF (override)"), optimistic so the panel stays open.

Test: domain rule → paywalled in table+inspector+feed badge; 'free' → off in all
three; validation 422 + 404. 242 pytest + 11 vitest.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
jay
2026-06-12 22:10:44 -04:00
parent 7279b18fdc
commit 2dbe73430c
9 changed files with 130 additions and 28 deletions
+26 -6
View File
@@ -42,7 +42,7 @@ from .hero import safe_to_lead
from .llm import LocalModelClient
from .moods import MOODS, mood_filter
from .lanes import build_lane_pool
from .paywall import is_paywalled
from .paywall import is_paywalled, is_paywalled_for_source
from .taxonomy import FAMILIES, FLAVORS, TOPICS
# Edge-cache directives for GLOBAL endpoints — responses that depend only on the
@@ -216,7 +216,7 @@ def _pick_lead(items: list[dict]) -> list[dict]:
still appear in the set — they just don't lead.
"""
def gentle(a: dict) -> bool:
return safe_to_lead(a) and not is_paywalled(a.get("canonical_url"))
return safe_to_lead(a) and not is_paywalled_for_source(a.get("canonical_url"), a.get("paywall_override"))
for ok in (
lambda a: gentle(a) and bool(a.get("image_url")),
@@ -290,7 +290,7 @@ class Article(BaseModel):
reason_text=row.get("reason_text"),
model_name=row.get("model_name"),
rank=row.get("rank"),
paywalled=is_paywalled(row.get("canonical_url")),
paywalled=is_paywalled_for_source(row.get("canonical_url"), row.get("paywall_override")),
tags=[t for t in (raw_tags.split(",") if raw_tags else []) if t],
)
@@ -448,6 +448,10 @@ class SourceVisibilityBody(BaseModel):
visible: bool = True
class SourcePaywallBody(BaseModel):
override: str | None = None # None = use domain rule · 'free' · 'paywalled'
class CandidateSuggestBody(BaseModel):
feed_url: str = ""
name: str | None = None
@@ -1164,6 +1168,22 @@ def create_app() -> FastAPI:
"has_more": len(arts) == limit,
}
@app.post("/api/admin/sources/{sid}/paywall")
def admin_source_paywall(sid: int, body: SourcePaywallBody, request: Request) -> dict:
# Per-source paywall override: corrects domain-rule false positives
# (NY Times Learning is free) / negatives. Threaded into feed/lead/brief
# ranking + badges via is_paywalled_for_source.
ov = body.override or None
if ov not in (None, "free", "paywalled"):
raise HTTPException(status_code=422, detail="override must be null, 'free', or 'paywalled'")
with get_conn() as conn:
_require_admin(conn, request)
cur = conn.execute("UPDATE sources SET paywall_override = ? WHERE id = ?", (ov, sid))
if cur.rowcount == 0:
raise HTTPException(status_code=404, detail="source not found")
conn.commit()
return {"ok": True, "override": ov}
# --- Source candidates (supervised add-a-source pipeline) ----------------
def _candidate_dict(row) -> dict:
@@ -1565,7 +1585,7 @@ def create_app() -> FastAPI:
)
# Keep the top of a browse view readable: stable-sort paywalled items
# below readable ones (composite order preserved within each group).
rows = sorted(rows, key=lambda r: is_paywalled(r["canonical_url"]))
rows = sorted(rows, key=lambda r: is_paywalled_for_source(r["canonical_url"], r["paywall_override"]))
return FeedResponse(
topic=topic,
flavor=flavor,
@@ -1739,7 +1759,7 @@ def create_app() -> FastAPI:
rows = queries.feed(conn, sort="latest", since=since, limit=60, **kw)
if fp.avoid_terms:
rows = filter_articles(rows, fp, now)
rows = sorted(rows, key=lambda r: is_paywalled(r["canonical_url"]))
rows = sorted(rows, key=lambda r: is_paywalled_for_source(r["canonical_url"], r["paywall_override"]))
return FeedResponse(topic=None, flavor=None, count=len(rows), items=[Article.from_row(r) for r in rows[:8]])
@app.get("/api/brief", response_model=BriefResponse)
@@ -1820,7 +1840,7 @@ def create_app() -> FastAPI:
for r in filter_articles(rows, fp, now):
if r["id"] in excl:
continue
if avoid_paywall and is_paywalled(r["canonical_url"]):
if avoid_paywall and is_paywalled_for_source(r["canonical_url"], r["paywall_override"]):
continue
if gentle and not safe_to_lead(r):
continue
+3 -2
View File
@@ -3,7 +3,7 @@ from __future__ import annotations
import sqlite3
from .localtime import local_today
from .paywall import is_paywalled
from .paywall import is_paywalled, is_paywalled_for_source
def build_daily_brief(
@@ -19,7 +19,7 @@ def build_daily_brief(
# changed. A calm daily brief shouldn't repeatedly hand the reader a locked
# door: push paywalled candidates below readable ones (stable sort) first.
rows = _candidate_articles(conn, target_date, window_days)
rows = sorted(rows, key=lambda r: is_paywalled(r["canonical_url"]))
rows = sorted(rows, key=lambda r: is_paywalled_for_source(r["canonical_url"], r["paywall_override"]))
selected = _select_diverse(rows, limit)
selected_ids = [row["id"] for row in selected]
@@ -121,6 +121,7 @@ def _candidate_articles(
src.name AS source_name,
src.default_category,
src.trust_score,
src.paywall_override AS paywall_override,
s.constructive_score,
s.cortisol_score,
s.ragebait_score,
+1
View File
@@ -392,6 +392,7 @@ def _migrate(conn: sqlite3.Connection) -> None:
"consecutive_failures": "INTEGER NOT NULL DEFAULT 0",
"review_flag": "INTEGER NOT NULL DEFAULT 0",
"review_reason": "TEXT",
"paywall_override": "TEXT", # NULL = use domain rule · 'free' · 'paywalled'
}
for column, decl in health_columns.items():
if column not in source_cols:
+5 -5
View File
@@ -16,7 +16,7 @@ from html import escape
from . import email_send
from .localtime import local_now, local_today
from .paywall import is_paywalled
from .paywall import is_paywalled, is_paywalled_for_source
DIGEST_HOUR = int(os.environ.get("GOODNEWS_DIGEST_HOUR", "7"))
DIGEST_WINDOW_HOURS = 4 # send between DIGEST_HOUR and +4h, site-local
@@ -31,7 +31,7 @@ def digest_items(conn: sqlite3.Connection, brief_date: str, limit: int = 7) -> l
"""The brief's items with the bits a calm email needs (visible sources only)."""
rows = conn.execute(
"""
SELECT a.id, a.title, a.canonical_url, s.name AS source, sc.reason_text,
SELECT a.id, a.title, a.canonical_url, s.name AS source, s.paywall_override, sc.reason_text,
(SELECT summary FROM article_summaries WHERE article_id = a.id) AS summary
FROM daily_briefs b
JOIN daily_brief_items bi ON bi.brief_id = b.id
@@ -47,7 +47,7 @@ def digest_items(conn: sqlite3.Connection, brief_date: str, limit: int = 7) -> l
items = []
for r in rows:
d = dict(r)
d["paywalled"] = is_paywalled(d["canonical_url"])
d["paywalled"] = is_paywalled_for_source(d["canonical_url"], d.get("paywall_override"))
items.append(d)
return items
@@ -74,7 +74,7 @@ def followed_digest_items(conn: sqlite3.Connection, user_id: int, exclude_ids, l
params += ftags
rows = conn.execute(
f"""
SELECT a.id, a.title, a.canonical_url, s.name AS source, a.source_id, sc.reason_text,
SELECT a.id, a.title, a.canonical_url, s.name AS source, s.paywall_override, a.source_id, sc.reason_text,
(SELECT summary FROM article_summaries WHERE article_id = a.id) AS summary
FROM articles a
JOIN sources s ON s.id = a.source_id
@@ -92,7 +92,7 @@ def followed_digest_items(conn: sqlite3.Connection, user_id: int, exclude_ids, l
if d["id"] in exclude or per_source.get(d["source_id"], 0) >= 1:
continue
per_source[d["source_id"]] = 1
d["paywalled"] = is_paywalled(d["canonical_url"])
d["paywalled"] = is_paywalled_for_source(d["canonical_url"], d.get("paywall_override"))
out.append(d)
if len(out) >= limit:
break
+14
View File
@@ -35,7 +35,21 @@ PAYWALL_DOMAINS = {
def is_paywalled(url: str | None) -> bool:
"""Low-level DOMAIN rule. Keep this distinct from the source-aware decision so
callers can tell 'domain says paywalled' from 'this source is overridden'."""
host = urlsplit(url or "").netloc.lower()
if host.startswith("www."):
host = host[4:]
return any(host == d or host.endswith("." + d) for d in PAYWALL_DOMAINS)
def is_paywalled_for_source(url: str | None, override: str | None = None) -> bool:
"""The EFFECTIVE paywall decision used for ranking/lead/badges: a per-source
override (set in admin after inspecting the articles) wins over the domain
rule — 'free' clears a false positive (e.g. NY Times Learning), 'paywalled'
flags a false negative. NULL falls back to the domain rule."""
if override == "free":
return False
if override == "paywalled":
return True
return is_paywalled(url)
+14 -7
View File
@@ -11,7 +11,7 @@ import sqlite3
from datetime import UTC, datetime, timedelta
from .feeds import MAX_BACKOFF_MINUTES
from .paywall import is_paywalled
from .paywall import is_paywalled, is_paywalled_for_source
# UA substrings that mark automated clients. Crawlers run JS on a throttled
# budget and trip the boot-failure beacon routinely — without this filter they
@@ -53,6 +53,7 @@ _ARTICLE_COLUMNS = f"""
s.reason_code,
s.reason_text,
s.model_name,
src.paywall_override AS paywall_override,
(SELECT group_concat(t.tag) FROM article_tags t WHERE t.article_id = a.id) AS tags,
{RANK_SCORE_SQL} AS rank_score
"""
@@ -335,7 +336,7 @@ def source_health(conn: sqlite3.Connection) -> list[dict]:
SELECT
s.id, s.name, s.feed_url, s.homepage_url, s.default_category AS category, s.active,
s.status, s.content_visible, s.retry_after_at,
s.consecutive_failures AS failures, s.review_flag, s.review_reason,
s.consecutive_failures AS failures, s.review_flag, s.review_reason, s.paywall_override,
s.poll_interval_minutes AS interval_minutes,
s.last_success_at, s.last_error_at, substr(s.last_error, 1, 160) AS last_error,
(SELECT MAX(r.finished_at) FROM ingest_runs r
@@ -370,8 +371,8 @@ def source_health(conn: sqlite3.Connection) -> list[dict]:
# duplicate of content already served (accepted_total served = accepted dupes).
d["accepted_dup_rate"] = round(100 * (accepted - d["served"]) / accepted) if accepted else None
d["image_coverage"] = round(100 * (d["images"] or 0) / d["served"]) if d["served"] else None
# Paywall is a domain-level hint, so it's a per-source flag (not a rate).
d["paywalled"] = is_paywalled(d.get("homepage_url") or d.get("feed_url"))
# Paywall is a domain-level hint + a per-source override; show the EFFECTIVE flag.
d["paywalled"] = is_paywalled_for_source(d.get("homepage_url") or d.get("feed_url"), d.get("paywall_override"))
# Match the REAL scheduler gate: due = the later of the streak-backoff time
# and any retry_after_at rest (UTC strings sort chronologically).
due_times = [t for t in (d["next_due_at"], d["retry_after_at"]) if t]
@@ -468,6 +469,8 @@ def source_articles(conn: sqlite3.Connection, source_id: int, filter: str = "all
limit: int = 25, offset: int = 0) -> list[dict]:
"""The actual ingested articles for a source, newest first — so admins can
verify the metric (paywall/image/acceptance) against real evidence."""
ov = conn.execute("SELECT paywall_override FROM sources WHERE id = ?", (source_id,)).fetchone()
override = ov["paywall_override"] if ov else None
where = _SRC_ART_FILTERS.get(filter, "")
rows = conn.execute(
f"""
@@ -492,7 +495,7 @@ def source_articles(conn: sqlite3.Connection, source_id: int, filter: str = "all
"reason": r["reason_text"] or r["reason_code"], # the "why" behind accept/reject
"topic": r["topic"],
"flavor": r["flavor"],
"paywalled": is_paywalled(r["canonical_url"]), # domain rule — same for the source
"paywalled": is_paywalled_for_source(r["canonical_url"], override), # effective (domain rule + override)
"has_image": bool(r["image_url"]),
"duplicate": r["duplicate_of"] is not None,
}
@@ -515,11 +518,15 @@ def source_articles_summary(conn: sqlite3.Connection, source_id: int) -> dict:
""",
(source_id,),
).fetchone()
one = conn.execute("SELECT canonical_url FROM articles WHERE source_id = ? LIMIT 1", (source_id,)).fetchone()
srow = conn.execute("SELECT homepage_url, feed_url, paywall_override FROM sources WHERE id = ?", (source_id,)).fetchone()
override = srow["paywall_override"] if srow else None
url = (srow["homepage_url"] or srow["feed_url"]) if srow else None
return {
"total": agg["total"], "accepted": agg["accepted"], "rejected": agg["rejected"],
"no_image": agg["no_image"], "duplicates": agg["duplicates"],
"paywalled": is_paywalled(one["canonical_url"]) if one else False,
"paywalled": is_paywalled_for_source(url, override), # effective
"paywall_domain": is_paywalled(url), # what the domain rule alone says
"paywall_override": override, # null | 'free' | 'paywalled' — the basis
}
+3 -3
View File
@@ -7,7 +7,7 @@ from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urlsplit
from .paywall import is_paywalled
from .paywall import is_paywalled, is_paywalled_for_source
def load_sources(path: Path | str) -> list[dict]:
@@ -244,7 +244,7 @@ def review_sources(
now = datetime.now(timezone.utc)
flagged = []
sources = conn.execute(
"SELECT id, name, consecutive_failures FROM sources WHERE active = 1"
"SELECT id, name, consecutive_failures, paywall_override FROM sources WHERE active = 1"
).fetchall()
for s in sources:
@@ -292,7 +292,7 @@ def review_sources(
avg_rage = sum(r["ragebait_score"] or 0 for r in recent) / n
if avg_rage > 3:
reasons.append(f"high ragebait (avg {avg_rage:.1f})")
paywalled = sum(1 for r in recent if is_paywalled(r["canonical_url"])) / n
paywalled = sum(1 for r in recent if is_paywalled_for_source(r["canonical_url"], s["paywall_override"])) / n
if paywalled > 0.5:
reasons.append(f"paywall-heavy ({paywalled * 100:.0f}%)")