Sources: per-source paywall override (3-state) — fix domain-rule mis-flags

The Articles inspector revealed paywall is domain-coarse: nytimes.com is flagged,
so NY Times Learning's free Word-of-the-Day inherits 🔒 — and that flag isn't
cosmetic, it deprioritizes the content in feed sort + lead selection. Add a
per-source override so admins can correct it after inspecting.

- sources.paywall_override: NULL (domain rule) | 'free' | 'paywalled'.
- paywall.py: keep low-level is_paywalled(url) (domain); add is_paywalled_for_source
  (url, override) for the EFFECTIVE decision — never patched the domain helper
  globally (per Codex), so "domain says X" stays distinguishable from "overridden".
- Threaded everywhere ranking/UI touches paywall, via src.paywall_override on the
  shared _ARTICLE_COLUMNS + the source-aware helper: feed sort, /api/since, replace,
  lead selection, Article badge, brief composition (briefs.py), digest, source_health
  (table 🔒), the Articles inspector, and the review/attention check — so ranking and
  UI always agree.
- Endpoint POST /api/admin/sources/{id}/paywall {override}; admin UI: a select in the
  inspector header (Use domain rule / Treat as free / Treat as paywalled) + the basis
  ("ON (domain)" / "OFF (override)"), optimistic so the panel stays open.

Test: domain rule → paywalled in table+inspector+feed badge; 'free' → off in all
three; validation 422 + 404. 242 pytest + 11 vitest.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
jay
2026-06-12 22:10:44 -04:00
parent 7279b18fdc
commit 2dbe73430c
9 changed files with 130 additions and 28 deletions
+31 -5
View File
@@ -283,6 +283,24 @@
finally { s._artBusy = false; }
}
const ART_FILTERS = [['all', 'All'], ['accepted', 'Accepted'], ['rejected', 'Rejected'], ['no_image', 'No image'], ['duplicates', 'Duplicates']];
function pwBasis(sum) {
if (!sum) return '';
if (sum.paywall_override === 'free') return 'OFF (override)';
if (sum.paywall_override === 'paywalled') return 'ON (override)';
return sum.paywalled ? 'ON (domain)' : 'off';
}
// Per-source paywall override — corrects a domain-rule false positive/negative
// and flows into feed/lead/brief ranking + the table's 🔒. Optimistic in place
// so the inspector panel stays open.
async function setPaywall(s, override) {
const ov = override || null;
try {
await postJSON(`/api/admin/sources/${s.id}/paywall`, { override: ov });
const eff = ov === 'free' ? false : ov === 'paywalled' ? true : (s._artSummary?.paywall_domain ?? s.paywalled);
s.paywall_override = ov; s.paywalled = eff; // updates the Media-column 🔒
if (s._artSummary) { s._artSummary.paywall_override = ov; s._artSummary.paywalled = eff; }
} catch (e) { s._artErr = e?.message || 'Could not set the override.'; }
}
// --- Source candidates: supervised "add a source" pipeline ---
let candidates = $state([]);
@@ -726,9 +744,15 @@
{#if s._artSummary}
<div class="artsum">
<strong>{s._artSummary.total}</strong> ingested · {s._artSummary.accepted} accepted ·
{s._artSummary.rejected} rejected · {s._artSummary.no_image} no image ·
{s._artSummary.duplicates} dup ·
paywall rule: <span class="pwrule" class:on={s._artSummary.paywalled}>{s._artSummary.paywalled ? 'ON (domain)' : 'off'}</span>
{s._artSummary.rejected} rejected · {s._artSummary.no_image} no image · {s._artSummary.duplicates} dup
</div>
<div class="pwctl">
<span>Paywall: <span class="pwrule" class:on={s._artSummary.paywalled}>{pwBasis(s._artSummary)}</span></span>
<select class="pwsel" onchange={(e) => setPaywall(s, e.currentTarget.value)}>
<option value="" selected={!s._artSummary.paywall_override}>Use domain rule ({s._artSummary.paywall_domain ? 'on' : 'off'})</option>
<option value="free" selected={s._artSummary.paywall_override === 'free'}>Treat as free</option>
<option value="paywalled" selected={s._artSummary.paywall_override === 'paywalled'}>Treat as paywalled</option>
</select>
</div>
{/if}
<div class="artfilters">
@@ -1259,8 +1283,10 @@
/* Source article inspector */
.srctable tr.artrow td { background: var(--bg); font-size: 0.84rem; padding: 10px 12px; }
.artsum { color: var(--ink); margin-bottom: 8px; }
.artsum .pwrule { color: var(--muted); font-weight: 600; }
.artsum .pwrule.on { color: #9a3b3b; }
.pwrule { color: var(--muted); font-weight: 600; }
.pwrule.on { color: #9a3b3b; }
.pwctl { display: flex; align-items: center; gap: 10px; flex-wrap: wrap; margin-bottom: 8px; font-size: 0.8rem; color: var(--muted); }
.pwsel { font: inherit; font-size: 0.78rem; padding: 3px 8px; border: 1px solid var(--line); border-radius: 8px; background: var(--surface); color: var(--ink); }
.artfilters { display: flex; gap: 6px; flex-wrap: wrap; align-items: center; margin-bottom: 8px; }
.chip.sm { font-size: 0.74rem; padding: 3px 10px; }
.artlist { list-style: none; margin: 0; padding: 0; display: flex; flex-direction: column; gap: 7px; max-height: 360px; overflow-y: auto; }
+26 -6
View File
@@ -42,7 +42,7 @@ from .hero import safe_to_lead
from .llm import LocalModelClient
from .moods import MOODS, mood_filter
from .lanes import build_lane_pool
from .paywall import is_paywalled
from .paywall import is_paywalled, is_paywalled_for_source
from .taxonomy import FAMILIES, FLAVORS, TOPICS
# Edge-cache directives for GLOBAL endpoints — responses that depend only on the
@@ -216,7 +216,7 @@ def _pick_lead(items: list[dict]) -> list[dict]:
still appear in the set they just don't lead.
"""
def gentle(a: dict) -> bool:
return safe_to_lead(a) and not is_paywalled(a.get("canonical_url"))
return safe_to_lead(a) and not is_paywalled_for_source(a.get("canonical_url"), a.get("paywall_override"))
for ok in (
lambda a: gentle(a) and bool(a.get("image_url")),
@@ -290,7 +290,7 @@ class Article(BaseModel):
reason_text=row.get("reason_text"),
model_name=row.get("model_name"),
rank=row.get("rank"),
paywalled=is_paywalled(row.get("canonical_url")),
paywalled=is_paywalled_for_source(row.get("canonical_url"), row.get("paywall_override")),
tags=[t for t in (raw_tags.split(",") if raw_tags else []) if t],
)
@@ -448,6 +448,10 @@ class SourceVisibilityBody(BaseModel):
visible: bool = True
class SourcePaywallBody(BaseModel):
override: str | None = None # None = use domain rule · 'free' · 'paywalled'
class CandidateSuggestBody(BaseModel):
feed_url: str = ""
name: str | None = None
@@ -1164,6 +1168,22 @@ def create_app() -> FastAPI:
"has_more": len(arts) == limit,
}
@app.post("/api/admin/sources/{sid}/paywall")
def admin_source_paywall(sid: int, body: SourcePaywallBody, request: Request) -> dict:
# Per-source paywall override: corrects domain-rule false positives
# (NY Times Learning is free) / negatives. Threaded into feed/lead/brief
# ranking + badges via is_paywalled_for_source.
ov = body.override or None
if ov not in (None, "free", "paywalled"):
raise HTTPException(status_code=422, detail="override must be null, 'free', or 'paywalled'")
with get_conn() as conn:
_require_admin(conn, request)
cur = conn.execute("UPDATE sources SET paywall_override = ? WHERE id = ?", (ov, sid))
if cur.rowcount == 0:
raise HTTPException(status_code=404, detail="source not found")
conn.commit()
return {"ok": True, "override": ov}
# --- Source candidates (supervised add-a-source pipeline) ----------------
def _candidate_dict(row) -> dict:
@@ -1565,7 +1585,7 @@ def create_app() -> FastAPI:
)
# Keep the top of a browse view readable: stable-sort paywalled items
# below readable ones (composite order preserved within each group).
rows = sorted(rows, key=lambda r: is_paywalled(r["canonical_url"]))
rows = sorted(rows, key=lambda r: is_paywalled_for_source(r["canonical_url"], r["paywall_override"]))
return FeedResponse(
topic=topic,
flavor=flavor,
@@ -1739,7 +1759,7 @@ def create_app() -> FastAPI:
rows = queries.feed(conn, sort="latest", since=since, limit=60, **kw)
if fp.avoid_terms:
rows = filter_articles(rows, fp, now)
rows = sorted(rows, key=lambda r: is_paywalled(r["canonical_url"]))
rows = sorted(rows, key=lambda r: is_paywalled_for_source(r["canonical_url"], r["paywall_override"]))
return FeedResponse(topic=None, flavor=None, count=len(rows), items=[Article.from_row(r) for r in rows[:8]])
@app.get("/api/brief", response_model=BriefResponse)
@@ -1820,7 +1840,7 @@ def create_app() -> FastAPI:
for r in filter_articles(rows, fp, now):
if r["id"] in excl:
continue
if avoid_paywall and is_paywalled(r["canonical_url"]):
if avoid_paywall and is_paywalled_for_source(r["canonical_url"], r["paywall_override"]):
continue
if gentle and not safe_to_lead(r):
continue
+3 -2
View File
@@ -3,7 +3,7 @@ from __future__ import annotations
import sqlite3
from .localtime import local_today
from .paywall import is_paywalled
from .paywall import is_paywalled, is_paywalled_for_source
def build_daily_brief(
@@ -19,7 +19,7 @@ def build_daily_brief(
# changed. A calm daily brief shouldn't repeatedly hand the reader a locked
# door: push paywalled candidates below readable ones (stable sort) first.
rows = _candidate_articles(conn, target_date, window_days)
rows = sorted(rows, key=lambda r: is_paywalled(r["canonical_url"]))
rows = sorted(rows, key=lambda r: is_paywalled_for_source(r["canonical_url"], r["paywall_override"]))
selected = _select_diverse(rows, limit)
selected_ids = [row["id"] for row in selected]
@@ -121,6 +121,7 @@ def _candidate_articles(
src.name AS source_name,
src.default_category,
src.trust_score,
src.paywall_override AS paywall_override,
s.constructive_score,
s.cortisol_score,
s.ragebait_score,
+1
View File
@@ -392,6 +392,7 @@ def _migrate(conn: sqlite3.Connection) -> None:
"consecutive_failures": "INTEGER NOT NULL DEFAULT 0",
"review_flag": "INTEGER NOT NULL DEFAULT 0",
"review_reason": "TEXT",
"paywall_override": "TEXT", # NULL = use domain rule · 'free' · 'paywalled'
}
for column, decl in health_columns.items():
if column not in source_cols:
+5 -5
View File
@@ -16,7 +16,7 @@ from html import escape
from . import email_send
from .localtime import local_now, local_today
from .paywall import is_paywalled
from .paywall import is_paywalled, is_paywalled_for_source
DIGEST_HOUR = int(os.environ.get("GOODNEWS_DIGEST_HOUR", "7"))
DIGEST_WINDOW_HOURS = 4 # send between DIGEST_HOUR and +4h, site-local
@@ -31,7 +31,7 @@ def digest_items(conn: sqlite3.Connection, brief_date: str, limit: int = 7) -> l
"""The brief's items with the bits a calm email needs (visible sources only)."""
rows = conn.execute(
"""
SELECT a.id, a.title, a.canonical_url, s.name AS source, sc.reason_text,
SELECT a.id, a.title, a.canonical_url, s.name AS source, s.paywall_override, sc.reason_text,
(SELECT summary FROM article_summaries WHERE article_id = a.id) AS summary
FROM daily_briefs b
JOIN daily_brief_items bi ON bi.brief_id = b.id
@@ -47,7 +47,7 @@ def digest_items(conn: sqlite3.Connection, brief_date: str, limit: int = 7) -> l
items = []
for r in rows:
d = dict(r)
d["paywalled"] = is_paywalled(d["canonical_url"])
d["paywalled"] = is_paywalled_for_source(d["canonical_url"], d.get("paywall_override"))
items.append(d)
return items
@@ -74,7 +74,7 @@ def followed_digest_items(conn: sqlite3.Connection, user_id: int, exclude_ids, l
params += ftags
rows = conn.execute(
f"""
SELECT a.id, a.title, a.canonical_url, s.name AS source, a.source_id, sc.reason_text,
SELECT a.id, a.title, a.canonical_url, s.name AS source, s.paywall_override, a.source_id, sc.reason_text,
(SELECT summary FROM article_summaries WHERE article_id = a.id) AS summary
FROM articles a
JOIN sources s ON s.id = a.source_id
@@ -92,7 +92,7 @@ def followed_digest_items(conn: sqlite3.Connection, user_id: int, exclude_ids, l
if d["id"] in exclude or per_source.get(d["source_id"], 0) >= 1:
continue
per_source[d["source_id"]] = 1
d["paywalled"] = is_paywalled(d["canonical_url"])
d["paywalled"] = is_paywalled_for_source(d["canonical_url"], d.get("paywall_override"))
out.append(d)
if len(out) >= limit:
break
+14
View File
@@ -35,7 +35,21 @@ PAYWALL_DOMAINS = {
def is_paywalled(url: str | None) -> bool:
"""Low-level DOMAIN rule. Keep this distinct from the source-aware decision so
callers can tell 'domain says paywalled' from 'this source is overridden'."""
host = urlsplit(url or "").netloc.lower()
if host.startswith("www."):
host = host[4:]
return any(host == d or host.endswith("." + d) for d in PAYWALL_DOMAINS)
def is_paywalled_for_source(url: str | None, override: str | None = None) -> bool:
"""The EFFECTIVE paywall decision used for ranking/lead/badges: a per-source
override (set in admin after inspecting the articles) wins over the domain
rule 'free' clears a false positive (e.g. NY Times Learning), 'paywalled'
flags a false negative. NULL falls back to the domain rule."""
if override == "free":
return False
if override == "paywalled":
return True
return is_paywalled(url)
+14 -7
View File
@@ -11,7 +11,7 @@ import sqlite3
from datetime import UTC, datetime, timedelta
from .feeds import MAX_BACKOFF_MINUTES
from .paywall import is_paywalled
from .paywall import is_paywalled, is_paywalled_for_source
# UA substrings that mark automated clients. Crawlers run JS on a throttled
# budget and trip the boot-failure beacon routinely — without this filter they
@@ -53,6 +53,7 @@ _ARTICLE_COLUMNS = f"""
s.reason_code,
s.reason_text,
s.model_name,
src.paywall_override AS paywall_override,
(SELECT group_concat(t.tag) FROM article_tags t WHERE t.article_id = a.id) AS tags,
{RANK_SCORE_SQL} AS rank_score
"""
@@ -335,7 +336,7 @@ def source_health(conn: sqlite3.Connection) -> list[dict]:
SELECT
s.id, s.name, s.feed_url, s.homepage_url, s.default_category AS category, s.active,
s.status, s.content_visible, s.retry_after_at,
s.consecutive_failures AS failures, s.review_flag, s.review_reason,
s.consecutive_failures AS failures, s.review_flag, s.review_reason, s.paywall_override,
s.poll_interval_minutes AS interval_minutes,
s.last_success_at, s.last_error_at, substr(s.last_error, 1, 160) AS last_error,
(SELECT MAX(r.finished_at) FROM ingest_runs r
@@ -370,8 +371,8 @@ def source_health(conn: sqlite3.Connection) -> list[dict]:
# duplicate of content already served (accepted_total served = accepted dupes).
d["accepted_dup_rate"] = round(100 * (accepted - d["served"]) / accepted) if accepted else None
d["image_coverage"] = round(100 * (d["images"] or 0) / d["served"]) if d["served"] else None
# Paywall is a domain-level hint, so it's a per-source flag (not a rate).
d["paywalled"] = is_paywalled(d.get("homepage_url") or d.get("feed_url"))
# Paywall is a domain-level hint + a per-source override; show the EFFECTIVE flag.
d["paywalled"] = is_paywalled_for_source(d.get("homepage_url") or d.get("feed_url"), d.get("paywall_override"))
# Match the REAL scheduler gate: due = the later of the streak-backoff time
# and any retry_after_at rest (UTC strings sort chronologically).
due_times = [t for t in (d["next_due_at"], d["retry_after_at"]) if t]
@@ -468,6 +469,8 @@ def source_articles(conn: sqlite3.Connection, source_id: int, filter: str = "all
limit: int = 25, offset: int = 0) -> list[dict]:
"""The actual ingested articles for a source, newest first — so admins can
verify the metric (paywall/image/acceptance) against real evidence."""
ov = conn.execute("SELECT paywall_override FROM sources WHERE id = ?", (source_id,)).fetchone()
override = ov["paywall_override"] if ov else None
where = _SRC_ART_FILTERS.get(filter, "")
rows = conn.execute(
f"""
@@ -492,7 +495,7 @@ def source_articles(conn: sqlite3.Connection, source_id: int, filter: str = "all
"reason": r["reason_text"] or r["reason_code"], # the "why" behind accept/reject
"topic": r["topic"],
"flavor": r["flavor"],
"paywalled": is_paywalled(r["canonical_url"]), # domain rule — same for the source
"paywalled": is_paywalled_for_source(r["canonical_url"], override), # effective (domain rule + override)
"has_image": bool(r["image_url"]),
"duplicate": r["duplicate_of"] is not None,
}
@@ -515,11 +518,15 @@ def source_articles_summary(conn: sqlite3.Connection, source_id: int) -> dict:
""",
(source_id,),
).fetchone()
one = conn.execute("SELECT canonical_url FROM articles WHERE source_id = ? LIMIT 1", (source_id,)).fetchone()
srow = conn.execute("SELECT homepage_url, feed_url, paywall_override FROM sources WHERE id = ?", (source_id,)).fetchone()
override = srow["paywall_override"] if srow else None
url = (srow["homepage_url"] or srow["feed_url"]) if srow else None
return {
"total": agg["total"], "accepted": agg["accepted"], "rejected": agg["rejected"],
"no_image": agg["no_image"], "duplicates": agg["duplicates"],
"paywalled": is_paywalled(one["canonical_url"]) if one else False,
"paywalled": is_paywalled_for_source(url, override), # effective
"paywall_domain": is_paywalled(url), # what the domain rule alone says
"paywall_override": override, # null | 'free' | 'paywalled' — the basis
}
+3 -3
View File
@@ -7,7 +7,7 @@ from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urlsplit
from .paywall import is_paywalled
from .paywall import is_paywalled, is_paywalled_for_source
def load_sources(path: Path | str) -> list[dict]:
@@ -244,7 +244,7 @@ def review_sources(
now = datetime.now(timezone.utc)
flagged = []
sources = conn.execute(
"SELECT id, name, consecutive_failures FROM sources WHERE active = 1"
"SELECT id, name, consecutive_failures, paywall_override FROM sources WHERE active = 1"
).fetchall()
for s in sources:
@@ -292,7 +292,7 @@ def review_sources(
avg_rage = sum(r["ragebait_score"] or 0 for r in recent) / n
if avg_rage > 3:
reasons.append(f"high ragebait (avg {avg_rage:.1f})")
paywalled = sum(1 for r in recent if is_paywalled(r["canonical_url"])) / n
paywalled = sum(1 for r in recent if is_paywalled_for_source(r["canonical_url"], s["paywall_override"])) / n
if paywalled > 0.5:
reasons.append(f"paywall-heavy ({paywalled * 100:.0f}%)")
+33
View File
@@ -518,3 +518,36 @@ def test_source_articles_inspector(tmp_path, monkeypatch):
assert tc.get("/api/admin/sources/1/articles?filter=rejected").json()["articles"] == []
assert len(tc.get("/api/admin/sources/1/articles?filter=no_image").json()["articles"]) == 1
assert tc.get("/api/admin/sources/999/articles").status_code == 404 # unknown source
def test_source_paywall_override(tmp_path, monkeypatch):
import sqlite3, os
app, api = _make(tmp_path, monkeypatch, admin_email="boss@x.com")
c = sqlite3.connect(os.environ["GOODNEWS_DB"])
c.execute("INSERT INTO sources (id,name,feed_url,homepage_url,trust_score,content_visible) "
"VALUES (2,'NYT Learning','http://x/f','https://www.nytimes.com/section/learning',5,1)")
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash) "
"VALUES (2,2,'https://www.nytimes.com/learning/word-of-the-day','WOTD','h2')")
c.execute("INSERT INTO article_scores (article_id,accepted,topic) VALUES (2,1,'culture')")
c.commit(); c.close()
tc = _signin(app, api, "boss@x.com")
def feed_badge():
return next(a for a in tc.get("/api/feed?source_id=2").json()["items"] if a["id"] == 2)["paywalled"]
# domain rule: nytimes.com → paywalled in table, inspector, AND feed badge (all agree)
assert _src(tc, 2)["paywalled"] is True
assert tc.get("/api/admin/sources/2/articles").json()["summary"]["paywalled"] is True
assert feed_badge() is True
# override 'free' (the NYT Learning fix) → effective OFF everywhere
assert tc.post("/api/admin/sources/2/paywall", json={"override": "free"}).json()["override"] == "free"
assert _src(tc, 2)["paywalled"] is False
summ = tc.get("/api/admin/sources/2/articles").json()["summary"]
assert summ["paywalled"] is False and summ["paywall_domain"] is True and summ["paywall_override"] == "free"
assert feed_badge() is False # ranking/badge now agree it's free
# back to domain rule, and the 'paywalled' override
assert tc.post("/api/admin/sources/2/paywall", json={"override": None}).json()["override"] is None
assert _src(tc, 2)["paywalled"] is True
# validation + 404
assert tc.post("/api/admin/sources/2/paywall", json={"override": "bogus"}).status_code == 422
assert tc.post("/api/admin/sources/999/paywall", json={"override": "free"}).status_code == 404