Files
upbeatBytes/goodnews/sources.py
T
thejayman77 ba801d90f6 Make paywalls systemic + fix ArticleCard reactivity
- ArticleCard: derive safeHref from article.url and reset image-failure state
  when the article changes, so in-place replacements re-evaluate correctly
  (clears the Svelte capture warning; build is warning-free again).
- Downweight paywalled stories below readable ones (stable sort) when composing
  the daily five and in feed results — the brief now leads readable and rarely
  hands over a locked door.
- review_sources gains a 'paywall-heavy' advisory flag (Nature, New Scientist
  flag at 100%); never auto-deactivates.
- New Scientist/Nature kept active but no longer reach the daily five; they
  remain browsable with the label + Replace.
- Tests: brief readability preference + paywall-heavy flag (79 total).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-31 01:36:53 +00:00

241 lines
8.7 KiB
Python

from __future__ import annotations
import json
import sqlite3
import tomllib
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urlsplit
from .paywall import is_paywalled
def load_sources(path: Path | str) -> list[dict]:
data = tomllib.loads(Path(path).read_text(encoding="utf-8"))
sources = data.get("sources", [])
if not isinstance(sources, list):
raise ValueError("sources.toml must contain [[sources]] entries")
return sources
def upsert_sources(conn: sqlite3.Connection, source_defs: list[dict]) -> int:
count = 0
for source in source_defs:
conn.execute(
"""
INSERT INTO sources (
name, homepage_url, feed_url, source_type, default_category,
trust_score, pr_risk_score, active, poll_interval_minutes, notes,
updated_at
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
ON CONFLICT(feed_url) DO UPDATE SET
name = excluded.name,
homepage_url = excluded.homepage_url,
source_type = excluded.source_type,
default_category = excluded.default_category,
trust_score = excluded.trust_score,
pr_risk_score = excluded.pr_risk_score,
active = excluded.active,
poll_interval_minutes = excluded.poll_interval_minutes,
notes = excluded.notes,
updated_at = CURRENT_TIMESTAMP
""",
(
source["name"],
source.get("homepage_url"),
source["feed_url"],
source.get("source_type", "rss"),
source.get("default_category"),
int(source.get("trust_score", 5)),
int(source.get("pr_risk_score", 3)),
1 if source.get("active", True) else 0,
int(source.get("poll_interval_minutes", 60)),
source.get("notes"),
),
)
count += 1
conn.commit()
return count
# --- Supervised source candidates (staging before the real sources table) ----
def save_candidate(
conn: sqlite3.Connection,
feed_url: str,
preview: dict | None = None,
name: str | None = None,
homepage_url: str | None = None,
status: str = "quarantined",
notes: str | None = None,
) -> sqlite3.Row:
"""Stage a suggested feed (with an optional preview snapshot) for review.
Re-previewing an existing candidate refreshes its snapshot but never changes
a status a curator already set (e.g. a rejected feed stays rejected).
"""
conn.execute(
"""
INSERT INTO source_candidates (
feed_url, homepage_url, name, status, preview_json, notes, last_previewed_at, updated_at
)
VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
ON CONFLICT(feed_url) DO UPDATE SET
preview_json = excluded.preview_json,
name = COALESCE(excluded.name, source_candidates.name),
homepage_url = COALESCE(excluded.homepage_url, source_candidates.homepage_url),
notes = COALESCE(excluded.notes, source_candidates.notes),
last_previewed_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
""",
(feed_url, homepage_url, name, status, json.dumps(preview) if preview else None, notes),
)
conn.commit()
return conn.execute("SELECT * FROM source_candidates WHERE feed_url = ?", (feed_url,)).fetchone()
def list_candidates(conn: sqlite3.Connection, status: str | None = None) -> list[sqlite3.Row]:
if status:
return conn.execute(
"SELECT * FROM source_candidates WHERE status = ? ORDER BY updated_at DESC", (status,)
).fetchall()
return conn.execute("SELECT * FROM source_candidates ORDER BY updated_at DESC").fetchall()
def reject_candidate(conn: sqlite3.Connection, candidate_id: int) -> bool:
cur = conn.execute(
"UPDATE source_candidates SET status = 'rejected', updated_at = CURRENT_TIMESTAMP WHERE id = ?",
(candidate_id,),
)
conn.commit()
return cur.rowcount > 0
def promote_candidate(
conn: sqlite3.Connection,
candidate_id: int,
active: bool = False,
default_category: str | None = None,
trust_score: int = 5,
pr_risk_score: int = 3,
poll_interval_minutes: int = 180,
) -> int:
"""Copy a reviewed candidate into the real sources table.
Inactive by default (active-on-approval): a promoted feed is wired up but
won't be polled until explicitly activated. Never called automatically.
"""
cand = conn.execute("SELECT * FROM source_candidates WHERE id = ?", (candidate_id,)).fetchone()
if cand is None:
raise ValueError(f"no candidate with id {candidate_id}")
name = cand["name"] or urlsplit(cand["feed_url"]).netloc or cand["feed_url"]
upsert_sources(
conn,
[
{
"name": name,
"feed_url": cand["feed_url"],
"homepage_url": cand["homepage_url"],
"default_category": default_category,
"trust_score": trust_score,
"pr_risk_score": pr_risk_score,
"active": active,
"poll_interval_minutes": poll_interval_minutes,
"notes": f"promoted from candidate {candidate_id}",
}
],
)
conn.execute(
"UPDATE source_candidates SET status = 'promoted', updated_at = CURRENT_TIMESTAMP WHERE id = ?",
(candidate_id,),
)
conn.commit()
row = conn.execute("SELECT id FROM sources WHERE feed_url = ?", (cand["feed_url"],)).fetchone()
return int(row["id"])
# --- Advisory source health: flag for review, never auto-deactivate -----------
def review_sources(
conn: sqlite3.Connection,
stale_days: int = 14,
min_recent: int = 15,
recent_window: int = 40,
) -> list[dict]:
"""Recompute advisory review flags for active sources.
Sets review_flag/review_reason but NEVER changes `active` — the human stays
in the loop. Returns the list of newly-flagged sources.
"""
now = datetime.now(timezone.utc)
flagged = []
sources = conn.execute(
"SELECT id, name, consecutive_failures FROM sources WHERE active = 1"
).fetchall()
for s in sources:
reasons: list[str] = []
if (s["consecutive_failures"] or 0) >= 3:
reasons.append(f"failing ({s['consecutive_failures']} consecutive)")
recent = conn.execute(
"""
SELECT sc.accepted, sc.cortisol_score, sc.ragebait_score, a.duplicate_of,
a.canonical_url, COALESCE(a.published_at, a.discovered_at) AS dt
FROM articles a
JOIN article_scores sc ON sc.article_id = a.id
WHERE a.source_id = ?
ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
LIMIT ?
""",
(s["id"], recent_window),
).fetchall()
n = len(recent)
if n == 0:
reasons.append("no articles yet")
else:
try:
newest = datetime.fromisoformat(recent[0]["dt"])
if newest.tzinfo is None:
newest = newest.replace(tzinfo=timezone.utc)
age = (now - newest).days
if age > stale_days:
reasons.append(f"stale (newest {age}d ago)")
except (ValueError, TypeError):
pass
if n >= min_recent:
acc = sum(r["accepted"] or 0 for r in recent) / n
if acc < 0.10:
reasons.append(f"low acceptance ({acc * 100:.0f}%)")
dup = sum(1 for r in recent if r["duplicate_of"] is not None) / n
if dup > 0.5:
reasons.append(f"duplicate-heavy ({dup * 100:.0f}%)")
avg_cort = sum(r["cortisol_score"] or 0 for r in recent) / n
if avg_cort > 5:
reasons.append(f"high cortisol (avg {avg_cort:.1f})")
avg_rage = sum(r["ragebait_score"] or 0 for r in recent) / n
if avg_rage > 3:
reasons.append(f"high ragebait (avg {avg_rage:.1f})")
paywalled = sum(1 for r in recent if is_paywalled(r["canonical_url"])) / n
if paywalled > 0.5:
reasons.append(f"paywall-heavy ({paywalled * 100:.0f}%)")
flag = 1 if reasons else 0
reason = "; ".join(reasons) if reasons else None
conn.execute(
"UPDATE sources SET review_flag = ?, review_reason = ? WHERE id = ?",
(flag, reason, s["id"]),
)
if flag:
flagged.append({"id": s["id"], "name": s["name"], "reason": reason})
conn.commit()
return flagged