ba801d90f6
- ArticleCard: derive safeHref from article.url and reset image-failure state when the article changes, so in-place replacements re-evaluate correctly (clears the Svelte capture warning; build is warning-free again). - Downweight paywalled stories below readable ones (stable sort) when composing the daily five and in feed results — the brief now leads readable and rarely hands over a locked door. - review_sources gains a 'paywall-heavy' advisory flag (Nature, New Scientist flag at 100%); never auto-deactivates. - New Scientist/Nature kept active but no longer reach the daily five; they remain browsable with the label + Replace. - Tests: brief readability preference + paywall-heavy flag (79 total). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
241 lines
8.7 KiB
Python
241 lines
8.7 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import sqlite3
|
|
import tomllib
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from urllib.parse import urlsplit
|
|
|
|
from .paywall import is_paywalled
|
|
|
|
|
|
def load_sources(path: Path | str) -> list[dict]:
|
|
data = tomllib.loads(Path(path).read_text(encoding="utf-8"))
|
|
sources = data.get("sources", [])
|
|
if not isinstance(sources, list):
|
|
raise ValueError("sources.toml must contain [[sources]] entries")
|
|
return sources
|
|
|
|
|
|
def upsert_sources(conn: sqlite3.Connection, source_defs: list[dict]) -> int:
|
|
count = 0
|
|
for source in source_defs:
|
|
conn.execute(
|
|
"""
|
|
INSERT INTO sources (
|
|
name, homepage_url, feed_url, source_type, default_category,
|
|
trust_score, pr_risk_score, active, poll_interval_minutes, notes,
|
|
updated_at
|
|
)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
|
|
ON CONFLICT(feed_url) DO UPDATE SET
|
|
name = excluded.name,
|
|
homepage_url = excluded.homepage_url,
|
|
source_type = excluded.source_type,
|
|
default_category = excluded.default_category,
|
|
trust_score = excluded.trust_score,
|
|
pr_risk_score = excluded.pr_risk_score,
|
|
active = excluded.active,
|
|
poll_interval_minutes = excluded.poll_interval_minutes,
|
|
notes = excluded.notes,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
""",
|
|
(
|
|
source["name"],
|
|
source.get("homepage_url"),
|
|
source["feed_url"],
|
|
source.get("source_type", "rss"),
|
|
source.get("default_category"),
|
|
int(source.get("trust_score", 5)),
|
|
int(source.get("pr_risk_score", 3)),
|
|
1 if source.get("active", True) else 0,
|
|
int(source.get("poll_interval_minutes", 60)),
|
|
source.get("notes"),
|
|
),
|
|
)
|
|
count += 1
|
|
conn.commit()
|
|
return count
|
|
|
|
|
|
# --- Supervised source candidates (staging before the real sources table) ----
|
|
|
|
|
|
def save_candidate(
|
|
conn: sqlite3.Connection,
|
|
feed_url: str,
|
|
preview: dict | None = None,
|
|
name: str | None = None,
|
|
homepage_url: str | None = None,
|
|
status: str = "quarantined",
|
|
notes: str | None = None,
|
|
) -> sqlite3.Row:
|
|
"""Stage a suggested feed (with an optional preview snapshot) for review.
|
|
|
|
Re-previewing an existing candidate refreshes its snapshot but never changes
|
|
a status a curator already set (e.g. a rejected feed stays rejected).
|
|
"""
|
|
conn.execute(
|
|
"""
|
|
INSERT INTO source_candidates (
|
|
feed_url, homepage_url, name, status, preview_json, notes, last_previewed_at, updated_at
|
|
)
|
|
VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
|
|
ON CONFLICT(feed_url) DO UPDATE SET
|
|
preview_json = excluded.preview_json,
|
|
name = COALESCE(excluded.name, source_candidates.name),
|
|
homepage_url = COALESCE(excluded.homepage_url, source_candidates.homepage_url),
|
|
notes = COALESCE(excluded.notes, source_candidates.notes),
|
|
last_previewed_at = CURRENT_TIMESTAMP,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
""",
|
|
(feed_url, homepage_url, name, status, json.dumps(preview) if preview else None, notes),
|
|
)
|
|
conn.commit()
|
|
return conn.execute("SELECT * FROM source_candidates WHERE feed_url = ?", (feed_url,)).fetchone()
|
|
|
|
|
|
def list_candidates(conn: sqlite3.Connection, status: str | None = None) -> list[sqlite3.Row]:
|
|
if status:
|
|
return conn.execute(
|
|
"SELECT * FROM source_candidates WHERE status = ? ORDER BY updated_at DESC", (status,)
|
|
).fetchall()
|
|
return conn.execute("SELECT * FROM source_candidates ORDER BY updated_at DESC").fetchall()
|
|
|
|
|
|
def reject_candidate(conn: sqlite3.Connection, candidate_id: int) -> bool:
|
|
cur = conn.execute(
|
|
"UPDATE source_candidates SET status = 'rejected', updated_at = CURRENT_TIMESTAMP WHERE id = ?",
|
|
(candidate_id,),
|
|
)
|
|
conn.commit()
|
|
return cur.rowcount > 0
|
|
|
|
|
|
def promote_candidate(
|
|
conn: sqlite3.Connection,
|
|
candidate_id: int,
|
|
active: bool = False,
|
|
default_category: str | None = None,
|
|
trust_score: int = 5,
|
|
pr_risk_score: int = 3,
|
|
poll_interval_minutes: int = 180,
|
|
) -> int:
|
|
"""Copy a reviewed candidate into the real sources table.
|
|
|
|
Inactive by default (active-on-approval): a promoted feed is wired up but
|
|
won't be polled until explicitly activated. Never called automatically.
|
|
"""
|
|
cand = conn.execute("SELECT * FROM source_candidates WHERE id = ?", (candidate_id,)).fetchone()
|
|
if cand is None:
|
|
raise ValueError(f"no candidate with id {candidate_id}")
|
|
|
|
name = cand["name"] or urlsplit(cand["feed_url"]).netloc or cand["feed_url"]
|
|
upsert_sources(
|
|
conn,
|
|
[
|
|
{
|
|
"name": name,
|
|
"feed_url": cand["feed_url"],
|
|
"homepage_url": cand["homepage_url"],
|
|
"default_category": default_category,
|
|
"trust_score": trust_score,
|
|
"pr_risk_score": pr_risk_score,
|
|
"active": active,
|
|
"poll_interval_minutes": poll_interval_minutes,
|
|
"notes": f"promoted from candidate {candidate_id}",
|
|
}
|
|
],
|
|
)
|
|
conn.execute(
|
|
"UPDATE source_candidates SET status = 'promoted', updated_at = CURRENT_TIMESTAMP WHERE id = ?",
|
|
(candidate_id,),
|
|
)
|
|
conn.commit()
|
|
row = conn.execute("SELECT id FROM sources WHERE feed_url = ?", (cand["feed_url"],)).fetchone()
|
|
return int(row["id"])
|
|
|
|
|
|
# --- Advisory source health: flag for review, never auto-deactivate -----------
|
|
|
|
|
|
def review_sources(
|
|
conn: sqlite3.Connection,
|
|
stale_days: int = 14,
|
|
min_recent: int = 15,
|
|
recent_window: int = 40,
|
|
) -> list[dict]:
|
|
"""Recompute advisory review flags for active sources.
|
|
|
|
Sets review_flag/review_reason but NEVER changes `active` — the human stays
|
|
in the loop. Returns the list of newly-flagged sources.
|
|
"""
|
|
now = datetime.now(timezone.utc)
|
|
flagged = []
|
|
sources = conn.execute(
|
|
"SELECT id, name, consecutive_failures FROM sources WHERE active = 1"
|
|
).fetchall()
|
|
|
|
for s in sources:
|
|
reasons: list[str] = []
|
|
if (s["consecutive_failures"] or 0) >= 3:
|
|
reasons.append(f"failing ({s['consecutive_failures']} consecutive)")
|
|
|
|
recent = conn.execute(
|
|
"""
|
|
SELECT sc.accepted, sc.cortisol_score, sc.ragebait_score, a.duplicate_of,
|
|
a.canonical_url, COALESCE(a.published_at, a.discovered_at) AS dt
|
|
FROM articles a
|
|
JOIN article_scores sc ON sc.article_id = a.id
|
|
WHERE a.source_id = ?
|
|
ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
|
|
LIMIT ?
|
|
""",
|
|
(s["id"], recent_window),
|
|
).fetchall()
|
|
n = len(recent)
|
|
|
|
if n == 0:
|
|
reasons.append("no articles yet")
|
|
else:
|
|
try:
|
|
newest = datetime.fromisoformat(recent[0]["dt"])
|
|
if newest.tzinfo is None:
|
|
newest = newest.replace(tzinfo=timezone.utc)
|
|
age = (now - newest).days
|
|
if age > stale_days:
|
|
reasons.append(f"stale (newest {age}d ago)")
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
if n >= min_recent:
|
|
acc = sum(r["accepted"] or 0 for r in recent) / n
|
|
if acc < 0.10:
|
|
reasons.append(f"low acceptance ({acc * 100:.0f}%)")
|
|
dup = sum(1 for r in recent if r["duplicate_of"] is not None) / n
|
|
if dup > 0.5:
|
|
reasons.append(f"duplicate-heavy ({dup * 100:.0f}%)")
|
|
avg_cort = sum(r["cortisol_score"] or 0 for r in recent) / n
|
|
if avg_cort > 5:
|
|
reasons.append(f"high cortisol (avg {avg_cort:.1f})")
|
|
avg_rage = sum(r["ragebait_score"] or 0 for r in recent) / n
|
|
if avg_rage > 3:
|
|
reasons.append(f"high ragebait (avg {avg_rage:.1f})")
|
|
paywalled = sum(1 for r in recent if is_paywalled(r["canonical_url"])) / n
|
|
if paywalled > 0.5:
|
|
reasons.append(f"paywall-heavy ({paywalled * 100:.0f}%)")
|
|
|
|
flag = 1 if reasons else 0
|
|
reason = "; ".join(reasons) if reasons else None
|
|
conn.execute(
|
|
"UPDATE sources SET review_flag = ?, review_reason = ? WHERE id = ?",
|
|
(flag, reason, s["id"]),
|
|
)
|
|
if flag:
|
|
flagged.append({"id": s["id"], "name": s["name"], "reason": reason})
|
|
|
|
conn.commit()
|
|
return flagged
|
|
|