Supervised source candidates: stage, list, promote, reject

- New source_candidates staging table (status suggested/quarantined/rejected/
  promoted, preview_json snapshot) so untrusted/suggested feeds stay out of the
  real ingestion path until reviewed.
- sources.py: save_candidate (re-preview never revives a curator's rejection),
  list_candidates, reject_candidate, promote_candidate (copies into sources,
  inactive by default — active on approval; never automatic).
- CLI: suggest-source / list-candidates / promote-candidate / reject-candidate.
- API: read-only GET /api/candidates (writes stay CLI-only — no unauthenticated
  public write surface yet).
- Fix deprecated ElementTree truth-value test in _parse_rss.
- Tests: candidate lifecycle (save/list/promote/reject, status preservation,
  name derivation) — 51 total.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
jay
2026-05-30 19:52:40 +00:00
parent 95195daff8
commit aa4125ddec
7 changed files with 282 additions and 7 deletions
+12 -5
View File
@@ -18,6 +18,10 @@ python3 -m goodnews classify --limit 10 --base-url http://127.0.0.1:1234/v1 --mo
python3 -m goodnews dedup --base-url http://127.0.0.1:1234/v1
python3 -m goodnews check-feeds
python3 -m goodnews preview-source https://example.com/feed/ --classify
python3 -m goodnews suggest-source https://example.com/feed/ --name "Example" --classify
python3 -m goodnews list-candidates
python3 -m goodnews promote-candidate 1 # copies into sources (inactive by default)
python3 -m goodnews reject-candidate 1
python3 -m goodnews build-brief --date 2026-05-27 --replace
python3 -m goodnews show-brief
python3 -m goodnews list-recent --limit 10
@@ -101,6 +105,7 @@ Endpoints:
- `GET /api/brief?date=&limit=` — a daily brief (latest if no date)
- `GET /api/brief-dates` — available brief dates
- `GET /api/source-preview?url=&classify=` — read-only scored sample of a feed (vet before adding)
- `GET /api/candidates?status=` — staged source candidates (read-only; curation is CLI-only for now)
- `GET /docs` — interactive OpenAPI documentation
The ingestion CLI stays pure-stdlib; only the `web` extra pulls in FastAPI/uvicorn,
@@ -186,11 +191,13 @@ and site, scheduled `cycle` via systemd, a pytest suite, and device-local Calm F
Still ahead:
1. **Supervised source pipeline**read-only preview is done (`preview-source` /
`/api/source-preview`: freshness, acceptance rate, topic/flavor mix,
cortisol/ragebait/PR averages, example items). Still ahead: add a previewed
source to *quarantine*, and auto-degrade stale/rejecting feeds (advisory, never
auto-blocking).
1. **Supervised source pipeline**preview + staging are done: `suggest-source`
previews a feed and stages it in the `source_candidates` table (status
suggested/quarantined/rejected/promoted); `promote-candidate` copies it into
`sources` (inactive by default — active on approval); promotion is never
automatic. Still ahead: advisory auto-degrade of stale/rejecting feeds (flag
for review, never auto-block), and an authenticated POST surface so the website
can accept public suggestions once accounts exist.
2. **Learned "Less like this" weighting** — replace the interim flavor-pause with
real preference down-ranking.
3. **Corpus rebalancing** — add calm/feelgood sources (currently science-heavy).
+28
View File
@@ -13,6 +13,7 @@ so the API and CLI always read the same file.
from __future__ import annotations
import json
import os
import re
import sqlite3
@@ -125,6 +126,19 @@ class RejectedExample(BaseModel):
reason: str
class Candidate(BaseModel):
id: int
feed_url: str
homepage_url: str | None = None
name: str | None = None
status: str
preview: dict | None = None
notes: str | None = None
last_previewed_at: str | None = None
created_at: str | None = None
updated_at: str | None = None
class SourcePreview(BaseModel):
url: str
sampled: int
@@ -251,6 +265,20 @@ def create_app() -> FastAPI:
with get_conn() as conn:
return queries.available_dates(conn, limit=limit)
@app.get("/api/candidates", response_model=list[Candidate])
def candidates(status: str | None = Query(None)) -> list[Candidate]:
from .sources import list_candidates
with get_conn() as conn:
rows = list_candidates(conn, status=status)
out = []
for r in rows:
d = dict(r)
pj = d.pop("preview_json", None)
d["preview"] = json.loads(pj) if pj else None
out.append(Candidate(**d))
return out
@app.get("/api/source-preview", response_model=SourcePreview)
def source_preview(
url: str = Query(..., max_length=2048),
+65 -1
View File
@@ -19,7 +19,14 @@ from .feeds import (
)
from .llm import LocalModelClient, classify_articles
from .scoring import score_article
from .sources import load_sources, upsert_sources
from .sources import (
list_candidates,
load_sources,
promote_candidate,
reject_candidate,
save_candidate,
upsert_sources,
)
ROOT = Path(__file__).resolve().parents[1]
@@ -66,6 +73,28 @@ def main() -> None:
preview_parser.add_argument("--base-url", help="OpenAI-compatible base URL (with --classify)")
preview_parser.add_argument("--model", help="Local model name (with --classify)")
suggest_parser = subparsers.add_parser("suggest-source", help="Preview a feed and stage it as a candidate")
suggest_parser.add_argument("url", help="Feed URL to suggest")
suggest_parser.add_argument("--name", help="Display name for the source")
suggest_parser.add_argument("--homepage", help="Homepage URL")
suggest_parser.add_argument("--sample", type=int, default=25)
suggest_parser.add_argument("--classify", action="store_true", help="Classify the sample with the local model")
suggest_parser.add_argument("--base-url")
suggest_parser.add_argument("--model")
cand_parser = subparsers.add_parser("list-candidates", help="List staged source candidates")
cand_parser.add_argument("--status", help="Filter by status: suggested|quarantined|rejected|promoted")
promote_parser = subparsers.add_parser("promote-candidate", help="Copy a candidate into the real sources")
promote_parser.add_argument("id", type=int)
promote_parser.add_argument("--active", action="store_true", help="Activate immediately (default: inactive)")
promote_parser.add_argument("--category", help="default_category for the new source")
promote_parser.add_argument("--trust", type=int, default=5)
promote_parser.add_argument("--pr-risk", type=int, default=3)
reject_parser = subparsers.add_parser("reject-candidate", help="Mark a candidate as rejected")
reject_parser.add_argument("id", type=int)
runs_parser = subparsers.add_parser("list-runs", help="Show recent ingest runs")
runs_parser.add_argument("--limit", type=int, default=20)
@@ -154,6 +183,41 @@ def main() -> None:
client = llm_client_from_args(args) if args.classify else None
preview = preview_feed(args.url, sample=args.sample, client=client)
print_preview(preview)
elif args.command == "suggest-source":
init_db(conn)
client = llm_client_from_args(args) if args.classify else None
preview = preview_feed(args.url, sample=args.sample, client=client)
print_preview(preview)
cand = save_candidate(conn, args.url, preview=preview, name=args.name, homepage_url=args.homepage)
print(f"Saved as candidate #{cand['id']} (status {cand['status']}). Review with list-candidates.")
elif args.command == "list-candidates":
init_db(conn)
rows = list_candidates(conn, status=args.status)
if not rows:
print("No candidates.")
for r in rows:
line = f"[{r['id']}] {r['status']} | {r['name'] or '(unnamed)'} | {r['feed_url']}"
if r["preview_json"]:
import json as _json
p = _json.loads(r["preview_json"])
line += f" (accept {round(p.get('acceptance_rate', 0) * 100)}%, sampled {p.get('sampled', 0)})"
print(line)
elif args.command == "promote-candidate":
init_db(conn)
try:
source_id = promote_candidate(
conn, args.id, active=args.active, default_category=args.category,
trust_score=args.trust, pr_risk_score=args.pr_risk,
)
except ValueError as exc:
raise SystemExit(str(exc))
state = "active" if args.active else "inactive"
print(f"Promoted candidate #{args.id} -> source #{source_id} ({state}).")
elif args.command == "reject-candidate":
init_db(conn)
ok = reject_candidate(conn, args.id)
print(f"Rejected candidate #{args.id}." if ok else f"No candidate #{args.id}.")
elif args.command == "list-runs":
list_runs(conn, limit=args.limit)
elif args.command == "rescore":
+13
View File
@@ -83,6 +83,19 @@ CREATE TABLE IF NOT EXISTS ingest_runs (
error TEXT
);
CREATE TABLE IF NOT EXISTS source_candidates (
id INTEGER PRIMARY KEY AUTOINCREMENT,
feed_url TEXT NOT NULL UNIQUE,
homepage_url TEXT,
name TEXT,
status TEXT NOT NULL DEFAULT 'suggested',
preview_json TEXT,
notes TEXT,
last_previewed_at TEXT,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS daily_briefs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
brief_date TEXT NOT NULL UNIQUE,
+3 -1
View File
@@ -321,7 +321,9 @@ def insert_article(conn: sqlite3.Connection, source: sqlite3.Row, item: FeedItem
def _parse_rss(root: ET.Element) -> list[FeedItem]:
channel = _first_child(root, "channel") or root
channel = _first_child(root, "channel")
if channel is None:
channel = root
language = _first_text(channel, "language")
items = [element for element in root.iter() if _local_name(element.tag) == "item"]
parsed = []
+100
View File
@@ -1,8 +1,10 @@
from __future__ import annotations
import json
import sqlite3
import tomllib
from pathlib import Path
from urllib.parse import urlsplit
def load_sources(path: Path | str) -> list[dict]:
@@ -53,3 +55,101 @@ def upsert_sources(conn: sqlite3.Connection, source_defs: list[dict]) -> int:
conn.commit()
return count
# --- Supervised source candidates (staging before the real sources table) ----
def save_candidate(
conn: sqlite3.Connection,
feed_url: str,
preview: dict | None = None,
name: str | None = None,
homepage_url: str | None = None,
status: str = "quarantined",
notes: str | None = None,
) -> sqlite3.Row:
"""Stage a suggested feed (with an optional preview snapshot) for review.
Re-previewing an existing candidate refreshes its snapshot but never changes
a status a curator already set (e.g. a rejected feed stays rejected).
"""
conn.execute(
"""
INSERT INTO source_candidates (
feed_url, homepage_url, name, status, preview_json, notes, last_previewed_at, updated_at
)
VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
ON CONFLICT(feed_url) DO UPDATE SET
preview_json = excluded.preview_json,
name = COALESCE(excluded.name, source_candidates.name),
homepage_url = COALESCE(excluded.homepage_url, source_candidates.homepage_url),
notes = COALESCE(excluded.notes, source_candidates.notes),
last_previewed_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
""",
(feed_url, homepage_url, name, status, json.dumps(preview) if preview else None, notes),
)
conn.commit()
return conn.execute("SELECT * FROM source_candidates WHERE feed_url = ?", (feed_url,)).fetchone()
def list_candidates(conn: sqlite3.Connection, status: str | None = None) -> list[sqlite3.Row]:
if status:
return conn.execute(
"SELECT * FROM source_candidates WHERE status = ? ORDER BY updated_at DESC", (status,)
).fetchall()
return conn.execute("SELECT * FROM source_candidates ORDER BY updated_at DESC").fetchall()
def reject_candidate(conn: sqlite3.Connection, candidate_id: int) -> bool:
cur = conn.execute(
"UPDATE source_candidates SET status = 'rejected', updated_at = CURRENT_TIMESTAMP WHERE id = ?",
(candidate_id,),
)
conn.commit()
return cur.rowcount > 0
def promote_candidate(
conn: sqlite3.Connection,
candidate_id: int,
active: bool = False,
default_category: str | None = None,
trust_score: int = 5,
pr_risk_score: int = 3,
poll_interval_minutes: int = 180,
) -> int:
"""Copy a reviewed candidate into the real sources table.
Inactive by default (active-on-approval): a promoted feed is wired up but
won't be polled until explicitly activated. Never called automatically.
"""
cand = conn.execute("SELECT * FROM source_candidates WHERE id = ?", (candidate_id,)).fetchone()
if cand is None:
raise ValueError(f"no candidate with id {candidate_id}")
name = cand["name"] or urlsplit(cand["feed_url"]).netloc or cand["feed_url"]
upsert_sources(
conn,
[
{
"name": name,
"feed_url": cand["feed_url"],
"homepage_url": cand["homepage_url"],
"default_category": default_category,
"trust_score": trust_score,
"pr_risk_score": pr_risk_score,
"active": active,
"poll_interval_minutes": poll_interval_minutes,
"notes": f"promoted from candidate {candidate_id}",
}
],
)
conn.execute(
"UPDATE source_candidates SET status = 'promoted', updated_at = CURRENT_TIMESTAMP WHERE id = ?",
(candidate_id,),
)
conn.commit()
row = conn.execute("SELECT id FROM sources WHERE feed_url = ?", (cand["feed_url"],)).fetchone()
return int(row["id"])
+61
View File
@@ -0,0 +1,61 @@
import pytest
from goodnews.db import connect, init_db
from goodnews.sources import (
list_candidates,
promote_candidate,
reject_candidate,
save_candidate,
)
@pytest.fixture
def conn():
c = connect(":memory:")
init_db(c)
yield c
c.close()
def test_save_and_list_candidate(conn):
cand = save_candidate(conn, "http://x/feed", preview={"acceptance_rate": 0.8, "sampled": 10}, name="X")
assert cand["status"] == "quarantined"
rows = list_candidates(conn)
assert len(rows) == 1 and rows[0]["feed_url"] == "http://x/feed"
def test_re_preview_preserves_curator_status(conn):
save_candidate(conn, "http://x/feed")
reject_candidate(conn, list_candidates(conn)[0]["id"])
# Re-previewing must NOT revive a rejected feed.
save_candidate(conn, "http://x/feed", preview={"acceptance_rate": 0.9})
assert list_candidates(conn)[0]["status"] == "rejected"
def test_promote_creates_inactive_source_and_marks_promoted(conn):
cand = save_candidate(conn, "http://x/feed", name="Lovely Feed")
source_id = promote_candidate(conn, cand["id"]) # inactive by default
src = conn.execute("SELECT name, active FROM sources WHERE id = ?", (source_id,)).fetchone()
assert src["name"] == "Lovely Feed"
assert src["active"] == 0 # active-on-approval: not polled until activated
status = conn.execute("SELECT status FROM source_candidates WHERE id = ?", (cand["id"],)).fetchone()["status"]
assert status == "promoted"
def test_promote_active_flag(conn):
cand = save_candidate(conn, "http://y/feed", name="Y")
sid = promote_candidate(conn, cand["id"], active=True)
assert conn.execute("SELECT active FROM sources WHERE id = ?", (sid,)).fetchone()["active"] == 1
def test_promote_unknown_raises(conn):
with pytest.raises(ValueError):
promote_candidate(conn, 999)
def test_name_derived_from_url_when_missing(conn):
cand = save_candidate(conn, "https://news.example.org/rss")
sid = promote_candidate(conn, cand["id"])
assert conn.execute("SELECT name FROM sources WHERE id = ?", (sid,)).fetchone()["name"] == "news.example.org"