Track 3: read-only source preview (vet a feed before adding)

- feeds.preview_feed(): fetch + score a sample WITHOUT persisting; returns
  freshness, acceptance rate, cortisol/ragebait/PR averages, and example
  accepted/rejected items. With an LLM client it also returns topic/flavor mix
  and the model's (accurate) acceptance view.
- CLI 'preview-source URL [--sample] [--classify]'.
- API 'GET /api/source-preview?url=&sample=&classify=' with an http(s)-only
  guard (SSRF note left for go-public hardening).
- Site 'Suggest a source' panel with Quick check (heuristic, instant) and Deep
  check (model, accurate), rendered DOM-safely.
- Tests: network-free preview_feed tests via monkeypatched fetch (45 total).
- README documents the command, endpoint, and updated roadmap.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
jay
2026-05-30 19:37:34 +00:00
parent cabe0b6049
commit 95195daff8
6 changed files with 304 additions and 5 deletions
+7 -3
View File
@@ -17,6 +17,7 @@ python3 -m goodnews check-llm --base-url http://127.0.0.1:1234/v1 --model gpt-os
python3 -m goodnews classify --limit 10 --base-url http://127.0.0.1:1234/v1 --model gpt-oss
python3 -m goodnews dedup --base-url http://127.0.0.1:1234/v1
python3 -m goodnews check-feeds
python3 -m goodnews preview-source https://example.com/feed/ --classify
python3 -m goodnews build-brief --date 2026-05-27 --replace
python3 -m goodnews show-brief
python3 -m goodnews list-recent --limit 10
@@ -99,6 +100,7 @@ Endpoints:
- `GET /api/feed?topic=&flavor=&limit=&offset=` — ranked, filtered articles
- `GET /api/brief?date=&limit=` — a daily brief (latest if no date)
- `GET /api/brief-dates` — available brief dates
- `GET /api/source-preview?url=&classify=` — read-only scored sample of a feed (vet before adding)
- `GET /docs` — interactive OpenAPI documentation
The ingestion CLI stays pure-stdlib; only the `web` extra pulls in FastAPI/uvicorn,
@@ -184,9 +186,11 @@ and site, scheduled `cycle` via systemd, a pytest suite, and device-local Calm F
Still ahead:
1. **Supervised source pipeline**paste a feed URL, preview a scored sample
(freshness, acceptance rate, topic/flavor mix, cortisol/ragebait/PR averages,
example items), then add to quarantine before it can reach the main feed.
1. **Supervised source pipeline**read-only preview is done (`preview-source` /
`/api/source-preview`: freshness, acceptance rate, topic/flavor mix,
cortisol/ragebait/PR averages, example items). Still ahead: add a previewed
source to *quarantine*, and auto-degrade stale/rejecting feeds (advisory, never
auto-blocking).
2. **Learned "Less like this" weighting** — replace the interim flavor-pause with
real preference down-ranking.
3. **Corpus rebalancing** — add calm/feelgood sources (currently science-heavy).
+43 -1
View File
@@ -14,6 +14,7 @@ so the API and CLI always read the same file.
from __future__ import annotations
import os
import re
import sqlite3
from collections import Counter
from contextlib import contextmanager
@@ -25,9 +26,10 @@ from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel
from . import queries
from . import feeds, queries
from .db import connect, init_db
from .filters import filter_articles, prefs_from_json
from .llm import LocalModelClient
from .taxonomy import FLAVORS, TOPICS
ROOT = Path(__file__).resolve().parents[1]
@@ -118,6 +120,28 @@ class BriefResponse(BaseModel):
items: list[Article]
class RejectedExample(BaseModel):
title: str
reason: str
class SourcePreview(BaseModel):
url: str
sampled: int
classified: bool
accepted: int
acceptance_rate: float
avg_cortisol: float
avg_ragebait: float
avg_pr_risk: float
newest_published: str | None
recent_7d: int
topic_mix: dict[str, int]
flavor_mix: dict[str, int]
examples_accepted: list[str]
examples_rejected: list[RejectedExample]
# --- App --------------------------------------------------------------------
@@ -227,6 +251,24 @@ def create_app() -> FastAPI:
with get_conn() as conn:
return queries.available_dates(conn, limit=limit)
@app.get("/api/source-preview", response_model=SourcePreview)
def source_preview(
url: str = Query(..., max_length=2048),
sample: int = Query(25, ge=1, le=50),
classify: bool = Query(False, description="Also classify with the local model (accurate but slower)"),
) -> SourcePreview:
# Read-only sample scoring; nothing is persisted. Only http(s) is allowed.
# NOTE: fetching a user-supplied URL is an SSRF surface — before exposing
# this publicly, also block private/loopback/link-local address ranges.
if not re.match(r"^https?://", url, re.IGNORECASE):
raise HTTPException(400, "url must start with http:// or https://")
client = LocalModelClient.from_env() if classify else None
try:
data = feeds.preview_feed(url, sample=sample, client=client)
except Exception as exc:
raise HTTPException(502, f"could not preview feed: {exc}")
return SourcePreview(**data)
# Static site last, mounted at root, so /api/* and /healthz win.
if STATIC_DIR.is_dir():
app.mount("/", StaticFiles(directory=str(STATIC_DIR), html=True), name="site")
+38 -1
View File
@@ -9,7 +9,14 @@ from pathlib import Path
from .briefs import build_daily_brief, show_brief
from .db import connect, init_db
from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, dedup as run_dedup
from .feeds import fetch_feed, parse_feed, poll_all_sources, poll_due_sources, poll_source
from .feeds import (
fetch_feed,
parse_feed,
poll_all_sources,
poll_due_sources,
poll_source,
preview_feed,
)
from .llm import LocalModelClient, classify_articles
from .scoring import score_article
from .sources import load_sources, upsert_sources
@@ -52,6 +59,13 @@ def main() -> None:
check_feeds_parser = subparsers.add_parser("check-feeds", help="Fetch and parse each feed, reporting health")
check_feeds_parser.add_argument("--all", action="store_true", help="Include inactive sources")
preview_parser = subparsers.add_parser("preview-source", help="Score a sample of a feed without adding it")
preview_parser.add_argument("url", help="Feed URL to preview")
preview_parser.add_argument("--sample", type=int, default=25)
preview_parser.add_argument("--classify", action="store_true", help="Also classify with the local model (slower)")
preview_parser.add_argument("--base-url", help="OpenAI-compatible base URL (with --classify)")
preview_parser.add_argument("--model", help="Local model name (with --classify)")
runs_parser = subparsers.add_parser("list-runs", help="Show recent ingest runs")
runs_parser.add_argument("--limit", type=int, default=20)
@@ -136,6 +150,10 @@ def main() -> None:
source_report(conn)
elif args.command == "check-feeds":
check_feeds(conn, include_inactive=args.all)
elif args.command == "preview-source":
client = llm_client_from_args(args) if args.classify else None
preview = preview_feed(args.url, sample=args.sample, client=client)
print_preview(preview)
elif args.command == "list-runs":
list_runs(conn, limit=args.limit)
elif args.command == "rescore":
@@ -243,6 +261,25 @@ def list_recent(conn: sqlite3.Connection, limit: int, accepted_only: bool) -> No
print(f" {row['canonical_url']}")
def print_preview(p: dict) -> None:
mode = "model" if p["classified"] else "heuristic"
print(f"Preview of {p['url']} ({mode})")
print(f" sampled={p['sampled']} accepted={p['accepted']} ({p['acceptance_rate']*100:.0f}%)")
print(f" freshness: newest={p['newest_published'] or 'unknown'} in_last_7d={p['recent_7d']}")
print(f" averages: cortisol={p['avg_cortisol']} ragebait={p['avg_ragebait']} pr_risk={p['avg_pr_risk']}")
if p["topic_mix"]:
print(f" topics: {p['topic_mix']}")
print(f" flavors: {p['flavor_mix']}")
if p["examples_accepted"]:
print(" would accept:")
for t in p["examples_accepted"]:
print(f" + {t[:80]}")
if p["examples_rejected"]:
print(" would skip:")
for ex in p["examples_rejected"]:
print(f" - {ex['title'][:70]} ({ex['reason']})")
def check_feeds(conn: sqlite3.Connection, include_inactive: bool = False) -> None:
where = "" if include_inactive else "WHERE active = 1"
rows = conn.execute(f"SELECT name, feed_url FROM sources {where} ORDER BY name").fetchall()
+101
View File
@@ -5,6 +5,7 @@ import sqlite3
import urllib.error
import urllib.request
import xml.etree.ElementTree as ET
from collections import Counter
from dataclasses import dataclass
from datetime import UTC, datetime
@@ -133,6 +134,106 @@ def poll_source(conn: sqlite3.Connection, source: sqlite3.Row) -> dict:
}
def preview_feed(url: str, sample: int = 25, pr_risk_default: int = 3, client=None) -> dict:
"""Fetch and score a sample of a feed WITHOUT persisting anything.
Read-only: lets a user vet a candidate source before it is ever added. By
default it uses the fast heuristic; pass an LLM client to also get the
topic/flavor mix and the model's acceptance view (slower).
"""
items = parse_feed(fetch_feed(url))
rows = []
for item in items[:sample]:
title = clean_text(item.title, max_len=500)
if not title:
continue
description = clean_text(item.description, max_len=1000)
s = score_article(title, description, pr_risk_default)
rows.append(
{
"title": title,
"description": description,
"url": canonicalize_url(item.url),
"published_at": item.published_at,
"accepted": bool(s["accepted"]),
"cortisol": s["cortisol_score"],
"ragebait": s["ragebait_score"],
"pr_risk": s["pr_risk_score"],
"reason_code": s["reason_code"],
"topic": None,
"flavor": None,
}
)
classified = False
if client and rows:
from .llm import normalize_scores
classified = True
for r in rows:
try:
raw = client.classify(
{
"source_name": "preview",
"default_category": None,
"source_trust_score": 5,
"source_pr_risk_score": pr_risk_default,
"published_at": r["published_at"],
"title": r["title"],
"description": r["description"] or "",
"canonical_url": r["url"],
}
)
ns = normalize_scores(raw, model_name=client.model)
r.update(
accepted=bool(ns["accepted"]),
topic=ns["topic"],
flavor=ns["flavor"],
cortisol=ns["cortisol_score"],
ragebait=ns["ragebait_score"],
pr_risk=ns["pr_risk_score"],
)
except Exception:
pass # one bad item shouldn't sink the whole preview
total = len(rows)
accepted = sum(1 for r in rows if r["accepted"])
def _avg(key: str) -> float:
return round(sum(r[key] for r in rows) / total, 1) if total else 0.0
# Freshness: newest item and how many landed in the last week.
now = datetime.now(UTC)
dates = []
for r in rows:
if r["published_at"]:
try:
dates.append(datetime.fromisoformat(r["published_at"]))
except ValueError:
pass
newest = max(dates).isoformat() if dates else None
recent_7d = sum(1 for d in dates if (now - d).days <= 7)
return {
"url": url,
"sampled": total,
"classified": classified,
"accepted": accepted,
"acceptance_rate": round(accepted / total, 2) if total else 0.0,
"avg_cortisol": _avg("cortisol"),
"avg_ragebait": _avg("ragebait"),
"avg_pr_risk": _avg("pr_risk"),
"newest_published": newest,
"recent_7d": recent_7d,
"topic_mix": dict(Counter(r["topic"] for r in rows if r["topic"])),
"flavor_mix": dict(Counter(r["flavor"] for r in rows if r["flavor"])),
"examples_accepted": [r["title"] for r in rows if r["accepted"]][:5],
"examples_rejected": [
{"title": r["title"], "reason": r["reason_code"]} for r in rows if not r["accepted"]
][:5],
}
def fetch_feed(url: str, timeout: int = 20) -> bytes:
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
try:
+79
View File
@@ -88,6 +88,19 @@
.panel .reset { margin-top: 6px; background: none; border: none; color: var(--muted);
cursor: pointer; font-size: 0.8rem; text-decoration: underline; }
.calm-note { color: var(--muted); font-size: 0.8rem; margin: -8px 0 14px; }
.addsource { background: var(--card); border: 1px solid var(--line); border-radius: 12px;
padding: 16px 18px; margin-bottom: 18px; }
.addsource .hint { color: var(--muted); font-size: 0.82rem; margin: 0 0 10px; }
.addsource-row { display: flex; gap: 8px; flex-wrap: wrap; }
.addsource input { flex: 1 1 240px; border: 1px solid var(--line); border-radius: 8px;
padding: 7px 11px; font-size: 0.9rem; }
.addbtn { border: 1px solid var(--accent); background: var(--accent); color: #fff;
border-radius: 8px; padding: 7px 14px; cursor: pointer; font-size: 0.85rem; }
.addbtn.ghost { background: none; color: var(--accent); }
.preview-metrics { margin-top: 14px; }
.preview-metrics .stat { font-size: 0.9rem; margin: 2px 0; }
.preview-metrics .label { color: var(--muted); }
.preview-metrics ul { margin: 4px 0 10px; padding-left: 18px; font-size: 0.86rem; }
footer { text-align: center; color: var(--muted); font-size: 0.78rem; padding: 20px; }
footer a { color: var(--accent); }
</style>
@@ -130,6 +143,17 @@
<div id="topic-chips" class="chips"></div>
<div id="flavor-chips" class="chips"></div>
<div id="feed"></div>
<div class="section-title">Suggest a source</div>
<div class="addsource">
<p class="hint">Paste a feed URL to see how calm it is before anyone adds it. Nothing is saved — this just samples and scores recent items.</p>
<div class="addsource-row">
<input id="src-url" type="text" placeholder="https://example.com/feed/" />
<button id="src-quick" class="addbtn">Quick check</button>
<button id="src-deep" class="addbtn ghost">Deep check (uses model)</button>
</div>
<div id="src-result"></div>
</div>
</main>
<footer>
goodNews · metadata &amp; links only, no stored articles ·
@@ -354,6 +378,11 @@
fc.appendChild(chip("all", true, () => setFlavor(null)));
cats.flavors.forEach(f => fc.appendChild(chip(f.key, false, () => setFlavor(f.key))));
// source preview controls
el("src-quick").onclick = () => previewSource(false);
el("src-deep").onclick = () => previewSource(true);
el("src-url").addEventListener("keydown", (e) => { if (e.key === "Enter") previewSource(false); });
// panel controls
el("calm-toggle").onclick = () => el("panel").classList.toggle("open");
el("term-add").onclick = addTerm;
@@ -363,6 +392,56 @@
refreshAll();
}
async function previewSource(deep) {
const url = el("src-url").value.trim();
const out = el("src-result");
out.replaceChildren();
if (!/^https?:\/\//i.test(url)) {
out.append(node("div", "empty", "Enter a URL starting with http:// or https://"));
return;
}
out.append(node("div", "empty", deep ? "Deep checking with the model — this can take a moment…" : "Checking…"));
try {
const p = await getJSON(`/api/source-preview?url=${encodeURIComponent(url)}&classify=${deep}`);
renderPreview(out, p);
} catch (e) {
out.replaceChildren(node("div", "empty", "Could not read that feed."));
}
}
function renderPreview(out, p) {
out.replaceChildren();
const box = node("div", "preview-metrics");
const stat = (label, value) => {
const d = node("div", "stat");
d.append(node("span", "label", label + " "));
d.append(document.createTextNode(String(value)));
box.append(d);
};
stat("Mode:", p.classified ? "model (accurate)" : "heuristic (quick, conservative)");
stat("Acceptance:", `${Math.round(p.acceptance_rate * 100)}% (${p.accepted}/${p.sampled})`);
stat("Freshness:", `${p.recent_7d}/${p.sampled} in last 7 days · newest ${(p.newest_published||"unknown").slice(0,10)}`);
stat("Calm averages:", `cortisol ${p.avg_cortisol} · ragebait ${p.avg_ragebait} · PR ${p.avg_pr_risk}`);
const mix = (m) => Object.entries(m).map(([k, v]) => `${k} ${v}`).join(" · ") || "—";
if (p.classified) {
stat("Topics:", mix(p.topic_mix));
stat("Flavors:", mix(p.flavor_mix));
}
if (p.examples_accepted.length) {
box.append(node("div", "stat label", "Would surface:"));
const ul = node("ul");
p.examples_accepted.forEach(t => ul.append(node("li", null, t)));
box.append(ul);
}
if (p.examples_rejected.length) {
box.append(node("div", "stat label", "Would skip:"));
const ul = node("ul");
p.examples_rejected.forEach(e => ul.append(node("li", null, e.title)));
box.append(ul);
}
out.append(box);
}
function addTerm() {
const v = el("term-input").value.trim();
if (v && !prefs.avoid_terms.includes(v)) { prefs.avoid_terms.push(v); savePrefs(); }
+36
View File
@@ -0,0 +1,36 @@
import goodnews.feeds as feeds
RSS = b"""<?xml version="1.0"?>
<rss><channel>
<item>
<title>Volunteers restore creek and rescue stranded wildlife</title>
<description>A hopeful recovery effort</description>
<link>http://example.com/1</link>
<pubDate>Sat, 30 May 2026 10:00:00 GMT</pubDate>
</item>
<item>
<title>Quarterly tax filing deadline reminder</title>
<description>routine notice</description>
<link>http://example.com/2</link>
<pubDate>Sat, 30 May 2026 09:00:00 GMT</pubDate>
</item>
</channel></rss>"""
def test_preview_feed_scores_sample_without_network(monkeypatch):
monkeypatch.setattr(feeds, "fetch_feed", lambda url, **kw: RSS)
p = feeds.preview_feed("http://example.com/feed")
assert p["sampled"] == 2
assert p["classified"] is False
assert 0.0 <= p["acceptance_rate"] <= 1.0
assert p["accepted"] >= 1 # the restore/rescue story should pass the heuristic
assert p["newest_published"] is not None
assert isinstance(p["topic_mix"], dict) and p["topic_mix"] == {} # empty without a model
assert all("reason" in ex and "title" in ex for ex in p["examples_rejected"])
def test_preview_feed_respects_sample_cap(monkeypatch):
monkeypatch.setattr(feeds, "fetch_feed", lambda url, **kw: RSS)
p = feeds.preview_feed("http://example.com/feed", sample=1)
assert p["sampled"] == 1