Track 3: read-only source preview (vet a feed before adding)

- feeds.preview_feed(): fetch + score a sample WITHOUT persisting; returns freshness, acceptance rate, cortisol/ragebait/PR averages, and example accepted/rejected items. With an LLM client it also returns topic/flavor mix and the model's (accurate) acceptance view. - CLI 'preview-source URL [--sample] [--classify]'. - API 'GET /api/source-preview?url=&sample=&classify=' with an http(s)-only guard (SSRF note left for go-public hardening). - Site 'Suggest a source' panel with Quick check (heuristic, instant) and Deep check (model, accurate), rendered DOM-safely. - Tests: network-free preview_feed tests via monkeypatched fetch (45 total). - README documents the command, endpoint, and updated roadmap. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-30 19:37:34 +00:00
parent cabe0b6049
commit 95195daff8
6 changed files with 304 additions and 5 deletions
@@ -17,6 +17,7 @@ python3 -m goodnews check-llm --base-url http://127.0.0.1:1234/v1 --model gpt-os
 python3 -m goodnews classify --limit 10 --base-url http://127.0.0.1:1234/v1 --model gpt-oss
 python3 -m goodnews dedup --base-url http://127.0.0.1:1234/v1
 python3 -m goodnews check-feeds
+python3 -m goodnews preview-source https://example.com/feed/ --classify
 python3 -m goodnews build-brief --date 2026-05-27 --replace
 python3 -m goodnews show-brief
 python3 -m goodnews list-recent --limit 10
@@ -99,6 +100,7 @@ Endpoints:
 - `GET /api/feed?topic=&flavor=&limit=&offset=` — ranked, filtered articles
 - `GET /api/brief?date=&limit=` — a daily brief (latest if no date)
 - `GET /api/brief-dates` — available brief dates
+- `GET /api/source-preview?url=&classify=` — read-only scored sample of a feed (vet before adding)
 - `GET /docs` — interactive OpenAPI documentation

 The ingestion CLI stays pure-stdlib; only the `web` extra pulls in FastAPI/uvicorn,
@@ -184,9 +186,11 @@ and site, scheduled `cycle` via systemd, a pytest suite, and device-local Calm F

 Still ahead:

-1. **Supervised source pipeline** — paste a feed URL, preview a scored sample
-   (freshness, acceptance rate, topic/flavor mix, cortisol/ragebait/PR averages,
-   example items), then add to quarantine before it can reach the main feed.
+1. **Supervised source pipeline** — read-only preview is done (`preview-source` /
+   `/api/source-preview`: freshness, acceptance rate, topic/flavor mix,
+   cortisol/ragebait/PR averages, example items). Still ahead: add a previewed
+   source to *quarantine*, and auto-degrade stale/rejecting feeds (advisory, never
+   auto-blocking).
 2. **Learned "Less like this" weighting** — replace the interim flavor-pause with
   real preference down-ranking.
 3. **Corpus rebalancing** — add calm/feelgood sources (currently science-heavy).
@@ -14,6 +14,7 @@ so the API and CLI always read the same file.
 from __future__ import annotations

 import os
+import re
 import sqlite3
 from collections import Counter
 from contextlib import contextmanager
@@ -25,9 +26,10 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel

-from . import queries
+from . import feeds, queries
 from .db import connect, init_db
 from .filters import filter_articles, prefs_from_json
+from .llm import LocalModelClient
 from .taxonomy import FLAVORS, TOPICS

 ROOT = Path(__file__).resolve().parents[1]
@@ -118,6 +120,28 @@ class BriefResponse(BaseModel):
    items: list[Article]


+class RejectedExample(BaseModel):
+    title: str
+    reason: str
+
+
+class SourcePreview(BaseModel):
+    url: str
+    sampled: int
+    classified: bool
+    accepted: int
+    acceptance_rate: float
+    avg_cortisol: float
+    avg_ragebait: float
+    avg_pr_risk: float
+    newest_published: str | None
+    recent_7d: int
+    topic_mix: dict[str, int]
+    flavor_mix: dict[str, int]
+    examples_accepted: list[str]
+    examples_rejected: list[RejectedExample]
+
+
 # --- App --------------------------------------------------------------------


@@ -227,6 +251,24 @@ def create_app() -> FastAPI:
        with get_conn() as conn:
            return queries.available_dates(conn, limit=limit)

+    @app.get("/api/source-preview", response_model=SourcePreview)
+    def source_preview(
+        url: str = Query(..., max_length=2048),
+        sample: int = Query(25, ge=1, le=50),
+        classify: bool = Query(False, description="Also classify with the local model (accurate but slower)"),
+    ) -> SourcePreview:
+        # Read-only sample scoring; nothing is persisted. Only http(s) is allowed.
+        # NOTE: fetching a user-supplied URL is an SSRF surface — before exposing
+        # this publicly, also block private/loopback/link-local address ranges.
+        if not re.match(r"^https?://", url, re.IGNORECASE):
+            raise HTTPException(400, "url must start with http:// or https://")
+        client = LocalModelClient.from_env() if classify else None
+        try:
+            data = feeds.preview_feed(url, sample=sample, client=client)
+        except Exception as exc:
+            raise HTTPException(502, f"could not preview feed: {exc}")
+        return SourcePreview(**data)
+
    # Static site last, mounted at root, so /api/* and /healthz win.
    if STATIC_DIR.is_dir():
        app.mount("/", StaticFiles(directory=str(STATIC_DIR), html=True), name="site")
@@ -9,7 +9,14 @@ from pathlib import Path
 from .briefs import build_daily_brief, show_brief
 from .db import connect, init_db
 from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, dedup as run_dedup
-from .feeds import fetch_feed, parse_feed, poll_all_sources, poll_due_sources, poll_source
+from .feeds import (
+    fetch_feed,
+    parse_feed,
+    poll_all_sources,
+    poll_due_sources,
+    poll_source,
+    preview_feed,
+)
 from .llm import LocalModelClient, classify_articles
 from .scoring import score_article
 from .sources import load_sources, upsert_sources
@@ -52,6 +59,13 @@ def main() -> None:
    check_feeds_parser = subparsers.add_parser("check-feeds", help="Fetch and parse each feed, reporting health")
    check_feeds_parser.add_argument("--all", action="store_true", help="Include inactive sources")

+    preview_parser = subparsers.add_parser("preview-source", help="Score a sample of a feed without adding it")
+    preview_parser.add_argument("url", help="Feed URL to preview")
+    preview_parser.add_argument("--sample", type=int, default=25)
+    preview_parser.add_argument("--classify", action="store_true", help="Also classify with the local model (slower)")
+    preview_parser.add_argument("--base-url", help="OpenAI-compatible base URL (with --classify)")
+    preview_parser.add_argument("--model", help="Local model name (with --classify)")
+
    runs_parser = subparsers.add_parser("list-runs", help="Show recent ingest runs")
    runs_parser.add_argument("--limit", type=int, default=20)

@@ -136,6 +150,10 @@ def main() -> None:
        source_report(conn)
    elif args.command == "check-feeds":
        check_feeds(conn, include_inactive=args.all)
+    elif args.command == "preview-source":
+        client = llm_client_from_args(args) if args.classify else None
+        preview = preview_feed(args.url, sample=args.sample, client=client)
+        print_preview(preview)
    elif args.command == "list-runs":
        list_runs(conn, limit=args.limit)
    elif args.command == "rescore":
@@ -243,6 +261,25 @@ def list_recent(conn: sqlite3.Connection, limit: int, accepted_only: bool) -> No
        print(f"  {row['canonical_url']}")


+def print_preview(p: dict) -> None:
+    mode = "model" if p["classified"] else "heuristic"
+    print(f"Preview of {p['url']}  ({mode})")
+    print(f"  sampled={p['sampled']} accepted={p['accepted']} ({p['acceptance_rate']*100:.0f}%)")
+    print(f"  freshness: newest={p['newest_published'] or 'unknown'} in_last_7d={p['recent_7d']}")
+    print(f"  averages: cortisol={p['avg_cortisol']} ragebait={p['avg_ragebait']} pr_risk={p['avg_pr_risk']}")
+    if p["topic_mix"]:
+        print(f"  topics: {p['topic_mix']}")
+        print(f"  flavors: {p['flavor_mix']}")
+    if p["examples_accepted"]:
+        print("  would accept:")
+        for t in p["examples_accepted"]:
+            print(f"    + {t[:80]}")
+    if p["examples_rejected"]:
+        print("  would skip:")
+        for ex in p["examples_rejected"]:
+            print(f"    - {ex['title'][:70]} ({ex['reason']})")
+
+
 def check_feeds(conn: sqlite3.Connection, include_inactive: bool = False) -> None:
    where = "" if include_inactive else "WHERE active = 1"
    rows = conn.execute(f"SELECT name, feed_url FROM sources {where} ORDER BY name").fetchall()
@@ -5,6 +5,7 @@ import sqlite3
 import urllib.error
 import urllib.request
 import xml.etree.ElementTree as ET
+from collections import Counter
 from dataclasses import dataclass
 from datetime import UTC, datetime

@@ -133,6 +134,106 @@ def poll_source(conn: sqlite3.Connection, source: sqlite3.Row) -> dict:
        }


+def preview_feed(url: str, sample: int = 25, pr_risk_default: int = 3, client=None) -> dict:
+    """Fetch and score a sample of a feed WITHOUT persisting anything.
+
+    Read-only: lets a user vet a candidate source before it is ever added. By
+    default it uses the fast heuristic; pass an LLM client to also get the
+    topic/flavor mix and the model's acceptance view (slower).
+    """
+    items = parse_feed(fetch_feed(url))
+    rows = []
+    for item in items[:sample]:
+        title = clean_text(item.title, max_len=500)
+        if not title:
+            continue
+        description = clean_text(item.description, max_len=1000)
+        s = score_article(title, description, pr_risk_default)
+        rows.append(
+            {
+                "title": title,
+                "description": description,
+                "url": canonicalize_url(item.url),
+                "published_at": item.published_at,
+                "accepted": bool(s["accepted"]),
+                "cortisol": s["cortisol_score"],
+                "ragebait": s["ragebait_score"],
+                "pr_risk": s["pr_risk_score"],
+                "reason_code": s["reason_code"],
+                "topic": None,
+                "flavor": None,
+            }
+        )
+
+    classified = False
+    if client and rows:
+        from .llm import normalize_scores
+
+        classified = True
+        for r in rows:
+            try:
+                raw = client.classify(
+                    {
+                        "source_name": "preview",
+                        "default_category": None,
+                        "source_trust_score": 5,
+                        "source_pr_risk_score": pr_risk_default,
+                        "published_at": r["published_at"],
+                        "title": r["title"],
+                        "description": r["description"] or "",
+                        "canonical_url": r["url"],
+                    }
+                )
+                ns = normalize_scores(raw, model_name=client.model)
+                r.update(
+                    accepted=bool(ns["accepted"]),
+                    topic=ns["topic"],
+                    flavor=ns["flavor"],
+                    cortisol=ns["cortisol_score"],
+                    ragebait=ns["ragebait_score"],
+                    pr_risk=ns["pr_risk_score"],
+                )
+            except Exception:
+                pass  # one bad item shouldn't sink the whole preview
+
+    total = len(rows)
+    accepted = sum(1 for r in rows if r["accepted"])
+
+    def _avg(key: str) -> float:
+        return round(sum(r[key] for r in rows) / total, 1) if total else 0.0
+
+    # Freshness: newest item and how many landed in the last week.
+    now = datetime.now(UTC)
+    dates = []
+    for r in rows:
+        if r["published_at"]:
+            try:
+                dates.append(datetime.fromisoformat(r["published_at"]))
+            except ValueError:
+                pass
+    newest = max(dates).isoformat() if dates else None
+    recent_7d = sum(1 for d in dates if (now - d).days <= 7)
+
+    return {
+        "url": url,
+        "sampled": total,
+        "classified": classified,
+        "accepted": accepted,
+        "acceptance_rate": round(accepted / total, 2) if total else 0.0,
+        "avg_cortisol": _avg("cortisol"),
+        "avg_ragebait": _avg("ragebait"),
+        "avg_pr_risk": _avg("pr_risk"),
+        "newest_published": newest,
+        "recent_7d": recent_7d,
+        "topic_mix": dict(Counter(r["topic"] for r in rows if r["topic"])),
+        "flavor_mix": dict(Counter(r["flavor"] for r in rows if r["flavor"])),
+        "examples_accepted": [r["title"] for r in rows if r["accepted"]][:5],
+        "examples_rejected": [
+            {"title": r["title"], "reason": r["reason_code"]} for r in rows if not r["accepted"]
+        ][:5],
+    }
+
+
 def fetch_feed(url: str, timeout: int = 20) -> bytes:
    request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
    try:
@@ -88,6 +88,19 @@
    .panel .reset { margin-top: 6px; background: none; border: none; color: var(--muted);
      cursor: pointer; font-size: 0.8rem; text-decoration: underline; }
    .calm-note { color: var(--muted); font-size: 0.8rem; margin: -8px 0 14px; }
+    .addsource { background: var(--card); border: 1px solid var(--line); border-radius: 12px;
+      padding: 16px 18px; margin-bottom: 18px; }
+    .addsource .hint { color: var(--muted); font-size: 0.82rem; margin: 0 0 10px; }
+    .addsource-row { display: flex; gap: 8px; flex-wrap: wrap; }
+    .addsource input { flex: 1 1 240px; border: 1px solid var(--line); border-radius: 8px;
+      padding: 7px 11px; font-size: 0.9rem; }
+    .addbtn { border: 1px solid var(--accent); background: var(--accent); color: #fff;
+      border-radius: 8px; padding: 7px 14px; cursor: pointer; font-size: 0.85rem; }
+    .addbtn.ghost { background: none; color: var(--accent); }
+    .preview-metrics { margin-top: 14px; }
+    .preview-metrics .stat { font-size: 0.9rem; margin: 2px 0; }
+    .preview-metrics .label { color: var(--muted); }
+    .preview-metrics ul { margin: 4px 0 10px; padding-left: 18px; font-size: 0.86rem; }
    footer { text-align: center; color: var(--muted); font-size: 0.78rem; padding: 20px; }
    footer a { color: var(--accent); }
  </style>
@@ -130,6 +143,17 @@
    <div id="topic-chips" class="chips"></div>
    <div id="flavor-chips" class="chips"></div>
    <div id="feed"></div>
+
+    <div class="section-title">Suggest a source</div>
+    <div class="addsource">
+      <p class="hint">Paste a feed URL to see how calm it is before anyone adds it. Nothing is saved — this just samples and scores recent items.</p>
+      <div class="addsource-row">
+        <input id="src-url" type="text" placeholder="https://example.com/feed/" />
+        <button id="src-quick" class="addbtn">Quick check</button>
+        <button id="src-deep" class="addbtn ghost">Deep check (uses model)</button>
+      </div>
+      <div id="src-result"></div>
+    </div>
  </main>
  <footer>
    goodNews · metadata &amp; links only, no stored articles ·
@@ -354,6 +378,11 @@
      fc.appendChild(chip("all", true, () => setFlavor(null)));
      cats.flavors.forEach(f => fc.appendChild(chip(f.key, false, () => setFlavor(f.key))));

+      // source preview controls
+      el("src-quick").onclick = () => previewSource(false);
+      el("src-deep").onclick = () => previewSource(true);
+      el("src-url").addEventListener("keydown", (e) => { if (e.key === "Enter") previewSource(false); });
+
      // panel controls
      el("calm-toggle").onclick = () => el("panel").classList.toggle("open");
      el("term-add").onclick = addTerm;
@@ -363,6 +392,56 @@
      refreshAll();
    }

+    async function previewSource(deep) {
+      const url = el("src-url").value.trim();
+      const out = el("src-result");
+      out.replaceChildren();
+      if (!/^https?:\/\//i.test(url)) {
+        out.append(node("div", "empty", "Enter a URL starting with http:// or https://"));
+        return;
+      }
+      out.append(node("div", "empty", deep ? "Deep checking with the model — this can take a moment…" : "Checking…"));
+      try {
+        const p = await getJSON(`/api/source-preview?url=${encodeURIComponent(url)}&classify=${deep}`);
+        renderPreview(out, p);
+      } catch (e) {
+        out.replaceChildren(node("div", "empty", "Could not read that feed."));
+      }
+    }
+
+    function renderPreview(out, p) {
+      out.replaceChildren();
+      const box = node("div", "preview-metrics");
+      const stat = (label, value) => {
+        const d = node("div", "stat");
+        d.append(node("span", "label", label + " "));
+        d.append(document.createTextNode(String(value)));
+        box.append(d);
+      };
+      stat("Mode:", p.classified ? "model (accurate)" : "heuristic (quick, conservative)");
+      stat("Acceptance:", `${Math.round(p.acceptance_rate * 100)}% (${p.accepted}/${p.sampled})`);
+      stat("Freshness:", `${p.recent_7d}/${p.sampled} in last 7 days · newest ${(p.newest_published||"unknown").slice(0,10)}`);
+      stat("Calm averages:", `cortisol ${p.avg_cortisol} · ragebait ${p.avg_ragebait} · PR ${p.avg_pr_risk}`);
+      const mix = (m) => Object.entries(m).map(([k, v]) => `${k} ${v}`).join(" · ") || "—";
+      if (p.classified) {
+        stat("Topics:", mix(p.topic_mix));
+        stat("Flavors:", mix(p.flavor_mix));
+      }
+      if (p.examples_accepted.length) {
+        box.append(node("div", "stat label", "Would surface:"));
+        const ul = node("ul");
+        p.examples_accepted.forEach(t => ul.append(node("li", null, t)));
+        box.append(ul);
+      }
+      if (p.examples_rejected.length) {
+        box.append(node("div", "stat label", "Would skip:"));
+        const ul = node("ul");
+        p.examples_rejected.forEach(e => ul.append(node("li", null, e.title)));
+        box.append(ul);
+      }
+      out.append(box);
+    }
+
    function addTerm() {
      const v = el("term-input").value.trim();
      if (v && !prefs.avoid_terms.includes(v)) { prefs.avoid_terms.push(v); savePrefs(); }
@@ -0,0 +1,36 @@
+import goodnews.feeds as feeds
+
+RSS = b"""<?xml version="1.0"?>
+<rss><channel>
+  <item>
+    <title>Volunteers restore creek and rescue stranded wildlife</title>
+    <description>A hopeful recovery effort</description>
+    <link>http://example.com/1</link>
+    <pubDate>Sat, 30 May 2026 10:00:00 GMT</pubDate>
+  </item>
+  <item>
+    <title>Quarterly tax filing deadline reminder</title>
+    <description>routine notice</description>
+    <link>http://example.com/2</link>
+    <pubDate>Sat, 30 May 2026 09:00:00 GMT</pubDate>
+  </item>
+</channel></rss>"""
+
+
+def test_preview_feed_scores_sample_without_network(monkeypatch):
+    monkeypatch.setattr(feeds, "fetch_feed", lambda url, **kw: RSS)
+    p = feeds.preview_feed("http://example.com/feed")
+
+    assert p["sampled"] == 2
+    assert p["classified"] is False
+    assert 0.0 <= p["acceptance_rate"] <= 1.0
+    assert p["accepted"] >= 1  # the restore/rescue story should pass the heuristic
+    assert p["newest_published"] is not None
+    assert isinstance(p["topic_mix"], dict) and p["topic_mix"] == {}  # empty without a model
+    assert all("reason" in ex and "title" in ex for ex in p["examples_rejected"])
+
+
+def test_preview_feed_respects_sample_cap(monkeypatch):
+    monkeypatch.setattr(feeds, "fetch_feed", lambda url, **kw: RSS)
+    p = feeds.preview_feed("http://example.com/feed", sample=1)
+    assert p["sampled"] == 1