Track 3: read-only source preview (vet a feed before adding)

- feeds.preview_feed(): fetch + score a sample WITHOUT persisting; returns freshness, acceptance rate, cortisol/ragebait/PR averages, and example accepted/rejected items. With an LLM client it also returns topic/flavor mix and the model's (accurate) acceptance view. - CLI 'preview-source URL [--sample] [--classify]'. - API 'GET /api/source-preview?url=&sample=&classify=' with an http(s)-only guard (SSRF note left for go-public hardening). - Site 'Suggest a source' panel with Quick check (heuristic, instant) and Deep check (model, accurate), rendered DOM-safely. - Tests: network-free preview_feed tests via monkeypatched fetch (45 total). - README documents the command, endpoint, and updated roadmap. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-30 19:37:34 +00:00
parent cabe0b6049
commit 95195daff8
6 changed files with 304 additions and 5 deletions
@@ -17,6 +17,7 @@ python3 -m goodnews check-llm --base-url http://127.0.0.1:1234/v1 --model gpt-os
 python3 -m goodnews classify --limit 10 --base-url http://127.0.0.1:1234/v1 --model gpt-oss
 python3 -m goodnews dedup --base-url http://127.0.0.1:1234/v1
 python3 -m goodnews check-feeds
 python3 -m goodnews preview-source https://example.com/feed/ --classify
 python3 -m goodnews build-brief --date 2026-05-27 --replace
 python3 -m goodnews show-brief
 python3 -m goodnews list-recent --limit 10
@@ -99,6 +100,7 @@ Endpoints:
 - `GET /api/feed?topic=&flavor=&limit=&offset=` — ranked, filtered articles
 - `GET /api/brief?date=&limit=` — a daily brief (latest if no date)
 - `GET /api/brief-dates` — available brief dates
 - `GET /api/source-preview?url=&classify=` — read-only scored sample of a feed (vet before adding)
 - `GET /docs` — interactive OpenAPI documentation
 The ingestion CLI stays pure-stdlib; only the `web` extra pulls in FastAPI/uvicorn,
@@ -184,9 +186,11 @@ and site, scheduled `cycle` via systemd, a pytest suite, and device-local Calm F
 Still ahead:
-1. **Supervised source pipeline** — paste a feed URL, preview a scored sample
+1. **Supervised source pipeline** — read-only preview is done (`preview-source` /
-   (freshness, acceptance rate, topic/flavor mix, cortisol/ragebait/PR averages,
+   `/api/source-preview`: freshness, acceptance rate, topic/flavor mix,
-   example items), then add to quarantine before it can reach the main feed.
+   cortisol/ragebait/PR averages, example items). Still ahead: add a previewed
   source to *quarantine*, and auto-degrade stale/rejecting feeds (advisory, never
   auto-blocking).
 2. **Learned "Less like this" weighting** — replace the interim flavor-pause with
   real preference down-ranking.
 3. **Corpus rebalancing** — add calm/feelgood sources (currently science-heavy).
@@ -14,6 +14,7 @@ so the API and CLI always read the same file.
 from __future__ import annotations
 import os
 import re
 import sqlite3
 from collections import Counter
 from contextlib import contextmanager
@@ -25,9 +26,10 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
-from . import queries
+from . import feeds, queries
 from .db import connect, init_db
 from .filters import filter_articles, prefs_from_json
 from .llm import LocalModelClient
 from .taxonomy import FLAVORS, TOPICS
 ROOT = Path(__file__).resolve().parents[1]
@@ -118,6 +120,28 @@ class BriefResponse(BaseModel):
    items: list[Article]
 class RejectedExample(BaseModel):
    title: str
    reason: str
 class SourcePreview(BaseModel):
    url: str
    sampled: int
    classified: bool
    accepted: int
    acceptance_rate: float
    avg_cortisol: float
    avg_ragebait: float
    avg_pr_risk: float
    newest_published: str | None
    recent_7d: int
    topic_mix: dict[str, int]
    flavor_mix: dict[str, int]
    examples_accepted: list[str]
    examples_rejected: list[RejectedExample]
 # --- App --------------------------------------------------------------------
@@ -227,6 +251,24 @@ def create_app() -> FastAPI:
        with get_conn() as conn:
            return queries.available_dates(conn, limit=limit)
    @app.get("/api/source-preview", response_model=SourcePreview)
    def source_preview(
        url: str = Query(..., max_length=2048),
        sample: int = Query(25, ge=1, le=50),
        classify: bool = Query(False, description="Also classify with the local model (accurate but slower)"),
    ) -> SourcePreview:
        # Read-only sample scoring; nothing is persisted. Only http(s) is allowed.
        # NOTE: fetching a user-supplied URL is an SSRF surface — before exposing
        # this publicly, also block private/loopback/link-local address ranges.
        if not re.match(r"^https?://", url, re.IGNORECASE):
            raise HTTPException(400, "url must start with http:// or https://")
        client = LocalModelClient.from_env() if classify else None
        try:
            data = feeds.preview_feed(url, sample=sample, client=client)
        except Exception as exc:
            raise HTTPException(502, f"could not preview feed: {exc}")
        return SourcePreview(**data)
    # Static site last, mounted at root, so /api/* and /healthz win.
    if STATIC_DIR.is_dir():
        app.mount("/", StaticFiles(directory=str(STATIC_DIR), html=True), name="site")
@@ -9,7 +9,14 @@ from pathlib import Path
 from .briefs import build_daily_brief, show_brief
 from .db import connect, init_db
 from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, dedup as run_dedup
-from .feeds import fetch_feed, parse_feed, poll_all_sources, poll_due_sources, poll_source
+from .feeds import (
    fetch_feed,
    parse_feed,
    poll_all_sources,
    poll_due_sources,
    poll_source,
    preview_feed,
 )
 from .llm import LocalModelClient, classify_articles
 from .scoring import score_article
 from .sources import load_sources, upsert_sources
@@ -52,6 +59,13 @@ def main() -> None:
    check_feeds_parser = subparsers.add_parser("check-feeds", help="Fetch and parse each feed, reporting health")
    check_feeds_parser.add_argument("--all", action="store_true", help="Include inactive sources")
    preview_parser = subparsers.add_parser("preview-source", help="Score a sample of a feed without adding it")
    preview_parser.add_argument("url", help="Feed URL to preview")
    preview_parser.add_argument("--sample", type=int, default=25)
    preview_parser.add_argument("--classify", action="store_true", help="Also classify with the local model (slower)")
    preview_parser.add_argument("--base-url", help="OpenAI-compatible base URL (with --classify)")
    preview_parser.add_argument("--model", help="Local model name (with --classify)")
    runs_parser = subparsers.add_parser("list-runs", help="Show recent ingest runs")
    runs_parser.add_argument("--limit", type=int, default=20)
@@ -136,6 +150,10 @@ def main() -> None:
        source_report(conn)
    elif args.command == "check-feeds":
        check_feeds(conn, include_inactive=args.all)
    elif args.command == "preview-source":
        client = llm_client_from_args(args) if args.classify else None
        preview = preview_feed(args.url, sample=args.sample, client=client)
        print_preview(preview)
    elif args.command == "list-runs":
        list_runs(conn, limit=args.limit)
    elif args.command == "rescore":
@@ -243,6 +261,25 @@ def list_recent(conn: sqlite3.Connection, limit: int, accepted_only: bool) -> No
        print(f"  {row['canonical_url']}")
 def print_preview(p: dict) -> None:
    mode = "model" if p["classified"] else "heuristic"
    print(f"Preview of {p['url']}  ({mode})")
    print(f"  sampled={p['sampled']} accepted={p['accepted']} ({p['acceptance_rate']*100:.0f}%)")
    print(f"  freshness: newest={p['newest_published'] or 'unknown'} in_last_7d={p['recent_7d']}")
    print(f"  averages: cortisol={p['avg_cortisol']} ragebait={p['avg_ragebait']} pr_risk={p['avg_pr_risk']}")
    if p["topic_mix"]:
        print(f"  topics: {p['topic_mix']}")
        print(f"  flavors: {p['flavor_mix']}")
    if p["examples_accepted"]:
        print("  would accept:")
        for t in p["examples_accepted"]:
            print(f"    + {t[:80]}")
    if p["examples_rejected"]:
        print("  would skip:")
        for ex in p["examples_rejected"]:
            print(f"    - {ex['title'][:70]} ({ex['reason']})")
 def check_feeds(conn: sqlite3.Connection, include_inactive: bool = False) -> None:
    where = "" if include_inactive else "WHERE active = 1"
    rows = conn.execute(f"SELECT name, feed_url FROM sources {where} ORDER BY name").fetchall()
@@ -5,6 +5,7 @@ import sqlite3
 import urllib.error
 import urllib.request
 import xml.etree.ElementTree as ET
 from collections import Counter
 from dataclasses import dataclass
 from datetime import UTC, datetime
@@ -133,6 +134,106 @@ def poll_source(conn: sqlite3.Connection, source: sqlite3.Row) -> dict:
        }
 def preview_feed(url: str, sample: int = 25, pr_risk_default: int = 3, client=None) -> dict:
    """Fetch and score a sample of a feed WITHOUT persisting anything.
    Read-only: lets a user vet a candidate source before it is ever added. By
    default it uses the fast heuristic; pass an LLM client to also get the
    topic/flavor mix and the model's acceptance view (slower).
    """
    items = parse_feed(fetch_feed(url))
    rows = []
    for item in items[:sample]:
        title = clean_text(item.title, max_len=500)
        if not title:
            continue
        description = clean_text(item.description, max_len=1000)
        s = score_article(title, description, pr_risk_default)
        rows.append(
            {
                "title": title,
                "description": description,
                "url": canonicalize_url(item.url),
                "published_at": item.published_at,
                "accepted": bool(s["accepted"]),
                "cortisol": s["cortisol_score"],
                "ragebait": s["ragebait_score"],
                "pr_risk": s["pr_risk_score"],
                "reason_code": s["reason_code"],
                "topic": None,
                "flavor": None,
            }
        )
    classified = False
    if client and rows:
        from .llm import normalize_scores
        classified = True
        for r in rows:
            try:
                raw = client.classify(
                    {
                        "source_name": "preview",
                        "default_category": None,
                        "source_trust_score": 5,
                        "source_pr_risk_score": pr_risk_default,
                        "published_at": r["published_at"],
                        "title": r["title"],
                        "description": r["description"] or "",
                        "canonical_url": r["url"],
                    }
                )
                ns = normalize_scores(raw, model_name=client.model)
                r.update(
                    accepted=bool(ns["accepted"]),
                    topic=ns["topic"],
                    flavor=ns["flavor"],
                    cortisol=ns["cortisol_score"],
                    ragebait=ns["ragebait_score"],
                    pr_risk=ns["pr_risk_score"],
                )
            except Exception:
                pass  # one bad item shouldn't sink the whole preview
    total = len(rows)
    accepted = sum(1 for r in rows if r["accepted"])
    def _avg(key: str) -> float:
        return round(sum(r[key] for r in rows) / total, 1) if total else 0.0
    # Freshness: newest item and how many landed in the last week.
    now = datetime.now(UTC)
    dates = []
    for r in rows:
        if r["published_at"]:
            try:
                dates.append(datetime.fromisoformat(r["published_at"]))
            except ValueError:
                pass
    newest = max(dates).isoformat() if dates else None
    recent_7d = sum(1 for d in dates if (now - d).days <= 7)
    return {
        "url": url,
        "sampled": total,
        "classified": classified,
        "accepted": accepted,
        "acceptance_rate": round(accepted / total, 2) if total else 0.0,
        "avg_cortisol": _avg("cortisol"),
        "avg_ragebait": _avg("ragebait"),
        "avg_pr_risk": _avg("pr_risk"),
        "newest_published": newest,
        "recent_7d": recent_7d,
        "topic_mix": dict(Counter(r["topic"] for r in rows if r["topic"])),
        "flavor_mix": dict(Counter(r["flavor"] for r in rows if r["flavor"])),
        "examples_accepted": [r["title"] for r in rows if r["accepted"]][:5],
        "examples_rejected": [
            {"title": r["title"], "reason": r["reason_code"]} for r in rows if not r["accepted"]
        ][:5],
    }
 def fetch_feed(url: str, timeout: int = 20) -> bytes:
    request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
    try:
@@ -88,6 +88,19 @@
    .panel .reset { margin-top: 6px; background: none; border: none; color: var(--muted);
      cursor: pointer; font-size: 0.8rem; text-decoration: underline; }
    .calm-note { color: var(--muted); font-size: 0.8rem; margin: -8px 0 14px; }
    .addsource { background: var(--card); border: 1px solid var(--line); border-radius: 12px;
      padding: 16px 18px; margin-bottom: 18px; }
    .addsource .hint { color: var(--muted); font-size: 0.82rem; margin: 0 0 10px; }
    .addsource-row { display: flex; gap: 8px; flex-wrap: wrap; }
    .addsource input { flex: 1 1 240px; border: 1px solid var(--line); border-radius: 8px;
      padding: 7px 11px; font-size: 0.9rem; }
    .addbtn { border: 1px solid var(--accent); background: var(--accent); color: #fff;
      border-radius: 8px; padding: 7px 14px; cursor: pointer; font-size: 0.85rem; }
    .addbtn.ghost { background: none; color: var(--accent); }
    .preview-metrics { margin-top: 14px; }
    .preview-metrics .stat { font-size: 0.9rem; margin: 2px 0; }
    .preview-metrics .label { color: var(--muted); }
    .preview-metrics ul { margin: 4px 0 10px; padding-left: 18px; font-size: 0.86rem; }
    footer { text-align: center; color: var(--muted); font-size: 0.78rem; padding: 20px; }
    footer a { color: var(--accent); }
  </style>
@@ -130,6 +143,17 @@
    <div id="topic-chips" class="chips"></div>
    <div id="flavor-chips" class="chips"></div>
    <div id="feed"></div>
    <div class="section-title">Suggest a source</div>
    <div class="addsource">
      <p class="hint">Paste a feed URL to see how calm it is before anyone adds it. Nothing is saved — this just samples and scores recent items.</p>
      <div class="addsource-row">
        <input id="src-url" type="text" placeholder="https://example.com/feed/" />
        <button id="src-quick" class="addbtn">Quick check</button>
        <button id="src-deep" class="addbtn ghost">Deep check (uses model)</button>
      </div>
      <div id="src-result"></div>
    </div>
  </main>
  <footer>
    goodNews · metadata &amp; links only, no stored articles ·
@@ -354,6 +378,11 @@
      fc.appendChild(chip("all", true, () => setFlavor(null)));
      cats.flavors.forEach(f => fc.appendChild(chip(f.key, false, () => setFlavor(f.key))));
      // source preview controls
      el("src-quick").onclick = () => previewSource(false);
      el("src-deep").onclick = () => previewSource(true);
      el("src-url").addEventListener("keydown", (e) => { if (e.key === "Enter") previewSource(false); });
      // panel controls
      el("calm-toggle").onclick = () => el("panel").classList.toggle("open");
      el("term-add").onclick = addTerm;
@@ -363,6 +392,56 @@
      refreshAll();
    }
    async function previewSource(deep) {
      const url = el("src-url").value.trim();
      const out = el("src-result");
      out.replaceChildren();
      if (!/^https?:\/\//i.test(url)) {
        out.append(node("div", "empty", "Enter a URL starting with http:// or https://"));
        return;
      }
      out.append(node("div", "empty", deep ? "Deep checking with the model — this can take a moment…" : "Checking…"));
      try {
        const p = await getJSON(`/api/source-preview?url=${encodeURIComponent(url)}&classify=${deep}`);
        renderPreview(out, p);
      } catch (e) {
        out.replaceChildren(node("div", "empty", "Could not read that feed."));
      }
    }
    function renderPreview(out, p) {
      out.replaceChildren();
      const box = node("div", "preview-metrics");
      const stat = (label, value) => {
        const d = node("div", "stat");
        d.append(node("span", "label", label + " "));
        d.append(document.createTextNode(String(value)));
        box.append(d);
      };
      stat("Mode:", p.classified ? "model (accurate)" : "heuristic (quick, conservative)");
      stat("Acceptance:", `${Math.round(p.acceptance_rate * 100)}% (${p.accepted}/${p.sampled})`);
      stat("Freshness:", `${p.recent_7d}/${p.sampled} in last 7 days · newest ${(p.newest_published||"unknown").slice(0,10)}`);
      stat("Calm averages:", `cortisol ${p.avg_cortisol} · ragebait ${p.avg_ragebait} · PR ${p.avg_pr_risk}`);
      const mix = (m) => Object.entries(m).map(([k, v]) => `${k} ${v}`).join(" · ") || "—";
      if (p.classified) {
        stat("Topics:", mix(p.topic_mix));
        stat("Flavors:", mix(p.flavor_mix));
      }
      if (p.examples_accepted.length) {
        box.append(node("div", "stat label", "Would surface:"));
        const ul = node("ul");
        p.examples_accepted.forEach(t => ul.append(node("li", null, t)));
        box.append(ul);
      }
      if (p.examples_rejected.length) {
        box.append(node("div", "stat label", "Would skip:"));
        const ul = node("ul");
        p.examples_rejected.forEach(e => ul.append(node("li", null, e.title)));
        box.append(ul);
      }
      out.append(box);
    }
    function addTerm() {
      const v = el("term-input").value.trim();
      if (v && !prefs.avoid_terms.includes(v)) { prefs.avoid_terms.push(v); savePrefs(); }
@@ -0,0 +1,36 @@
 import goodnews.feeds as feeds
 RSS = b"""<?xml version="1.0"?>
 <rss><channel>
  <item>
    <title>Volunteers restore creek and rescue stranded wildlife</title>
    <description>A hopeful recovery effort</description>
    <link>http://example.com/1</link>
    <pubDate>Sat, 30 May 2026 10:00:00 GMT</pubDate>
  </item>
  <item>
    <title>Quarterly tax filing deadline reminder</title>
    <description>routine notice</description>
    <link>http://example.com/2</link>
    <pubDate>Sat, 30 May 2026 09:00:00 GMT</pubDate>
  </item>
 </channel></rss>"""
 def test_preview_feed_scores_sample_without_network(monkeypatch):
    monkeypatch.setattr(feeds, "fetch_feed", lambda url, **kw: RSS)
    p = feeds.preview_feed("http://example.com/feed")
    assert p["sampled"] == 2
    assert p["classified"] is False
    assert 0.0 <= p["acceptance_rate"] <= 1.0
    assert p["accepted"] >= 1  # the restore/rescue story should pass the heuristic
    assert p["newest_published"] is not None
    assert isinstance(p["topic_mix"], dict) and p["topic_mix"] == {}  # empty without a model
    assert all("reason" in ex and "title" in ex for ex in p["examples_rejected"])
 def test_preview_feed_respects_sample_cap(monkeypatch):
    monkeypatch.setattr(feeds, "fetch_feed", lambda url, **kw: RSS)
    p = feeds.preview_feed("http://example.com/feed", sample=1)
    assert p["sampled"] == 1