From 95195daff848216c207f607531538443e08452be Mon Sep 17 00:00:00 2001 From: jay Date: Sat, 30 May 2026 19:37:34 +0000 Subject: [PATCH] Track 3: read-only source preview (vet a feed before adding) - feeds.preview_feed(): fetch + score a sample WITHOUT persisting; returns freshness, acceptance rate, cortisol/ragebait/PR averages, and example accepted/rejected items. With an LLM client it also returns topic/flavor mix and the model's (accurate) acceptance view. - CLI 'preview-source URL [--sample] [--classify]'. - API 'GET /api/source-preview?url=&sample=&classify=' with an http(s)-only guard (SSRF note left for go-public hardening). - Site 'Suggest a source' panel with Quick check (heuristic, instant) and Deep check (model, accurate), rendered DOM-safely. - Tests: network-free preview_feed tests via monkeypatched fetch (45 total). - README documents the command, endpoint, and updated roadmap. Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 10 ++-- goodnews/api.py | 44 +++++++++++++++- goodnews/cli.py | 39 +++++++++++++- goodnews/feeds.py | 101 +++++++++++++++++++++++++++++++++++++ goodnews/static/index.html | 79 +++++++++++++++++++++++++++++ tests/test_preview.py | 36 +++++++++++++ 6 files changed, 304 insertions(+), 5 deletions(-) create mode 100644 tests/test_preview.py diff --git a/README.md b/README.md index fac1ef9..76677f5 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ python3 -m goodnews check-llm --base-url http://127.0.0.1:1234/v1 --model gpt-os python3 -m goodnews classify --limit 10 --base-url http://127.0.0.1:1234/v1 --model gpt-oss python3 -m goodnews dedup --base-url http://127.0.0.1:1234/v1 python3 -m goodnews check-feeds +python3 -m goodnews preview-source https://example.com/feed/ --classify python3 -m goodnews build-brief --date 2026-05-27 --replace python3 -m goodnews show-brief python3 -m goodnews list-recent --limit 10 @@ -99,6 +100,7 @@ Endpoints: - `GET /api/feed?topic=&flavor=&limit=&offset=` — ranked, filtered articles - `GET /api/brief?date=&limit=` — a daily brief (latest if no date) - `GET /api/brief-dates` — available brief dates +- `GET /api/source-preview?url=&classify=` — read-only scored sample of a feed (vet before adding) - `GET /docs` — interactive OpenAPI documentation The ingestion CLI stays pure-stdlib; only the `web` extra pulls in FastAPI/uvicorn, @@ -184,9 +186,11 @@ and site, scheduled `cycle` via systemd, a pytest suite, and device-local Calm F Still ahead: -1. **Supervised source pipeline** — paste a feed URL, preview a scored sample - (freshness, acceptance rate, topic/flavor mix, cortisol/ragebait/PR averages, - example items), then add to quarantine before it can reach the main feed. +1. **Supervised source pipeline** — read-only preview is done (`preview-source` / + `/api/source-preview`: freshness, acceptance rate, topic/flavor mix, + cortisol/ragebait/PR averages, example items). Still ahead: add a previewed + source to *quarantine*, and auto-degrade stale/rejecting feeds (advisory, never + auto-blocking). 2. **Learned "Less like this" weighting** — replace the interim flavor-pause with real preference down-ranking. 3. **Corpus rebalancing** — add calm/feelgood sources (currently science-heavy). diff --git a/goodnews/api.py b/goodnews/api.py index 3e0d709..14b2a6b 100644 --- a/goodnews/api.py +++ b/goodnews/api.py @@ -14,6 +14,7 @@ so the API and CLI always read the same file. from __future__ import annotations import os +import re import sqlite3 from collections import Counter from contextlib import contextmanager @@ -25,9 +26,10 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.staticfiles import StaticFiles from pydantic import BaseModel -from . import queries +from . import feeds, queries from .db import connect, init_db from .filters import filter_articles, prefs_from_json +from .llm import LocalModelClient from .taxonomy import FLAVORS, TOPICS ROOT = Path(__file__).resolve().parents[1] @@ -118,6 +120,28 @@ class BriefResponse(BaseModel): items: list[Article] +class RejectedExample(BaseModel): + title: str + reason: str + + +class SourcePreview(BaseModel): + url: str + sampled: int + classified: bool + accepted: int + acceptance_rate: float + avg_cortisol: float + avg_ragebait: float + avg_pr_risk: float + newest_published: str | None + recent_7d: int + topic_mix: dict[str, int] + flavor_mix: dict[str, int] + examples_accepted: list[str] + examples_rejected: list[RejectedExample] + + # --- App -------------------------------------------------------------------- @@ -227,6 +251,24 @@ def create_app() -> FastAPI: with get_conn() as conn: return queries.available_dates(conn, limit=limit) + @app.get("/api/source-preview", response_model=SourcePreview) + def source_preview( + url: str = Query(..., max_length=2048), + sample: int = Query(25, ge=1, le=50), + classify: bool = Query(False, description="Also classify with the local model (accurate but slower)"), + ) -> SourcePreview: + # Read-only sample scoring; nothing is persisted. Only http(s) is allowed. + # NOTE: fetching a user-supplied URL is an SSRF surface — before exposing + # this publicly, also block private/loopback/link-local address ranges. + if not re.match(r"^https?://", url, re.IGNORECASE): + raise HTTPException(400, "url must start with http:// or https://") + client = LocalModelClient.from_env() if classify else None + try: + data = feeds.preview_feed(url, sample=sample, client=client) + except Exception as exc: + raise HTTPException(502, f"could not preview feed: {exc}") + return SourcePreview(**data) + # Static site last, mounted at root, so /api/* and /healthz win. if STATIC_DIR.is_dir(): app.mount("/", StaticFiles(directory=str(STATIC_DIR), html=True), name="site") diff --git a/goodnews/cli.py b/goodnews/cli.py index 26edf52..9b05f96 100644 --- a/goodnews/cli.py +++ b/goodnews/cli.py @@ -9,7 +9,14 @@ from pathlib import Path from .briefs import build_daily_brief, show_brief from .db import connect, init_db from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, dedup as run_dedup -from .feeds import fetch_feed, parse_feed, poll_all_sources, poll_due_sources, poll_source +from .feeds import ( + fetch_feed, + parse_feed, + poll_all_sources, + poll_due_sources, + poll_source, + preview_feed, +) from .llm import LocalModelClient, classify_articles from .scoring import score_article from .sources import load_sources, upsert_sources @@ -52,6 +59,13 @@ def main() -> None: check_feeds_parser = subparsers.add_parser("check-feeds", help="Fetch and parse each feed, reporting health") check_feeds_parser.add_argument("--all", action="store_true", help="Include inactive sources") + preview_parser = subparsers.add_parser("preview-source", help="Score a sample of a feed without adding it") + preview_parser.add_argument("url", help="Feed URL to preview") + preview_parser.add_argument("--sample", type=int, default=25) + preview_parser.add_argument("--classify", action="store_true", help="Also classify with the local model (slower)") + preview_parser.add_argument("--base-url", help="OpenAI-compatible base URL (with --classify)") + preview_parser.add_argument("--model", help="Local model name (with --classify)") + runs_parser = subparsers.add_parser("list-runs", help="Show recent ingest runs") runs_parser.add_argument("--limit", type=int, default=20) @@ -136,6 +150,10 @@ def main() -> None: source_report(conn) elif args.command == "check-feeds": check_feeds(conn, include_inactive=args.all) + elif args.command == "preview-source": + client = llm_client_from_args(args) if args.classify else None + preview = preview_feed(args.url, sample=args.sample, client=client) + print_preview(preview) elif args.command == "list-runs": list_runs(conn, limit=args.limit) elif args.command == "rescore": @@ -243,6 +261,25 @@ def list_recent(conn: sqlite3.Connection, limit: int, accepted_only: bool) -> No print(f" {row['canonical_url']}") +def print_preview(p: dict) -> None: + mode = "model" if p["classified"] else "heuristic" + print(f"Preview of {p['url']} ({mode})") + print(f" sampled={p['sampled']} accepted={p['accepted']} ({p['acceptance_rate']*100:.0f}%)") + print(f" freshness: newest={p['newest_published'] or 'unknown'} in_last_7d={p['recent_7d']}") + print(f" averages: cortisol={p['avg_cortisol']} ragebait={p['avg_ragebait']} pr_risk={p['avg_pr_risk']}") + if p["topic_mix"]: + print(f" topics: {p['topic_mix']}") + print(f" flavors: {p['flavor_mix']}") + if p["examples_accepted"]: + print(" would accept:") + for t in p["examples_accepted"]: + print(f" + {t[:80]}") + if p["examples_rejected"]: + print(" would skip:") + for ex in p["examples_rejected"]: + print(f" - {ex['title'][:70]} ({ex['reason']})") + + def check_feeds(conn: sqlite3.Connection, include_inactive: bool = False) -> None: where = "" if include_inactive else "WHERE active = 1" rows = conn.execute(f"SELECT name, feed_url FROM sources {where} ORDER BY name").fetchall() diff --git a/goodnews/feeds.py b/goodnews/feeds.py index 0a58e1b..423efb1 100644 --- a/goodnews/feeds.py +++ b/goodnews/feeds.py @@ -5,6 +5,7 @@ import sqlite3 import urllib.error import urllib.request import xml.etree.ElementTree as ET +from collections import Counter from dataclasses import dataclass from datetime import UTC, datetime @@ -133,6 +134,106 @@ def poll_source(conn: sqlite3.Connection, source: sqlite3.Row) -> dict: } +def preview_feed(url: str, sample: int = 25, pr_risk_default: int = 3, client=None) -> dict: + """Fetch and score a sample of a feed WITHOUT persisting anything. + + Read-only: lets a user vet a candidate source before it is ever added. By + default it uses the fast heuristic; pass an LLM client to also get the + topic/flavor mix and the model's acceptance view (slower). + """ + items = parse_feed(fetch_feed(url)) + rows = [] + for item in items[:sample]: + title = clean_text(item.title, max_len=500) + if not title: + continue + description = clean_text(item.description, max_len=1000) + s = score_article(title, description, pr_risk_default) + rows.append( + { + "title": title, + "description": description, + "url": canonicalize_url(item.url), + "published_at": item.published_at, + "accepted": bool(s["accepted"]), + "cortisol": s["cortisol_score"], + "ragebait": s["ragebait_score"], + "pr_risk": s["pr_risk_score"], + "reason_code": s["reason_code"], + "topic": None, + "flavor": None, + } + ) + + classified = False + if client and rows: + from .llm import normalize_scores + + classified = True + for r in rows: + try: + raw = client.classify( + { + "source_name": "preview", + "default_category": None, + "source_trust_score": 5, + "source_pr_risk_score": pr_risk_default, + "published_at": r["published_at"], + "title": r["title"], + "description": r["description"] or "", + "canonical_url": r["url"], + } + ) + ns = normalize_scores(raw, model_name=client.model) + r.update( + accepted=bool(ns["accepted"]), + topic=ns["topic"], + flavor=ns["flavor"], + cortisol=ns["cortisol_score"], + ragebait=ns["ragebait_score"], + pr_risk=ns["pr_risk_score"], + ) + except Exception: + pass # one bad item shouldn't sink the whole preview + + total = len(rows) + accepted = sum(1 for r in rows if r["accepted"]) + + def _avg(key: str) -> float: + return round(sum(r[key] for r in rows) / total, 1) if total else 0.0 + + # Freshness: newest item and how many landed in the last week. + now = datetime.now(UTC) + dates = [] + for r in rows: + if r["published_at"]: + try: + dates.append(datetime.fromisoformat(r["published_at"])) + except ValueError: + pass + newest = max(dates).isoformat() if dates else None + recent_7d = sum(1 for d in dates if (now - d).days <= 7) + + return { + "url": url, + "sampled": total, + "classified": classified, + "accepted": accepted, + "acceptance_rate": round(accepted / total, 2) if total else 0.0, + "avg_cortisol": _avg("cortisol"), + "avg_ragebait": _avg("ragebait"), + "avg_pr_risk": _avg("pr_risk"), + "newest_published": newest, + "recent_7d": recent_7d, + "topic_mix": dict(Counter(r["topic"] for r in rows if r["topic"])), + "flavor_mix": dict(Counter(r["flavor"] for r in rows if r["flavor"])), + "examples_accepted": [r["title"] for r in rows if r["accepted"]][:5], + "examples_rejected": [ + {"title": r["title"], "reason": r["reason_code"]} for r in rows if not r["accepted"] + ][:5], + } + + def fetch_feed(url: str, timeout: int = 20) -> bytes: request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) try: diff --git a/goodnews/static/index.html b/goodnews/static/index.html index e3f0a08..40280b1 100644 --- a/goodnews/static/index.html +++ b/goodnews/static/index.html @@ -88,6 +88,19 @@ .panel .reset { margin-top: 6px; background: none; border: none; color: var(--muted); cursor: pointer; font-size: 0.8rem; text-decoration: underline; } .calm-note { color: var(--muted); font-size: 0.8rem; margin: -8px 0 14px; } + .addsource { background: var(--card); border: 1px solid var(--line); border-radius: 12px; + padding: 16px 18px; margin-bottom: 18px; } + .addsource .hint { color: var(--muted); font-size: 0.82rem; margin: 0 0 10px; } + .addsource-row { display: flex; gap: 8px; flex-wrap: wrap; } + .addsource input { flex: 1 1 240px; border: 1px solid var(--line); border-radius: 8px; + padding: 7px 11px; font-size: 0.9rem; } + .addbtn { border: 1px solid var(--accent); background: var(--accent); color: #fff; + border-radius: 8px; padding: 7px 14px; cursor: pointer; font-size: 0.85rem; } + .addbtn.ghost { background: none; color: var(--accent); } + .preview-metrics { margin-top: 14px; } + .preview-metrics .stat { font-size: 0.9rem; margin: 2px 0; } + .preview-metrics .label { color: var(--muted); } + .preview-metrics ul { margin: 4px 0 10px; padding-left: 18px; font-size: 0.86rem; } footer { text-align: center; color: var(--muted); font-size: 0.78rem; padding: 20px; } footer a { color: var(--accent); } @@ -130,6 +143,17 @@
+ +
Suggest a source
+
+

Paste a feed URL to see how calm it is before anyone adds it. Nothing is saved — this just samples and scores recent items.

+
+ + + +
+
+