Sources: LLM deep-preview, source search, duplicate-add guard

Three admin Sources upgrades:
- Deep preview: a per-candidate "🔬 Deep preview" button runs the REAL
  classifier on an 8-item sample (the same model that judges live articles),
  versus the fast keyword heuristic the add/Re-preview path uses. Preview now
  carries `classified`, surfaced as a "model-checked" vs "quick estimate"
  badge — so the acceptance % is no longer ambiguously heuristic. conn is
  released during the ~30-60s model pass; postJSON has no client timeout.
- Search: free-text box over the sources table (name / category / feed URL /
  homepage), folded into the existing status filter, with a live match count
  and empty state. Makes "is this already added?" a glance.
- Duplicate-add guard: sources.find_existing_feed() + feed_key() normalize
  scheme/www/trailing-slash/case, so re-adding a feed that's already a live
  source or a queued candidate is refused with a 409 naming where it lives
  (DB already enforced exact-URL uniqueness; this catches the near-miss
  variants and overwrite-on-promote footgun).

Tests: test_candidate_deep_preview_and_dedup (deep flag wires the model +
uses the small sample; exact/www/slash/case variants all 409). 224 pytest +
11 vitest green.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
jay
2026-06-11 21:19:15 -04:00
parent ba1a29d12a
commit e1ac19351e
4 changed files with 126 additions and 11 deletions
+25 -5
View File
@@ -1143,10 +1143,20 @@ def create_app() -> FastAPI:
rows = sources.list_candidates(conn)
return [_candidate_dict(r) for r in rows]
def _preview_or_502(url: str) -> dict:
# SSRF-safe fetch (admin-pasted URL is untrusted); heuristic-only (fast).
def _preview_or_502(url: str, deep: bool = False) -> dict:
# SSRF-safe fetch (admin-pasted URL is untrusted). Default is the fast
# heuristic; deep=True also runs the real LLM classifier on a small sample
# (slower, ~5-7s/item — the true acceptance view, not an estimate).
client = None
if deep:
try:
client = LocalModelClient.from_env()
except Exception: # noqa: BLE001 — fall back to heuristic if the model is down
client = None
try:
return feeds.preview_feed(url, sample=20, fetcher=feeds.safe_fetch_feed)
return feeds.preview_feed(
url, sample=(8 if deep else 20), fetcher=feeds.safe_fetch_feed, client=client
)
except Exception as exc: # noqa: BLE001 — surface a readable reason
raise HTTPException(status_code=502, detail=f"Couldn't preview that feed: {exc}")
@@ -1157,6 +1167,16 @@ def create_app() -> FastAPI:
raise HTTPException(status_code=422, detail="feed_url is required")
with get_conn() as conn: # gate BEFORE the outbound fetch
_require_admin(conn, request)
# Don't re-add a feed that's already live or already queued (catches
# http/https · www · trailing-slash variants, not just exact dups).
existing = sources.find_existing_feed(conn, url)
if existing:
where = "already a source" if existing["kind"] == "source" else "already in the candidate queue"
raise HTTPException(
status_code=409,
detail=f"{existing['name']}” is {where} ({existing['status']}). "
"Find it below — Re-preview it there if you want a fresh read.",
)
preview = _preview_or_502(url) # no DB connection held during network I/O
with get_conn() as conn:
_require_admin(conn, request)
@@ -1164,14 +1184,14 @@ def create_app() -> FastAPI:
return _candidate_dict(row)
@app.post("/api/admin/candidates/{cid}/preview")
def admin_candidate_repreview(cid: int, request: Request) -> dict:
def admin_candidate_repreview(cid: int, request: Request, deep: bool = False) -> dict:
with get_conn() as conn:
_require_admin(conn, request)
cand = conn.execute("SELECT feed_url FROM source_candidates WHERE id = ?", (cid,)).fetchone()
if not cand:
raise HTTPException(status_code=404, detail="candidate not found")
url = cand["feed_url"]
preview = _preview_or_502(url)
preview = _preview_or_502(url, deep=deep) # conn released during the (slow) model pass
with get_conn() as conn:
_require_admin(conn, request)
row = sources.save_candidate(conn, url, preview=preview)
+31
View File
@@ -67,6 +67,37 @@ def upsert_sources(conn: sqlite3.Connection, source_defs: list[dict]) -> int:
return count
# --- Duplicate detection (catch the same feed added twice) --------------------
def feed_key(url: str) -> str:
"""A loose comparison key for spotting the same feed added twice despite
trivial differences (scheme, www, trailing slash, case). Compare-only — the
feed_url is always STORED exactly as entered; this just powers dup warnings."""
try:
p = urlsplit((url or "").strip().lower())
host = p.netloc.removeprefix("www.")
path = p.path.rstrip("/")
return host + path + (("?" + p.query) if p.query else "")
except Exception: # noqa: BLE001 — never let a weird URL break add
return (url or "").strip().lower()
def find_existing_feed(conn: sqlite3.Connection, url: str) -> dict | None:
"""Is this feed already a live source or a pending candidate? Matches on the
loose key, so http/https + www + trailing-slash variants are all caught."""
key = feed_key(url)
for r in conn.execute("SELECT id, name, feed_url, status FROM sources"):
if feed_key(r["feed_url"]) == key:
return {"kind": "source", "id": r["id"], "name": r["name"], "status": r["status"]}
for r in conn.execute(
"SELECT id, name, feed_url, status FROM source_candidates WHERE status NOT IN ('rejected','promoted')"
):
if feed_key(r["feed_url"]) == key:
return {"kind": "candidate", "id": r["id"], "name": r["name"] or r["feed_url"], "status": r["status"]}
return None
# --- Supervised source candidates (staging before the real sources table) ----