Sources: LLM deep-preview, source search, duplicate-add guard

Three admin Sources upgrades:
- Deep preview: a per-candidate "🔬 Deep preview" button runs the REAL
  classifier on an 8-item sample (the same model that judges live articles),
  versus the fast keyword heuristic the add/Re-preview path uses. Preview now
  carries `classified`, surfaced as a "model-checked" vs "quick estimate"
  badge — so the acceptance % is no longer ambiguously heuristic. conn is
  released during the ~30-60s model pass; postJSON has no client timeout.
- Search: free-text box over the sources table (name / category / feed URL /
  homepage), folded into the existing status filter, with a live match count
  and empty state. Makes "is this already added?" a glance.
- Duplicate-add guard: sources.find_existing_feed() + feed_key() normalize
  scheme/www/trailing-slash/case, so re-adding a feed that's already a live
  source or a queued candidate is refused with a 409 naming where it lives
  (DB already enforced exact-URL uniqueness; this catches the near-miss
  variants and overwrite-on-promote footgun).

Tests: test_candidate_deep_preview_and_dedup (deep flag wires the model +
uses the small sample; exact/www/slash/case variants all 409). 224 pytest +
11 vitest green.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
jay
2026-06-11 21:19:15 -04:00
parent ba1a29d12a
commit e1ac19351e
4 changed files with 126 additions and 11 deletions
+45 -6
View File
@@ -192,15 +192,21 @@
let paused = $derived(sources.filter((s) => sstatus(s) === 'paused').length);
let retired = $derived(sources.filter((s) => sstatus(s) === 'retired').length);
// Sources table filter.
// Sources table filter + free-text search (name / category / feed URL).
let srcFilter = $state('all');
let srcSearch = $state('');
let shownSources = $derived(
srcFilter === 'healthy' ? sources.filter((s) => sstatus(s) === 'active' && !s.failures && !s.review_flag)
(srcFilter === 'healthy' ? sources.filter((s) => sstatus(s) === 'active' && !s.failures && !s.review_flag)
: srcFilter === 'resting' ? sources.filter((s) => sstatus(s) === 'active' && s.failures > 0)
: srcFilter === 'flagged' ? sources.filter((s) => s.review_flag)
: srcFilter === 'paused' ? sources.filter((s) => sstatus(s) === 'paused')
: srcFilter === 'retired' ? sources.filter((s) => sstatus(s) === 'retired')
: sources
).filter((s) => {
const q = srcSearch.trim().toLowerCase();
if (!q) return true;
return `${s.name} ${s.category || ''} ${s.feed_url || ''} ${s.homepage_url || ''}`.toLowerCase().includes(q);
})
);
// Lifecycle (status: active/paused/retired) keeps `active` mirrored server-side;
@@ -276,6 +282,13 @@
try { Object.assign(c, await postJSON(`/api/admin/candidates/${c.id}/preview`)); }
catch (e) { c._err = e?.message || 'Re-preview failed.'; }
}
// Deep preview runs the REAL classifier on a small sample (~30-60s) — the
// model's true acceptance view, not the fast heuristic estimate.
async function deepPreview(c) {
c._err = ''; c._deep = true;
try { Object.assign(c, await postJSON(`/api/admin/candidates/${c.id}/preview?deep=true`), { _deep: false }); }
catch (e) { c._err = e?.message || 'Deep preview failed.'; c._deep = false; }
}
async function promoteCandidate(c) {
c._err = '';
try {
@@ -560,6 +573,7 @@
{#if c.preview}
<div class="cprev">
{c.preview.accepted ?? 0}/{c.preview.sampled ?? 0} sampled would pass{#if c.preview.acceptance_rate != null} · {Math.round(c.preview.acceptance_rate * 100)}% accept{/if}{#if c.preview.recent_7d != null} · {c.preview.recent_7d} in last 7d{/if}
{#if c.preview.classified}<span class="vbadge model" title="Scored by the real classifier — the true acceptance view">model-checked</span>{:else}<span class="vbadge fast" title="Fast keyword heuristic — an estimate. Run Deep preview for the model's real verdict.">quick estimate</span>{/if}
{#if c.preview.examples_accepted?.length}<div class="cex">e.g. {c.preview.examples_accepted.slice(0, 3).join(' · ')}</div>{/if}
</div>
{/if}
@@ -569,6 +583,7 @@
<label class="cchk"><input type="checkbox" bind:checked={c._activate} /> Activate immediately</label>
<button class="csend" onclick={() => promoteCandidate(c)}>Promote{c._activate ? '' : ' as paused'}</button>
<button class="act" onclick={() => repreviewCandidate(c)}>Re-preview</button>
<button class="act" title="Run the real model on a sample (~30-60s)" onclick={() => deepPreview(c)} disabled={c._deep}>{c._deep ? 'Deep-checking…' : '🔬 Deep preview'}</button>
<button class="act del" onclick={() => rejectCandidate(c)}>Reject</button>
</div>
</li>
@@ -579,10 +594,16 @@
<h2>Sources <a class="exportlink" href="/api/admin/export/sources.csv" download>export CSV ↓</a></h2>
<p class="sub2">{healthy} healthy · {resting} resting · {flagged} flagged · {paused} paused · {retired} retired · {sources.length} total</p>
<div class="filterchips">
{#each [['all', 'All'], ['healthy', 'Healthy'], ['resting', 'Resting'], ['flagged', 'Flagged'], ['paused', 'Paused'], ['retired', 'Retired']] as [key, label] (key)}
<button class="chip" class:on={srcFilter === key} onclick={() => (srcFilter = key)}>{label}</button>
{/each}
<div class="srctools">
<div class="filterchips">
{#each [['all', 'All'], ['healthy', 'Healthy'], ['resting', 'Resting'], ['flagged', 'Flagged'], ['paused', 'Paused'], ['retired', 'Retired']] as [key, label] (key)}
<button class="chip" class:on={srcFilter === key} onclick={() => (srcFilter = key)}>{label}</button>
{/each}
</div>
<div class="srcsearch">
<input type="search" placeholder="Search name, category, or URL…" bind:value={srcSearch} autocapitalize="off" autocomplete="off" spellcheck="false" />
{#if srcSearch.trim()}<span class="srccount">{shownSources.length} match{shownSources.length === 1 ? '' : 'es'}</span>{/if}
</div>
</div>
<div class="tablewrap">
<table class="srctable">
@@ -649,6 +670,8 @@
</td>
</tr>
{/if}
{:else}
<tr><td colspan="10" class="srcempty">{srcSearch.trim() ? `No sources match “${srcSearch.trim()}.` : 'No sources in this view.'}</td></tr>
{/each}
</tbody>
</table>
@@ -1056,6 +1079,17 @@
}
.filterchips .chip:hover { border-color: var(--accent); }
.filterchips .chip.on { background: var(--accent); border-color: var(--accent); color: #fff; }
/* Sources filter row + search */
.srctools { display: flex; align-items: center; justify-content: space-between; gap: 12px; flex-wrap: wrap; margin: 0 0 14px; }
.srctools .filterchips { margin: 0; }
.srcsearch { display: inline-flex; align-items: center; gap: 8px; }
.srcsearch input {
font: inherit; font-size: 0.84rem; padding: 6px 12px; border: 1px solid var(--line);
border-radius: 999px; background: var(--surface); color: var(--ink); width: 230px;
}
.srcsearch input:focus { outline: none; border-color: var(--accent); }
.srccount { font-size: 0.78rem; color: var(--muted); white-space: nowrap; }
.srcempty { text-align: center; color: var(--muted); font-style: italic; padding: 22px 10px; }
/* Add a source + candidate queue */
.addsrc { background: var(--surface); border: 1px solid var(--line); border-radius: 14px; padding: 14px 16px; margin-bottom: 6px; }
@@ -1070,6 +1104,11 @@
.curl { font-size: 0.76rem; color: var(--muted); word-break: break-all; margin-top: 2px; }
.cprev { font-size: 0.84rem; color: var(--ink); margin-top: 7px; }
.cprev .cex { color: var(--muted); font-size: 0.8rem; margin-top: 2px; font-style: italic; }
/* Heuristic-vs-model preview badge */
.vbadge { display: inline-block; margin-left: 8px; padding: 1px 8px; border-radius: 999px;
font-size: 0.68rem; font-weight: 600; text-transform: uppercase; letter-spacing: 0.04em; cursor: help; }
.vbadge.model { background: #e3efe4; color: #3f7048; }
.vbadge.fast { background: var(--line); color: var(--muted); }
.cactions { display: flex; gap: 9px; align-items: center; flex-wrap: wrap; margin-top: 10px; }
.cactions .ccat { font: inherit; font-size: 0.8rem; padding: 5px 9px; border: 1px solid var(--line); border-radius: 8px; background: var(--bg); color: var(--ink); width: 150px; }
.cactions .cchk { font-size: 0.8rem; color: var(--muted); display: inline-flex; align-items: center; gap: 5px; }
+25 -5
View File
@@ -1143,10 +1143,20 @@ def create_app() -> FastAPI:
rows = sources.list_candidates(conn)
return [_candidate_dict(r) for r in rows]
def _preview_or_502(url: str) -> dict:
# SSRF-safe fetch (admin-pasted URL is untrusted); heuristic-only (fast).
def _preview_or_502(url: str, deep: bool = False) -> dict:
# SSRF-safe fetch (admin-pasted URL is untrusted). Default is the fast
# heuristic; deep=True also runs the real LLM classifier on a small sample
# (slower, ~5-7s/item — the true acceptance view, not an estimate).
client = None
if deep:
try:
client = LocalModelClient.from_env()
except Exception: # noqa: BLE001 — fall back to heuristic if the model is down
client = None
try:
return feeds.preview_feed(url, sample=20, fetcher=feeds.safe_fetch_feed)
return feeds.preview_feed(
url, sample=(8 if deep else 20), fetcher=feeds.safe_fetch_feed, client=client
)
except Exception as exc: # noqa: BLE001 — surface a readable reason
raise HTTPException(status_code=502, detail=f"Couldn't preview that feed: {exc}")
@@ -1157,6 +1167,16 @@ def create_app() -> FastAPI:
raise HTTPException(status_code=422, detail="feed_url is required")
with get_conn() as conn: # gate BEFORE the outbound fetch
_require_admin(conn, request)
# Don't re-add a feed that's already live or already queued (catches
# http/https · www · trailing-slash variants, not just exact dups).
existing = sources.find_existing_feed(conn, url)
if existing:
where = "already a source" if existing["kind"] == "source" else "already in the candidate queue"
raise HTTPException(
status_code=409,
detail=f"{existing['name']}” is {where} ({existing['status']}). "
"Find it below — Re-preview it there if you want a fresh read.",
)
preview = _preview_or_502(url) # no DB connection held during network I/O
with get_conn() as conn:
_require_admin(conn, request)
@@ -1164,14 +1184,14 @@ def create_app() -> FastAPI:
return _candidate_dict(row)
@app.post("/api/admin/candidates/{cid}/preview")
def admin_candidate_repreview(cid: int, request: Request) -> dict:
def admin_candidate_repreview(cid: int, request: Request, deep: bool = False) -> dict:
with get_conn() as conn:
_require_admin(conn, request)
cand = conn.execute("SELECT feed_url FROM source_candidates WHERE id = ?", (cid,)).fetchone()
if not cand:
raise HTTPException(status_code=404, detail="candidate not found")
url = cand["feed_url"]
preview = _preview_or_502(url)
preview = _preview_or_502(url, deep=deep) # conn released during the (slow) model pass
with get_conn() as conn:
_require_admin(conn, request)
row = sources.save_candidate(conn, url, preview=preview)
+31
View File
@@ -67,6 +67,37 @@ def upsert_sources(conn: sqlite3.Connection, source_defs: list[dict]) -> int:
return count
# --- Duplicate detection (catch the same feed added twice) --------------------
def feed_key(url: str) -> str:
"""A loose comparison key for spotting the same feed added twice despite
trivial differences (scheme, www, trailing slash, case). Compare-only the
feed_url is always STORED exactly as entered; this just powers dup warnings."""
try:
p = urlsplit((url or "").strip().lower())
host = p.netloc.removeprefix("www.")
path = p.path.rstrip("/")
return host + path + (("?" + p.query) if p.query else "")
except Exception: # noqa: BLE001 — never let a weird URL break add
return (url or "").strip().lower()
def find_existing_feed(conn: sqlite3.Connection, url: str) -> dict | None:
"""Is this feed already a live source or a pending candidate? Matches on the
loose key, so http/https + www + trailing-slash variants are all caught."""
key = feed_key(url)
for r in conn.execute("SELECT id, name, feed_url, status FROM sources"):
if feed_key(r["feed_url"]) == key:
return {"kind": "source", "id": r["id"], "name": r["name"], "status": r["status"]}
for r in conn.execute(
"SELECT id, name, feed_url, status FROM source_candidates WHERE status NOT IN ('rejected','promoted')"
):
if feed_key(r["feed_url"]) == key:
return {"kind": "candidate", "id": r["id"], "name": r["name"] or r["feed_url"], "status": r["status"]}
return None
# --- Supervised source candidates (staging before the real sources table) ----
+25
View File
@@ -141,6 +141,31 @@ def test_candidate_suggest_promote_paused(tmp_path, monkeypatch):
assert any(s["name"] == "Good Feed" for s in tc.get("/api/admin/stats").json()["sources"])
def test_candidate_deep_preview_and_dedup(tmp_path, monkeypatch):
app, api = _make(tmp_path, monkeypatch, admin_email="boss@x.com")
def fake_preview(url, **k):
# Echo back whether the LLM client was wired in + the sample size used.
return {"url": url, "sampled": k.get("sample"), "accepted": 4,
"classified": k.get("client") is not None}
monkeypatch.setattr(api.feeds, "preview_feed", fake_preview)
# Deep preview builds a model client; stub it so we never touch the real LAN model.
monkeypatch.setattr(api, "LocalModelClient", type("C", (), {"from_env": staticmethod(lambda: object())}))
tc = _signin(app, api, "boss@x.com")
cand = tc.post("/api/admin/candidates", json={"feed_url": "https://news.test/feed"}).json()
assert cand["preview"]["classified"] is False # add uses the fast heuristic
# Deep preview runs the real classifier on the smaller sample.
deep = tc.post(f"/api/admin/candidates/{cand['id']}/preview?deep=true").json()
assert deep["preview"]["classified"] is True and deep["preview"]["sampled"] == 8
# Dedup: exact + trivial variants (scheme / www / trailing slash / case) are refused.
assert tc.post("/api/admin/candidates", json={"feed_url": "https://news.test/feed"}).status_code == 409
assert tc.post("/api/admin/candidates", json={"feed_url": "http://www.news.test/feed/"}).status_code == 409
# Once promoted to a live source, re-adding is still refused.
tc.post(f"/api/admin/candidates/{cand['id']}/promote", json={})
assert tc.post("/api/admin/candidates", json={"feed_url": "https://NEWS.test/feed"}).status_code == 409
def test_candidate_reject_and_gating(tmp_path, monkeypatch):
app, api = _make(tmp_path, monkeypatch, admin_email="boss@x.com")
monkeypatch.setattr(api.feeds, "preview_feed", lambda url, **k: {"url": url, "sampled": 1, "accepted": 0})