Sources: LLM deep-preview, source search, duplicate-add guard
Three admin Sources upgrades: - Deep preview: a per-candidate "🔬 Deep preview" button runs the REAL classifier on an 8-item sample (the same model that judges live articles), versus the fast keyword heuristic the add/Re-preview path uses. Preview now carries `classified`, surfaced as a "model-checked" vs "quick estimate" badge — so the acceptance % is no longer ambiguously heuristic. conn is released during the ~30-60s model pass; postJSON has no client timeout. - Search: free-text box over the sources table (name / category / feed URL / homepage), folded into the existing status filter, with a live match count and empty state. Makes "is this already added?" a glance. - Duplicate-add guard: sources.find_existing_feed() + feed_key() normalize scheme/www/trailing-slash/case, so re-adding a feed that's already a live source or a queued candidate is refused with a 409 naming where it lives (DB already enforced exact-URL uniqueness; this catches the near-miss variants and overwrite-on-promote footgun). Tests: test_candidate_deep_preview_and_dedup (deep flag wires the model + uses the small sample; exact/www/slash/case variants all 409). 224 pytest + 11 vitest green. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -192,15 +192,21 @@
|
||||
let paused = $derived(sources.filter((s) => sstatus(s) === 'paused').length);
|
||||
let retired = $derived(sources.filter((s) => sstatus(s) === 'retired').length);
|
||||
|
||||
// Sources table filter.
|
||||
// Sources table filter + free-text search (name / category / feed URL).
|
||||
let srcFilter = $state('all');
|
||||
let srcSearch = $state('');
|
||||
let shownSources = $derived(
|
||||
srcFilter === 'healthy' ? sources.filter((s) => sstatus(s) === 'active' && !s.failures && !s.review_flag)
|
||||
(srcFilter === 'healthy' ? sources.filter((s) => sstatus(s) === 'active' && !s.failures && !s.review_flag)
|
||||
: srcFilter === 'resting' ? sources.filter((s) => sstatus(s) === 'active' && s.failures > 0)
|
||||
: srcFilter === 'flagged' ? sources.filter((s) => s.review_flag)
|
||||
: srcFilter === 'paused' ? sources.filter((s) => sstatus(s) === 'paused')
|
||||
: srcFilter === 'retired' ? sources.filter((s) => sstatus(s) === 'retired')
|
||||
: sources
|
||||
).filter((s) => {
|
||||
const q = srcSearch.trim().toLowerCase();
|
||||
if (!q) return true;
|
||||
return `${s.name} ${s.category || ''} ${s.feed_url || ''} ${s.homepage_url || ''}`.toLowerCase().includes(q);
|
||||
})
|
||||
);
|
||||
|
||||
// Lifecycle (status: active/paused/retired) keeps `active` mirrored server-side;
|
||||
@@ -276,6 +282,13 @@
|
||||
try { Object.assign(c, await postJSON(`/api/admin/candidates/${c.id}/preview`)); }
|
||||
catch (e) { c._err = e?.message || 'Re-preview failed.'; }
|
||||
}
|
||||
// Deep preview runs the REAL classifier on a small sample (~30-60s) — the
|
||||
// model's true acceptance view, not the fast heuristic estimate.
|
||||
async function deepPreview(c) {
|
||||
c._err = ''; c._deep = true;
|
||||
try { Object.assign(c, await postJSON(`/api/admin/candidates/${c.id}/preview?deep=true`), { _deep: false }); }
|
||||
catch (e) { c._err = e?.message || 'Deep preview failed.'; c._deep = false; }
|
||||
}
|
||||
async function promoteCandidate(c) {
|
||||
c._err = '';
|
||||
try {
|
||||
@@ -560,6 +573,7 @@
|
||||
{#if c.preview}
|
||||
<div class="cprev">
|
||||
{c.preview.accepted ?? 0}/{c.preview.sampled ?? 0} sampled would pass{#if c.preview.acceptance_rate != null} · {Math.round(c.preview.acceptance_rate * 100)}% accept{/if}{#if c.preview.recent_7d != null} · {c.preview.recent_7d} in last 7d{/if}
|
||||
{#if c.preview.classified}<span class="vbadge model" title="Scored by the real classifier — the true acceptance view">model-checked</span>{:else}<span class="vbadge fast" title="Fast keyword heuristic — an estimate. Run Deep preview for the model's real verdict.">quick estimate</span>{/if}
|
||||
{#if c.preview.examples_accepted?.length}<div class="cex">e.g. {c.preview.examples_accepted.slice(0, 3).join(' · ')}</div>{/if}
|
||||
</div>
|
||||
{/if}
|
||||
@@ -569,6 +583,7 @@
|
||||
<label class="cchk"><input type="checkbox" bind:checked={c._activate} /> Activate immediately</label>
|
||||
<button class="csend" onclick={() => promoteCandidate(c)}>Promote{c._activate ? '' : ' as paused'}</button>
|
||||
<button class="act" onclick={() => repreviewCandidate(c)}>Re-preview</button>
|
||||
<button class="act" title="Run the real model on a sample (~30-60s)" onclick={() => deepPreview(c)} disabled={c._deep}>{c._deep ? 'Deep-checking…' : '🔬 Deep preview'}</button>
|
||||
<button class="act del" onclick={() => rejectCandidate(c)}>Reject</button>
|
||||
</div>
|
||||
</li>
|
||||
@@ -579,10 +594,16 @@
|
||||
|
||||
<h2>Sources <a class="exportlink" href="/api/admin/export/sources.csv" download>export CSV ↓</a></h2>
|
||||
<p class="sub2">{healthy} healthy · {resting} resting · {flagged} flagged · {paused} paused · {retired} retired · {sources.length} total</p>
|
||||
<div class="filterchips">
|
||||
{#each [['all', 'All'], ['healthy', 'Healthy'], ['resting', 'Resting'], ['flagged', 'Flagged'], ['paused', 'Paused'], ['retired', 'Retired']] as [key, label] (key)}
|
||||
<button class="chip" class:on={srcFilter === key} onclick={() => (srcFilter = key)}>{label}</button>
|
||||
{/each}
|
||||
<div class="srctools">
|
||||
<div class="filterchips">
|
||||
{#each [['all', 'All'], ['healthy', 'Healthy'], ['resting', 'Resting'], ['flagged', 'Flagged'], ['paused', 'Paused'], ['retired', 'Retired']] as [key, label] (key)}
|
||||
<button class="chip" class:on={srcFilter === key} onclick={() => (srcFilter = key)}>{label}</button>
|
||||
{/each}
|
||||
</div>
|
||||
<div class="srcsearch">
|
||||
<input type="search" placeholder="Search name, category, or URL…" bind:value={srcSearch} autocapitalize="off" autocomplete="off" spellcheck="false" />
|
||||
{#if srcSearch.trim()}<span class="srccount">{shownSources.length} match{shownSources.length === 1 ? '' : 'es'}</span>{/if}
|
||||
</div>
|
||||
</div>
|
||||
<div class="tablewrap">
|
||||
<table class="srctable">
|
||||
@@ -649,6 +670,8 @@
|
||||
</td>
|
||||
</tr>
|
||||
{/if}
|
||||
{:else}
|
||||
<tr><td colspan="10" class="srcempty">{srcSearch.trim() ? `No sources match “${srcSearch.trim()}”.` : 'No sources in this view.'}</td></tr>
|
||||
{/each}
|
||||
</tbody>
|
||||
</table>
|
||||
@@ -1056,6 +1079,17 @@
|
||||
}
|
||||
.filterchips .chip:hover { border-color: var(--accent); }
|
||||
.filterchips .chip.on { background: var(--accent); border-color: var(--accent); color: #fff; }
|
||||
/* Sources filter row + search */
|
||||
.srctools { display: flex; align-items: center; justify-content: space-between; gap: 12px; flex-wrap: wrap; margin: 0 0 14px; }
|
||||
.srctools .filterchips { margin: 0; }
|
||||
.srcsearch { display: inline-flex; align-items: center; gap: 8px; }
|
||||
.srcsearch input {
|
||||
font: inherit; font-size: 0.84rem; padding: 6px 12px; border: 1px solid var(--line);
|
||||
border-radius: 999px; background: var(--surface); color: var(--ink); width: 230px;
|
||||
}
|
||||
.srcsearch input:focus { outline: none; border-color: var(--accent); }
|
||||
.srccount { font-size: 0.78rem; color: var(--muted); white-space: nowrap; }
|
||||
.srcempty { text-align: center; color: var(--muted); font-style: italic; padding: 22px 10px; }
|
||||
|
||||
/* Add a source + candidate queue */
|
||||
.addsrc { background: var(--surface); border: 1px solid var(--line); border-radius: 14px; padding: 14px 16px; margin-bottom: 6px; }
|
||||
@@ -1070,6 +1104,11 @@
|
||||
.curl { font-size: 0.76rem; color: var(--muted); word-break: break-all; margin-top: 2px; }
|
||||
.cprev { font-size: 0.84rem; color: var(--ink); margin-top: 7px; }
|
||||
.cprev .cex { color: var(--muted); font-size: 0.8rem; margin-top: 2px; font-style: italic; }
|
||||
/* Heuristic-vs-model preview badge */
|
||||
.vbadge { display: inline-block; margin-left: 8px; padding: 1px 8px; border-radius: 999px;
|
||||
font-size: 0.68rem; font-weight: 600; text-transform: uppercase; letter-spacing: 0.04em; cursor: help; }
|
||||
.vbadge.model { background: #e3efe4; color: #3f7048; }
|
||||
.vbadge.fast { background: var(--line); color: var(--muted); }
|
||||
.cactions { display: flex; gap: 9px; align-items: center; flex-wrap: wrap; margin-top: 10px; }
|
||||
.cactions .ccat { font: inherit; font-size: 0.8rem; padding: 5px 9px; border: 1px solid var(--line); border-radius: 8px; background: var(--bg); color: var(--ink); width: 150px; }
|
||||
.cactions .cchk { font-size: 0.8rem; color: var(--muted); display: inline-flex; align-items: center; gap: 5px; }
|
||||
|
||||
+25
-5
@@ -1143,10 +1143,20 @@ def create_app() -> FastAPI:
|
||||
rows = sources.list_candidates(conn)
|
||||
return [_candidate_dict(r) for r in rows]
|
||||
|
||||
def _preview_or_502(url: str) -> dict:
|
||||
# SSRF-safe fetch (admin-pasted URL is untrusted); heuristic-only (fast).
|
||||
def _preview_or_502(url: str, deep: bool = False) -> dict:
|
||||
# SSRF-safe fetch (admin-pasted URL is untrusted). Default is the fast
|
||||
# heuristic; deep=True also runs the real LLM classifier on a small sample
|
||||
# (slower, ~5-7s/item — the true acceptance view, not an estimate).
|
||||
client = None
|
||||
if deep:
|
||||
try:
|
||||
client = LocalModelClient.from_env()
|
||||
except Exception: # noqa: BLE001 — fall back to heuristic if the model is down
|
||||
client = None
|
||||
try:
|
||||
return feeds.preview_feed(url, sample=20, fetcher=feeds.safe_fetch_feed)
|
||||
return feeds.preview_feed(
|
||||
url, sample=(8 if deep else 20), fetcher=feeds.safe_fetch_feed, client=client
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001 — surface a readable reason
|
||||
raise HTTPException(status_code=502, detail=f"Couldn't preview that feed: {exc}")
|
||||
|
||||
@@ -1157,6 +1167,16 @@ def create_app() -> FastAPI:
|
||||
raise HTTPException(status_code=422, detail="feed_url is required")
|
||||
with get_conn() as conn: # gate BEFORE the outbound fetch
|
||||
_require_admin(conn, request)
|
||||
# Don't re-add a feed that's already live or already queued (catches
|
||||
# http/https · www · trailing-slash variants, not just exact dups).
|
||||
existing = sources.find_existing_feed(conn, url)
|
||||
if existing:
|
||||
where = "already a source" if existing["kind"] == "source" else "already in the candidate queue"
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail=f"“{existing['name']}” is {where} ({existing['status']}). "
|
||||
"Find it below — Re-preview it there if you want a fresh read.",
|
||||
)
|
||||
preview = _preview_or_502(url) # no DB connection held during network I/O
|
||||
with get_conn() as conn:
|
||||
_require_admin(conn, request)
|
||||
@@ -1164,14 +1184,14 @@ def create_app() -> FastAPI:
|
||||
return _candidate_dict(row)
|
||||
|
||||
@app.post("/api/admin/candidates/{cid}/preview")
|
||||
def admin_candidate_repreview(cid: int, request: Request) -> dict:
|
||||
def admin_candidate_repreview(cid: int, request: Request, deep: bool = False) -> dict:
|
||||
with get_conn() as conn:
|
||||
_require_admin(conn, request)
|
||||
cand = conn.execute("SELECT feed_url FROM source_candidates WHERE id = ?", (cid,)).fetchone()
|
||||
if not cand:
|
||||
raise HTTPException(status_code=404, detail="candidate not found")
|
||||
url = cand["feed_url"]
|
||||
preview = _preview_or_502(url)
|
||||
preview = _preview_or_502(url, deep=deep) # conn released during the (slow) model pass
|
||||
with get_conn() as conn:
|
||||
_require_admin(conn, request)
|
||||
row = sources.save_candidate(conn, url, preview=preview)
|
||||
|
||||
@@ -67,6 +67,37 @@ def upsert_sources(conn: sqlite3.Connection, source_defs: list[dict]) -> int:
|
||||
return count
|
||||
|
||||
|
||||
# --- Duplicate detection (catch the same feed added twice) --------------------
|
||||
|
||||
|
||||
def feed_key(url: str) -> str:
|
||||
"""A loose comparison key for spotting the same feed added twice despite
|
||||
trivial differences (scheme, www, trailing slash, case). Compare-only — the
|
||||
feed_url is always STORED exactly as entered; this just powers dup warnings."""
|
||||
try:
|
||||
p = urlsplit((url or "").strip().lower())
|
||||
host = p.netloc.removeprefix("www.")
|
||||
path = p.path.rstrip("/")
|
||||
return host + path + (("?" + p.query) if p.query else "")
|
||||
except Exception: # noqa: BLE001 — never let a weird URL break add
|
||||
return (url or "").strip().lower()
|
||||
|
||||
|
||||
def find_existing_feed(conn: sqlite3.Connection, url: str) -> dict | None:
|
||||
"""Is this feed already a live source or a pending candidate? Matches on the
|
||||
loose key, so http/https + www + trailing-slash variants are all caught."""
|
||||
key = feed_key(url)
|
||||
for r in conn.execute("SELECT id, name, feed_url, status FROM sources"):
|
||||
if feed_key(r["feed_url"]) == key:
|
||||
return {"kind": "source", "id": r["id"], "name": r["name"], "status": r["status"]}
|
||||
for r in conn.execute(
|
||||
"SELECT id, name, feed_url, status FROM source_candidates WHERE status NOT IN ('rejected','promoted')"
|
||||
):
|
||||
if feed_key(r["feed_url"]) == key:
|
||||
return {"kind": "candidate", "id": r["id"], "name": r["name"] or r["feed_url"], "status": r["status"]}
|
||||
return None
|
||||
|
||||
|
||||
# --- Supervised source candidates (staging before the real sources table) ----
|
||||
|
||||
|
||||
|
||||
@@ -141,6 +141,31 @@ def test_candidate_suggest_promote_paused(tmp_path, monkeypatch):
|
||||
assert any(s["name"] == "Good Feed" for s in tc.get("/api/admin/stats").json()["sources"])
|
||||
|
||||
|
||||
def test_candidate_deep_preview_and_dedup(tmp_path, monkeypatch):
|
||||
app, api = _make(tmp_path, monkeypatch, admin_email="boss@x.com")
|
||||
|
||||
def fake_preview(url, **k):
|
||||
# Echo back whether the LLM client was wired in + the sample size used.
|
||||
return {"url": url, "sampled": k.get("sample"), "accepted": 4,
|
||||
"classified": k.get("client") is not None}
|
||||
monkeypatch.setattr(api.feeds, "preview_feed", fake_preview)
|
||||
# Deep preview builds a model client; stub it so we never touch the real LAN model.
|
||||
monkeypatch.setattr(api, "LocalModelClient", type("C", (), {"from_env": staticmethod(lambda: object())}))
|
||||
tc = _signin(app, api, "boss@x.com")
|
||||
|
||||
cand = tc.post("/api/admin/candidates", json={"feed_url": "https://news.test/feed"}).json()
|
||||
assert cand["preview"]["classified"] is False # add uses the fast heuristic
|
||||
# Deep preview runs the real classifier on the smaller sample.
|
||||
deep = tc.post(f"/api/admin/candidates/{cand['id']}/preview?deep=true").json()
|
||||
assert deep["preview"]["classified"] is True and deep["preview"]["sampled"] == 8
|
||||
# Dedup: exact + trivial variants (scheme / www / trailing slash / case) are refused.
|
||||
assert tc.post("/api/admin/candidates", json={"feed_url": "https://news.test/feed"}).status_code == 409
|
||||
assert tc.post("/api/admin/candidates", json={"feed_url": "http://www.news.test/feed/"}).status_code == 409
|
||||
# Once promoted to a live source, re-adding is still refused.
|
||||
tc.post(f"/api/admin/candidates/{cand['id']}/promote", json={})
|
||||
assert tc.post("/api/admin/candidates", json={"feed_url": "https://NEWS.test/feed"}).status_code == 409
|
||||
|
||||
|
||||
def test_candidate_reject_and_gating(tmp_path, monkeypatch):
|
||||
app, api = _make(tmp_path, monkeypatch, admin_email="boss@x.com")
|
||||
monkeypatch.setattr(api.feeds, "preview_feed", lambda url, **k: {"url": url, "sampled": 1, "accepted": 0})
|
||||
|
||||
Reference in New Issue
Block a user