Sources hardening (Codex audit): promote-time dedup, postJSON timeout, host-only feed_key

Three follow-ups from Codex's audit of the deep-preview/search/dedup work:
- Promote-time duplicate guard: promote_candidate() now re-checks
  find_existing_feed() and raises DuplicateFeedError → 409, so an
  old/CLI/direct-DB candidate or a race can't bypass the add-time check and
  silently overwrite a live source's settings via upsert. (sources scanned
  first, so a real source collision wins over the candidate matching itself.)
- postJSON/putJSON/delJSON gain opt-in {timeout} (AbortController, default
  none so other calls are unchanged); deep preview uses 120s and surfaces a
  calm "timed out" message instead of pinning the button on "Deep-checking…"
  if the LAN model stalls.
- feed_key() now lowercases the host only, not the whole URL — paths/queries
  can be case-significant; scheme/www/trailing-slash/host-case still collapse.

Tests: test_candidate_deep_preview_and_dedup extended — promote succeeds once,
then a re-promote of the same candidate is refused 409. 224 pytest + 11 vitest.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
jay
2026-06-11 21:31:39 -04:00
parent e1ac19351e
commit 3afc1ed37e
5 changed files with 75 additions and 27 deletions
+34 -18
View File
@@ -10,34 +10,50 @@ export async function getJSON(url, { timeout = 10000 } = {}) {
}
}
export async function postJSON(url, body) {
return sendJSON('POST', url, body);
export async function postJSON(url, body, opts) {
return sendJSON('POST', url, body, opts);
}
export async function putJSON(url, body) {
return sendJSON('PUT', url, body);
export async function putJSON(url, body, opts) {
return sendJSON('PUT', url, body, opts);
}
export async function delJSON(url) {
return sendJSON('DELETE', url);
export async function delJSON(url, opts) {
return sendJSON('DELETE', url, undefined, opts);
}
async function sendJSON(method, url, body) {
// timeout is opt-in (default: none) so normal calls are unchanged, but slow
// model-backed actions (e.g. deep preview) can bound the wait and fail calmly
// instead of leaving a button stuck if the LAN model hangs.
async function sendJSON(method, url, body, { timeout } = {}) {
const opts = { method, credentials: 'same-origin' };
if (body !== undefined) {
opts.headers = { 'Content-Type': 'application/json' };
opts.body = JSON.stringify(body);
}
const r = await fetch(url, opts);
if (!r.ok) {
let msg = r.statusText;
try {
const j = await r.json();
if (j && j.detail) msg = j.detail;
} catch {
/* non-JSON error body */
}
throw new Error(msg);
let timer;
if (timeout) {
const ctrl = new AbortController();
opts.signal = ctrl.signal;
timer = setTimeout(() => ctrl.abort(), timeout);
}
try {
const r = await fetch(url, opts);
if (!r.ok) {
let msg = r.statusText;
try {
const j = await r.json();
if (j && j.detail) msg = j.detail;
} catch {
/* non-JSON error body */
}
throw new Error(msg);
}
return await r.json();
} catch (e) {
if (e?.name === 'AbortError') throw new Error('Timed out — the server took too long to respond.');
throw e;
} finally {
clearTimeout(timer);
}
return r.json();
}
+6 -2
View File
@@ -286,8 +286,12 @@
// model's true acceptance view, not the fast heuristic estimate.
async function deepPreview(c) {
c._err = ''; c._deep = true;
try { Object.assign(c, await postJSON(`/api/admin/candidates/${c.id}/preview?deep=true`), { _deep: false }); }
catch (e) { c._err = e?.message || 'Deep preview failed.'; c._deep = false; }
try {
// ~5-7s/item on the LAN model; bound the wait so a model stall can't pin
// the button on "Deep-checking…" forever.
const res = await postJSON(`/api/admin/candidates/${c.id}/preview?deep=true`, undefined, { timeout: 120000 });
Object.assign(c, res, { _deep: false });
} catch (e) { c._err = e?.message || 'Deep preview failed.'; c._deep = false; }
}
async function promoteCandidate(c) {
c._err = '';
+7
View File
@@ -1207,6 +1207,13 @@ def create_app() -> FastAPI:
trust_score=body.trust_score, pr_risk_score=body.pr_risk_score,
poll_interval_minutes=body.poll_interval_minutes,
)
except sources.DuplicateFeedError as exc:
ex = exc.existing
raise HTTPException(
status_code=409,
detail=f"{ex['name']}” is already a source ({ex['status']}) — "
"promote skipped so its settings aren't overwritten.",
)
except ValueError:
raise HTTPException(status_code=404, detail="candidate not found")
src = conn.execute(
+22 -4
View File
@@ -70,13 +70,23 @@ def upsert_sources(conn: sqlite3.Connection, source_defs: list[dict]) -> int:
# --- Duplicate detection (catch the same feed added twice) --------------------
class DuplicateFeedError(Exception):
"""Raised when an operation would create a second copy of an existing feed.
Carries the existing match so the caller can name it in the response."""
def __init__(self, existing: dict):
self.existing = existing
super().__init__(f"feed already exists as {existing['kind']}{existing['name']}")
def feed_key(url: str) -> str:
"""A loose comparison key for spotting the same feed added twice despite
trivial differences (scheme, www, trailing slash, case). Compare-only the
feed_url is always STORED exactly as entered; this just powers dup warnings."""
trivial differences (scheme, www, trailing slash, host case). Compare-only
the feed_url is always STORED exactly as entered; this just powers dup checks.
Only the host is lowercased: URL paths/queries can be case-significant."""
try:
p = urlsplit((url or "").strip().lower())
host = p.netloc.removeprefix("www.")
p = urlsplit((url or "").strip())
host = p.netloc.lower().removeprefix("www.")
path = p.path.rstrip("/")
return host + path + (("?" + p.query) if p.query else "")
except Exception: # noqa: BLE001 — never let a weird URL break add
@@ -170,6 +180,14 @@ def promote_candidate(
if cand is None:
raise ValueError(f"no candidate with id {candidate_id}")
# Re-check duplicates at promote time too — the add-time guard can be bypassed
# by old/CLI/direct-DB candidates or a race, and upsert_sources would silently
# overwrite the existing source's settings. (sources are scanned first, so a
# real source collision wins over this candidate matching itself.)
existing = find_existing_feed(conn, cand["feed_url"])
if existing and existing["kind"] == "source":
raise DuplicateFeedError(existing)
name = cand["name"] or urlsplit(cand["feed_url"]).netloc or cand["feed_url"]
upsert_sources(
conn,
+6 -3
View File
@@ -158,12 +158,15 @@ def test_candidate_deep_preview_and_dedup(tmp_path, monkeypatch):
# Deep preview runs the real classifier on the smaller sample.
deep = tc.post(f"/api/admin/candidates/{cand['id']}/preview?deep=true").json()
assert deep["preview"]["classified"] is True and deep["preview"]["sampled"] == 8
# Dedup: exact + trivial variants (scheme / www / trailing slash / case) are refused.
# Dedup at ADD: exact + trivial variants (scheme / www / trailing slash / host case).
assert tc.post("/api/admin/candidates", json={"feed_url": "https://news.test/feed"}).status_code == 409
assert tc.post("/api/admin/candidates", json={"feed_url": "http://www.news.test/feed/"}).status_code == 409
# Once promoted to a live source, re-adding is still refused.
tc.post(f"/api/admin/candidates/{cand['id']}/promote", json={})
# Promote succeeds the first time and creates the live source.
assert tc.post(f"/api/admin/candidates/{cand['id']}/promote", json={}).status_code == 200
assert tc.post("/api/admin/candidates", json={"feed_url": "https://NEWS.test/feed"}).status_code == 409
# Dedup at PROMOTE too: a stale/duplicate candidate (here, re-promoting the
# same one) can't bypass add and overwrite the live source's settings.
assert tc.post(f"/api/admin/candidates/{cand['id']}/promote", json={}).status_code == 409
def test_candidate_reject_and_gating(tmp_path, monkeypatch):