Admin: source Articles inspector (verify metrics against real evidence)

New per-row "Articles" button on the Sources table expands a read-only inline
panel of the source's ACTUAL ingested articles — so the automated metrics
(paywall/image/acceptance/duplicate) can be verified against evidence instead of
trusted blind. Distinct from "Check" (which re-samples the LIVE feed for
would-pass quality); this shows what's already in the DB, which is what the table
metrics are computed from.

- Backend: GET /api/admin/sources/{id}/articles?filter=&limit=&offset= (admin,
  read-only). queries.source_articles + source_articles_summary — per article:
  title, url, date, accepted, reason (the "why"), topic/flavor, paywalled
  (domain rule), has_image, duplicate. Summary = counts + source-level paywall
  rule.
- Frontend: expandable panel with a summary header ("27 ingested · 18 accepted
  · … · paywall rule: ON (domain)"), filter chips (All/Accepted/Rejected/No
  image/Duplicates), compact rows with title→link + badges + reason, Load more.

So "100% paywall" or "0% images" becomes clickable evidence: open two articles
to tell a real paywall from a mis-flagged domain, or a true image gap from an
enrichment failure. Test: test_source_articles_inspector. 241 pytest + 11 vitest.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
jay
2026-06-12 21:37:51 -04:00
parent 64339aafb0
commit ddcfab3a11
9 changed files with 445 additions and 61 deletions
+18
View File
@@ -1146,6 +1146,24 @@ def create_app() -> FastAPI:
url = src["feed_url"]
return _preview_or_502(url) # safe fetch, no DB connection held
@app.get("/api/admin/sources/{sid}/articles")
def admin_source_articles(sid: int, request: Request, filter: str = "all",
limit: int = 25, offset: int = 0) -> dict:
# Read-only inspector: the REAL ingested articles behind a source's metrics,
# so paywall/image/acceptance/duplicate signals can be verified against evidence.
limit = max(1, min(int(limit), 100))
offset = max(0, int(offset))
with get_conn() as conn:
_require_admin(conn, request)
if not conn.execute("SELECT 1 FROM sources WHERE id = ?", (sid,)).fetchone():
raise HTTPException(status_code=404, detail="source not found")
arts = queries.source_articles(conn, sid, filter, limit, offset)
return {
"articles": arts,
"summary": queries.source_articles_summary(conn, sid) if offset == 0 else None,
"has_more": len(arts) == limit,
}
# --- Source candidates (supervised add-a-source pipeline) ----------------
def _candidate_dict(row) -> dict:
+79 -34
View File
@@ -518,42 +518,79 @@ def generate_wordsearch_puzzle(conn: sqlite3.Connection, date: str, client=None)
return json.loads(row["payload_json"])
_WS_CROSS_TARGET = 0.5 # aim: about half the placements cross an existing word
def _zone(r: int, c: int, size: int) -> tuple[int, int]:
"""Which quadrant a cell falls in — coarse occupancy used to spread words."""
return (r * 2 // size, c * 2 // size)
def _place_words(words: list[str], size: int, seed: int) -> tuple[list[list[str | None]], list[tuple[str, list[tuple[int, int]]]]]:
"""Core placement (date-seeded, deterministic). Returns the letter grid (None
where unfilled) and [(word, cells)] for every word genuinely placed.
Interlock is a TARGET, not a side effect: each word either (a) must cross an
already-placed word — when crossings are running below ~half of placements —
or (b) anchors in open ground. Both modes steer toward the least crowded /
least developed quadrant, so crossings attach to lonely words at the edges of
structure rather than thickening one knot, and anchors spread across the
board. All valid spots are enumerated (the grid is tiny) — earlier random
sampling kept missing the rare crossing spots, which is why grids came out
as disconnected "clean" words."""
rng = random.Random(seed)
grid: list[list[str | None]] = [[None] * size for _ in range(size)]
zone_fill = {(zr, zc): 0 for zr in (0, 1) for zc in (0, 1)}
placements: list[tuple[str, list[tuple[int, int]]]] = []
crossed = 0
for word in sorted(words, key=len, reverse=True):
n = len(word)
if n > size:
continue
cands = [] # (overlap, cells) over every legal placement
for dr, dc in _DIRS:
for r0 in range(size):
for c0 in range(size):
if not (0 <= r0 + dr * (n - 1) < size and 0 <= c0 + dc * (n - 1) < size):
continue
cells = [(r0 + dr * i, c0 + dc * i) for i in range(n)]
if not all(grid[r][c] in (None, word[i]) for i, (r, c) in enumerate(cells)):
continue
cands.append((sum(1 for i, (r, c) in enumerate(cells) if grid[r][c] == word[i]), cells))
if not cands:
continue
crossing = [t for t in cands if t[0] > 0]
want_cross = bool(crossing) and crossed < _WS_CROSS_TARGET * len(placements)
scored = [] # (score, overlap, cells)
for overlap, cells in crossing if want_cross else cands:
crowd = _neighbour_fill(grid, cells, size)
zload = sum(zone_fill[_zone(r, c, size)] for r, c in cells) // n
# Crossing mode rewards extra overlaps; anchor mode is overlap-neutral
# (crowding already steers it to open ground).
scored.append(((overlap * 4 if want_cross else 0) - 2 * crowd - zload, overlap, cells))
scored.sort(key=lambda t: t[0], reverse=True)
top = [t for t in scored if t[0] >= scored[0][0] - 1] # near-best: variety without losing intent
_, overlap, cells = rng.choice(top)
for i, (r, c) in enumerate(cells):
if grid[r][c] is None:
grid[r][c] = word[i]
zone_fill[_zone(r, c, size)] += 1
placements.append((word, cells))
if overlap:
crossed += 1
return grid, placements
def _build_grid(words: list[str], size: int, seed: int) -> tuple[list[str], list[str]]:
"""Place words in a size×size grid (date-seeded, deterministic) and fill the
rest. Returns (rows, placed_words). Every returned word is genuinely placed."""
rng = random.Random(seed)
grid: list[list[str | None]] = [[None] * size for _ in range(size)]
placed = []
for word in sorted(words, key=len, reverse=True):
if len(word) > size:
continue
# Gather valid placements and SCORE them: reward crossing an existing word
# (so the grid interlocks like a real puzzle) but penalise crowding, so
# words spread across the board instead of all clustering around the ones
# placed first. Pick at random among the best ~20% to keep organic variety.
scored = [] # (score, cells)
for _ in range(400):
dr, dc = rng.choice(_DIRS)
r0, c0 = rng.randrange(size), rng.randrange(size)
cells = [(r0 + dr * i, c0 + dc * i) for i in range(len(word))]
if any(not (0 <= r < size and 0 <= c < size) for r, c in cells):
continue
if not all(grid[r][c] in (None, word[i]) for i, (r, c) in enumerate(cells)):
continue
overlap = sum(1 for i, (r, c) in enumerate(cells) if grid[r][c] == word[i])
scored.append((overlap * 4 - _neighbour_fill(grid, cells, size), cells))
if not scored:
continue
scored.sort(key=lambda t: t[0], reverse=True)
_, cells = rng.choice(scored[: max(1, len(scored) // 5)])
for i, (r, c) in enumerate(cells):
grid[r][c] = word[i]
placed.append(word)
grid, placements = _place_words(words, size, seed)
rng = random.Random(_seed(str(seed), "fill"))
for r in range(size):
for c in range(size):
if grid[r][c] is None:
grid[r][c] = chr(65 + rng.randrange(26))
return ["".join(row) for row in grid], placed
return ["".join(row) for row in grid], [w for w, _ in placements]
# --- Cross-device game state sync -------------------------------------------
@@ -562,17 +599,18 @@ def _build_grid(words: list[str], size: int, seed: int) -> tuple[list[str], list
def _merge_wordsearch(a: dict, b: dict) -> dict:
"""Union the found words (a find is monotonic — you can't un-find one, so the
union is always correct), keep the earliest start and the best (min) time."""
union is always correct), credit the most ACTIVE play time either device has
banked (max — the clock only runs while the puzzle is on screen, so wall-clock
gaps between sittings never count), and keep the best (min) finish time."""
by_word = {}
for fw in list(a.get("foundWords") or []) + list(b.get("foundWords") or []):
w = fw.get("word") if isinstance(fw, dict) else None
if w and w not in by_word:
by_word[w] = fw
starts = [s for s in (a.get("startTime"), b.get("startTime")) if s]
times = [m for m in (a.get("ms"), b.get("ms")) if m]
return {
"foundWords": list(by_word.values()),
"startTime": min(starts) if starts else 0,
"played": max(_int(a.get("played")), _int(b.get("played"))),
"ms": min(times) if times else 0,
}
@@ -615,6 +653,13 @@ def _int(x) -> int:
return 0
_WS_MS_CAP = 86_400_000 # clamp client-sent timings to one day — beyond that is junk
def _ms(x) -> int:
return max(0, min(_int(x), _WS_MS_CAP))
def _sanitize_wordsearch(conn: sqlite3.Connection, variant: str, date: str, state: dict) -> dict:
"""Trust only finds that are real for THIS puzzle: word in the day's list and
cells that actually spell it in the grid (validated when the puzzle exists,
@@ -656,8 +701,8 @@ def _sanitize_wordsearch(conn: sqlite3.Connection, variant: str, date: str, stat
seen.add(w)
clean.append({"word": w, "cells": cells, "ci": len(clean) % 10})
done = bool(words) and len(clean) == len(words)
return {"foundWords": clean, "startTime": _int(state.get("startTime")),
"ms": _int(state.get("ms")) if done else 0}
return {"foundWords": clean, "played": _ms(state.get("played")),
"ms": _ms(state.get("ms")) if done else 0}
_WORD_COLOURS = {"absent", "present", "correct"}
+69
View File
@@ -454,6 +454,75 @@ def _attention(content: dict, sources: list[dict], feedback_unread: int, now: da
return items
# --- Source article inspector: the real articles behind the source metrics -----
_SRC_ART_FILTERS = {
"accepted": "AND s.accepted = 1",
"rejected": "AND s.accepted = 0",
"no_image": "AND (a.image_url IS NULL OR a.image_url = '')",
"duplicates": "AND a.duplicate_of IS NOT NULL",
}
def source_articles(conn: sqlite3.Connection, source_id: int, filter: str = "all",
limit: int = 25, offset: int = 0) -> list[dict]:
"""The actual ingested articles for a source, newest first — so admins can
verify the metric (paywall/image/acceptance) against real evidence."""
where = _SRC_ART_FILTERS.get(filter, "")
rows = conn.execute(
f"""
SELECT a.id, a.title, a.canonical_url, a.published_at, a.discovered_at,
a.image_url, a.duplicate_of,
s.accepted, s.reason_code, s.reason_text, s.topic, s.flavor
FROM articles a
LEFT JOIN article_scores s ON s.article_id = a.id
WHERE a.source_id = ? {where}
ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
LIMIT ? OFFSET ?
""",
(source_id, limit, offset),
).fetchall()
return [
{
"id": r["id"],
"title": r["title"],
"url": r["canonical_url"],
"published_at": r["published_at"] or r["discovered_at"],
"accepted": r["accepted"],
"reason": r["reason_text"] or r["reason_code"], # the "why" behind accept/reject
"topic": r["topic"],
"flavor": r["flavor"],
"paywalled": is_paywalled(r["canonical_url"]), # domain rule — same for the source
"has_image": bool(r["image_url"]),
"duplicate": r["duplicate_of"] is not None,
}
for r in rows
]
def source_articles_summary(conn: sqlite3.Connection, source_id: int) -> dict:
"""Counts behind the table metrics + the source-level paywall rule, so the
panel header reads e.g. '120 · 96 accepted · 24 rejected · 3 no image · paywall: ON'."""
agg = conn.execute(
"""
SELECT COUNT(*) total,
COALESCE(SUM(s.accepted = 1), 0) accepted,
COALESCE(SUM(s.accepted = 0), 0) rejected,
COALESCE(SUM(a.image_url IS NULL OR a.image_url = ''), 0) no_image,
COALESCE(SUM(a.duplicate_of IS NOT NULL), 0) duplicates
FROM articles a LEFT JOIN article_scores s ON s.article_id = a.id
WHERE a.source_id = ?
""",
(source_id,),
).fetchone()
one = conn.execute("SELECT canonical_url FROM articles WHERE source_id = ? LIMIT 1", (source_id,)).fetchone()
return {
"total": agg["total"], "accepted": agg["accepted"], "rejected": agg["rejected"],
"no_image": agg["no_image"], "duplicates": agg["duplicates"],
"paywalled": is_paywalled(one["canonical_url"]) if one else False,
}
def admin_stats(conn: sqlite3.Connection, days: int = 30) -> dict:
"""Aggregate, non-personal usage stats for the admin dashboard."""
since = f"-{days} days"