Admin: source Articles inspector (verify metrics against real evidence)

New per-row "Articles" button on the Sources table expands a read-only inline panel of the source's ACTUAL ingested articles — so the automated metrics (paywall/image/acceptance/duplicate) can be verified against evidence instead of trusted blind. Distinct from "Check" (which re-samples the LIVE feed for would-pass quality); this shows what's already in the DB, which is what the table metrics are computed from. - Backend: GET /api/admin/sources/{id}/articles?filter=&limit=&offset= (admin, read-only). queries.source_articles + source_articles_summary — per article: title, url, date, accepted, reason (the "why"), topic/flavor, paywalled (domain rule), has_image, duplicate. Summary = counts + source-level paywall rule. - Frontend: expandable panel with a summary header ("27 ingested · 18 accepted · … · paywall rule: ON (domain)"), filter chips (All/Accepted/Rejected/No image/Duplicates), compact rows with title→link + badges + reason, Load more. So "100% paywall" or "0% images" becomes clickable evidence: open two articles to tell a real paywall from a mis-flagged domain, or a true image gap from an enrichment failure. Test: test_source_articles_inspector. 241 pytest + 11 vitest. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-12 21:37:51 -04:00
parent 64339aafb0
commit ddcfab3a11
9 changed files with 445 additions and 61 deletions
@@ -1146,6 +1146,24 @@ def create_app() -> FastAPI:
            url = src["feed_url"]
        return _preview_or_502(url)  # safe fetch, no DB connection held

+    @app.get("/api/admin/sources/{sid}/articles")
+    def admin_source_articles(sid: int, request: Request, filter: str = "all",
+                              limit: int = 25, offset: int = 0) -> dict:
+        # Read-only inspector: the REAL ingested articles behind a source's metrics,
+        # so paywall/image/acceptance/duplicate signals can be verified against evidence.
+        limit = max(1, min(int(limit), 100))
+        offset = max(0, int(offset))
+        with get_conn() as conn:
+            _require_admin(conn, request)
+            if not conn.execute("SELECT 1 FROM sources WHERE id = ?", (sid,)).fetchone():
+                raise HTTPException(status_code=404, detail="source not found")
+            arts = queries.source_articles(conn, sid, filter, limit, offset)
+            return {
+                "articles": arts,
+                "summary": queries.source_articles_summary(conn, sid) if offset == 0 else None,
+                "has_more": len(arts) == limit,
+            }
+
    # --- Source candidates (supervised add-a-source pipeline) ----------------

    def _candidate_dict(row) -> dict:
@@ -518,42 +518,79 @@ def generate_wordsearch_puzzle(conn: sqlite3.Connection, date: str, client=None)
    return json.loads(row["payload_json"])


+_WS_CROSS_TARGET = 0.5  # aim: about half the placements cross an existing word
+
+
+def _zone(r: int, c: int, size: int) -> tuple[int, int]:
+    """Which quadrant a cell falls in — coarse occupancy used to spread words."""
+    return (r * 2 // size, c * 2 // size)
+
+
+def _place_words(words: list[str], size: int, seed: int) -> tuple[list[list[str | None]], list[tuple[str, list[tuple[int, int]]]]]:
+    """Core placement (date-seeded, deterministic). Returns the letter grid (None
+    where unfilled) and [(word, cells)] for every word genuinely placed.
+
+    Interlock is a TARGET, not a side effect: each word either (a) must cross an
+    already-placed word — when crossings are running below ~half of placements —
+    or (b) anchors in open ground. Both modes steer toward the least crowded /
+    least developed quadrant, so crossings attach to lonely words at the edges of
+    structure rather than thickening one knot, and anchors spread across the
+    board. All valid spots are enumerated (the grid is tiny) — earlier random
+    sampling kept missing the rare crossing spots, which is why grids came out
+    as disconnected "clean" words."""
+    rng = random.Random(seed)
+    grid: list[list[str | None]] = [[None] * size for _ in range(size)]
+    zone_fill = {(zr, zc): 0 for zr in (0, 1) for zc in (0, 1)}
+    placements: list[tuple[str, list[tuple[int, int]]]] = []
+    crossed = 0
+    for word in sorted(words, key=len, reverse=True):
+        n = len(word)
+        if n > size:
+            continue
+        cands = []  # (overlap, cells) over every legal placement
+        for dr, dc in _DIRS:
+            for r0 in range(size):
+                for c0 in range(size):
+                    if not (0 <= r0 + dr * (n - 1) < size and 0 <= c0 + dc * (n - 1) < size):
+                        continue
+                    cells = [(r0 + dr * i, c0 + dc * i) for i in range(n)]
+                    if not all(grid[r][c] in (None, word[i]) for i, (r, c) in enumerate(cells)):
+                        continue
+                    cands.append((sum(1 for i, (r, c) in enumerate(cells) if grid[r][c] == word[i]), cells))
+        if not cands:
+            continue
+        crossing = [t for t in cands if t[0] > 0]
+        want_cross = bool(crossing) and crossed < _WS_CROSS_TARGET * len(placements)
+        scored = []  # (score, overlap, cells)
+        for overlap, cells in crossing if want_cross else cands:
+            crowd = _neighbour_fill(grid, cells, size)
+            zload = sum(zone_fill[_zone(r, c, size)] for r, c in cells) // n
+            # Crossing mode rewards extra overlaps; anchor mode is overlap-neutral
+            # (crowding already steers it to open ground).
+            scored.append(((overlap * 4 if want_cross else 0) - 2 * crowd - zload, overlap, cells))
+        scored.sort(key=lambda t: t[0], reverse=True)
+        top = [t for t in scored if t[0] >= scored[0][0] - 1]  # near-best: variety without losing intent
+        _, overlap, cells = rng.choice(top)
+        for i, (r, c) in enumerate(cells):
+            if grid[r][c] is None:
+                grid[r][c] = word[i]
+                zone_fill[_zone(r, c, size)] += 1
+        placements.append((word, cells))
+        if overlap:
+            crossed += 1
+    return grid, placements
+
+
 def _build_grid(words: list[str], size: int, seed: int) -> tuple[list[str], list[str]]:
    """Place words in a size×size grid (date-seeded, deterministic) and fill the
    rest. Returns (rows, placed_words). Every returned word is genuinely placed."""
-    rng = random.Random(seed)
-    grid: list[list[str | None]] = [[None] * size for _ in range(size)]
-    placed = []
-    for word in sorted(words, key=len, reverse=True):
-        if len(word) > size:
-            continue
-        # Gather valid placements and SCORE them: reward crossing an existing word
-        # (so the grid interlocks like a real puzzle) but penalise crowding, so
-        # words spread across the board instead of all clustering around the ones
-        # placed first. Pick at random among the best ~20% to keep organic variety.
-        scored = []  # (score, cells)
-        for _ in range(400):
-            dr, dc = rng.choice(_DIRS)
-            r0, c0 = rng.randrange(size), rng.randrange(size)
-            cells = [(r0 + dr * i, c0 + dc * i) for i in range(len(word))]
-            if any(not (0 <= r < size and 0 <= c < size) for r, c in cells):
-                continue
-            if not all(grid[r][c] in (None, word[i]) for i, (r, c) in enumerate(cells)):
-                continue
-            overlap = sum(1 for i, (r, c) in enumerate(cells) if grid[r][c] == word[i])
-            scored.append((overlap * 4 - _neighbour_fill(grid, cells, size), cells))
-        if not scored:
-            continue
-        scored.sort(key=lambda t: t[0], reverse=True)
-        _, cells = rng.choice(scored[: max(1, len(scored) // 5)])
-        for i, (r, c) in enumerate(cells):
-            grid[r][c] = word[i]
-        placed.append(word)
+    grid, placements = _place_words(words, size, seed)
+    rng = random.Random(_seed(str(seed), "fill"))
    for r in range(size):
        for c in range(size):
            if grid[r][c] is None:
                grid[r][c] = chr(65 + rng.randrange(26))
-    return ["".join(row) for row in grid], placed
+    return ["".join(row) for row in grid], [w for w, _ in placements]


 # --- Cross-device game state sync -------------------------------------------
@@ -562,17 +599,18 @@ def _build_grid(words: list[str], size: int, seed: int) -> tuple[list[str], list

 def _merge_wordsearch(a: dict, b: dict) -> dict:
    """Union the found words (a find is monotonic — you can't un-find one, so the
-    union is always correct), keep the earliest start and the best (min) time."""
+    union is always correct), credit the most ACTIVE play time either device has
+    banked (max — the clock only runs while the puzzle is on screen, so wall-clock
+    gaps between sittings never count), and keep the best (min) finish time."""
    by_word = {}
    for fw in list(a.get("foundWords") or []) + list(b.get("foundWords") or []):
        w = fw.get("word") if isinstance(fw, dict) else None
        if w and w not in by_word:
            by_word[w] = fw
-    starts = [s for s in (a.get("startTime"), b.get("startTime")) if s]
    times = [m for m in (a.get("ms"), b.get("ms")) if m]
    return {
        "foundWords": list(by_word.values()),
-        "startTime": min(starts) if starts else 0,
+        "played": max(_int(a.get("played")), _int(b.get("played"))),
        "ms": min(times) if times else 0,
    }

@@ -615,6 +653,13 @@ def _int(x) -> int:
        return 0


+_WS_MS_CAP = 86_400_000  # clamp client-sent timings to one day — beyond that is junk
+
+
+def _ms(x) -> int:
+    return max(0, min(_int(x), _WS_MS_CAP))
+
+
 def _sanitize_wordsearch(conn: sqlite3.Connection, variant: str, date: str, state: dict) -> dict:
    """Trust only finds that are real for THIS puzzle: word in the day's list and
    cells that actually spell it in the grid (validated when the puzzle exists,
@@ -656,8 +701,8 @@ def _sanitize_wordsearch(conn: sqlite3.Connection, variant: str, date: str, stat
        seen.add(w)
        clean.append({"word": w, "cells": cells, "ci": len(clean) % 10})
    done = bool(words) and len(clean) == len(words)
-    return {"foundWords": clean, "startTime": _int(state.get("startTime")),
-            "ms": _int(state.get("ms")) if done else 0}
+    return {"foundWords": clean, "played": _ms(state.get("played")),
+            "ms": _ms(state.get("ms")) if done else 0}


 _WORD_COLOURS = {"absent", "present", "correct"}
@@ -454,6 +454,75 @@ def _attention(content: dict, sources: list[dict], feedback_unread: int, now: da
    return items


+# --- Source article inspector: the real articles behind the source metrics -----
+
+_SRC_ART_FILTERS = {
+    "accepted": "AND s.accepted = 1",
+    "rejected": "AND s.accepted = 0",
+    "no_image": "AND (a.image_url IS NULL OR a.image_url = '')",
+    "duplicates": "AND a.duplicate_of IS NOT NULL",
+}
+
+
+def source_articles(conn: sqlite3.Connection, source_id: int, filter: str = "all",
+                    limit: int = 25, offset: int = 0) -> list[dict]:
+    """The actual ingested articles for a source, newest first — so admins can
+    verify the metric (paywall/image/acceptance) against real evidence."""
+    where = _SRC_ART_FILTERS.get(filter, "")
+    rows = conn.execute(
+        f"""
+        SELECT a.id, a.title, a.canonical_url, a.published_at, a.discovered_at,
+               a.image_url, a.duplicate_of,
+               s.accepted, s.reason_code, s.reason_text, s.topic, s.flavor
+        FROM articles a
+        LEFT JOIN article_scores s ON s.article_id = a.id
+        WHERE a.source_id = ? {where}
+        ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
+        LIMIT ? OFFSET ?
+        """,
+        (source_id, limit, offset),
+    ).fetchall()
+    return [
+        {
+            "id": r["id"],
+            "title": r["title"],
+            "url": r["canonical_url"],
+            "published_at": r["published_at"] or r["discovered_at"],
+            "accepted": r["accepted"],
+            "reason": r["reason_text"] or r["reason_code"],  # the "why" behind accept/reject
+            "topic": r["topic"],
+            "flavor": r["flavor"],
+            "paywalled": is_paywalled(r["canonical_url"]),    # domain rule — same for the source
+            "has_image": bool(r["image_url"]),
+            "duplicate": r["duplicate_of"] is not None,
+        }
+        for r in rows
+    ]
+
+
+def source_articles_summary(conn: sqlite3.Connection, source_id: int) -> dict:
+    """Counts behind the table metrics + the source-level paywall rule, so the
+    panel header reads e.g. '120 · 96 accepted · 24 rejected · 3 no image · paywall: ON'."""
+    agg = conn.execute(
+        """
+        SELECT COUNT(*) total,
+               COALESCE(SUM(s.accepted = 1), 0) accepted,
+               COALESCE(SUM(s.accepted = 0), 0) rejected,
+               COALESCE(SUM(a.image_url IS NULL OR a.image_url = ''), 0) no_image,
+               COALESCE(SUM(a.duplicate_of IS NOT NULL), 0) duplicates
+        FROM articles a LEFT JOIN article_scores s ON s.article_id = a.id
+        WHERE a.source_id = ?
+        """,
+        (source_id,),
+    ).fetchone()
+    one = conn.execute("SELECT canonical_url FROM articles WHERE source_id = ? LIMIT 1", (source_id,)).fetchone()
+    return {
+        "total": agg["total"], "accepted": agg["accepted"], "rejected": agg["rejected"],
+        "no_image": agg["no_image"], "duplicates": agg["duplicates"],
+        "paywalled": is_paywalled(one["canonical_url"]) if one else False,
+    }
+
+
 def admin_stats(conn: sqlite3.Connection, days: int = 30) -> dict:
    """Aggregate, non-personal usage stats for the admin dashboard."""
    since = f"-{days} days"