Attention strip: richer source-health items (stale/reject/dup/thin/rate-limit)

Per Codex — make the Overview strip diagnostic without making the operator hunt
through tables. Aggregated (one calm line per condition with a count), volume-
gated, conservative thresholds:

* Stale: active+visible source, last success > 10 days ago (warn).
* High rejection: >=20 ingested, acceptance < 25% (info).
* High duplicate: >=10 accepted, accepted-dup > 50% (info).
* Thin images: >=10 served, per-source image coverage < 25% (info).
* Long rate-limit: retry_after_at more than 12h out (info).

source_health gains a per-source images count + image_coverage. _attention takes
an optional now (for tests). Existing site-wide items (global image coverage,
thin brief, unread feedback) unchanged.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
jay
2026-06-09 11:50:17 -04:00
parent 01de5a3ef0
commit d2e2b303ac
2 changed files with 93 additions and 2 deletions
+53 -2
View File
@@ -8,6 +8,7 @@ the original source — never stored bodies.
from __future__ import annotations
import sqlite3
from datetime import UTC, datetime, timedelta
from .feeds import MAX_BACKOFF_MINUTES
@@ -307,6 +308,9 @@ def source_health(conn: sqlite3.Connection) -> list[dict]:
(SELECT COUNT(*) FROM articles a WHERE a.source_id = s.id AND a.duplicate_of IS NOT NULL) AS duplicates,
(SELECT COUNT(*) FROM articles a JOIN article_scores sc ON sc.article_id = a.id
WHERE a.source_id = s.id AND sc.accepted = 1 AND a.duplicate_of IS NULL) AS served,
(SELECT COUNT(*) FROM articles a JOIN article_scores sc ON sc.article_id = a.id
WHERE a.source_id = s.id AND sc.accepted = 1 AND a.duplicate_of IS NULL
AND a.image_url IS NOT NULL AND a.image_url != '') AS images,
datetime(
(SELECT MAX(r.finished_at) FROM ingest_runs r
WHERE r.source_id = s.id AND r.finished_at IS NOT NULL),
@@ -327,6 +331,7 @@ def source_health(conn: sqlite3.Connection) -> list[dict]:
# Curation quality: of what this source got ACCEPTED, how much was a
# duplicate of content already served (accepted_total served = accepted dupes).
d["accepted_dup_rate"] = round(100 * (accepted - d["served"]) / accepted) if accepted else None
d["image_coverage"] = round(100 * (d["images"] or 0) / d["served"]) if d["served"] else None
# Match the REAL scheduler gate: due = the later of the streak-backoff time
# and any retry_after_at rest (UTC strings sort chronologically).
due_times = [t for t in (d["next_due_at"], d["retry_after_at"]) if t]
@@ -335,20 +340,66 @@ def source_health(conn: sqlite3.Connection) -> list[dict]:
return out
def _attention(content: dict, sources: list[dict], feedback_unread: int) -> list[dict]:
# Attention thresholds — conservative + volume-gated, so the strip is a calm
# operator nudge rather than a noisy scold. Items are aggregated (one line per
# condition with a count), not one line per offending source.
_STALE_DAYS = 10
_REJECT_MIN_INGESTED = 20
_REJECT_RATE = 25 # acceptance below this % (with enough volume)
_DUP_MIN_ACCEPTED = 10
_DUP_RATE = 50 # accepted-duplicate above this %
_IMG_MIN_SERVED = 10
_IMG_COVERAGE = 25 # per-source image coverage below this %
_LONG_REST_HOURS = 12
def _attention(content: dict, sources: list[dict], feedback_unread: int, now: datetime | None = None) -> list[dict]:
"""The 'Attention Needed' strip: what an operator should look at, soft-toned
(warn = act soon, info = worth a glance). Derived from the same data shown
elsewhere, so it never disagrees with the detail sections."""
items: list[dict] = []
n = lambda c: "" if c == 1 else "s" # noqa: E731 — tiny pluralizer
now = now or datetime.now(UTC)
active = [s for s in sources if (s.get("status") or ("active" if s.get("active") else "paused")) == "active"]
resting = [s for s in sources if s.get("active") and (s.get("failures") or 0) > 0]
resting = [s for s in active if (s.get("failures") or 0) > 0]
if resting:
items.append({"level": "warn", "text": f"{len(resting)} source{n(len(resting))} backing off after failures"})
flagged = [s for s in sources if s.get("review_flag")]
if flagged:
items.append({"level": "warn", "text": f"{len(flagged)} source{n(len(flagged))} flagged for review"})
# Stale: active, visible feeds whose last success is well in the past.
stale_cutoff = (now - timedelta(days=_STALE_DAYS)).strftime("%Y-%m-%d %H:%M:%S")
stale = [s for s in active if s.get("content_visible", 1) and s.get("last_success_at") and s["last_success_at"] < stale_cutoff]
if stale:
items.append({"level": "warn", "text": f"{len(stale)} source{n(len(stale))} haven't updated in over {_STALE_DAYS} days"})
# High rejection: enough ingested volume, low acceptance.
rejecting = [s for s in active if (s.get("total_articles") or 0) >= _REJECT_MIN_INGESTED
and s.get("acceptance_rate") is not None and s["acceptance_rate"] < _REJECT_RATE]
if rejecting:
items.append({"level": "info", "text": f"{len(rejecting)} source{n(len(rejecting))} accepting under {_REJECT_RATE}% of submissions"})
# High accepted-duplicate: enough accepted volume, mostly echoing others.
duping = [s for s in active if (s.get("accepted_total") or 0) >= _DUP_MIN_ACCEPTED
and s.get("accepted_dup_rate") is not None and s["accepted_dup_rate"] > _DUP_RATE]
if duping:
items.append({"level": "info", "text": f"{len(duping)} source{n(len(duping))} mostly duplicating other feeds"})
# Low image coverage (info, not a warning).
thin = [s for s in active if (s.get("served") or 0) >= _IMG_MIN_SERVED
and s.get("image_coverage") is not None and s["image_coverage"] < _IMG_COVERAGE]
if thin:
items.append({"level": "info", "text": f"{len(thin)} source{n(len(thin))} with thin image coverage (under {_IMG_COVERAGE}%)"})
# Long rate-limit rest (info).
rest_cutoff = (now + timedelta(hours=_LONG_REST_HOURS)).strftime("%Y-%m-%d %H:%M:%S")
long_rest = [s for s in sources if s.get("retry_after_at") and s["retry_after_at"] > rest_cutoff]
if long_rest:
items.append({"level": "info", "text": f"{len(long_rest)} source{n(len(long_rest))} rate-limited for {_LONG_REST_HOURS}h+"})
# Site-wide signals.
served = content.get("served") or 0
with_image = content.get("with_image") or 0
if served and (with_image / served) < 0.70:
+40
View File
@@ -74,3 +74,43 @@ def test_attention_clear_when_healthy():
content = {"served": 100, "with_image": 95, "latest_brief_size": 7}
sources = [{"failures": 0, "review_flag": 0}]
assert queries._attention(content, sources, feedback_unread=0) == []
def test_attention_richer_items_fire_and_are_quiet_below_threshold():
from datetime import UTC, datetime
from goodnews import queries
now = datetime(2026, 6, 9, 12, 0, 0, tzinfo=UTC)
content = {"served": 100, "with_image": 95, "latest_brief_size": 7} # no site-wide items
sources = [
# stale: active, visible, last success 20 days ago
{"status": "active", "content_visible": 1, "last_success_at": "2026-05-20 00:00:00"},
# high rejection: 40 ingested, 10% acceptance
{"status": "active", "content_visible": 1, "total_articles": 40, "acceptance_rate": 10},
# high duplicate: 30 accepted, 70% accepted-dup
{"status": "active", "content_visible": 1, "accepted_total": 30, "accepted_dup_rate": 70},
# thin images: 40 served, 5% coverage
{"status": "active", "content_visible": 1, "served": 40, "image_coverage": 5},
# long rate-limit rest: 2 days out
{"status": "active", "content_visible": 1, "retry_after_at": "2026-06-11 12:00:00"},
]
texts = " | ".join(i["text"] for i in queries._attention(content, sources, 0, now=now))
assert "haven't updated in over 10 days" in texts
assert "accepting under 25%" in texts
assert "mostly duplicating" in texts
assert "thin image coverage" in texts
assert "rate-limited for 12h+" in texts
def test_attention_quiet_below_thresholds():
from datetime import UTC, datetime
from goodnews import queries
now = datetime(2026, 6, 9, 12, 0, 0, tzinfo=UTC)
content = {"served": 100, "with_image": 95, "latest_brief_size": 7}
sources = [
{"status": "active", "content_visible": 1, "last_success_at": "2026-06-08 00:00:00"}, # 1 day — fresh
{"status": "active", "content_visible": 1, "total_articles": 5, "acceptance_rate": 10}, # too little volume
{"status": "active", "content_visible": 1, "accepted_total": 3, "accepted_dup_rate": 90}, # too little volume
{"status": "active", "content_visible": 1, "served": 5, "image_coverage": 0}, # too little volume
{"status": "paused", "content_visible": 1, "last_success_at": "2020-01-01 00:00:00"}, # paused → ignored
]
assert queries._attention(content, sources, 0, now=now) == []