Durable image quality: stop trusting feed thumbnails; cycle enriches Latest

Make "no blurry images" sustainable, not a one-off cleanup. RSS feed thumbnails
(~44% were ~90px) were stored at ingest and upscaled to mush, so new articles
would reintroduce them. Now image_url is filled ONLY by the quality-gated
og:image enrichment:

* insert_article no longer stores the feed image (was canonicalize_url(item...)).
* enrich_recent_images(): the cycle fetches a quality og:image for the newest
  accepted, imageless articles each run (bounded), keeping Latest photo-rich.
* Brief + on-open enrichment unchanged.

Net: every stored image is a validated, ≥450px og:image; the rest are clean
placeholders.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
jay
2026-06-07 15:55:57 -04:00
parent b134c2dab6
commit 50dc2167cd
4 changed files with 56 additions and 2 deletions
+10 -1
View File
@@ -9,7 +9,7 @@ from .briefs import build_daily_brief, show_brief
from .db import connect, init_db
from .localtime import local_today
from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, dedup as run_dedup
from .enrich import enrich_brief_images, enrich_summarized_images
from .enrich import enrich_brief_images, enrich_recent_images, enrich_summarized_images
from .summarize import generate_summary, get_summary
from .feeds import (
fetch_feed,
@@ -461,6 +461,15 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non
except Exception as exc:
print(f"brief: skipped ({exc})")
# Keep the Latest feed photo-rich: fetch quality og:images for the newest
# accepted articles that lack one (bounded per cycle).
try:
recent = enrich_recent_images(conn)
if recent:
print(f"recent images: {recent} enriched")
except Exception as exc:
print(f"recent images: skipped ({exc})")
# Pre-warm summaries for today's brief so Today reads as a calm briefing.
# Idempotent: cached items are skipped, so this only hits the LLM for new ones.
try:
+26
View File
@@ -360,6 +360,32 @@ def enrich_article_image(
return bool(image)
def enrich_recent_images(
conn: sqlite3.Connection, fetch=fetch_og_image, limit: int = 40, retry_days: int = 7
) -> int:
"""Keep the Latest feed photo-rich: fetch a quality og:image for the newest
accepted, non-duplicate articles that lack one. Bounded per run, so it tracks
fresh content without blanket-fetching the archive. Returns newly-found count.
"""
rows = conn.execute(
"""
SELECT a.id FROM articles a
JOIN article_scores s ON s.article_id = a.id
WHERE s.accepted = 1 AND a.duplicate_of IS NULL
AND (a.image_url IS NULL OR a.image_url = '')
AND (a.image_checked_at IS NULL OR a.image_checked_at < datetime('now', ?))
ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
LIMIT ?
""",
(f"-{retry_days} days", limit),
).fetchall()
found = 0
for row in rows:
if enrich_article_image(conn, row["id"], fetch=fetch, retry_days=retry_days):
found += 1
return found
def enrich_summarized_images(
conn: sqlite3.Connection, fetch=fetch_og_image, limit: int = 50, retry_days: int = 7
) -> int:
+4 -1
View File
@@ -323,7 +323,10 @@ def insert_article(conn: sqlite3.Connection, source: sqlite3.Row, item: FeedItem
description,
clean_text(item.author, max_len=250),
item.published_at,
canonicalize_url(item.image_url),
# Don't store the feed's image: RSS thumbnails are often tiny
# (~90px) and upscale to mush. image_url is filled only by the
# quality-gated og:image enrichment (brief / recent / on-open).
None,
item.language,
item.raw_guid,
url_hash,
+16
View File
@@ -78,3 +78,19 @@ def test_image_dimensions_parses_headers():
gif = b"GIF89a" + struct.pack("<HH", 90, 90)
assert enrich._image_dimensions(gif) == (90, 90)
assert enrich._image_dimensions(b"not an image at all") is None
def test_enrich_recent_targets_newest_imageless_accepted(tmp_path):
c = _setup(tmp_path)
# newest first by published_at; summary not required (unlike the backfill)
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url,published_at) "
"VALUES (1,1,'u1','T1','h1',NULL,'2026-01-01T00:00:00')")
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url,published_at) "
"VALUES (2,1,'u2','T2','h2',NULL,'2026-06-01T00:00:00')") # newest
c.execute("INSERT INTO article_scores (article_id,accepted) VALUES (1,1),(2,1)")
c.commit()
seen = []
enrich.enrich_recent_images(c, fetch=lambda url: seen.append(url) or "http://og.jpg", limit=10)
# both enriched; newest processed first
assert seen[0].endswith("u2")
assert c.execute("SELECT image_url FROM articles WHERE id=2").fetchone()[0] == "http://og.jpg"