From 50dc2167cda328811f71fe182e4cc0091d7fba56 Mon Sep 17 00:00:00 2001 From: jay Date: Sun, 7 Jun 2026 15:55:57 -0400 Subject: [PATCH] Durable image quality: stop trusting feed thumbnails; cycle enriches Latest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make "no blurry images" sustainable, not a one-off cleanup. RSS feed thumbnails (~44% were ~90px) were stored at ingest and upscaled to mush, so new articles would reintroduce them. Now image_url is filled ONLY by the quality-gated og:image enrichment: * insert_article no longer stores the feed image (was canonicalize_url(item...)). * enrich_recent_images(): the cycle fetches a quality og:image for the newest accepted, imageless articles each run (bounded), keeping Latest photo-rich. * Brief + on-open enrichment unchanged. Net: every stored image is a validated, ≥450px og:image; the rest are clean placeholders. Co-Authored-By: Claude Opus 4.8 (1M context) --- goodnews/cli.py | 11 ++++++++++- goodnews/enrich.py | 26 ++++++++++++++++++++++++++ goodnews/feeds.py | 5 ++++- tests/test_enrich_images.py | 16 ++++++++++++++++ 4 files changed, 56 insertions(+), 2 deletions(-) diff --git a/goodnews/cli.py b/goodnews/cli.py index ed1be69..dae2702 100644 --- a/goodnews/cli.py +++ b/goodnews/cli.py @@ -9,7 +9,7 @@ from .briefs import build_daily_brief, show_brief from .db import connect, init_db from .localtime import local_today from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, dedup as run_dedup -from .enrich import enrich_brief_images, enrich_summarized_images +from .enrich import enrich_brief_images, enrich_recent_images, enrich_summarized_images from .summarize import generate_summary, get_summary from .feeds import ( fetch_feed, @@ -461,6 +461,15 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non except Exception as exc: print(f"brief: skipped ({exc})") + # Keep the Latest feed photo-rich: fetch quality og:images for the newest + # accepted articles that lack one (bounded per cycle). + try: + recent = enrich_recent_images(conn) + if recent: + print(f"recent images: {recent} enriched") + except Exception as exc: + print(f"recent images: skipped ({exc})") + # Pre-warm summaries for today's brief so Today reads as a calm briefing. # Idempotent: cached items are skipped, so this only hits the LLM for new ones. try: diff --git a/goodnews/enrich.py b/goodnews/enrich.py index eeec566..0cd7a66 100644 --- a/goodnews/enrich.py +++ b/goodnews/enrich.py @@ -360,6 +360,32 @@ def enrich_article_image( return bool(image) +def enrich_recent_images( + conn: sqlite3.Connection, fetch=fetch_og_image, limit: int = 40, retry_days: int = 7 +) -> int: + """Keep the Latest feed photo-rich: fetch a quality og:image for the newest + accepted, non-duplicate articles that lack one. Bounded per run, so it tracks + fresh content without blanket-fetching the archive. Returns newly-found count. + """ + rows = conn.execute( + """ + SELECT a.id FROM articles a + JOIN article_scores s ON s.article_id = a.id + WHERE s.accepted = 1 AND a.duplicate_of IS NULL + AND (a.image_url IS NULL OR a.image_url = '') + AND (a.image_checked_at IS NULL OR a.image_checked_at < datetime('now', ?)) + ORDER BY COALESCE(a.published_at, a.discovered_at) DESC + LIMIT ? + """, + (f"-{retry_days} days", limit), + ).fetchall() + found = 0 + for row in rows: + if enrich_article_image(conn, row["id"], fetch=fetch, retry_days=retry_days): + found += 1 + return found + + def enrich_summarized_images( conn: sqlite3.Connection, fetch=fetch_og_image, limit: int = 50, retry_days: int = 7 ) -> int: diff --git a/goodnews/feeds.py b/goodnews/feeds.py index 0ee1b46..f49f797 100644 --- a/goodnews/feeds.py +++ b/goodnews/feeds.py @@ -323,7 +323,10 @@ def insert_article(conn: sqlite3.Connection, source: sqlite3.Row, item: FeedItem description, clean_text(item.author, max_len=250), item.published_at, - canonicalize_url(item.image_url), + # Don't store the feed's image: RSS thumbnails are often tiny + # (~90px) and upscale to mush. image_url is filled only by the + # quality-gated og:image enrichment (brief / recent / on-open). + None, item.language, item.raw_guid, url_hash, diff --git a/tests/test_enrich_images.py b/tests/test_enrich_images.py index 680e26a..a83cc2b 100644 --- a/tests/test_enrich_images.py +++ b/tests/test_enrich_images.py @@ -78,3 +78,19 @@ def test_image_dimensions_parses_headers(): gif = b"GIF89a" + struct.pack("