Durable image quality: stop trusting feed thumbnails; cycle enriches Latest
Make "no blurry images" sustainable, not a one-off cleanup. RSS feed thumbnails (~44% were ~90px) were stored at ingest and upscaled to mush, so new articles would reintroduce them. Now image_url is filled ONLY by the quality-gated og:image enrichment: * insert_article no longer stores the feed image (was canonicalize_url(item...)). * enrich_recent_images(): the cycle fetches a quality og:image for the newest accepted, imageless articles each run (bounded), keeping Latest photo-rich. * Brief + on-open enrichment unchanged. Net: every stored image is a validated, ≥450px og:image; the rest are clean placeholders. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+10
-1
@@ -9,7 +9,7 @@ from .briefs import build_daily_brief, show_brief
|
|||||||
from .db import connect, init_db
|
from .db import connect, init_db
|
||||||
from .localtime import local_today
|
from .localtime import local_today
|
||||||
from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, dedup as run_dedup
|
from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, dedup as run_dedup
|
||||||
from .enrich import enrich_brief_images, enrich_summarized_images
|
from .enrich import enrich_brief_images, enrich_recent_images, enrich_summarized_images
|
||||||
from .summarize import generate_summary, get_summary
|
from .summarize import generate_summary, get_summary
|
||||||
from .feeds import (
|
from .feeds import (
|
||||||
fetch_feed,
|
fetch_feed,
|
||||||
@@ -461,6 +461,15 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non
|
|||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f"brief: skipped ({exc})")
|
print(f"brief: skipped ({exc})")
|
||||||
|
|
||||||
|
# Keep the Latest feed photo-rich: fetch quality og:images for the newest
|
||||||
|
# accepted articles that lack one (bounded per cycle).
|
||||||
|
try:
|
||||||
|
recent = enrich_recent_images(conn)
|
||||||
|
if recent:
|
||||||
|
print(f"recent images: {recent} enriched")
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"recent images: skipped ({exc})")
|
||||||
|
|
||||||
# Pre-warm summaries for today's brief so Today reads as a calm briefing.
|
# Pre-warm summaries for today's brief so Today reads as a calm briefing.
|
||||||
# Idempotent: cached items are skipped, so this only hits the LLM for new ones.
|
# Idempotent: cached items are skipped, so this only hits the LLM for new ones.
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -360,6 +360,32 @@ def enrich_article_image(
|
|||||||
return bool(image)
|
return bool(image)
|
||||||
|
|
||||||
|
|
||||||
|
def enrich_recent_images(
|
||||||
|
conn: sqlite3.Connection, fetch=fetch_og_image, limit: int = 40, retry_days: int = 7
|
||||||
|
) -> int:
|
||||||
|
"""Keep the Latest feed photo-rich: fetch a quality og:image for the newest
|
||||||
|
accepted, non-duplicate articles that lack one. Bounded per run, so it tracks
|
||||||
|
fresh content without blanket-fetching the archive. Returns newly-found count.
|
||||||
|
"""
|
||||||
|
rows = conn.execute(
|
||||||
|
"""
|
||||||
|
SELECT a.id FROM articles a
|
||||||
|
JOIN article_scores s ON s.article_id = a.id
|
||||||
|
WHERE s.accepted = 1 AND a.duplicate_of IS NULL
|
||||||
|
AND (a.image_url IS NULL OR a.image_url = '')
|
||||||
|
AND (a.image_checked_at IS NULL OR a.image_checked_at < datetime('now', ?))
|
||||||
|
ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
|
||||||
|
LIMIT ?
|
||||||
|
""",
|
||||||
|
(f"-{retry_days} days", limit),
|
||||||
|
).fetchall()
|
||||||
|
found = 0
|
||||||
|
for row in rows:
|
||||||
|
if enrich_article_image(conn, row["id"], fetch=fetch, retry_days=retry_days):
|
||||||
|
found += 1
|
||||||
|
return found
|
||||||
|
|
||||||
|
|
||||||
def enrich_summarized_images(
|
def enrich_summarized_images(
|
||||||
conn: sqlite3.Connection, fetch=fetch_og_image, limit: int = 50, retry_days: int = 7
|
conn: sqlite3.Connection, fetch=fetch_og_image, limit: int = 50, retry_days: int = 7
|
||||||
) -> int:
|
) -> int:
|
||||||
|
|||||||
+4
-1
@@ -323,7 +323,10 @@ def insert_article(conn: sqlite3.Connection, source: sqlite3.Row, item: FeedItem
|
|||||||
description,
|
description,
|
||||||
clean_text(item.author, max_len=250),
|
clean_text(item.author, max_len=250),
|
||||||
item.published_at,
|
item.published_at,
|
||||||
canonicalize_url(item.image_url),
|
# Don't store the feed's image: RSS thumbnails are often tiny
|
||||||
|
# (~90px) and upscale to mush. image_url is filled only by the
|
||||||
|
# quality-gated og:image enrichment (brief / recent / on-open).
|
||||||
|
None,
|
||||||
item.language,
|
item.language,
|
||||||
item.raw_guid,
|
item.raw_guid,
|
||||||
url_hash,
|
url_hash,
|
||||||
|
|||||||
@@ -78,3 +78,19 @@ def test_image_dimensions_parses_headers():
|
|||||||
gif = b"GIF89a" + struct.pack("<HH", 90, 90)
|
gif = b"GIF89a" + struct.pack("<HH", 90, 90)
|
||||||
assert enrich._image_dimensions(gif) == (90, 90)
|
assert enrich._image_dimensions(gif) == (90, 90)
|
||||||
assert enrich._image_dimensions(b"not an image at all") is None
|
assert enrich._image_dimensions(b"not an image at all") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_enrich_recent_targets_newest_imageless_accepted(tmp_path):
|
||||||
|
c = _setup(tmp_path)
|
||||||
|
# newest first by published_at; summary not required (unlike the backfill)
|
||||||
|
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url,published_at) "
|
||||||
|
"VALUES (1,1,'u1','T1','h1',NULL,'2026-01-01T00:00:00')")
|
||||||
|
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url,published_at) "
|
||||||
|
"VALUES (2,1,'u2','T2','h2',NULL,'2026-06-01T00:00:00')") # newest
|
||||||
|
c.execute("INSERT INTO article_scores (article_id,accepted) VALUES (1,1),(2,1)")
|
||||||
|
c.commit()
|
||||||
|
seen = []
|
||||||
|
enrich.enrich_recent_images(c, fetch=lambda url: seen.append(url) or "http://og.jpg", limit=10)
|
||||||
|
# both enriched; newest processed first
|
||||||
|
assert seen[0].endswith("u2")
|
||||||
|
assert c.execute("SELECT image_url FROM articles WHERE id=2").fetchone()[0] == "http://og.jpg"
|
||||||
|
|||||||
Reference in New Issue
Block a user