Files
upbeatBytes/tests/test_enrich_images.py
thejayman77 50dc2167cd Durable image quality: stop trusting feed thumbnails; cycle enriches Latest
Make "no blurry images" sustainable, not a one-off cleanup. RSS feed thumbnails
(~44% were ~90px) were stored at ingest and upscaled to mush, so new articles
would reintroduce them. Now image_url is filled ONLY by the quality-gated
og:image enrichment:

* insert_article no longer stores the feed image (was canonicalize_url(item...)).
* enrich_recent_images(): the cycle fetches a quality og:image for the newest
  accepted, imageless articles each run (bounded), keeping Latest photo-rich.
* Brief + on-open enrichment unchanged.

Net: every stored image is a validated, ≥450px og:image; the rest are clean
placeholders.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-07 15:55:57 -04:00

97 lines
4.6 KiB
Python

from goodnews.db import connect, init_db
from goodnews import enrich
def _setup(tmp_path):
c = connect(str(tmp_path / "t.db")); init_db(c)
c.execute("INSERT INTO sources (id,name,feed_url) VALUES (1,'S','http://s/f')")
return c
def _add(c, aid, *, image=None, accepted=1, dup=None, summarized=True):
c.execute(
"INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url,duplicate_of,published_at) "
"VALUES (?,1,?,?,?,?,?,datetime('now'))",
(aid, f"http://s/{aid}", f"T{aid}", f"h{aid}", image, dup),
)
c.execute("INSERT INTO article_scores (article_id,accepted) VALUES (?,?)", (aid, accepted))
if summarized:
c.execute("INSERT INTO article_summaries (article_id,summary) VALUES (?, 's')", (aid,))
c.commit()
def test_enrich_article_image_fills_missing(tmp_path):
c = _setup(tmp_path); _add(c, 1, image=None)
got = enrich.enrich_article_image(c, 1, fetch=lambda url: "http://img/og.jpg")
assert got is True
assert c.execute("SELECT image_url FROM articles WHERE id=1").fetchone()[0] == "http://img/og.jpg"
# image_checked_at is stamped so we don't re-fetch endlessly
assert c.execute("SELECT image_checked_at FROM articles WHERE id=1").fetchone()[0] is not None
def test_enrich_skips_when_image_present(tmp_path):
c = _setup(tmp_path); _add(c, 1, image="http://existing/img.jpg")
calls = []
got = enrich.enrich_article_image(c, 1, fetch=lambda url: calls.append(url) or "http://new.jpg")
assert got is False and calls == [] # never fetched
assert c.execute("SELECT image_url FROM articles WHERE id=1").fetchone()[0] == "http://existing/img.jpg"
def test_enrich_records_checked_even_when_none_found(tmp_path):
c = _setup(tmp_path); _add(c, 1, image=None)
got = enrich.enrich_article_image(c, 1, fetch=lambda url: None)
assert got is False
# checked_at stamped → won't retry until the retry window passes
assert c.execute("SELECT image_checked_at FROM articles WHERE id=1").fetchone()[0] is not None
again = enrich.enrich_article_image(c, 1, fetch=lambda url: "http://late.jpg", retry_days=7)
assert again is False # still within retry window
def test_prune_clears_only_broken_images(tmp_path):
c = _setup(tmp_path)
_add(c, 1, image="http://good/a.jpg")
_add(c, 2, image="http://broken/b.jpg")
_add(c, 3, image=None) # nothing to check
# Pretend only the 'good' host loads.
cleared = enrich.prune_broken_images(c, check=lambda u: "good" in u)
assert cleared == 1
assert c.execute("SELECT image_url FROM articles WHERE id=1").fetchone()[0] == "http://good/a.jpg"
assert c.execute("SELECT image_url FROM articles WHERE id=2").fetchone()[0] is None
def test_backfill_only_targets_summarized_accepted_imageless(tmp_path):
c = _setup(tmp_path)
_add(c, 1, image=None, summarized=True, accepted=1) # eligible
_add(c, 2, image=None, summarized=False, accepted=1) # no summary → skip
_add(c, 3, image=None, summarized=True, accepted=0) # rejected → skip
_add(c, 4, image="http://has.jpg", summarized=True) # has image → skip
n = enrich.enrich_summarized_images(c, fetch=lambda url: "http://og.jpg", limit=50)
assert n == 1
assert c.execute("SELECT image_url FROM articles WHERE id=2").fetchone()[0] is None
def test_image_dimensions_parses_headers():
import struct
from goodnews import enrich
png = b"\x89PNG\r\n\x1a\n" + b"\x00\x00\x00\x0d" + b"IHDR" + struct.pack(">II", 1200, 630)
assert enrich._image_dimensions(png) == (1200, 630)
gif = b"GIF89a" + struct.pack("<HH", 90, 90)
assert enrich._image_dimensions(gif) == (90, 90)
assert enrich._image_dimensions(b"not an image at all") is None
def test_enrich_recent_targets_newest_imageless_accepted(tmp_path):
c = _setup(tmp_path)
# newest first by published_at; summary not required (unlike the backfill)
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url,published_at) "
"VALUES (1,1,'u1','T1','h1',NULL,'2026-01-01T00:00:00')")
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url,published_at) "
"VALUES (2,1,'u2','T2','h2',NULL,'2026-06-01T00:00:00')") # newest
c.execute("INSERT INTO article_scores (article_id,accepted) VALUES (1,1),(2,1)")
c.commit()
seen = []
enrich.enrich_recent_images(c, fetch=lambda url: seen.append(url) or "http://og.jpg", limit=10)
# both enriched; newest processed first
assert seen[0].endswith("u2")
assert c.execute("SELECT image_url FROM articles WHERE id=2").fetchone()[0] == "http://og.jpg"