50dc2167cd
Make "no blurry images" sustainable, not a one-off cleanup. RSS feed thumbnails (~44% were ~90px) were stored at ingest and upscaled to mush, so new articles would reintroduce them. Now image_url is filled ONLY by the quality-gated og:image enrichment: * insert_article no longer stores the feed image (was canonicalize_url(item...)). * enrich_recent_images(): the cycle fetches a quality og:image for the newest accepted, imageless articles each run (bounded), keeping Latest photo-rich. * Brief + on-open enrichment unchanged. Net: every stored image is a validated, ≥450px og:image; the rest are clean placeholders. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
97 lines
4.6 KiB
Python
97 lines
4.6 KiB
Python
from goodnews.db import connect, init_db
|
|
from goodnews import enrich
|
|
|
|
|
|
def _setup(tmp_path):
|
|
c = connect(str(tmp_path / "t.db")); init_db(c)
|
|
c.execute("INSERT INTO sources (id,name,feed_url) VALUES (1,'S','http://s/f')")
|
|
return c
|
|
|
|
|
|
def _add(c, aid, *, image=None, accepted=1, dup=None, summarized=True):
|
|
c.execute(
|
|
"INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url,duplicate_of,published_at) "
|
|
"VALUES (?,1,?,?,?,?,?,datetime('now'))",
|
|
(aid, f"http://s/{aid}", f"T{aid}", f"h{aid}", image, dup),
|
|
)
|
|
c.execute("INSERT INTO article_scores (article_id,accepted) VALUES (?,?)", (aid, accepted))
|
|
if summarized:
|
|
c.execute("INSERT INTO article_summaries (article_id,summary) VALUES (?, 's')", (aid,))
|
|
c.commit()
|
|
|
|
|
|
def test_enrich_article_image_fills_missing(tmp_path):
|
|
c = _setup(tmp_path); _add(c, 1, image=None)
|
|
got = enrich.enrich_article_image(c, 1, fetch=lambda url: "http://img/og.jpg")
|
|
assert got is True
|
|
assert c.execute("SELECT image_url FROM articles WHERE id=1").fetchone()[0] == "http://img/og.jpg"
|
|
# image_checked_at is stamped so we don't re-fetch endlessly
|
|
assert c.execute("SELECT image_checked_at FROM articles WHERE id=1").fetchone()[0] is not None
|
|
|
|
|
|
def test_enrich_skips_when_image_present(tmp_path):
|
|
c = _setup(tmp_path); _add(c, 1, image="http://existing/img.jpg")
|
|
calls = []
|
|
got = enrich.enrich_article_image(c, 1, fetch=lambda url: calls.append(url) or "http://new.jpg")
|
|
assert got is False and calls == [] # never fetched
|
|
assert c.execute("SELECT image_url FROM articles WHERE id=1").fetchone()[0] == "http://existing/img.jpg"
|
|
|
|
|
|
def test_enrich_records_checked_even_when_none_found(tmp_path):
|
|
c = _setup(tmp_path); _add(c, 1, image=None)
|
|
got = enrich.enrich_article_image(c, 1, fetch=lambda url: None)
|
|
assert got is False
|
|
# checked_at stamped → won't retry until the retry window passes
|
|
assert c.execute("SELECT image_checked_at FROM articles WHERE id=1").fetchone()[0] is not None
|
|
again = enrich.enrich_article_image(c, 1, fetch=lambda url: "http://late.jpg", retry_days=7)
|
|
assert again is False # still within retry window
|
|
|
|
|
|
def test_prune_clears_only_broken_images(tmp_path):
|
|
c = _setup(tmp_path)
|
|
_add(c, 1, image="http://good/a.jpg")
|
|
_add(c, 2, image="http://broken/b.jpg")
|
|
_add(c, 3, image=None) # nothing to check
|
|
# Pretend only the 'good' host loads.
|
|
cleared = enrich.prune_broken_images(c, check=lambda u: "good" in u)
|
|
assert cleared == 1
|
|
assert c.execute("SELECT image_url FROM articles WHERE id=1").fetchone()[0] == "http://good/a.jpg"
|
|
assert c.execute("SELECT image_url FROM articles WHERE id=2").fetchone()[0] is None
|
|
|
|
|
|
def test_backfill_only_targets_summarized_accepted_imageless(tmp_path):
|
|
c = _setup(tmp_path)
|
|
_add(c, 1, image=None, summarized=True, accepted=1) # eligible
|
|
_add(c, 2, image=None, summarized=False, accepted=1) # no summary → skip
|
|
_add(c, 3, image=None, summarized=True, accepted=0) # rejected → skip
|
|
_add(c, 4, image="http://has.jpg", summarized=True) # has image → skip
|
|
n = enrich.enrich_summarized_images(c, fetch=lambda url: "http://og.jpg", limit=50)
|
|
assert n == 1
|
|
assert c.execute("SELECT image_url FROM articles WHERE id=2").fetchone()[0] is None
|
|
|
|
|
|
def test_image_dimensions_parses_headers():
|
|
import struct
|
|
from goodnews import enrich
|
|
png = b"\x89PNG\r\n\x1a\n" + b"\x00\x00\x00\x0d" + b"IHDR" + struct.pack(">II", 1200, 630)
|
|
assert enrich._image_dimensions(png) == (1200, 630)
|
|
gif = b"GIF89a" + struct.pack("<HH", 90, 90)
|
|
assert enrich._image_dimensions(gif) == (90, 90)
|
|
assert enrich._image_dimensions(b"not an image at all") is None
|
|
|
|
|
|
def test_enrich_recent_targets_newest_imageless_accepted(tmp_path):
|
|
c = _setup(tmp_path)
|
|
# newest first by published_at; summary not required (unlike the backfill)
|
|
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url,published_at) "
|
|
"VALUES (1,1,'u1','T1','h1',NULL,'2026-01-01T00:00:00')")
|
|
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url,published_at) "
|
|
"VALUES (2,1,'u2','T2','h2',NULL,'2026-06-01T00:00:00')") # newest
|
|
c.execute("INSERT INTO article_scores (article_id,accepted) VALUES (1,1),(2,1)")
|
|
c.commit()
|
|
seen = []
|
|
enrich.enrich_recent_images(c, fetch=lambda url: seen.append(url) or "http://og.jpg", limit=10)
|
|
# both enriched; newest processed first
|
|
assert seen[0].endswith("u2")
|
|
assert c.execute("SELECT image_url FROM articles WHERE id=2").fetchone()[0] == "http://og.jpg"
|