import pytest from goodnews.db import connect, init_db from goodnews.enrich import og_image_from_html, _host_is_public, enrich_brief_images def test_og_image_parser_handles_attr_order_and_fallback(): assert og_image_from_html(b'') == "https://x.com/a.jpg" assert og_image_from_html(b'') == "https://x.com/b.jpg" assert og_image_from_html(b'') == "https://x.com/c.jpg" assert og_image_from_html(b"nope") is None def test_og_image_only_reads_head(): # a meta after must be ignored assert og_image_from_html(b'') is None def test_host_public_guard_blocks_internal_ranges(): assert _host_is_public("8.8.8.8") # public assert not _host_is_public("127.0.0.1") # loopback assert not _host_is_public("10.0.0.1") # private assert not _host_is_public("192.168.1.1") # private assert not _host_is_public("169.254.0.1") # link-local assert not _host_is_public("0.0.0.0") # unspecified assert not _host_is_public("") assert not _host_is_public(None) @pytest.fixture def conn(): c = connect(":memory:"); init_db(c) c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)") c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash) VALUES (1,1,'https://phys.org/x','t1','h1')") c.execute("INSERT INTO daily_briefs (id,brief_date,title) VALUES (1,'2026-05-31','B')") c.execute("INSERT INTO daily_brief_items (brief_id,article_id,rank) VALUES (1,1,1)") c.commit(); yield c; c.close() def test_enrich_sets_image_and_stamps(conn): calls = [] found = enrich_brief_images(conn, "2026-05-31", fetch=lambda u: calls.append(u) or "https://img.example/p.jpg") assert found == 1 and calls == ["https://phys.org/x"] r = conn.execute("SELECT image_url, image_checked_at FROM articles WHERE id=1").fetchone() assert r["image_url"] == "https://img.example/p.jpg" and r["image_checked_at"] is not None def test_enrich_caches_failure_and_does_not_retry(conn): calls = [] fail = lambda u: calls.append(u) or None assert enrich_brief_images(conn, "2026-05-31", fetch=fail) == 0 r = conn.execute("SELECT image_url, image_checked_at FROM articles WHERE id=1").fetchone() assert r["image_url"] is None and r["image_checked_at"] is not None # checked, cached assert enrich_brief_images(conn, "2026-05-31", fetch=fail) == 0 assert len(calls) == 1 # not retried once checked def test_enrich_upgrades_existing_feed_image(tmp_path): # A brief item with a (small) feed image should be upgraded to og:image. from goodnews.db import connect as _c, init_db as _i c = _c(":memory:"); _i(c) c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)") c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) " "VALUES (1,1,'https://bbc.com/x','t1','h1','https://bbc.com/tiny-thumb.jpg')") c.execute("INSERT INTO daily_briefs (id,brief_date,title) VALUES (1,'2026-05-31','B')") c.execute("INSERT INTO daily_brief_items (brief_id,article_id,rank) VALUES (1,1,1)") c.commit() found = enrich_brief_images(c, "2026-05-31", fetch=lambda u: "https://bbc.com/big-og.jpg") assert found == 1 assert c.execute("SELECT image_url FROM articles WHERE id=1").fetchone()["image_url"] == "https://bbc.com/big-og.jpg" c.close() def test_rejects_generic_share_images(): # genuine placeholder/default share images are skipped assert og_image_from_html(b'') is None assert og_image_from_html(b'') is None # a real article image comes through assert og_image_from_html(b'') == "https://x.com/real-photo.jpg" # BBC's branded_news path is a real photo (logo baked in) — keep it, but swap # to the clean cpsprodpb variant so the hero isn't branded assert og_image_from_html(b'') == "https://ichef.bbci.co.uk/news/1024/cpsprodpb/x.jpg" # if the first og is a generic placeholder but a later one is real, take the real one html = (b'' b'') assert og_image_from_html(html) == "https://x.com/article.jpg"