acbc06a9e5
BBC's og:image comes from the "branded_news" CDN path with a "BBC NEWS" logo baked into the picture (shows as "…EWS" once the hero crops it). The identical photo is served under "cpsprodpb" with no logo, so rewrite branded_news → cpsprodpb. Best of both: full-resolution hero, no burned-in branding. Re-enriched recent briefs so live images swap over. 99 tests pass. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
87 lines
4.8 KiB
Python
87 lines
4.8 KiB
Python
import pytest
|
|
|
|
from goodnews.db import connect, init_db
|
|
from goodnews.enrich import og_image_from_html, _host_is_public, enrich_brief_images
|
|
|
|
|
|
def test_og_image_parser_handles_attr_order_and_fallback():
|
|
assert og_image_from_html(b'<head><meta property="og:image" content="https://x.com/a.jpg"></head>') == "https://x.com/a.jpg"
|
|
assert og_image_from_html(b'<meta content="https://x.com/b.jpg" property="og:image">') == "https://x.com/b.jpg"
|
|
assert og_image_from_html(b'<meta name="twitter:image" content="https://x.com/c.jpg">') == "https://x.com/c.jpg"
|
|
assert og_image_from_html(b"<html><body>nope</body></html>") is None
|
|
|
|
|
|
def test_og_image_only_reads_head():
|
|
# a meta after </head> must be ignored
|
|
assert og_image_from_html(b'<head></head><meta property="og:image" content="https://x.com/d.jpg">') is None
|
|
|
|
|
|
def test_host_public_guard_blocks_internal_ranges():
|
|
assert _host_is_public("8.8.8.8") # public
|
|
assert not _host_is_public("127.0.0.1") # loopback
|
|
assert not _host_is_public("10.0.0.1") # private
|
|
assert not _host_is_public("192.168.1.1") # private
|
|
assert not _host_is_public("169.254.0.1") # link-local
|
|
assert not _host_is_public("0.0.0.0") # unspecified
|
|
assert not _host_is_public("")
|
|
assert not _host_is_public(None)
|
|
|
|
|
|
@pytest.fixture
|
|
def conn():
|
|
c = connect(":memory:"); init_db(c)
|
|
c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)")
|
|
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash) VALUES (1,1,'https://phys.org/x','t1','h1')")
|
|
c.execute("INSERT INTO daily_briefs (id,brief_date,title) VALUES (1,'2026-05-31','B')")
|
|
c.execute("INSERT INTO daily_brief_items (brief_id,article_id,rank) VALUES (1,1,1)")
|
|
c.commit(); yield c; c.close()
|
|
|
|
|
|
def test_enrich_sets_image_and_stamps(conn):
|
|
calls = []
|
|
found = enrich_brief_images(conn, "2026-05-31", fetch=lambda u: calls.append(u) or "https://img.example/p.jpg")
|
|
assert found == 1 and calls == ["https://phys.org/x"]
|
|
r = conn.execute("SELECT image_url, image_checked_at FROM articles WHERE id=1").fetchone()
|
|
assert r["image_url"] == "https://img.example/p.jpg" and r["image_checked_at"] is not None
|
|
|
|
|
|
def test_enrich_caches_failure_and_does_not_retry(conn):
|
|
calls = []
|
|
fail = lambda u: calls.append(u) or None
|
|
assert enrich_brief_images(conn, "2026-05-31", fetch=fail) == 0
|
|
r = conn.execute("SELECT image_url, image_checked_at FROM articles WHERE id=1").fetchone()
|
|
assert r["image_url"] is None and r["image_checked_at"] is not None # checked, cached
|
|
assert enrich_brief_images(conn, "2026-05-31", fetch=fail) == 0
|
|
assert len(calls) == 1 # not retried once checked
|
|
|
|
|
|
def test_enrich_upgrades_existing_feed_image(tmp_path):
|
|
# A brief item with a (small) feed image should be upgraded to og:image.
|
|
from goodnews.db import connect as _c, init_db as _i
|
|
c = _c(":memory:"); _i(c)
|
|
c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)")
|
|
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) "
|
|
"VALUES (1,1,'https://bbc.com/x','t1','h1','https://bbc.com/tiny-thumb.jpg')")
|
|
c.execute("INSERT INTO daily_briefs (id,brief_date,title) VALUES (1,'2026-05-31','B')")
|
|
c.execute("INSERT INTO daily_brief_items (brief_id,article_id,rank) VALUES (1,1,1)")
|
|
c.commit()
|
|
found = enrich_brief_images(c, "2026-05-31", fetch=lambda u: "https://bbc.com/big-og.jpg")
|
|
assert found == 1
|
|
assert c.execute("SELECT image_url FROM articles WHERE id=1").fetchone()["image_url"] == "https://bbc.com/big-og.jpg"
|
|
c.close()
|
|
|
|
|
|
def test_rejects_generic_share_images():
|
|
# genuine placeholder/default share images are skipped
|
|
assert og_image_from_html(b'<meta name="twitter:image" content="https://media.npr.org/include/images/facebook-default-wide-s.jpg">') is None
|
|
assert og_image_from_html(b'<meta property="og:image" content="https://x.com/og-default.jpg">') is None
|
|
# a real article image comes through
|
|
assert og_image_from_html(b'<meta property="og:image" content="https://x.com/real-photo.jpg">') == "https://x.com/real-photo.jpg"
|
|
# BBC's branded_news path is a real photo (logo baked in) — keep it, but swap
|
|
# to the clean cpsprodpb variant so the hero isn't branded
|
|
assert og_image_from_html(b'<meta property="og:image" content="https://ichef.bbci.co.uk/news/1024/branded_news/x.jpg">') == "https://ichef.bbci.co.uk/news/1024/cpsprodpb/x.jpg"
|
|
# if the first og is a generic placeholder but a later one is real, take the real one
|
|
html = (b'<meta property="og:image" content="https://x.com/og-default.jpg">'
|
|
b'<meta property="og:image" content="https://x.com/article.jpg">')
|
|
assert og_image_from_html(html) == "https://x.com/article.jpg"
|