Files
thejayman77 acbc06a9e5 Use BBC's clean image variant (cpsprodpb) instead of the branded one
BBC's og:image comes from the "branded_news" CDN path with a "BBC NEWS" logo
baked into the picture (shows as "…EWS" once the hero crops it). The identical
photo is served under "cpsprodpb" with no logo, so rewrite branded_news →
cpsprodpb. Best of both: full-resolution hero, no burned-in branding. Re-enriched
recent briefs so live images swap over. 99 tests pass.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-02 07:51:51 +00:00

87 lines
4.8 KiB
Python

import pytest
from goodnews.db import connect, init_db
from goodnews.enrich import og_image_from_html, _host_is_public, enrich_brief_images
def test_og_image_parser_handles_attr_order_and_fallback():
assert og_image_from_html(b'<head><meta property="og:image" content="https://x.com/a.jpg"></head>') == "https://x.com/a.jpg"
assert og_image_from_html(b'<meta content="https://x.com/b.jpg" property="og:image">') == "https://x.com/b.jpg"
assert og_image_from_html(b'<meta name="twitter:image" content="https://x.com/c.jpg">') == "https://x.com/c.jpg"
assert og_image_from_html(b"<html><body>nope</body></html>") is None
def test_og_image_only_reads_head():
# a meta after </head> must be ignored
assert og_image_from_html(b'<head></head><meta property="og:image" content="https://x.com/d.jpg">') is None
def test_host_public_guard_blocks_internal_ranges():
assert _host_is_public("8.8.8.8") # public
assert not _host_is_public("127.0.0.1") # loopback
assert not _host_is_public("10.0.0.1") # private
assert not _host_is_public("192.168.1.1") # private
assert not _host_is_public("169.254.0.1") # link-local
assert not _host_is_public("0.0.0.0") # unspecified
assert not _host_is_public("")
assert not _host_is_public(None)
@pytest.fixture
def conn():
c = connect(":memory:"); init_db(c)
c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)")
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash) VALUES (1,1,'https://phys.org/x','t1','h1')")
c.execute("INSERT INTO daily_briefs (id,brief_date,title) VALUES (1,'2026-05-31','B')")
c.execute("INSERT INTO daily_brief_items (brief_id,article_id,rank) VALUES (1,1,1)")
c.commit(); yield c; c.close()
def test_enrich_sets_image_and_stamps(conn):
calls = []
found = enrich_brief_images(conn, "2026-05-31", fetch=lambda u: calls.append(u) or "https://img.example/p.jpg")
assert found == 1 and calls == ["https://phys.org/x"]
r = conn.execute("SELECT image_url, image_checked_at FROM articles WHERE id=1").fetchone()
assert r["image_url"] == "https://img.example/p.jpg" and r["image_checked_at"] is not None
def test_enrich_caches_failure_and_does_not_retry(conn):
calls = []
fail = lambda u: calls.append(u) or None
assert enrich_brief_images(conn, "2026-05-31", fetch=fail) == 0
r = conn.execute("SELECT image_url, image_checked_at FROM articles WHERE id=1").fetchone()
assert r["image_url"] is None and r["image_checked_at"] is not None # checked, cached
assert enrich_brief_images(conn, "2026-05-31", fetch=fail) == 0
assert len(calls) == 1 # not retried once checked
def test_enrich_upgrades_existing_feed_image(tmp_path):
# A brief item with a (small) feed image should be upgraded to og:image.
from goodnews.db import connect as _c, init_db as _i
c = _c(":memory:"); _i(c)
c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)")
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) "
"VALUES (1,1,'https://bbc.com/x','t1','h1','https://bbc.com/tiny-thumb.jpg')")
c.execute("INSERT INTO daily_briefs (id,brief_date,title) VALUES (1,'2026-05-31','B')")
c.execute("INSERT INTO daily_brief_items (brief_id,article_id,rank) VALUES (1,1,1)")
c.commit()
found = enrich_brief_images(c, "2026-05-31", fetch=lambda u: "https://bbc.com/big-og.jpg")
assert found == 1
assert c.execute("SELECT image_url FROM articles WHERE id=1").fetchone()["image_url"] == "https://bbc.com/big-og.jpg"
c.close()
def test_rejects_generic_share_images():
# genuine placeholder/default share images are skipped
assert og_image_from_html(b'<meta name="twitter:image" content="https://media.npr.org/include/images/facebook-default-wide-s.jpg">') is None
assert og_image_from_html(b'<meta property="og:image" content="https://x.com/og-default.jpg">') is None
# a real article image comes through
assert og_image_from_html(b'<meta property="og:image" content="https://x.com/real-photo.jpg">') == "https://x.com/real-photo.jpg"
# BBC's branded_news path is a real photo (logo baked in) — keep it, but swap
# to the clean cpsprodpb variant so the hero isn't branded
assert og_image_from_html(b'<meta property="og:image" content="https://ichef.bbci.co.uk/news/1024/branded_news/x.jpg">') == "https://ichef.bbci.co.uk/news/1024/cpsprodpb/x.jpg"
# if the first og is a generic placeholder but a later one is real, take the real one
html = (b'<meta property="og:image" content="https://x.com/og-default.jpg">'
b'<meta property="og:image" content="https://x.com/article.jpg">')
assert og_image_from_html(html) == "https://x.com/article.jpg"