Stop rejecting BBC's branded_news images (the blurry-hero bug)

og:image extraction rejected any URL containing "branded_news" as a generic share
image, but that's BBC's normal CDN path for real article photos. So every BBC hero
fell back to the 240px RSS thumbnail (blurry when shown large). Drop that marker;
keep the genuine placeholder markers (facebook-default, og-default, etc.). Updated
the test to assert BBC branded_news paths pass through. 99 tests pass.

(One-time: cleared image_checked_at on the 57 previously-checked articles and
re-enriched recent briefs so existing thumbnails upgrade to og:images.)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
jay
2026-06-02 07:47:08 +00:00
parent 6d5bcb13e5
commit 2145622b59
2 changed files with 13 additions and 9 deletions
+5 -4
View File
@@ -34,11 +34,12 @@ MAX_REDIRECTS = 3
_META_RE = re.compile(rb"<meta\b[^>]*>", re.IGNORECASE)
_HEAD_END_RE = re.compile(rb"</head>", re.IGNORECASE)
# Substrings that mark a branded/generic share image rather than the article's
# own picture (e.g. BBC burns "BBC NEWS" into branded_news; NPR uses a default).
# We'd rather show no image (typographic hero) than a competitor logo.
# Substrings that mark a generic placeholder/default share image rather than the
# article's own picture (e.g. NPR's facebook-default). We'd rather show no image
# (typographic hero) than a generic logo card. NOTE: do NOT add "branded_news" —
# that's BBC's normal CDN path for real article photos, so rejecting it threw away
# every BBC hero image and fell back to the tiny RSS thumbnail.
_GENERIC_IMAGE_MARKERS = (
"branded_news",
"facebook-default",
"default-wide",
"default-fb",
+8 -5
View File
@@ -71,12 +71,15 @@ def test_enrich_upgrades_existing_feed_image(tmp_path):
c.close()
def test_rejects_branded_and_generic_share_images():
assert og_image_from_html(b'<meta property="og:image" content="https://ichef.bbci.co.uk/news/1024/branded_news/x.jpg">') is None
def test_rejects_generic_share_images():
# genuine placeholder/default share images are skipped
assert og_image_from_html(b'<meta name="twitter:image" content="https://media.npr.org/include/images/facebook-default-wide-s.jpg">') is None
# a real article image still comes through
assert og_image_from_html(b'<meta property="og:image" content="https://x.com/og-default.jpg">') is None
# a real article image comes through
assert og_image_from_html(b'<meta property="og:image" content="https://x.com/real-photo.jpg">') == "https://x.com/real-photo.jpg"
# if the first og is branded but a later one is real, take the real one
html = (b'<meta property="og:image" content="https://x.com/branded_news/logo.jpg">'
# BBC's branded_news path is a REAL photo path, not a placeholder — keep it
assert og_image_from_html(b'<meta property="og:image" content="https://ichef.bbci.co.uk/news/1024/branded_news/x.jpg">') == "https://ichef.bbci.co.uk/news/1024/branded_news/x.jpg"
# if the first og is a generic placeholder but a later one is real, take the real one
html = (b'<meta property="og:image" content="https://x.com/og-default.jpg">'
b'<meta property="og:image" content="https://x.com/article.jpg">')
assert og_image_from_html(html) == "https://x.com/article.jpg"