Stop rejecting BBC's branded_news images (the blurry-hero bug)
og:image extraction rejected any URL containing "branded_news" as a generic share image, but that's BBC's normal CDN path for real article photos. So every BBC hero fell back to the 240px RSS thumbnail (blurry when shown large). Drop that marker; keep the genuine placeholder markers (facebook-default, og-default, etc.). Updated the test to assert BBC branded_news paths pass through. 99 tests pass. (One-time: cleared image_checked_at on the 57 previously-checked articles and re-enriched recent briefs so existing thumbnails upgrade to og:images.) Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+5
-4
@@ -34,11 +34,12 @@ MAX_REDIRECTS = 3
|
||||
_META_RE = re.compile(rb"<meta\b[^>]*>", re.IGNORECASE)
|
||||
_HEAD_END_RE = re.compile(rb"</head>", re.IGNORECASE)
|
||||
|
||||
# Substrings that mark a branded/generic share image rather than the article's
|
||||
# own picture (e.g. BBC burns "BBC NEWS" into branded_news; NPR uses a default).
|
||||
# We'd rather show no image (typographic hero) than a competitor logo.
|
||||
# Substrings that mark a generic placeholder/default share image rather than the
|
||||
# article's own picture (e.g. NPR's facebook-default). We'd rather show no image
|
||||
# (typographic hero) than a generic logo card. NOTE: do NOT add "branded_news" —
|
||||
# that's BBC's normal CDN path for real article photos, so rejecting it threw away
|
||||
# every BBC hero image and fell back to the tiny RSS thumbnail.
|
||||
_GENERIC_IMAGE_MARKERS = (
|
||||
"branded_news",
|
||||
"facebook-default",
|
||||
"default-wide",
|
||||
"default-fb",
|
||||
|
||||
@@ -71,12 +71,15 @@ def test_enrich_upgrades_existing_feed_image(tmp_path):
|
||||
c.close()
|
||||
|
||||
|
||||
def test_rejects_branded_and_generic_share_images():
|
||||
assert og_image_from_html(b'<meta property="og:image" content="https://ichef.bbci.co.uk/news/1024/branded_news/x.jpg">') is None
|
||||
def test_rejects_generic_share_images():
|
||||
# genuine placeholder/default share images are skipped
|
||||
assert og_image_from_html(b'<meta name="twitter:image" content="https://media.npr.org/include/images/facebook-default-wide-s.jpg">') is None
|
||||
# a real article image still comes through
|
||||
assert og_image_from_html(b'<meta property="og:image" content="https://x.com/og-default.jpg">') is None
|
||||
# a real article image comes through
|
||||
assert og_image_from_html(b'<meta property="og:image" content="https://x.com/real-photo.jpg">') == "https://x.com/real-photo.jpg"
|
||||
# if the first og is branded but a later one is real, take the real one
|
||||
html = (b'<meta property="og:image" content="https://x.com/branded_news/logo.jpg">'
|
||||
# BBC's branded_news path is a REAL photo path, not a placeholder — keep it
|
||||
assert og_image_from_html(b'<meta property="og:image" content="https://ichef.bbci.co.uk/news/1024/branded_news/x.jpg">') == "https://ichef.bbci.co.uk/news/1024/branded_news/x.jpg"
|
||||
# if the first og is a generic placeholder but a later one is real, take the real one
|
||||
html = (b'<meta property="og:image" content="https://x.com/og-default.jpg">'
|
||||
b'<meta property="og:image" content="https://x.com/article.jpg">')
|
||||
assert og_image_from_html(html) == "https://x.com/article.jpg"
|
||||
|
||||
Reference in New Issue
Block a user