Use BBC's clean image variant (cpsprodpb) instead of the branded one
BBC's og:image comes from the "branded_news" CDN path with a "BBC NEWS" logo baked into the picture (shows as "…EWS" once the hero crops it). The identical photo is served under "cpsprodpb" with no logo, so rewrite branded_news → cpsprodpb. Best of both: full-resolution hero, no burned-in branding. Re-enriched recent briefs so live images swap over. 99 tests pass. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+15
-2
@@ -58,6 +58,19 @@ def _is_generic_image(url: str) -> bool:
|
|||||||
return any(marker in lowered for marker in _GENERIC_IMAGE_MARKERS)
|
return any(marker in lowered for marker in _GENERIC_IMAGE_MARKERS)
|
||||||
|
|
||||||
|
|
||||||
|
def _prefer_unbranded(url: str) -> str:
|
||||||
|
"""Swap BBC's logo-branded image variant for its clean one.
|
||||||
|
|
||||||
|
BBC's og:image is served from the "branded_news" CDN path with a "BBC NEWS"
|
||||||
|
logo baked into the picture (it shows as "…EWS" once the hero crops it). The
|
||||||
|
identical photo is served under "cpsprodpb" with no logo, so prefer that — a
|
||||||
|
clean hero at the same full resolution.
|
||||||
|
"""
|
||||||
|
if "ichef.bbci.co.uk" in url and "/branded_news/" in url:
|
||||||
|
return url.replace("/branded_news/", "/cpsprodpb/")
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
def _attr(tag: bytes, name: bytes) -> bytes | None:
|
def _attr(tag: bytes, name: bytes) -> bytes | None:
|
||||||
m = re.search(name + rb"""\s*=\s*["']([^"']*)["']""", tag, re.IGNORECASE)
|
m = re.search(name + rb"""\s*=\s*["']([^"']*)["']""", tag, re.IGNORECASE)
|
||||||
return m.group(1) if m else None
|
return m.group(1) if m else None
|
||||||
@@ -73,9 +86,9 @@ def og_image_from_html(html: bytes) -> str | None:
|
|||||||
if not content:
|
if not content:
|
||||||
continue
|
continue
|
||||||
image = canonicalize_url(content.decode("utf-8", "replace"))
|
image = canonicalize_url(content.decode("utf-8", "replace"))
|
||||||
# Skip branded/generic share images; keep scanning for a real one.
|
# Skip generic placeholders; keep scanning for a real one.
|
||||||
if image and not _is_generic_image(image):
|
if image and not _is_generic_image(image):
|
||||||
return image
|
return _prefer_unbranded(image)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -77,8 +77,9 @@ def test_rejects_generic_share_images():
|
|||||||
assert og_image_from_html(b'<meta property="og:image" content="https://x.com/og-default.jpg">') is None
|
assert og_image_from_html(b'<meta property="og:image" content="https://x.com/og-default.jpg">') is None
|
||||||
# a real article image comes through
|
# a real article image comes through
|
||||||
assert og_image_from_html(b'<meta property="og:image" content="https://x.com/real-photo.jpg">') == "https://x.com/real-photo.jpg"
|
assert og_image_from_html(b'<meta property="og:image" content="https://x.com/real-photo.jpg">') == "https://x.com/real-photo.jpg"
|
||||||
# BBC's branded_news path is a REAL photo path, not a placeholder — keep it
|
# BBC's branded_news path is a real photo (logo baked in) — keep it, but swap
|
||||||
assert og_image_from_html(b'<meta property="og:image" content="https://ichef.bbci.co.uk/news/1024/branded_news/x.jpg">') == "https://ichef.bbci.co.uk/news/1024/branded_news/x.jpg"
|
# to the clean cpsprodpb variant so the hero isn't branded
|
||||||
|
assert og_image_from_html(b'<meta property="og:image" content="https://ichef.bbci.co.uk/news/1024/branded_news/x.jpg">') == "https://ichef.bbci.co.uk/news/1024/cpsprodpb/x.jpg"
|
||||||
# if the first og is a generic placeholder but a later one is real, take the real one
|
# if the first og is a generic placeholder but a later one is real, take the real one
|
||||||
html = (b'<meta property="og:image" content="https://x.com/og-default.jpg">'
|
html = (b'<meta property="og:image" content="https://x.com/og-default.jpg">'
|
||||||
b'<meta property="og:image" content="https://x.com/article.jpg">')
|
b'<meta property="og:image" content="https://x.com/article.jpg">')
|
||||||
|
|||||||
Reference in New Issue
Block a user