diff --git a/goodnews/enrich.py b/goodnews/enrich.py index 1f2fa4e..e41f4a0 100644 --- a/goodnews/enrich.py +++ b/goodnews/enrich.py @@ -58,6 +58,19 @@ def _is_generic_image(url: str) -> bool: return any(marker in lowered for marker in _GENERIC_IMAGE_MARKERS) +def _prefer_unbranded(url: str) -> str: + """Swap BBC's logo-branded image variant for its clean one. + + BBC's og:image is served from the "branded_news" CDN path with a "BBC NEWS" + logo baked into the picture (it shows as "…EWS" once the hero crops it). The + identical photo is served under "cpsprodpb" with no logo, so prefer that — a + clean hero at the same full resolution. + """ + if "ichef.bbci.co.uk" in url and "/branded_news/" in url: + return url.replace("/branded_news/", "/cpsprodpb/") + return url + + def _attr(tag: bytes, name: bytes) -> bytes | None: m = re.search(name + rb"""\s*=\s*["']([^"']*)["']""", tag, re.IGNORECASE) return m.group(1) if m else None @@ -73,9 +86,9 @@ def og_image_from_html(html: bytes) -> str | None: if not content: continue image = canonicalize_url(content.decode("utf-8", "replace")) - # Skip branded/generic share images; keep scanning for a real one. + # Skip generic placeholders; keep scanning for a real one. if image and not _is_generic_image(image): - return image + return _prefer_unbranded(image) return None diff --git a/tests/test_enrich.py b/tests/test_enrich.py index 7afde6d..1ac7c50 100644 --- a/tests/test_enrich.py +++ b/tests/test_enrich.py @@ -77,8 +77,9 @@ def test_rejects_generic_share_images(): assert og_image_from_html(b'') is None # a real article image comes through assert og_image_from_html(b'') == "https://x.com/real-photo.jpg" - # BBC's branded_news path is a REAL photo path, not a placeholder — keep it - assert og_image_from_html(b'') == "https://ichef.bbci.co.uk/news/1024/branded_news/x.jpg" + # BBC's branded_news path is a real photo (logo baked in) — keep it, but swap + # to the clean cpsprodpb variant so the hero isn't branded + assert og_image_from_html(b'') == "https://ichef.bbci.co.uk/news/1024/cpsprodpb/x.jpg" # if the first og is a generic placeholder but a later one is real, take the real one html = (b'' b'')