Files
upbeatBytes/goodnews/enrich.py
T
thejayman77 acbc06a9e5 Use BBC's clean image variant (cpsprodpb) instead of the branded one
BBC's og:image comes from the "branded_news" CDN path with a "BBC NEWS" logo
baked into the picture (shows as "…EWS" once the hero crops it). The identical
photo is served under "cpsprodpb" with no logo, so rewrite branded_news →
cpsprodpb. Best of both: full-resolution hero, no burned-in branding. Re-enriched
recent briefs so live images swap over. 99 tests pass.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-02 07:51:51 +00:00

210 lines
7.5 KiB
Python

"""Bounded hero-image enrichment.
The grid stays purely typographic; the daily brief's items are the one place we
make an exception and fetch a real image — because the hero is the single
intentional visual doorway. We fetch ONLY the article page's <head> metadata
(og:image / twitter:image), store ONLY the resulting image URL, and never touch
the body. This is opt-in, brief-only, once per build.
Security (this is the one place we fetch user-/source-supplied pages):
- http(s) only; short timeout; byte cap; redirects followed manually and capped;
- every hop's host is DNS-resolved and rejected if ANY resolved address is
private / loopback / link-local / multicast / reserved / unspecified (SSRF).
Failures are cached by the caller (image_checked_at) so an article is never
retried forever.
"""
from __future__ import annotations
import ipaddress
import re
import socket
import sqlite3
import urllib.error
import urllib.request
from urllib.parse import urljoin, urlsplit
from .text import canonicalize_url
USER_AGENT = "goodNews/0.1 (+local constructive news prototype)"
TIMEOUT = 6
MAX_BYTES = 300_000
MAX_REDIRECTS = 3
_META_RE = re.compile(rb"<meta\b[^>]*>", re.IGNORECASE)
_HEAD_END_RE = re.compile(rb"</head>", re.IGNORECASE)
# Substrings that mark a generic placeholder/default share image rather than the
# article's own picture (e.g. NPR's facebook-default). We'd rather show no image
# (typographic hero) than a generic logo card. NOTE: do NOT add "branded_news" —
# that's BBC's normal CDN path for real article photos, so rejecting it threw away
# every BBC hero image and fell back to the tiny RSS thumbnail.
_GENERIC_IMAGE_MARKERS = (
"facebook-default",
"default-wide",
"default-fb",
"og-default",
"default-og",
"twitter-default",
"default-image",
"/placeholder",
"share-default",
"social-default",
)
def _is_generic_image(url: str) -> bool:
lowered = url.lower()
return any(marker in lowered for marker in _GENERIC_IMAGE_MARKERS)
def _prefer_unbranded(url: str) -> str:
"""Swap BBC's logo-branded image variant for its clean one.
BBC's og:image is served from the "branded_news" CDN path with a "BBC NEWS"
logo baked into the picture (it shows as "…EWS" once the hero crops it). The
identical photo is served under "cpsprodpb" with no logo, so prefer that — a
clean hero at the same full resolution.
"""
if "ichef.bbci.co.uk" in url and "/branded_news/" in url:
return url.replace("/branded_news/", "/cpsprodpb/")
return url
def _attr(tag: bytes, name: bytes) -> bytes | None:
m = re.search(name + rb"""\s*=\s*["']([^"']*)["']""", tag, re.IGNORECASE)
return m.group(1) if m else None
def og_image_from_html(html: bytes) -> str | None:
"""Extract og:image / twitter:image from a page's <head> bytes."""
head = html.split(b"</head>", 1)[0] if _HEAD_END_RE.search(html) else html
for tag in _META_RE.findall(head):
key = _attr(tag, b"property") or _attr(tag, b"name")
if key and key.lower() in (b"og:image", b"og:image:url", b"twitter:image"):
content = _attr(tag, b"content")
if not content:
continue
image = canonicalize_url(content.decode("utf-8", "replace"))
# Skip generic placeholders; keep scanning for a real one.
if image and not _is_generic_image(image):
return _prefer_unbranded(image)
return None
def _host_is_public(host: str | None) -> bool:
"""True only if the host resolves and ALL its addresses are public."""
if not host:
return False
try:
infos = socket.getaddrinfo(host, None)
except (socket.gaierror, UnicodeError, OSError):
return False
addrs = {info[4][0] for info in infos}
if not addrs:
return False
for addr in addrs:
try:
ip = ipaddress.ip_address(addr.split("%")[0]) # strip scope id
except ValueError:
return False
if (
ip.is_private or ip.is_loopback or ip.is_link_local
or ip.is_multicast or ip.is_reserved or ip.is_unspecified
):
return False
return True
class _NoRedirect(urllib.request.HTTPRedirectHandler):
# Don't auto-follow — we re-validate each hop's host ourselves.
def redirect_request(self, *args, **kwargs):
return None
def fetch_og_image(url: str | None) -> str | None:
"""Fetch a page's head metadata and return its og:image URL, or None.
Best-effort and safe: returns None on any error, bad scheme, redirect loop,
or a host that resolves to a non-public address.
"""
opener = urllib.request.build_opener(_NoRedirect)
for _ in range(MAX_REDIRECTS + 1):
if not url:
return None
parts = urlsplit(url)
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
return None
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"})
try:
response = opener.open(request, timeout=TIMEOUT)
except (urllib.error.URLError, OSError, ValueError):
return None
status = getattr(response, "status", 200) or 200
if status in (301, 302, 303, 307, 308):
location = response.headers.get("Location")
response.close()
if not location:
return None
url = urljoin(url, location)
continue
ctype = response.headers.get("Content-Type", "")
if "html" not in ctype.lower():
response.close()
return None
try:
body = response.read(MAX_BYTES)
finally:
response.close()
return og_image_from_html(body)
return None # too many redirects
def enrich_brief_images(
conn: sqlite3.Connection, brief_date: str, fetch=fetch_og_image, limit: int = 7, retry_days: int = 2
) -> int:
"""Fetch a hero-quality image for brief items that lack one.
Any of the brief's items can become the hero (via the client's fallback or a
replace), so this covers the whole brief (limit defaults to the brief size, 7),
not just the top few. Items already carrying an image are left alone; items
still without one are retried after `retry_days` so a transient fetch failure
or a weaker earlier extractor doesn't mark an article imageless forever.
Returns how many images were newly found.
"""
# Fetch even when a feed image exists, because feed thumbnails are often tiny
# and the hero is shown large — a page's og:image is the better hero visual.
rows = conn.execute(
"""
SELECT a.id, a.canonical_url
FROM daily_briefs b
JOIN daily_brief_items bi ON bi.brief_id = b.id
JOIN articles a ON a.id = bi.article_id
WHERE b.brief_date = ?
AND (
a.image_checked_at IS NULL
OR ((a.image_url IS NULL OR a.image_url = '')
AND a.image_checked_at < datetime('now', ?))
)
ORDER BY bi.rank
LIMIT ?
""",
(brief_date, f"-{retry_days} days", limit),
).fetchall()
found = 0
for row in rows:
try:
image = fetch(row["canonical_url"])
except Exception:
image = None
conn.execute(
"UPDATE articles SET image_url = COALESCE(?, image_url), image_checked_at = CURRENT_TIMESTAMP "
"WHERE id = ?",
(image, row["id"]),
)
if image:
found += 1
conn.commit()
return found