acbc06a9e5
BBC's og:image comes from the "branded_news" CDN path with a "BBC NEWS" logo baked into the picture (shows as "…EWS" once the hero crops it). The identical photo is served under "cpsprodpb" with no logo, so rewrite branded_news → cpsprodpb. Best of both: full-resolution hero, no burned-in branding. Re-enriched recent briefs so live images swap over. 99 tests pass. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
210 lines
7.5 KiB
Python
210 lines
7.5 KiB
Python
"""Bounded hero-image enrichment.
|
|
|
|
The grid stays purely typographic; the daily brief's items are the one place we
|
|
make an exception and fetch a real image — because the hero is the single
|
|
intentional visual doorway. We fetch ONLY the article page's <head> metadata
|
|
(og:image / twitter:image), store ONLY the resulting image URL, and never touch
|
|
the body. This is opt-in, brief-only, once per build.
|
|
|
|
Security (this is the one place we fetch user-/source-supplied pages):
|
|
- http(s) only; short timeout; byte cap; redirects followed manually and capped;
|
|
- every hop's host is DNS-resolved and rejected if ANY resolved address is
|
|
private / loopback / link-local / multicast / reserved / unspecified (SSRF).
|
|
Failures are cached by the caller (image_checked_at) so an article is never
|
|
retried forever.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import ipaddress
|
|
import re
|
|
import socket
|
|
import sqlite3
|
|
import urllib.error
|
|
import urllib.request
|
|
from urllib.parse import urljoin, urlsplit
|
|
|
|
from .text import canonicalize_url
|
|
|
|
USER_AGENT = "goodNews/0.1 (+local constructive news prototype)"
|
|
TIMEOUT = 6
|
|
MAX_BYTES = 300_000
|
|
MAX_REDIRECTS = 3
|
|
|
|
_META_RE = re.compile(rb"<meta\b[^>]*>", re.IGNORECASE)
|
|
_HEAD_END_RE = re.compile(rb"</head>", re.IGNORECASE)
|
|
|
|
# Substrings that mark a generic placeholder/default share image rather than the
|
|
# article's own picture (e.g. NPR's facebook-default). We'd rather show no image
|
|
# (typographic hero) than a generic logo card. NOTE: do NOT add "branded_news" —
|
|
# that's BBC's normal CDN path for real article photos, so rejecting it threw away
|
|
# every BBC hero image and fell back to the tiny RSS thumbnail.
|
|
_GENERIC_IMAGE_MARKERS = (
|
|
"facebook-default",
|
|
"default-wide",
|
|
"default-fb",
|
|
"og-default",
|
|
"default-og",
|
|
"twitter-default",
|
|
"default-image",
|
|
"/placeholder",
|
|
"share-default",
|
|
"social-default",
|
|
)
|
|
|
|
|
|
def _is_generic_image(url: str) -> bool:
|
|
lowered = url.lower()
|
|
return any(marker in lowered for marker in _GENERIC_IMAGE_MARKERS)
|
|
|
|
|
|
def _prefer_unbranded(url: str) -> str:
|
|
"""Swap BBC's logo-branded image variant for its clean one.
|
|
|
|
BBC's og:image is served from the "branded_news" CDN path with a "BBC NEWS"
|
|
logo baked into the picture (it shows as "…EWS" once the hero crops it). The
|
|
identical photo is served under "cpsprodpb" with no logo, so prefer that — a
|
|
clean hero at the same full resolution.
|
|
"""
|
|
if "ichef.bbci.co.uk" in url and "/branded_news/" in url:
|
|
return url.replace("/branded_news/", "/cpsprodpb/")
|
|
return url
|
|
|
|
|
|
def _attr(tag: bytes, name: bytes) -> bytes | None:
|
|
m = re.search(name + rb"""\s*=\s*["']([^"']*)["']""", tag, re.IGNORECASE)
|
|
return m.group(1) if m else None
|
|
|
|
|
|
def og_image_from_html(html: bytes) -> str | None:
|
|
"""Extract og:image / twitter:image from a page's <head> bytes."""
|
|
head = html.split(b"</head>", 1)[0] if _HEAD_END_RE.search(html) else html
|
|
for tag in _META_RE.findall(head):
|
|
key = _attr(tag, b"property") or _attr(tag, b"name")
|
|
if key and key.lower() in (b"og:image", b"og:image:url", b"twitter:image"):
|
|
content = _attr(tag, b"content")
|
|
if not content:
|
|
continue
|
|
image = canonicalize_url(content.decode("utf-8", "replace"))
|
|
# Skip generic placeholders; keep scanning for a real one.
|
|
if image and not _is_generic_image(image):
|
|
return _prefer_unbranded(image)
|
|
return None
|
|
|
|
|
|
def _host_is_public(host: str | None) -> bool:
|
|
"""True only if the host resolves and ALL its addresses are public."""
|
|
if not host:
|
|
return False
|
|
try:
|
|
infos = socket.getaddrinfo(host, None)
|
|
except (socket.gaierror, UnicodeError, OSError):
|
|
return False
|
|
addrs = {info[4][0] for info in infos}
|
|
if not addrs:
|
|
return False
|
|
for addr in addrs:
|
|
try:
|
|
ip = ipaddress.ip_address(addr.split("%")[0]) # strip scope id
|
|
except ValueError:
|
|
return False
|
|
if (
|
|
ip.is_private or ip.is_loopback or ip.is_link_local
|
|
or ip.is_multicast or ip.is_reserved or ip.is_unspecified
|
|
):
|
|
return False
|
|
return True
|
|
|
|
|
|
class _NoRedirect(urllib.request.HTTPRedirectHandler):
|
|
# Don't auto-follow — we re-validate each hop's host ourselves.
|
|
def redirect_request(self, *args, **kwargs):
|
|
return None
|
|
|
|
|
|
def fetch_og_image(url: str | None) -> str | None:
|
|
"""Fetch a page's head metadata and return its og:image URL, or None.
|
|
|
|
Best-effort and safe: returns None on any error, bad scheme, redirect loop,
|
|
or a host that resolves to a non-public address.
|
|
"""
|
|
opener = urllib.request.build_opener(_NoRedirect)
|
|
for _ in range(MAX_REDIRECTS + 1):
|
|
if not url:
|
|
return None
|
|
parts = urlsplit(url)
|
|
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
|
|
return None
|
|
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"})
|
|
try:
|
|
response = opener.open(request, timeout=TIMEOUT)
|
|
except (urllib.error.URLError, OSError, ValueError):
|
|
return None
|
|
status = getattr(response, "status", 200) or 200
|
|
if status in (301, 302, 303, 307, 308):
|
|
location = response.headers.get("Location")
|
|
response.close()
|
|
if not location:
|
|
return None
|
|
url = urljoin(url, location)
|
|
continue
|
|
ctype = response.headers.get("Content-Type", "")
|
|
if "html" not in ctype.lower():
|
|
response.close()
|
|
return None
|
|
try:
|
|
body = response.read(MAX_BYTES)
|
|
finally:
|
|
response.close()
|
|
return og_image_from_html(body)
|
|
return None # too many redirects
|
|
|
|
|
|
def enrich_brief_images(
|
|
conn: sqlite3.Connection, brief_date: str, fetch=fetch_og_image, limit: int = 7, retry_days: int = 2
|
|
) -> int:
|
|
"""Fetch a hero-quality image for brief items that lack one.
|
|
|
|
Any of the brief's items can become the hero (via the client's fallback or a
|
|
replace), so this covers the whole brief (limit defaults to the brief size, 7),
|
|
not just the top few. Items already carrying an image are left alone; items
|
|
still without one are retried after `retry_days` so a transient fetch failure
|
|
or a weaker earlier extractor doesn't mark an article imageless forever.
|
|
Returns how many images were newly found.
|
|
"""
|
|
# Fetch even when a feed image exists, because feed thumbnails are often tiny
|
|
# and the hero is shown large — a page's og:image is the better hero visual.
|
|
rows = conn.execute(
|
|
"""
|
|
SELECT a.id, a.canonical_url
|
|
FROM daily_briefs b
|
|
JOIN daily_brief_items bi ON bi.brief_id = b.id
|
|
JOIN articles a ON a.id = bi.article_id
|
|
WHERE b.brief_date = ?
|
|
AND (
|
|
a.image_checked_at IS NULL
|
|
OR ((a.image_url IS NULL OR a.image_url = '')
|
|
AND a.image_checked_at < datetime('now', ?))
|
|
)
|
|
ORDER BY bi.rank
|
|
LIMIT ?
|
|
""",
|
|
(brief_date, f"-{retry_days} days", limit),
|
|
).fetchall()
|
|
|
|
found = 0
|
|
for row in rows:
|
|
try:
|
|
image = fetch(row["canonical_url"])
|
|
except Exception:
|
|
image = None
|
|
conn.execute(
|
|
"UPDATE articles SET image_url = COALESCE(?, image_url), image_checked_at = CURRENT_TIMESTAMP "
|
|
"WHERE id = ?",
|
|
(image, row["id"]),
|
|
)
|
|
if image:
|
|
found += 1
|
|
conn.commit()
|
|
return found
|