upbeatBytes/goodnews/enrich.py

"""Bounded hero-image enrichment.

The grid stays purely typographic; the daily brief's items are the one place we
make an exception and fetch a real image — because the hero is the single
intentional visual doorway. We fetch ONLY the article page's <head> metadata
(og:image / twitter:image), store ONLY the resulting image URL, and never touch
the body. This is opt-in, brief-only, once per build.

Security (this is the one place we fetch user-/source-supplied pages):
- http(s) only; short timeout; byte cap; redirects followed manually and capped;
- every hop's host is DNS-resolved and rejected if ANY resolved address is
  private / loopback / link-local / multicast / reserved / unspecified (SSRF).
Failures are cached by the caller (image_checked_at) so an article is never
retried forever.
"""

from __future__ import annotations

import ipaddress
import re
import socket
import sqlite3
import urllib.error
import urllib.request
from urllib.parse import urljoin, urlsplit

from .text import canonicalize_url

USER_AGENT = "goodNews/0.1 (+local constructive news prototype)"
TIMEOUT = 6
MAX_BYTES = 300_000
MAX_REDIRECTS = 3

_META_RE = re.compile(rb"<meta\b[^>]*>", re.IGNORECASE)
_HEAD_END_RE = re.compile(rb"</head>", re.IGNORECASE)

# Substrings that mark a generic placeholder/default share image rather than the
# article's own picture (e.g. NPR's facebook-default). We'd rather show no image
# (typographic hero) than a generic logo card. NOTE: do NOT add "branded_news" —
# that's BBC's normal CDN path for real article photos, so rejecting it threw away
# every BBC hero image and fell back to the tiny RSS thumbnail.
_GENERIC_IMAGE_MARKERS = (
    "facebook-default",
    "default-wide",
    "default-fb",
    "og-default",
    "default-og",
    "twitter-default",
    "default-image",
    "/placeholder",
    "share-default",
    "social-default",
)


def _is_generic_image(url: str) -> bool:
    lowered = url.lower()
    return any(marker in lowered for marker in _GENERIC_IMAGE_MARKERS)


def _prefer_unbranded(url: str) -> str:
    """Swap BBC's logo-branded image variant for its clean one.

    BBC's og:image is served from the "branded_news" CDN path with a "BBC NEWS"
    logo baked into the picture (it shows as "…EWS" once the hero crops it). The
    identical photo is served under "cpsprodpb" with no logo, so prefer that — a
    clean hero at the same full resolution.
    """
    if "ichef.bbci.co.uk" in url and "/branded_news/" in url:
        return url.replace("/branded_news/", "/cpsprodpb/")
    return url


def _attr(tag: bytes, name: bytes) -> bytes | None:
    m = re.search(name + rb"""\s*=\s*["']([^"']*)["']""", tag, re.IGNORECASE)
    return m.group(1) if m else None


def og_image_from_html(html: bytes) -> str | None:
    """Extract og:image / twitter:image from a page's <head> bytes."""
    head = html.split(b"</head>", 1)[0] if _HEAD_END_RE.search(html) else html
    for tag in _META_RE.findall(head):
        key = _attr(tag, b"property") or _attr(tag, b"name")
        if key and key.lower() in (b"og:image", b"og:image:url", b"twitter:image"):
            content = _attr(tag, b"content")
            if not content:
                continue
            image = canonicalize_url(content.decode("utf-8", "replace"))
            # Skip generic placeholders; keep scanning for a real one.
            if image and not _is_generic_image(image):
                return _prefer_unbranded(image)
    return None


def _host_is_public(host: str | None) -> bool:
    """True only if the host resolves and ALL its addresses are public."""
    if not host:
        return False
    try:
        infos = socket.getaddrinfo(host, None)
    except (socket.gaierror, UnicodeError, OSError):
        return False
    addrs = {info[4][0] for info in infos}
    if not addrs:
        return False
    for addr in addrs:
        try:
            ip = ipaddress.ip_address(addr.split("%")[0])  # strip scope id
        except ValueError:
            return False
        if (
            ip.is_private or ip.is_loopback or ip.is_link_local
            or ip.is_multicast or ip.is_reserved or ip.is_unspecified
        ):
            return False
    return True


class _NoRedirect(urllib.request.HTTPRedirectHandler):
    # Don't auto-follow — we re-validate each hop's host ourselves.
    def redirect_request(self, *args, **kwargs):
        return None


def fetch_og_image(url: str | None) -> str | None:
    """Fetch a page's head metadata and return its og:image URL, or None.

    Best-effort and safe: returns None on any error, bad scheme, redirect loop,
    or a host that resolves to a non-public address.
    """
    opener = urllib.request.build_opener(_NoRedirect)
    for _ in range(MAX_REDIRECTS + 1):
        if not url:
            return None
        parts = urlsplit(url)
        if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
            return None
        request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"})
        try:
            response = opener.open(request, timeout=TIMEOUT)
        except (urllib.error.URLError, OSError, ValueError):
            return None
        status = getattr(response, "status", 200) or 200
        if status in (301, 302, 303, 307, 308):
            location = response.headers.get("Location")
            response.close()
            if not location:
                return None
            url = urljoin(url, location)
            continue
        ctype = response.headers.get("Content-Type", "")
        if "html" not in ctype.lower():
            response.close()
            return None
        try:
            body = response.read(MAX_BYTES)
        finally:
            response.close()
        return og_image_from_html(body)
    return None  # too many redirects


def enrich_brief_images(
    conn: sqlite3.Connection, brief_date: str, fetch=fetch_og_image, limit: int = 7, retry_days: int = 2
) -> int:
    """Fetch a hero-quality image for brief items that lack one.

    Any of the brief's items can become the hero (via the client's fallback or a
    replace), so this covers the whole brief (limit defaults to the brief size, 7),
    not just the top few. Items already carrying an image are left alone; items
    still without one are retried after `retry_days` so a transient fetch failure
    or a weaker earlier extractor doesn't mark an article imageless forever.
    Returns how many images were newly found.
    """
    # Fetch even when a feed image exists, because feed thumbnails are often tiny
    # and the hero is shown large — a page's og:image is the better hero visual.
    rows = conn.execute(
        """
        SELECT a.id, a.canonical_url
        FROM daily_briefs b
        JOIN daily_brief_items bi ON bi.brief_id = b.id
        JOIN articles a ON a.id = bi.article_id
        WHERE b.brief_date = ?
          AND (
            a.image_checked_at IS NULL
            OR ((a.image_url IS NULL OR a.image_url = '')
                AND a.image_checked_at < datetime('now', ?))
          )
        ORDER BY bi.rank
        LIMIT ?
        """,
        (brief_date, f"-{retry_days} days", limit),
    ).fetchall()

    found = 0
    for row in rows:
        try:
            image = fetch(row["canonical_url"])
        except Exception:
            image = None
        conn.execute(
            "UPDATE articles SET image_url = COALESCE(?, image_url), image_checked_at = CURRENT_TIMESTAMP "
            "WHERE id = ?",
            (image, row["id"]),
        )
        if image:
            found += 1
    conn.commit()
    return found