upbeatBytes/goodnews/enrich.py

"""Bounded hero-image enrichment.

The grid stays purely typographic; the daily brief's items are the one place we
make an exception and fetch a real image — because the hero is the single
intentional visual doorway. We fetch ONLY the article page's <head> metadata
(og:image / twitter:image), store ONLY the resulting image URL, and never touch
the body. This is opt-in, brief-only, once per build.

Security (this is the one place we fetch user-/source-supplied pages):
- http(s) only; short timeout; byte cap; redirects followed manually and capped;
- every hop's host is DNS-resolved and rejected if ANY resolved address is
  private / loopback / link-local / multicast / reserved / unspecified (SSRF).
Failures are cached by the caller (image_checked_at) so an article is never
retried forever.
"""

from __future__ import annotations

import ipaddress
import re
import socket
import sqlite3
import struct
import urllib.error
import urllib.request
from urllib.parse import urljoin, urlsplit

from .text import canonicalize_url

USER_AGENT = "goodNews/0.1 (+local constructive news prototype)"
TIMEOUT = 6
MAX_BYTES = 300_000
MAX_REDIRECTS = 3
# Below this, a feed thumbnail upscales to mush in the card banner. Real share
# images (og:image) are ~1200×630; tiny RSS thumbnails (~90px) are what we reject.
MIN_IMG_WIDTH = 450
MIN_IMG_HEIGHT = 250

_META_RE = re.compile(rb"<meta\b[^>]*>", re.IGNORECASE)
_HEAD_END_RE = re.compile(rb"</head>", re.IGNORECASE)

# Substrings that mark a generic placeholder/default share image rather than the
# article's own picture (e.g. NPR's facebook-default). We'd rather show no image
# (typographic hero) than a generic logo card. NOTE: do NOT add "branded_news" —
# that's BBC's normal CDN path for real article photos, so rejecting it threw away
# every BBC hero image and fell back to the tiny RSS thumbnail.
_GENERIC_IMAGE_MARKERS = (
    "facebook-default",
    "default-wide",
    "default-fb",
    "og-default",
    "default-og",
    "twitter-default",
    "default-image",
    "/placeholder",
    "share-default",
    "social-default",
    # tracking pixels / spacers / data-URIs — never a real share image
    "data:image",
    "/pixel",
    "1x1",
    "spacer",
    "/blank.",
    "transparent.",
)


def _is_generic_image(url: str) -> bool:
    lowered = url.lower()
    return any(marker in lowered for marker in _GENERIC_IMAGE_MARKERS)


def _prefer_unbranded(url: str) -> str:
    """Swap BBC's logo-branded image variant for its clean one.

    BBC's og:image is served from the "branded_news" CDN path with a "BBC NEWS"
    logo baked into the picture (it shows as "…EWS" once the hero crops it). The
    identical photo is served under "cpsprodpb" with no logo, so prefer that — a
    clean hero at the same full resolution.
    """
    if "ichef.bbci.co.uk" in url and "/branded_news/" in url:
        return url.replace("/branded_news/", "/cpsprodpb/")
    return url


def _attr(tag: bytes, name: bytes) -> bytes | None:
    m = re.search(name + rb"""\s*=\s*["']([^"']*)["']""", tag, re.IGNORECASE)
    return m.group(1) if m else None


def og_image_from_html(html: bytes) -> str | None:
    """Extract og:image / twitter:image from a page's <head> bytes."""
    head = html.split(b"</head>", 1)[0] if _HEAD_END_RE.search(html) else html
    for tag in _META_RE.findall(head):
        key = _attr(tag, b"property") or _attr(tag, b"name")
        if key and key.lower() in (b"og:image", b"og:image:url", b"twitter:image"):
            content = _attr(tag, b"content")
            if not content:
                continue
            image = canonicalize_url(content.decode("utf-8", "replace"))
            # Skip generic placeholders; keep scanning for a real one.
            if image and not _is_generic_image(image):
                return _prefer_unbranded(image)
    return None


def _host_is_public(host: str | None) -> bool:
    """True only if the host resolves and ALL its addresses are public."""
    if not host:
        return False
    try:
        infos = socket.getaddrinfo(host, None)
    except (socket.gaierror, UnicodeError, OSError):
        return False
    addrs = {info[4][0] for info in infos}
    if not addrs:
        return False
    for addr in addrs:
        try:
            ip = ipaddress.ip_address(addr.split("%")[0])  # strip scope id
        except ValueError:
            return False
        if (
            ip.is_private or ip.is_loopback or ip.is_link_local
            or ip.is_multicast or ip.is_reserved or ip.is_unspecified
        ):
            return False
    return True


class _NoRedirect(urllib.request.HTTPRedirectHandler):
    # Don't auto-follow — we re-validate each hop's host ourselves.
    def redirect_request(self, *args, **kwargs):
        return None


def fetch_og_image(url: str | None) -> str | None:
    """Fetch a page's head metadata and return its og:image URL, or None.

    Best-effort and safe: returns None on any error, bad scheme, redirect loop,
    or a host that resolves to a non-public address.
    """
    opener = urllib.request.build_opener(_NoRedirect)
    for _ in range(MAX_REDIRECTS + 1):
        if not url:
            return None
        parts = urlsplit(url)
        if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
            return None
        request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"})
        try:
            response = opener.open(request, timeout=TIMEOUT)
        except (urllib.error.URLError, OSError, ValueError):
            return None
        status = getattr(response, "status", 200) or 200
        if status in (301, 302, 303, 307, 308):
            location = response.headers.get("Location")
            response.close()
            if not location:
                return None
            url = urljoin(url, location)
            continue
        ctype = response.headers.get("Content-Type", "")
        if "html" not in ctype.lower():
            response.close()
            return None
        try:
            body = response.read(MAX_BYTES)
        finally:
            response.close()
        image = og_image_from_html(body)
        # A stored URL is not proof it renders — confirm it actually loads.
        return image if (image and _image_loads(image)) else None
    return None  # too many redirects


# Word counting reads more of the body than image metadata (which only needs <head>).
_READ_MAX_BYTES = 900_000


def fetch_source_words(url: str | None) -> int | None:
    """Fetch a page and return its full-article word count (furniture stripped), or
    None on any failure or a too-thin extraction (JS/video/paywall pages). Same SSRF
    safety as fetch_og_image; we read the count only, never store the body."""
    from .readtime import source_read_minutes, word_count_from_html
    opener = urllib.request.build_opener(_NoRedirect)
    for _ in range(MAX_REDIRECTS + 1):
        if not url:
            return None
        parts = urlsplit(url)
        if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
            return None
        request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"})
        try:
            response = opener.open(request, timeout=TIMEOUT)
        except (urllib.error.URLError, OSError, ValueError):
            return None
        status = getattr(response, "status", 200) or 200
        if status in (301, 302, 303, 307, 308):
            location = response.headers.get("Location")
            response.close()
            if not location:
                return None
            url = urljoin(url, location)
            continue
        if "html" not in response.headers.get("Content-Type", "").lower():
            response.close()
            return None
        try:
            body = response.read(_READ_MAX_BYTES)
        finally:
            response.close()
        words = word_count_from_html(body)
        return words if source_read_minutes(words) is not None else None
    return None  # too many redirects


def _image_dimensions(data: bytes) -> "tuple[int, int] | None":
    """Best-effort (width, height) from an image file's header bytes — PNG, GIF,
    JPEG, WebP. Returns None for formats we can't cheaply measure (e.g. SVG)."""
    if len(data) < 10:
        return None
    if len(data) >= 24 and data[:8] == b"\x89PNG\r\n\x1a\n" and data[12:16] == b"IHDR":
        return struct.unpack(">II", data[16:24])
    if data[:6] in (b"GIF87a", b"GIF89a"):
        return struct.unpack("<HH", data[6:10])
    if data[:2] == b"\xff\xd8":  # JPEG: scan for a Start-Of-Frame marker
        i, n = 2, len(data)
        while i < n - 9:
            if data[i] != 0xFF:
                i += 1
                continue
            marker = data[i + 1]
            if marker in (0xC0, 0xC1, 0xC2, 0xC3, 0xC5, 0xC6, 0xC7, 0xC9, 0xCA, 0xCB, 0xCD, 0xCE, 0xCF):
                h, w = struct.unpack(">HH", data[i + 5:i + 9])
                return (w, h)
            if marker == 0xD8 or marker == 0xD9 or 0xD0 <= marker <= 0xD7:
                i += 2
                continue
            i += 2 + struct.unpack(">H", data[i + 2:i + 4])[0]
        return None
    if data[:4] == b"RIFF" and data[8:12] == b"WEBP":
        fmt = data[12:16]
        try:
            if fmt == b"VP8 ":
                return (struct.unpack("<H", data[26:28])[0] & 0x3FFF,
                        struct.unpack("<H", data[28:30])[0] & 0x3FFF)
            if fmt == b"VP8X":
                return ((data[24] | data[25] << 8 | data[26] << 16) + 1,
                        (data[27] | data[28] << 8 | data[29] << 16) + 1)
        except (struct.error, IndexError):
            return None
    return None


def _image_loads(url: str) -> bool:
    """Confirm an image URL returns a real, big-enough image (HTTP 200 + image/*
    + dimensions ≥ the minimum).

    Two failure modes this guards against: signed/hotlink-protected URLs that
    401/403 on a direct load (e.g. the Guardian's i.guim.co.uk), and tiny feed
    thumbnails (~90px) that upscale to mush in the card banner. We request as the
    browser does — no referrer — with the same per-hop host safety as the page
    fetch. Images we can't measure (SVG/AVIF) pass on content-type alone.
    """
    opener = urllib.request.build_opener(_NoRedirect)
    for _ in range(MAX_REDIRECTS + 1):
        if not url:
            return False
        parts = urlsplit(url)
        if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
            return False
        request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "image/*,*/*"})
        try:
            response = opener.open(request, timeout=TIMEOUT)
        except (urllib.error.URLError, OSError, ValueError):
            return False
        try:
            status = getattr(response, "status", 200) or 200
            if status in (301, 302, 303, 307, 308):
                location = response.headers.get("Location")
                if not location:
                    return False
                url = urljoin(url, location)
                continue
            ctype = (response.headers.get("Content-Type") or "").lower()
            if status != 200 or not ctype.startswith("image/"):
                return False
            head = response.read(200_000)
        finally:
            response.close()
        dims = _image_dimensions(head)
        if dims and (dims[0] < MIN_IMG_WIDTH or dims[1] < MIN_IMG_HEIGHT):
            return False  # too small — would upscale to mush
        return True
    return False


def prune_broken_images(conn: sqlite3.Connection, check=_image_loads, limit: int = 3000) -> int:
    """Clear stored image URLs that no longer load (signed/expired/hotlink-
    protected), so coverage is honest and those cards fall back to the calm
    placeholder cleanly instead of attempting a doomed fetch. Returns count cleared.
    """
    rows = conn.execute(
        "SELECT id, image_url FROM articles WHERE image_url IS NOT NULL AND image_url != '' "
        "ORDER BY id DESC LIMIT ?",
        (limit,),
    ).fetchall()
    cleared = 0
    for row in rows:
        if not check(row["image_url"]):
            conn.execute(
                "UPDATE articles SET image_url = NULL, image_checked_at = CURRENT_TIMESTAMP WHERE id = ?",
                (row["id"],),
            )
            cleared += 1
    conn.commit()
    return cleared


def enrich_brief_images(
    conn: sqlite3.Connection, brief_date: str, fetch=fetch_og_image, limit: int = 7, retry_days: int = 2
) -> int:
    """Fetch a hero-quality image for brief items that lack one.

    Any of the brief's items can become the hero (via the client's fallback or a
    replace), so this covers the whole brief (limit defaults to the brief size, 7),
    not just the top few. Items already carrying an image are left alone; items
    still without one are retried after `retry_days` so a transient fetch failure
    or a weaker earlier extractor doesn't mark an article imageless forever.
    Returns how many images were newly found.
    """
    # Fetch even when a feed image exists, because feed thumbnails are often tiny
    # and the hero is shown large — a page's og:image is the better hero visual.
    rows = conn.execute(
        """
        SELECT a.id, a.canonical_url
        FROM daily_briefs b
        JOIN daily_brief_items bi ON bi.brief_id = b.id
        JOIN articles a ON a.id = bi.article_id
        WHERE b.brief_date = ?
          AND (
            a.image_checked_at IS NULL
            OR ((a.image_url IS NULL OR a.image_url = '')
                AND a.image_checked_at < datetime('now', ?))
          )
        ORDER BY bi.rank
        LIMIT ?
        """,
        (brief_date, f"-{retry_days} days", limit),
    ).fetchall()

    found = 0
    for row in rows:
        try:
            image = fetch(row["canonical_url"])
        except Exception:
            image = None
        conn.execute(
            "UPDATE articles SET image_url = COALESCE(?, image_url), image_checked_at = CURRENT_TIMESTAMP "
            "WHERE id = ?",
            (image, row["id"]),
        )
        if image:
            found += 1
    conn.commit()
    return found


def enrich_article_image(
    conn: sqlite3.Connection, article_id: int, fetch=fetch_og_image, retry_days: int = 7
) -> bool:
    """Attention-triggered: fetch an og:image for ONE article that lacks one.

    Called when an article earns a summary (i.e. it's actually being read), so we
    only spend a fetch on articles a reader has reached. Leaves an existing image
    alone; retries a still-imageless article only after `retry_days`. Returns True
    if a new image was stored. Best-effort — never raises.
    """
    row = conn.execute(
        """
        SELECT id, canonical_url FROM articles
        WHERE id = ?
          AND (image_url IS NULL OR image_url = '')
          AND (image_checked_at IS NULL OR image_checked_at < datetime('now', ?))
        """,
        (article_id, f"-{retry_days} days"),
    ).fetchone()
    if not row:
        return False  # has an image already, or checked too recently
    try:
        image = fetch(row["canonical_url"])
    except Exception:
        image = None
    conn.execute(
        "UPDATE articles SET image_url = COALESCE(?, image_url), image_checked_at = CURRENT_TIMESTAMP "
        "WHERE id = ?",
        (image, article_id),
    )
    conn.commit()
    return bool(image)


def enrich_recent_images(
    conn: sqlite3.Connection, fetch=fetch_og_image, limit: int = 40, retry_days: int = 7
) -> int:
    """Keep the Latest feed photo-rich: fetch a quality og:image for the newest
    accepted, non-duplicate articles that lack one. Bounded per run, so it tracks
    fresh content without blanket-fetching the archive. Returns newly-found count.
    """
    rows = conn.execute(
        """
        SELECT a.id FROM articles a
        JOIN article_scores s ON s.article_id = a.id
        WHERE s.accepted = 1 AND a.duplicate_of IS NULL
          AND (a.image_url IS NULL OR a.image_url = '')
          AND (a.image_checked_at IS NULL OR a.image_checked_at < datetime('now', ?))
        ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
        LIMIT ?
        """,
        (f"-{retry_days} days", limit),
    ).fetchall()
    found = 0
    for row in rows:
        if enrich_article_image(conn, row["id"], fetch=fetch, retry_days=retry_days):
            found += 1
    return found


def enrich_summarized_images(
    conn: sqlite3.Connection, fetch=fetch_og_image, limit: int = 50, retry_days: int = 7
) -> int:
    """Slow backfill: give already-summarized, accepted articles an image if they
    lack one. Run in modest batches so we never blast publishers. Returns count
    of newly-found images.
    """
    rows = conn.execute(
        """
        SELECT a.id FROM articles a
        JOIN article_summaries m ON m.article_id = a.id
        JOIN article_scores s ON s.article_id = a.id
        WHERE s.accepted = 1 AND a.duplicate_of IS NULL
          AND (a.image_url IS NULL OR a.image_url = '')
          AND (a.image_checked_at IS NULL OR a.image_checked_at < datetime('now', ?))
        ORDER BY a.id DESC
        LIMIT ?
        """,
        (f"-{retry_days} days", limit),
    ).fetchall()
    found = 0
    for row in rows:
        if enrich_article_image(conn, row["id"], fetch=fetch, retry_days=retry_days):
            found += 1
    return found


def enrich_read_times(
    conn: sqlite3.Connection, fetch=fetch_source_words, limit: int = 40, retry_days: int = 14
) -> int:
    """Give recent accepted articles a full-article word count, so the front door can
    show "Full story · ~N min" next to our one-minute gist. Bounded per run (mirrors
    the image enrichers); fetches each article once, retrying a failed/too-thin
    extraction only after `retry_days`. Returns how many real counts were stored."""
    rows = conn.execute(
        """
        SELECT a.id, a.canonical_url FROM articles a
        JOIN article_scores s ON s.article_id = a.id
        WHERE s.accepted = 1 AND a.duplicate_of IS NULL
          AND a.source_words IS NULL
          AND (a.read_checked_at IS NULL OR a.read_checked_at < datetime('now', ?))
        ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
        LIMIT ?
        """,
        (f"-{retry_days} days", limit),
    ).fetchall()
    found = 0
    for row in rows:
        try:
            words = fetch(row["canonical_url"])
        except Exception:
            words = None
        # Only ever write a REAL count; never overwrite a good value with null/zero.
        # Always stamp the check time so failed/thin pages aren't re-fetched until retry.
        if words:
            conn.execute(
                "UPDATE articles SET source_words = ?, read_checked_at = CURRENT_TIMESTAMP WHERE id = ?",
                (words, row["id"]),
            )
            found += 1
        else:
            conn.execute("UPDATE articles SET read_checked_at = CURRENT_TIMESTAMP WHERE id = ?", (row["id"],))
    conn.commit()
    return found