upbeatBytes/goodnews/newsimg.py

"""Local image cache + downscale for news article images.

Article images used to be hotlinked from the source, so a slow/flaky third-party CDN
left a blank graphic until a refresh. Instead the CYCLE fetches a downscaled WebP copy
to data/img_cache/ (beside the DB, mounted into the API container, mirrors art_cache),
and the API serves only cache HITS — it never fetches, so the public endpoint has no
SSRF or worker-exhaustion surface. The cache is bounded by a hard size ceiling with LRU
eviction, so it can't grow without limit no matter the ingest rate.

Security posture (the fetch runs only in the trusted cycle, but feed image URLs are
still externally supplied, so we treat them as untrusted):
  * SSRF-safe fetch reuses enrich._host_is_public + bounded redirect re-validation
    (same path as feeds.safe_fetch_feed) — no private/loopback/link-local targets,
    http(s) only, every redirect hop re-checked.
  * Only successfully-decoded RASTER images are re-encoded to WebP and stored; SVG and
    anything undecodable is REJECTED (never retained as a same-origin file).
  * Decompression-bomb + dimension guards.
  * Definitive failures are negative-cached (a .fail marker) so a bad URL isn't refetched
    every cycle; transient network errors are not, so they retry.
Concurrency: all fetching happens inside the cycle, which holds an exclusive lock, so no
two fetches race; writes are atomic (temp + rename) regardless.
"""
from __future__ import annotations

import hashlib
import io
import os
import time
import urllib.error
import urllib.request
from pathlib import Path
from urllib.parse import urljoin, urlsplit

from .enrich import MAX_REDIRECTS, _NoRedirect, _host_is_public

_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
_MIN_IMAGE_BYTES = 500
_MAX_FETCH_BYTES = 20 * 1024 * 1024       # never pull an absurd original into memory
_MAX_PIXELS = 50_000_000                   # decompression-bomb ceiling (≈50 MP)
_MAX_DIM = 12000                           # reject pathological single-axis dimensions
DISPLAY_WIDTH = 800                        # cards / feed never show wider than this
WEBP_QUALITY = 80
DEFAULT_CAP_BYTES = 1024 * 1024 * 1024     # 1 GB hard ceiling (override via env)
_FAIL_TTL_S = 3 * 24 * 3600                # don't refetch a definitively-bad URL for 3 days


def cache_dir() -> Path:
    override = os.environ.get("GOODNEWS_IMG_CACHE")
    db = Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3"))
    d = Path(override) if override else db.parent / "img_cache"
    d.mkdir(parents=True, exist_ok=True)
    return d


def cap_bytes() -> int:
    try:
        return int(os.environ.get("GOODNEWS_IMG_CACHE_CAP", DEFAULT_CAP_BYTES))
    except ValueError:
        return DEFAULT_CAP_BYTES


def _key(url: str) -> str:
    return hashlib.sha1(url.encode("utf-8")).hexdigest()


def display_url(article_id: int, image_policy: str | None, raw_url: str | None) -> str | None:
    """The image URL the frontend should use, honoring the SOURCE's image policy:
      'cache'  → our locally-cached copy (/api/img/<id>) — only for sources we've cleared
                 to re-host (open license / explicit permission / public-domain).
      'remote' → the publisher's own URL (hotlinked + the frontend's graceful retry). The
                 conservative DEFAULT: we display but never re-host.
      'none'   → no image (typographic cover).
    Returns None when there's no image or the policy is 'none'."""
    if not raw_url:
        return None
    if image_policy == "cache":
        return f"/api/img/{article_id}"
    if image_policy == "none":
        return None
    return raw_url   # 'remote' (default) — hotlink, never re-hosted


class _FetchError(Exception):
    """permanent=True → negative-cache (won't retry soon); False → transient, retry."""
    def __init__(self, msg: str, permanent: bool):
        super().__init__(msg)
        self.permanent = permanent


def _safe_fetch(url: str, timeout: int = 12) -> tuple[bytes, str]:
    """SSRF-safe fetch of an untrusted image URL: http(s) only, every redirect hop
    re-validated against public IPs, bounded redirects, body capped. Raises _FetchError
    (permanent for policy refusals, transient for network errors)."""
    opener = urllib.request.build_opener(_NoRedirect)
    current = url
    for _ in range(MAX_REDIRECTS + 1):
        parts = urlsplit(current)
        if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
            raise _FetchError(f"non-public or non-http(s): {current}", permanent=True)
        req = urllib.request.Request(current, headers=_UA)
        try:
            resp = opener.open(req, timeout=timeout)
        except urllib.error.HTTPError as exc:
            # _NoRedirect makes urllib RAISE on 3xx (rather than return a response), so
            # redirects arrive here. Re-validate the destination on the next loop. 4xx
            # (except 429) is a permanent miss → negative-cache; 429/5xx → transient.
            if exc.code in (301, 302, 303, 307, 308):
                loc = exc.headers.get("Location")
                exc.close()
                if not loc:
                    raise _FetchError("redirect without location", permanent=True) from exc
                current = urljoin(current, loc)
                continue
            permanent = 400 <= exc.code < 500 and exc.code != 429
            raise _FetchError(f"http {exc.code}", permanent=permanent) from exc
        except (urllib.error.URLError, OSError, ValueError) as exc:
            raise _FetchError(f"fetch failed: {exc}", permanent=False) from exc
        try:
            return resp.read(_MAX_FETCH_BYTES + 1), (resp.headers.get("Content-Type") or "")
        finally:
            resp.close()
    raise _FetchError("too many redirects", permanent=True)


def _encode(data: bytes) -> bytes | None:
    """Downscale a decoded RASTER image to DISPLAY_WIDTH and re-encode as WebP. None if
    it isn't a decodable raster (e.g. SVG), is a decompression bomb, or has pathological
    dimensions — the caller then REJECTS it (never stores arbitrary bytes)."""
    try:
        from PIL import Image
        Image.MAX_IMAGE_PIXELS = _MAX_PIXELS   # backstop; Pillow only WARNS at this, raises ~2x
        im = Image.open(io.BytesIO(data))      # lazy: header (size) read without decoding pixels
        # Enforce the pixel/dimension ceiling BEFORE load() so a decompression bomb is never
        # actually decoded (Pillow's own MAX_IMAGE_PIXELS only warns at the threshold).
        if (im.width * im.height > _MAX_PIXELS or im.width > _MAX_DIM or im.height > _MAX_DIM
                or im.width < 1 or im.height < 1):
            return None
        im.load()                              # decode now (also catches truncated data)
        if im.mode not in ("RGB", "RGBA"):
            im = im.convert("RGBA" if ("A" in im.mode or im.mode == "P") else "RGB")
        if im.width > DISPLAY_WIDTH:
            h = max(1, round(im.height * DISPLAY_WIDTH / im.width))
            im = im.resize((DISPLAY_WIDTH, h), Image.LANCZOS)
        out = io.BytesIO()
        im.save(out, format="WEBP", quality=WEBP_QUALITY, method=4)
        return out.getvalue()
    except Exception:  # noqa: BLE001 — UnidentifiedImageError, DecompressionBombError, SVG, truncated …
        return None


def _fail_path(url: str) -> Path:
    return cache_dir() / f"{_key(url)}.fail"


def _mark_failed(url: str) -> None:
    try:
        _fail_path(url).touch()
    except OSError:
        pass


def _failed_recently(url: str) -> bool:
    try:
        return (time.time() - _fail_path(url).stat().st_mtime) < _FAIL_TTL_S
    except OSError:
        return False


def path_for(url: str) -> Path | None:
    """The cached WebP for this URL if present (and bump its mtime, the LRU marker).
    A pure cache lookup — never fetches."""
    if not url:
        return None
    p = cache_dir() / f"{_key(url)}.webp"
    if p.exists():
        try:
            os.utime(p, None)   # touch → last-used time for LRU eviction
        except OSError:
            pass
        return p
    return None


def fetch_and_cache(url: str | None) -> Path | None:
    """Fetch (SSRF-safe), downscale to WebP, and cache atomically. CYCLE-ONLY — the API
    endpoint never calls this. None on any failure; definitive failures are negative-cached
    so they aren't retried every cycle."""
    if not url or not url.startswith(("http://", "https://")):
        return None
    try:
        data, _ctype = _safe_fetch(url)
    except _FetchError as exc:
        if exc.permanent:
            _mark_failed(url)
        return None
    if not (_MIN_IMAGE_BYTES <= len(data) <= _MAX_FETCH_BYTES):
        _mark_failed(url)
        return None
    blob = _encode(data)
    if blob is None:                         # SVG / undecodable / bomb / bad dimensions
        _mark_failed(url)
        return None
    cdir = cache_dir()
    key = _key(url)
    tmp = cdir / f".{key}.tmp"
    dest = cdir / f"{key}.webp"
    try:
        tmp.write_bytes(blob)
        os.replace(tmp, dest)                # atomic
    except OSError:
        try:
            tmp.unlink()
        except OSError:
            pass
        return None
    return dest


def purge_source(conn, source_id: int) -> int:
    """Delete every cached file for a source's article image URLs. Called when a source
    leaves 'cache' policy (revoked permission / re-classified), so the re-hosted copies
    come down immediately rather than lingering inaccessible on disk. Returns webp count."""
    rows = conn.execute(
        "SELECT DISTINCT image_url FROM articles WHERE source_id = ? "
        "AND image_url IS NOT NULL AND image_url != ''",
        (source_id,),
    ).fetchall()
    cdir = cache_dir()
    removed = 0
    for r in rows:
        key = _key(r[0])
        for suffix in (".webp", ".fail"):
            p = cdir / f"{key}{suffix}"
            try:
                if p.exists():
                    p.unlink()
                    if suffix == ".webp":
                        removed += 1
            except OSError:
                pass
    return removed


def warm(conn, limit: int = 200) -> int:
    """Pre-fetch display copies for the newest ACCEPTED, CANONICAL articles whose SOURCE
    is cleared to cache (image_policy='cache'), so the API only ever serves cache hits.
    Bounded; skips already-cached and recently-failed URLs. Returns how many it newly
    cached. Sources default to 'remote' (hotlink, never re-hosted), so this caches
    nothing until a source is explicitly set to 'cache'."""
    rows = conn.execute(
        "SELECT DISTINCT a.image_url FROM article_scores s JOIN articles a ON a.id = s.article_id "
        "JOIN sources src ON src.id = a.source_id "
        "WHERE s.accepted=1 AND a.duplicate_of IS NULL AND src.image_policy='cache' "
        "AND a.image_url IS NOT NULL AND a.image_url != '' ORDER BY a.id DESC LIMIT ?",
        (limit,),
    ).fetchall()
    made = 0
    for r in rows:
        url = r[0]
        if path_for(url) or _failed_recently(url):
            continue
        if fetch_and_cache(url):
            made += 1
    return made


def prune(cap: int | None = None) -> dict:
    """Enforce the size ceiling: delete least-recently-used WebPs (oldest mtime first)
    until under the cap; also sweep stale .fail markers. Returns {before, after, removed, cap}."""
    if cap is None:
        cap = cap_bytes()
    now = time.time()
    files, total = [], 0
    for p in cache_dir().iterdir():
        if p.name.startswith("."):
            continue
        if p.suffix == ".fail":
            try:
                if now - p.stat().st_mtime >= _FAIL_TTL_S:
                    p.unlink()
            except OSError:
                pass
            continue
        if p.suffix != ".webp" or not p.is_file():
            continue
        try:
            st = p.stat()
        except OSError:
            continue
        files.append((st.st_mtime, st.st_size, p))
        total += st.st_size
    before, removed = total, 0
    if total > cap:
        files.sort()                        # oldest mtime first = least recently used
        for _mtime, size, p in files:
            if total <= cap:
                break
            try:
                p.unlink()
                total -= size
                removed += 1
            except OSError:
                pass
    return {"before": before, "after": total, "removed": removed, "cap": cap}