upbeatBytes/goodnews/newsimg.py

"""Local image cache + downscale for news article images.

Article images used to be hotlinked from the source, so a slow/flaky third-party CDN
left a blank graphic until a refresh. Instead the CYCLE fetches a downscaled WebP copy
to data/img_cache/ (beside the DB, mounted into the API container, mirrors art_cache),
and the API serves only cache HITS — it never fetches, so the public endpoint has no
SSRF or worker-exhaustion surface. The cache is bounded by a hard size ceiling with LRU
eviction, so it can't grow without limit no matter the ingest rate.

Security posture (the fetch runs only in the trusted cycle, but feed image URLs are
still externally supplied, so we treat them as untrusted):
  * SSRF-safe fetch reuses enrich._host_is_public + bounded redirect re-validation
    (same path as feeds.safe_fetch_feed) — no private/loopback/link-local targets,
    http(s) only, every redirect hop re-checked.
  * Only successfully-decoded RASTER images are re-encoded to WebP and stored; SVG and
    anything undecodable is REJECTED (never retained as a same-origin file).
  * Decompression-bomb + dimension guards.
  * Definitive failures are negative-cached (a .fail marker) so a bad URL isn't refetched
    every cycle; transient network errors are not, so they retry.
Concurrency: all fetching happens inside the cycle, which holds an exclusive lock, so no
two fetches race; writes are atomic (temp + rename) regardless.
"""
from __future__ import annotations

import hashlib
import io
import os
import time
import urllib.error
import urllib.request
from pathlib import Path
from urllib.parse import urljoin, urlsplit

from .enrich import MAX_REDIRECTS, _NoRedirect, _host_is_public

_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
_MIN_IMAGE_BYTES = 500
_MAX_FETCH_BYTES = 20 * 1024 * 1024       # never pull an absurd original into memory
_MAX_PIXELS = 50_000_000                   # decompression-bomb ceiling (≈50 MP)
_MAX_DIM = 12000                           # reject pathological single-axis dimensions
DISPLAY_WIDTH = 800                        # cards / feed never show wider than this
WEBP_QUALITY = 80
DEFAULT_CAP_BYTES = 1024 * 1024 * 1024     # 1 GB hard ceiling (override via env)
_FAIL_TTL_S = 3 * 24 * 3600                # don't refetch a definitively-bad URL for 3 days


def cache_dir() -> Path:
    override = os.environ.get("GOODNEWS_IMG_CACHE")
    db = Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3"))
    d = Path(override) if override else db.parent / "img_cache"
    d.mkdir(parents=True, exist_ok=True)
    return d


def cap_bytes() -> int:
    try:
        return int(os.environ.get("GOODNEWS_IMG_CACHE_CAP", DEFAULT_CAP_BYTES))
    except ValueError:
        return DEFAULT_CAP_BYTES


def _key(url: str) -> str:
    return hashlib.sha1(url.encode("utf-8")).hexdigest()


class _FetchError(Exception):
    """permanent=True → negative-cache (won't retry soon); False → transient, retry."""
    def __init__(self, msg: str, permanent: bool):
        super().__init__(msg)
        self.permanent = permanent


def _safe_fetch(url: str, timeout: int = 12) -> tuple[bytes, str]:
    """SSRF-safe fetch of an untrusted image URL: http(s) only, every redirect hop
    re-validated against public IPs, bounded redirects, body capped. Raises _FetchError
    (permanent for policy refusals, transient for network errors)."""
    opener = urllib.request.build_opener(_NoRedirect)
    current = url
    for _ in range(MAX_REDIRECTS + 1):
        parts = urlsplit(current)
        if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
            raise _FetchError(f"non-public or non-http(s): {current}", permanent=True)
        req = urllib.request.Request(current, headers=_UA)
        try:
            resp = opener.open(req, timeout=timeout)
        except (urllib.error.URLError, OSError, ValueError) as exc:
            raise _FetchError(f"fetch failed: {exc}", permanent=False) from exc
        status = getattr(resp, "status", 200) or 200
        if status in (301, 302, 303, 307, 308):
            loc = resp.headers.get("Location")
            resp.close()
            if not loc:
                raise _FetchError("redirect without location", permanent=True)
            current = urljoin(current, loc)
            continue
        try:
            return resp.read(_MAX_FETCH_BYTES + 1), (resp.headers.get("Content-Type") or "")
        finally:
            resp.close()
    raise _FetchError("too many redirects", permanent=True)


def _encode(data: bytes) -> bytes | None:
    """Downscale a decoded RASTER image to DISPLAY_WIDTH and re-encode as WebP. None if
    it isn't a decodable raster (e.g. SVG), is a decompression bomb, or has pathological
    dimensions — the caller then REJECTS it (never stores arbitrary bytes)."""
    try:
        from PIL import Image
        Image.MAX_IMAGE_PIXELS = _MAX_PIXELS   # raise DecompressionBombError past this
        im = Image.open(io.BytesIO(data))
        im.load()                              # forces decode → catches truncated/bomb here
        if im.width > _MAX_DIM or im.height > _MAX_DIM or im.width < 1 or im.height < 1:
            return None
        if im.mode not in ("RGB", "RGBA"):
            im = im.convert("RGBA" if ("A" in im.mode or im.mode == "P") else "RGB")
        if im.width > DISPLAY_WIDTH:
            h = max(1, round(im.height * DISPLAY_WIDTH / im.width))
            im = im.resize((DISPLAY_WIDTH, h), Image.LANCZOS)
        out = io.BytesIO()
        im.save(out, format="WEBP", quality=WEBP_QUALITY, method=4)
        return out.getvalue()
    except Exception:  # noqa: BLE001 — UnidentifiedImageError, DecompressionBombError, SVG, truncated …
        return None


def _fail_path(url: str) -> Path:
    return cache_dir() / f"{_key(url)}.fail"


def _mark_failed(url: str) -> None:
    try:
        _fail_path(url).touch()
    except OSError:
        pass


def _failed_recently(url: str) -> bool:
    try:
        return (time.time() - _fail_path(url).stat().st_mtime) < _FAIL_TTL_S
    except OSError:
        return False


def path_for(url: str) -> Path | None:
    """The cached WebP for this URL if present (and bump its mtime, the LRU marker).
    A pure cache lookup — never fetches."""
    if not url:
        return None
    p = cache_dir() / f"{_key(url)}.webp"
    if p.exists():
        try:
            os.utime(p, None)   # touch → last-used time for LRU eviction
        except OSError:
            pass
        return p
    return None


def fetch_and_cache(url: str | None) -> Path | None:
    """Fetch (SSRF-safe), downscale to WebP, and cache atomically. CYCLE-ONLY — the API
    endpoint never calls this. None on any failure; definitive failures are negative-cached
    so they aren't retried every cycle."""
    if not url or not url.startswith(("http://", "https://")):
        return None
    try:
        data, _ctype = _safe_fetch(url)
    except _FetchError as exc:
        if exc.permanent:
            _mark_failed(url)
        return None
    if not (_MIN_IMAGE_BYTES <= len(data) <= _MAX_FETCH_BYTES):
        _mark_failed(url)
        return None
    blob = _encode(data)
    if blob is None:                         # SVG / undecodable / bomb / bad dimensions
        _mark_failed(url)
        return None
    cdir = cache_dir()
    key = _key(url)
    tmp = cdir / f".{key}.tmp"
    dest = cdir / f"{key}.webp"
    try:
        tmp.write_bytes(blob)
        os.replace(tmp, dest)                # atomic
    except OSError:
        try:
            tmp.unlink()
        except OSError:
            pass
        return None
    return dest


def warm(conn, limit: int = 200) -> int:
    """Pre-fetch display copies for the newest ACCEPTED, CANONICAL articles that have an
    image, so the API only ever serves cache hits. Bounded; skips already-cached and
    recently-failed URLs. Returns how many it newly cached."""
    rows = conn.execute(
        "SELECT DISTINCT a.image_url FROM article_scores s JOIN articles a ON a.id = s.article_id "
        "WHERE s.accepted=1 AND a.duplicate_of IS NULL AND a.image_url IS NOT NULL "
        "AND a.image_url != '' ORDER BY a.id DESC LIMIT ?",
        (limit,),
    ).fetchall()
    made = 0
    for r in rows:
        url = r[0]
        if path_for(url) or _failed_recently(url):
            continue
        if fetch_and_cache(url):
            made += 1
    return made


def prune(cap: int | None = None) -> dict:
    """Enforce the size ceiling: delete least-recently-used WebPs (oldest mtime first)
    until under the cap; also sweep stale .fail markers. Returns {before, after, removed, cap}."""
    if cap is None:
        cap = cap_bytes()
    now = time.time()
    files, total = [], 0
    for p in cache_dir().iterdir():
        if p.name.startswith("."):
            continue
        if p.suffix == ".fail":
            try:
                if now - p.stat().st_mtime >= _FAIL_TTL_S:
                    p.unlink()
            except OSError:
                pass
            continue
        if p.suffix != ".webp" or not p.is_file():
            continue
        try:
            st = p.stat()
        except OSError:
            continue
        files.append((st.st_mtime, st.st_size, p))
        total += st.st_size
    before, removed = total, 0
    if total > cap:
        files.sort()                        # oldest mtime first = least recently used
        for _mtime, size, p in files:
            if total <= cap:
                break
            try:
                p.unlink()
                total -= size
                removed += 1
            except OSError:
                pass
    return {"before": before, "after": total, "removed": removed, "cap": cap}