upbeatBytes/goodnews/newsimg.py

"""Local image cache + downscale for news article images.

The hub, feed, and article pages used to hotlink each article's image_url straight
from the source's server, so a slow / rate-limited / flaky third-party CDN left a
blank graphic until a refresh. Instead we cache a downscaled display copy on our own
origin (beside the DB, like art_cache) and serve that. The cache is bounded by a HARD
size ceiling with LRU eviction (prune), so it can't grow without limit no matter the
ingest rate. Network + Pillow calls are isolated so tests can monkeypatch them.

Keyed by a hash of the source URL: a given image_url always maps to the same file.
The API resolves an article id -> its image_url (a tight allowlist — we only ever
fetch URLs already in our own corpus, so it is not an open proxy)."""
from __future__ import annotations

import hashlib
import io
import os
import urllib.request
from pathlib import Path

_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
_MIN_IMAGE_BYTES = 500
_MAX_FETCH_BYTES = 20 * 1024 * 1024       # never pull an absurd original into memory
DISPLAY_WIDTH = 800                        # cards / feed never show wider than this
WEBP_QUALITY = 80
DEFAULT_CAP_BYTES = 1024 * 1024 * 1024     # 1 GB hard ceiling (override via env)


def cache_dir() -> Path:
    """Where cached images live — beside the DB, so the host cycle writes and the API
    container reads the same mounted volume (mirrors art.cache_dir)."""
    override = os.environ.get("GOODNEWS_IMG_CACHE")
    if override:
        d = Path(override)
    else:
        db = Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3"))
        d = db.parent / "img_cache"
    d.mkdir(parents=True, exist_ok=True)
    return d


def cap_bytes() -> int:
    try:
        return int(os.environ.get("GOODNEWS_IMG_CACHE_CAP", DEFAULT_CAP_BYTES))
    except ValueError:
        return DEFAULT_CAP_BYTES


def _key(url: str) -> str:
    return hashlib.sha1(url.encode("utf-8")).hexdigest()


def _http_bytes(url: str, timeout: int = 12) -> tuple[bytes, str]:
    req = urllib.request.Request(url, headers=_UA)
    with urllib.request.urlopen(req, timeout=timeout) as r:
        return r.read(_MAX_FETCH_BYTES + 1), (r.headers.get("Content-Type") or "")


def _encode(data: bytes) -> bytes | None:
    """Downscale to DISPLAY_WIDTH and re-encode as WebP. None if it isn't a decodable
    raster image (e.g. SVG) — the caller then stores the original bytes as-is."""
    try:
        from PIL import Image
        im = Image.open(io.BytesIO(data))
        im.load()
        if im.mode not in ("RGB", "RGBA"):
            im = im.convert("RGBA" if ("A" in im.mode or im.mode == "P") else "RGB")
        if im.width > DISPLAY_WIDTH:
            h = max(1, round(im.height * DISPLAY_WIDTH / im.width))
            im = im.resize((DISPLAY_WIDTH, h), Image.LANCZOS)
        out = io.BytesIO()
        im.save(out, format="WEBP", quality=WEBP_QUALITY, method=4)
        return out.getvalue()
    except Exception:  # noqa: BLE001 — not a decodable raster image
        return None


def _ext_for(ctype: str) -> str:
    c = ctype.lower()
    if "png" in c:
        return ".png"
    if "gif" in c:
        return ".gif"
    if "svg" in c:
        return ".svg"
    if "webp" in c:
        return ".webp"
    return ".jpg"


def path_for(url: str) -> Path | None:
    """The cached file for this URL if present (and bump its mtime, the LRU marker)."""
    for p in cache_dir().glob(_key(url) + ".*"):
        try:
            os.utime(p, None)   # touch -> last-used time for LRU eviction
        except OSError:
            pass
        return p
    return None


def get_or_fetch(url: str | None) -> Path | None:
    """Cached display copy for a source image URL, fetching + caching on first miss.
    Atomic write (temp then rename) so a reader never sees a half-file. None on any
    failure — callers (endpoint 404 -> frontend retry/typo cover) degrade gracefully."""
    if not url or not url.startswith(("http://", "https://")):
        return None
    hit = path_for(url)
    if hit:
        return hit
    try:
        data, ctype = _http_bytes(url)
    except Exception:  # noqa: BLE001 — source down/slow/blocked
        return None
    if len(data) < _MIN_IMAGE_BYTES or len(data) > _MAX_FETCH_BYTES:
        return None
    encoded = _encode(data)
    if encoded is not None:
        blob, ext = encoded, ".webp"
    elif ctype.startswith("image/"):
        blob, ext = data, _ext_for(ctype)   # couldn't re-encode (e.g. SVG): keep original
    else:
        return None
    key = _key(url)
    cdir = cache_dir()
    tmp = cdir / f".{key}.tmp"
    dest = cdir / f"{key}{ext}"
    try:
        tmp.write_bytes(blob)
        os.replace(tmp, dest)               # atomic
    except OSError:
        try:
            tmp.unlink()
        except OSError:
            pass
        return None
    return dest


def warm(conn, limit: int = 200) -> int:
    """Pre-fetch display copies for the newest accepted articles that have an image, so
    the FIRST page view is already a local hit (no first-view flakiness). Bounded; skips
    already-cached. Returns how many it newly cached."""
    rows = conn.execute(
        "SELECT DISTINCT a.image_url FROM article_scores s JOIN articles a ON a.id = s.article_id "
        "WHERE s.accepted=1 AND a.duplicate_of IS NULL AND a.image_url IS NOT NULL "
        "AND a.image_url != '' ORDER BY a.id DESC LIMIT ?",
        (limit,),
    ).fetchall()
    made = 0
    for r in rows:
        url = r[0]
        if path_for(url):
            continue
        if get_or_fetch(url):
            made += 1
    return made


def prune(cap: int | None = None) -> dict:
    """Enforce the size ceiling: delete least-recently-used files (oldest mtime first)
    until the cache is under the cap. Returns {before, after, removed, cap}."""
    if cap is None:
        cap = cap_bytes()
    files, total = [], 0
    for p in cache_dir().iterdir():
        if not p.is_file() or p.name.startswith("."):
            continue
        try:
            st = p.stat()
        except OSError:
            continue
        files.append((st.st_mtime, st.st_size, p))
        total += st.st_size
    before, removed = total, 0
    if total > cap:
        files.sort()                        # oldest mtime first = least recently used
        for _mtime, size, p in files:
            if total <= cap:
                break
            try:
                p.unlink()
                total -= size
                removed += 1
            except OSError:
                pass
    return {"before": before, "after": total, "removed": removed, "cap": cap}