"""Local image cache + downscale for news article images. Article images used to be hotlinked from the source, so a slow/flaky third-party CDN left a blank graphic until a refresh. Instead the CYCLE fetches a downscaled WebP copy to data/img_cache/ (beside the DB, mounted into the API container, mirrors art_cache), and the API serves only cache HITS — it never fetches, so the public endpoint has no SSRF or worker-exhaustion surface. The cache is bounded by a hard size ceiling with LRU eviction, so it can't grow without limit no matter the ingest rate. Security posture (the fetch runs only in the trusted cycle, but feed image URLs are still externally supplied, so we treat them as untrusted): * SSRF-safe fetch reuses enrich._host_is_public + bounded redirect re-validation (same path as feeds.safe_fetch_feed) — no private/loopback/link-local targets, http(s) only, every redirect hop re-checked. * Only successfully-decoded RASTER images are re-encoded to WebP and stored; SVG and anything undecodable is REJECTED (never retained as a same-origin file). * Decompression-bomb + dimension guards. * Definitive failures are negative-cached (a .fail marker) so a bad URL isn't refetched every cycle; transient network errors are not, so they retry. Concurrency: all fetching happens inside the cycle, which holds an exclusive lock, so no two fetches race; writes are atomic (temp + rename) regardless. """ from __future__ import annotations import hashlib import io import os import time import urllib.error import urllib.request from pathlib import Path from urllib.parse import urljoin, urlsplit from .enrich import MAX_REDIRECTS, _NoRedirect, _host_is_public _UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"} _MIN_IMAGE_BYTES = 500 _MAX_FETCH_BYTES = 20 * 1024 * 1024 # never pull an absurd original into memory _MAX_PIXELS = 50_000_000 # decompression-bomb ceiling (≈50 MP) _MAX_DIM = 12000 # reject pathological single-axis dimensions DISPLAY_WIDTH = 800 # cards / feed never show wider than this WEBP_QUALITY = 80 DEFAULT_CAP_BYTES = 1024 * 1024 * 1024 # 1 GB hard ceiling (override via env) _FAIL_TTL_S = 3 * 24 * 3600 # don't refetch a definitively-bad URL for 3 days def cache_dir() -> Path: override = os.environ.get("GOODNEWS_IMG_CACHE") db = Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3")) d = Path(override) if override else db.parent / "img_cache" d.mkdir(parents=True, exist_ok=True) return d def cap_bytes() -> int: try: return int(os.environ.get("GOODNEWS_IMG_CACHE_CAP", DEFAULT_CAP_BYTES)) except ValueError: return DEFAULT_CAP_BYTES def _key(url: str) -> str: return hashlib.sha1(url.encode("utf-8")).hexdigest() def display_url(article_id: int, image_policy: str | None, raw_url: str | None) -> str | None: """The image URL the frontend should use, honoring the SOURCE's image policy: 'cache' → our locally-cached copy (/api/img/) — only for sources we've cleared to re-host (open license / explicit permission / public-domain). 'remote' → the publisher's own URL (hotlinked + the frontend's graceful retry). The conservative DEFAULT: we display but never re-host. 'none' → no image (typographic cover). Returns None when there's no image or the policy is 'none'.""" if not raw_url: return None if image_policy == "cache": return f"/api/img/{article_id}" if image_policy == "none": return None return raw_url # 'remote' (default) — hotlink, never re-hosted class _FetchError(Exception): """permanent=True → negative-cache (won't retry soon); False → transient, retry.""" def __init__(self, msg: str, permanent: bool): super().__init__(msg) self.permanent = permanent def _safe_fetch(url: str, timeout: int = 12) -> tuple[bytes, str]: """SSRF-safe fetch of an untrusted image URL: http(s) only, every redirect hop re-validated against public IPs, bounded redirects, body capped. Raises _FetchError (permanent for policy refusals, transient for network errors).""" opener = urllib.request.build_opener(_NoRedirect) current = url for _ in range(MAX_REDIRECTS + 1): parts = urlsplit(current) if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname): raise _FetchError(f"non-public or non-http(s): {current}", permanent=True) req = urllib.request.Request(current, headers=_UA) try: resp = opener.open(req, timeout=timeout) except urllib.error.HTTPError as exc: # _NoRedirect makes urllib RAISE on 3xx (rather than return a response), so # redirects arrive here. Re-validate the destination on the next loop. 4xx # (except 429) is a permanent miss → negative-cache; 429/5xx → transient. if exc.code in (301, 302, 303, 307, 308): loc = exc.headers.get("Location") exc.close() if not loc: raise _FetchError("redirect without location", permanent=True) from exc current = urljoin(current, loc) continue permanent = 400 <= exc.code < 500 and exc.code != 429 raise _FetchError(f"http {exc.code}", permanent=permanent) from exc except (urllib.error.URLError, OSError, ValueError) as exc: raise _FetchError(f"fetch failed: {exc}", permanent=False) from exc try: return resp.read(_MAX_FETCH_BYTES + 1), (resp.headers.get("Content-Type") or "") finally: resp.close() raise _FetchError("too many redirects", permanent=True) def _encode(data: bytes) -> bytes | None: """Downscale a decoded RASTER image to DISPLAY_WIDTH and re-encode as WebP. None if it isn't a decodable raster (e.g. SVG), is a decompression bomb, or has pathological dimensions — the caller then REJECTS it (never stores arbitrary bytes).""" try: from PIL import Image Image.MAX_IMAGE_PIXELS = _MAX_PIXELS # backstop; Pillow only WARNS at this, raises ~2x im = Image.open(io.BytesIO(data)) # lazy: header (size) read without decoding pixels # Enforce the pixel/dimension ceiling BEFORE load() so a decompression bomb is never # actually decoded (Pillow's own MAX_IMAGE_PIXELS only warns at the threshold). if (im.width * im.height > _MAX_PIXELS or im.width > _MAX_DIM or im.height > _MAX_DIM or im.width < 1 or im.height < 1): return None im.load() # decode now (also catches truncated data) if im.mode not in ("RGB", "RGBA"): im = im.convert("RGBA" if ("A" in im.mode or im.mode == "P") else "RGB") if im.width > DISPLAY_WIDTH: h = max(1, round(im.height * DISPLAY_WIDTH / im.width)) im = im.resize((DISPLAY_WIDTH, h), Image.LANCZOS) out = io.BytesIO() im.save(out, format="WEBP", quality=WEBP_QUALITY, method=4) return out.getvalue() except Exception: # noqa: BLE001 — UnidentifiedImageError, DecompressionBombError, SVG, truncated … return None def _fail_path(url: str) -> Path: return cache_dir() / f"{_key(url)}.fail" def _mark_failed(url: str) -> None: try: _fail_path(url).touch() except OSError: pass def _failed_recently(url: str) -> bool: try: return (time.time() - _fail_path(url).stat().st_mtime) < _FAIL_TTL_S except OSError: return False def path_for(url: str) -> Path | None: """The cached WebP for this URL if present (and bump its mtime, the LRU marker). A pure cache lookup — never fetches.""" if not url: return None p = cache_dir() / f"{_key(url)}.webp" if p.exists(): try: os.utime(p, None) # touch → last-used time for LRU eviction except OSError: pass return p return None def fetch_and_cache(url: str | None) -> Path | None: """Fetch (SSRF-safe), downscale to WebP, and cache atomically. CYCLE-ONLY — the API endpoint never calls this. None on any failure; definitive failures are negative-cached so they aren't retried every cycle.""" if not url or not url.startswith(("http://", "https://")): return None try: data, _ctype = _safe_fetch(url) except _FetchError as exc: if exc.permanent: _mark_failed(url) return None if not (_MIN_IMAGE_BYTES <= len(data) <= _MAX_FETCH_BYTES): _mark_failed(url) return None blob = _encode(data) if blob is None: # SVG / undecodable / bomb / bad dimensions _mark_failed(url) return None cdir = cache_dir() key = _key(url) tmp = cdir / f".{key}.tmp" dest = cdir / f"{key}.webp" try: tmp.write_bytes(blob) os.replace(tmp, dest) # atomic except OSError: try: tmp.unlink() except OSError: pass return None return dest def purge_source(conn, source_id: int) -> int: """Delete every cached file for a source's article image URLs. Called when a source leaves 'cache' policy (revoked permission / re-classified), so the re-hosted copies come down immediately rather than lingering inaccessible on disk. Returns webp count.""" rows = conn.execute( "SELECT DISTINCT image_url FROM articles WHERE source_id = ? " "AND image_url IS NOT NULL AND image_url != ''", (source_id,), ).fetchall() cdir = cache_dir() removed = 0 for r in rows: key = _key(r[0]) for suffix in (".webp", ".fail"): p = cdir / f"{key}{suffix}" try: if p.exists(): p.unlink() if suffix == ".webp": removed += 1 except OSError: pass return removed def warm(conn, limit: int = 200) -> int: """Pre-fetch display copies for the newest ACCEPTED, CANONICAL articles whose SOURCE is cleared to cache (image_policy='cache'), so the API only ever serves cache hits. Bounded; skips already-cached and recently-failed URLs. Returns how many it newly cached. Sources default to 'remote' (hotlink, never re-hosted), so this caches nothing until a source is explicitly set to 'cache'.""" rows = conn.execute( "SELECT DISTINCT a.image_url FROM article_scores s JOIN articles a ON a.id = s.article_id " "JOIN sources src ON src.id = a.source_id " "WHERE s.accepted=1 AND a.duplicate_of IS NULL AND src.image_policy='cache' " "AND a.image_url IS NOT NULL AND a.image_url != '' ORDER BY a.id DESC LIMIT ?", (limit,), ).fetchall() made = 0 for r in rows: url = r[0] if path_for(url) or _failed_recently(url): continue if fetch_and_cache(url): made += 1 return made def prune(cap: int | None = None) -> dict: """Enforce the size ceiling: delete least-recently-used WebPs (oldest mtime first) until under the cap; also sweep stale .fail markers. Returns {before, after, removed, cap}.""" if cap is None: cap = cap_bytes() now = time.time() files, total = [], 0 for p in cache_dir().iterdir(): if p.name.startswith("."): continue if p.suffix == ".fail": try: if now - p.stat().st_mtime >= _FAIL_TTL_S: p.unlink() except OSError: pass continue if p.suffix != ".webp" or not p.is_file(): continue try: st = p.stat() except OSError: continue files.append((st.st_mtime, st.st_size, p)) total += st.st_size before, removed = total, 0 if total > cap: files.sort() # oldest mtime first = least recently used for _mtime, size, p in files: if total <= cap: break try: p.unlink() total -= size removed += 1 except OSError: pass return {"before": before, "after": total, "removed": removed, "cap": cap}