Files
upbeatBytes/goodnews/newsimg.py
T
thejayman77 a55ba185a8 images: harden the cache per Codex audit (SSRF-safe, cache-only endpoint, WebP-only)
Blocker fixes for the image cache:
- /api/img/{id} now serves cache HITS ONLY and is restricted to ACCEPTED, CANONICAL
  articles. It never fetches — the cycle (newsimg.warm) owns all fetching — so the
  public endpoint has no SSRF/worker-exhaustion surface. Dropped 1-year immutable
  caching (image_url can change) → public, max-age=86400.
- newsimg._safe_fetch: SSRF-safe (reuses enrich._host_is_public + _NoRedirect, http(s)
  only, every redirect hop re-validated, body capped). _FetchError distinguishes
  permanent refusals (negative-cached via a .fail marker) from transient errors (retry).
- _encode re-encodes only decoded RASTER images to WebP and REJECTS everything else
  (SVG, undecodable, decompression bombs via MAX_IMAGE_PIXELS, pathological dimensions);
  originals are never retained. prune() also sweeps stale .fail markers.
- Concurrency: fetching only runs inside the cycle lock; writes stay atomic.

Smaller fixes:
- share.py visible image has onerror→this.remove() (degrade to the text unfurl, no
  broken icon when an image isn't cached yet).
- share-page Back follows history only on a SAME-ORIGIN referrer (never bounce to an
  external site); menu now honors Escape + resets crossing back to desktop (HubBar parity).

Tests: private host, redirect-to-private, hostile SVG/non-image, transient-vs-permanent
failure, LRU prune, warm (accepted+canonical only, idempotent), cache-only endpoint
(404 on not-cached/unaccepted/duplicate, never fetches), share chrome parity. 441 pass.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 12:19:57 -04:00

252 lines
9.6 KiB
Python

"""Local image cache + downscale for news article images.
Article images used to be hotlinked from the source, so a slow/flaky third-party CDN
left a blank graphic until a refresh. Instead the CYCLE fetches a downscaled WebP copy
to data/img_cache/ (beside the DB, mounted into the API container, mirrors art_cache),
and the API serves only cache HITS — it never fetches, so the public endpoint has no
SSRF or worker-exhaustion surface. The cache is bounded by a hard size ceiling with LRU
eviction, so it can't grow without limit no matter the ingest rate.
Security posture (the fetch runs only in the trusted cycle, but feed image URLs are
still externally supplied, so we treat them as untrusted):
* SSRF-safe fetch reuses enrich._host_is_public + bounded redirect re-validation
(same path as feeds.safe_fetch_feed) — no private/loopback/link-local targets,
http(s) only, every redirect hop re-checked.
* Only successfully-decoded RASTER images are re-encoded to WebP and stored; SVG and
anything undecodable is REJECTED (never retained as a same-origin file).
* Decompression-bomb + dimension guards.
* Definitive failures are negative-cached (a .fail marker) so a bad URL isn't refetched
every cycle; transient network errors are not, so they retry.
Concurrency: all fetching happens inside the cycle, which holds an exclusive lock, so no
two fetches race; writes are atomic (temp + rename) regardless.
"""
from __future__ import annotations
import hashlib
import io
import os
import time
import urllib.error
import urllib.request
from pathlib import Path
from urllib.parse import urljoin, urlsplit
from .enrich import MAX_REDIRECTS, _NoRedirect, _host_is_public
_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
_MIN_IMAGE_BYTES = 500
_MAX_FETCH_BYTES = 20 * 1024 * 1024 # never pull an absurd original into memory
_MAX_PIXELS = 50_000_000 # decompression-bomb ceiling (≈50 MP)
_MAX_DIM = 12000 # reject pathological single-axis dimensions
DISPLAY_WIDTH = 800 # cards / feed never show wider than this
WEBP_QUALITY = 80
DEFAULT_CAP_BYTES = 1024 * 1024 * 1024 # 1 GB hard ceiling (override via env)
_FAIL_TTL_S = 3 * 24 * 3600 # don't refetch a definitively-bad URL for 3 days
def cache_dir() -> Path:
override = os.environ.get("GOODNEWS_IMG_CACHE")
db = Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3"))
d = Path(override) if override else db.parent / "img_cache"
d.mkdir(parents=True, exist_ok=True)
return d
def cap_bytes() -> int:
try:
return int(os.environ.get("GOODNEWS_IMG_CACHE_CAP", DEFAULT_CAP_BYTES))
except ValueError:
return DEFAULT_CAP_BYTES
def _key(url: str) -> str:
return hashlib.sha1(url.encode("utf-8")).hexdigest()
class _FetchError(Exception):
"""permanent=True → negative-cache (won't retry soon); False → transient, retry."""
def __init__(self, msg: str, permanent: bool):
super().__init__(msg)
self.permanent = permanent
def _safe_fetch(url: str, timeout: int = 12) -> tuple[bytes, str]:
"""SSRF-safe fetch of an untrusted image URL: http(s) only, every redirect hop
re-validated against public IPs, bounded redirects, body capped. Raises _FetchError
(permanent for policy refusals, transient for network errors)."""
opener = urllib.request.build_opener(_NoRedirect)
current = url
for _ in range(MAX_REDIRECTS + 1):
parts = urlsplit(current)
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
raise _FetchError(f"non-public or non-http(s): {current}", permanent=True)
req = urllib.request.Request(current, headers=_UA)
try:
resp = opener.open(req, timeout=timeout)
except (urllib.error.URLError, OSError, ValueError) as exc:
raise _FetchError(f"fetch failed: {exc}", permanent=False) from exc
status = getattr(resp, "status", 200) or 200
if status in (301, 302, 303, 307, 308):
loc = resp.headers.get("Location")
resp.close()
if not loc:
raise _FetchError("redirect without location", permanent=True)
current = urljoin(current, loc)
continue
try:
return resp.read(_MAX_FETCH_BYTES + 1), (resp.headers.get("Content-Type") or "")
finally:
resp.close()
raise _FetchError("too many redirects", permanent=True)
def _encode(data: bytes) -> bytes | None:
"""Downscale a decoded RASTER image to DISPLAY_WIDTH and re-encode as WebP. None if
it isn't a decodable raster (e.g. SVG), is a decompression bomb, or has pathological
dimensions — the caller then REJECTS it (never stores arbitrary bytes)."""
try:
from PIL import Image
Image.MAX_IMAGE_PIXELS = _MAX_PIXELS # raise DecompressionBombError past this
im = Image.open(io.BytesIO(data))
im.load() # forces decode → catches truncated/bomb here
if im.width > _MAX_DIM or im.height > _MAX_DIM or im.width < 1 or im.height < 1:
return None
if im.mode not in ("RGB", "RGBA"):
im = im.convert("RGBA" if ("A" in im.mode or im.mode == "P") else "RGB")
if im.width > DISPLAY_WIDTH:
h = max(1, round(im.height * DISPLAY_WIDTH / im.width))
im = im.resize((DISPLAY_WIDTH, h), Image.LANCZOS)
out = io.BytesIO()
im.save(out, format="WEBP", quality=WEBP_QUALITY, method=4)
return out.getvalue()
except Exception: # noqa: BLE001 — UnidentifiedImageError, DecompressionBombError, SVG, truncated …
return None
def _fail_path(url: str) -> Path:
return cache_dir() / f"{_key(url)}.fail"
def _mark_failed(url: str) -> None:
try:
_fail_path(url).touch()
except OSError:
pass
def _failed_recently(url: str) -> bool:
try:
return (time.time() - _fail_path(url).stat().st_mtime) < _FAIL_TTL_S
except OSError:
return False
def path_for(url: str) -> Path | None:
"""The cached WebP for this URL if present (and bump its mtime, the LRU marker).
A pure cache lookup — never fetches."""
if not url:
return None
p = cache_dir() / f"{_key(url)}.webp"
if p.exists():
try:
os.utime(p, None) # touch → last-used time for LRU eviction
except OSError:
pass
return p
return None
def fetch_and_cache(url: str | None) -> Path | None:
"""Fetch (SSRF-safe), downscale to WebP, and cache atomically. CYCLE-ONLY — the API
endpoint never calls this. None on any failure; definitive failures are negative-cached
so they aren't retried every cycle."""
if not url or not url.startswith(("http://", "https://")):
return None
try:
data, _ctype = _safe_fetch(url)
except _FetchError as exc:
if exc.permanent:
_mark_failed(url)
return None
if not (_MIN_IMAGE_BYTES <= len(data) <= _MAX_FETCH_BYTES):
_mark_failed(url)
return None
blob = _encode(data)
if blob is None: # SVG / undecodable / bomb / bad dimensions
_mark_failed(url)
return None
cdir = cache_dir()
key = _key(url)
tmp = cdir / f".{key}.tmp"
dest = cdir / f"{key}.webp"
try:
tmp.write_bytes(blob)
os.replace(tmp, dest) # atomic
except OSError:
try:
tmp.unlink()
except OSError:
pass
return None
return dest
def warm(conn, limit: int = 200) -> int:
"""Pre-fetch display copies for the newest ACCEPTED, CANONICAL articles that have an
image, so the API only ever serves cache hits. Bounded; skips already-cached and
recently-failed URLs. Returns how many it newly cached."""
rows = conn.execute(
"SELECT DISTINCT a.image_url FROM article_scores s JOIN articles a ON a.id = s.article_id "
"WHERE s.accepted=1 AND a.duplicate_of IS NULL AND a.image_url IS NOT NULL "
"AND a.image_url != '' ORDER BY a.id DESC LIMIT ?",
(limit,),
).fetchall()
made = 0
for r in rows:
url = r[0]
if path_for(url) or _failed_recently(url):
continue
if fetch_and_cache(url):
made += 1
return made
def prune(cap: int | None = None) -> dict:
"""Enforce the size ceiling: delete least-recently-used WebPs (oldest mtime first)
until under the cap; also sweep stale .fail markers. Returns {before, after, removed, cap}."""
if cap is None:
cap = cap_bytes()
now = time.time()
files, total = [], 0
for p in cache_dir().iterdir():
if p.name.startswith("."):
continue
if p.suffix == ".fail":
try:
if now - p.stat().st_mtime >= _FAIL_TTL_S:
p.unlink()
except OSError:
pass
continue
if p.suffix != ".webp" or not p.is_file():
continue
try:
st = p.stat()
except OSError:
continue
files.append((st.st_mtime, st.st_size, p))
total += st.st_size
before, removed = total, 0
if total > cap:
files.sort() # oldest mtime first = least recently used
for _mtime, size, p in files:
if total <= cap:
break
try:
p.unlink()
total -= size
removed += 1
except OSError:
pass
return {"before": before, "after": total, "removed": removed, "cap": cap}