a55ba185a8
Blocker fixes for the image cache:
- /api/img/{id} now serves cache HITS ONLY and is restricted to ACCEPTED, CANONICAL
articles. It never fetches — the cycle (newsimg.warm) owns all fetching — so the
public endpoint has no SSRF/worker-exhaustion surface. Dropped 1-year immutable
caching (image_url can change) → public, max-age=86400.
- newsimg._safe_fetch: SSRF-safe (reuses enrich._host_is_public + _NoRedirect, http(s)
only, every redirect hop re-validated, body capped). _FetchError distinguishes
permanent refusals (negative-cached via a .fail marker) from transient errors (retry).
- _encode re-encodes only decoded RASTER images to WebP and REJECTS everything else
(SVG, undecodable, decompression bombs via MAX_IMAGE_PIXELS, pathological dimensions);
originals are never retained. prune() also sweeps stale .fail markers.
- Concurrency: fetching only runs inside the cycle lock; writes stay atomic.
Smaller fixes:
- share.py visible image has onerror→this.remove() (degrade to the text unfurl, no
broken icon when an image isn't cached yet).
- share-page Back follows history only on a SAME-ORIGIN referrer (never bounce to an
external site); menu now honors Escape + resets crossing back to desktop (HubBar parity).
Tests: private host, redirect-to-private, hostile SVG/non-image, transient-vs-permanent
failure, LRU prune, warm (accepted+canonical only, idempotent), cache-only endpoint
(404 on not-cached/unaccepted/duplicate, never fetches), share chrome parity. 441 pass.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
252 lines
9.6 KiB
Python
252 lines
9.6 KiB
Python
"""Local image cache + downscale for news article images.
|
|
|
|
Article images used to be hotlinked from the source, so a slow/flaky third-party CDN
|
|
left a blank graphic until a refresh. Instead the CYCLE fetches a downscaled WebP copy
|
|
to data/img_cache/ (beside the DB, mounted into the API container, mirrors art_cache),
|
|
and the API serves only cache HITS — it never fetches, so the public endpoint has no
|
|
SSRF or worker-exhaustion surface. The cache is bounded by a hard size ceiling with LRU
|
|
eviction, so it can't grow without limit no matter the ingest rate.
|
|
|
|
Security posture (the fetch runs only in the trusted cycle, but feed image URLs are
|
|
still externally supplied, so we treat them as untrusted):
|
|
* SSRF-safe fetch reuses enrich._host_is_public + bounded redirect re-validation
|
|
(same path as feeds.safe_fetch_feed) — no private/loopback/link-local targets,
|
|
http(s) only, every redirect hop re-checked.
|
|
* Only successfully-decoded RASTER images are re-encoded to WebP and stored; SVG and
|
|
anything undecodable is REJECTED (never retained as a same-origin file).
|
|
* Decompression-bomb + dimension guards.
|
|
* Definitive failures are negative-cached (a .fail marker) so a bad URL isn't refetched
|
|
every cycle; transient network errors are not, so they retry.
|
|
Concurrency: all fetching happens inside the cycle, which holds an exclusive lock, so no
|
|
two fetches race; writes are atomic (temp + rename) regardless.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import io
|
|
import os
|
|
import time
|
|
import urllib.error
|
|
import urllib.request
|
|
from pathlib import Path
|
|
from urllib.parse import urljoin, urlsplit
|
|
|
|
from .enrich import MAX_REDIRECTS, _NoRedirect, _host_is_public
|
|
|
|
_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
|
|
_MIN_IMAGE_BYTES = 500
|
|
_MAX_FETCH_BYTES = 20 * 1024 * 1024 # never pull an absurd original into memory
|
|
_MAX_PIXELS = 50_000_000 # decompression-bomb ceiling (≈50 MP)
|
|
_MAX_DIM = 12000 # reject pathological single-axis dimensions
|
|
DISPLAY_WIDTH = 800 # cards / feed never show wider than this
|
|
WEBP_QUALITY = 80
|
|
DEFAULT_CAP_BYTES = 1024 * 1024 * 1024 # 1 GB hard ceiling (override via env)
|
|
_FAIL_TTL_S = 3 * 24 * 3600 # don't refetch a definitively-bad URL for 3 days
|
|
|
|
|
|
def cache_dir() -> Path:
|
|
override = os.environ.get("GOODNEWS_IMG_CACHE")
|
|
db = Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3"))
|
|
d = Path(override) if override else db.parent / "img_cache"
|
|
d.mkdir(parents=True, exist_ok=True)
|
|
return d
|
|
|
|
|
|
def cap_bytes() -> int:
|
|
try:
|
|
return int(os.environ.get("GOODNEWS_IMG_CACHE_CAP", DEFAULT_CAP_BYTES))
|
|
except ValueError:
|
|
return DEFAULT_CAP_BYTES
|
|
|
|
|
|
def _key(url: str) -> str:
|
|
return hashlib.sha1(url.encode("utf-8")).hexdigest()
|
|
|
|
|
|
class _FetchError(Exception):
|
|
"""permanent=True → negative-cache (won't retry soon); False → transient, retry."""
|
|
def __init__(self, msg: str, permanent: bool):
|
|
super().__init__(msg)
|
|
self.permanent = permanent
|
|
|
|
|
|
def _safe_fetch(url: str, timeout: int = 12) -> tuple[bytes, str]:
|
|
"""SSRF-safe fetch of an untrusted image URL: http(s) only, every redirect hop
|
|
re-validated against public IPs, bounded redirects, body capped. Raises _FetchError
|
|
(permanent for policy refusals, transient for network errors)."""
|
|
opener = urllib.request.build_opener(_NoRedirect)
|
|
current = url
|
|
for _ in range(MAX_REDIRECTS + 1):
|
|
parts = urlsplit(current)
|
|
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
|
|
raise _FetchError(f"non-public or non-http(s): {current}", permanent=True)
|
|
req = urllib.request.Request(current, headers=_UA)
|
|
try:
|
|
resp = opener.open(req, timeout=timeout)
|
|
except (urllib.error.URLError, OSError, ValueError) as exc:
|
|
raise _FetchError(f"fetch failed: {exc}", permanent=False) from exc
|
|
status = getattr(resp, "status", 200) or 200
|
|
if status in (301, 302, 303, 307, 308):
|
|
loc = resp.headers.get("Location")
|
|
resp.close()
|
|
if not loc:
|
|
raise _FetchError("redirect without location", permanent=True)
|
|
current = urljoin(current, loc)
|
|
continue
|
|
try:
|
|
return resp.read(_MAX_FETCH_BYTES + 1), (resp.headers.get("Content-Type") or "")
|
|
finally:
|
|
resp.close()
|
|
raise _FetchError("too many redirects", permanent=True)
|
|
|
|
|
|
def _encode(data: bytes) -> bytes | None:
|
|
"""Downscale a decoded RASTER image to DISPLAY_WIDTH and re-encode as WebP. None if
|
|
it isn't a decodable raster (e.g. SVG), is a decompression bomb, or has pathological
|
|
dimensions — the caller then REJECTS it (never stores arbitrary bytes)."""
|
|
try:
|
|
from PIL import Image
|
|
Image.MAX_IMAGE_PIXELS = _MAX_PIXELS # raise DecompressionBombError past this
|
|
im = Image.open(io.BytesIO(data))
|
|
im.load() # forces decode → catches truncated/bomb here
|
|
if im.width > _MAX_DIM or im.height > _MAX_DIM or im.width < 1 or im.height < 1:
|
|
return None
|
|
if im.mode not in ("RGB", "RGBA"):
|
|
im = im.convert("RGBA" if ("A" in im.mode or im.mode == "P") else "RGB")
|
|
if im.width > DISPLAY_WIDTH:
|
|
h = max(1, round(im.height * DISPLAY_WIDTH / im.width))
|
|
im = im.resize((DISPLAY_WIDTH, h), Image.LANCZOS)
|
|
out = io.BytesIO()
|
|
im.save(out, format="WEBP", quality=WEBP_QUALITY, method=4)
|
|
return out.getvalue()
|
|
except Exception: # noqa: BLE001 — UnidentifiedImageError, DecompressionBombError, SVG, truncated …
|
|
return None
|
|
|
|
|
|
def _fail_path(url: str) -> Path:
|
|
return cache_dir() / f"{_key(url)}.fail"
|
|
|
|
|
|
def _mark_failed(url: str) -> None:
|
|
try:
|
|
_fail_path(url).touch()
|
|
except OSError:
|
|
pass
|
|
|
|
|
|
def _failed_recently(url: str) -> bool:
|
|
try:
|
|
return (time.time() - _fail_path(url).stat().st_mtime) < _FAIL_TTL_S
|
|
except OSError:
|
|
return False
|
|
|
|
|
|
def path_for(url: str) -> Path | None:
|
|
"""The cached WebP for this URL if present (and bump its mtime, the LRU marker).
|
|
A pure cache lookup — never fetches."""
|
|
if not url:
|
|
return None
|
|
p = cache_dir() / f"{_key(url)}.webp"
|
|
if p.exists():
|
|
try:
|
|
os.utime(p, None) # touch → last-used time for LRU eviction
|
|
except OSError:
|
|
pass
|
|
return p
|
|
return None
|
|
|
|
|
|
def fetch_and_cache(url: str | None) -> Path | None:
|
|
"""Fetch (SSRF-safe), downscale to WebP, and cache atomically. CYCLE-ONLY — the API
|
|
endpoint never calls this. None on any failure; definitive failures are negative-cached
|
|
so they aren't retried every cycle."""
|
|
if not url or not url.startswith(("http://", "https://")):
|
|
return None
|
|
try:
|
|
data, _ctype = _safe_fetch(url)
|
|
except _FetchError as exc:
|
|
if exc.permanent:
|
|
_mark_failed(url)
|
|
return None
|
|
if not (_MIN_IMAGE_BYTES <= len(data) <= _MAX_FETCH_BYTES):
|
|
_mark_failed(url)
|
|
return None
|
|
blob = _encode(data)
|
|
if blob is None: # SVG / undecodable / bomb / bad dimensions
|
|
_mark_failed(url)
|
|
return None
|
|
cdir = cache_dir()
|
|
key = _key(url)
|
|
tmp = cdir / f".{key}.tmp"
|
|
dest = cdir / f"{key}.webp"
|
|
try:
|
|
tmp.write_bytes(blob)
|
|
os.replace(tmp, dest) # atomic
|
|
except OSError:
|
|
try:
|
|
tmp.unlink()
|
|
except OSError:
|
|
pass
|
|
return None
|
|
return dest
|
|
|
|
|
|
def warm(conn, limit: int = 200) -> int:
|
|
"""Pre-fetch display copies for the newest ACCEPTED, CANONICAL articles that have an
|
|
image, so the API only ever serves cache hits. Bounded; skips already-cached and
|
|
recently-failed URLs. Returns how many it newly cached."""
|
|
rows = conn.execute(
|
|
"SELECT DISTINCT a.image_url FROM article_scores s JOIN articles a ON a.id = s.article_id "
|
|
"WHERE s.accepted=1 AND a.duplicate_of IS NULL AND a.image_url IS NOT NULL "
|
|
"AND a.image_url != '' ORDER BY a.id DESC LIMIT ?",
|
|
(limit,),
|
|
).fetchall()
|
|
made = 0
|
|
for r in rows:
|
|
url = r[0]
|
|
if path_for(url) or _failed_recently(url):
|
|
continue
|
|
if fetch_and_cache(url):
|
|
made += 1
|
|
return made
|
|
|
|
|
|
def prune(cap: int | None = None) -> dict:
|
|
"""Enforce the size ceiling: delete least-recently-used WebPs (oldest mtime first)
|
|
until under the cap; also sweep stale .fail markers. Returns {before, after, removed, cap}."""
|
|
if cap is None:
|
|
cap = cap_bytes()
|
|
now = time.time()
|
|
files, total = [], 0
|
|
for p in cache_dir().iterdir():
|
|
if p.name.startswith("."):
|
|
continue
|
|
if p.suffix == ".fail":
|
|
try:
|
|
if now - p.stat().st_mtime >= _FAIL_TTL_S:
|
|
p.unlink()
|
|
except OSError:
|
|
pass
|
|
continue
|
|
if p.suffix != ".webp" or not p.is_file():
|
|
continue
|
|
try:
|
|
st = p.stat()
|
|
except OSError:
|
|
continue
|
|
files.append((st.st_mtime, st.st_size, p))
|
|
total += st.st_size
|
|
before, removed = total, 0
|
|
if total > cap:
|
|
files.sort() # oldest mtime first = least recently used
|
|
for _mtime, size, p in files:
|
|
if total <= cap:
|
|
break
|
|
try:
|
|
p.unlink()
|
|
total -= size
|
|
removed += 1
|
|
except OSError:
|
|
pass
|
|
return {"before": before, "after": total, "removed": removed, "cap": cap}
|