2dc4419024
- newsimg.purge_source(): when a source leaves 'cache' (permission revoked / re-classified),
the admin image-policy endpoint now deletes that source's re-hosted copies immediately,
rather than leaving them inaccessible-but-on-disk. Endpoint returns {purged}.
- Admin "Engaged readers" carries a warm-up note: tracking began 2026-06-30, so low
rolling windows are partly warm-up, not all bots (compare d7 after a week, the window
after its full span). Guards against misreading "6 engaged vs 135 visits" as 129 bots.
Tests: purge_source removes only the target source's copies; endpoint reports purged.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
305 lines
12 KiB
Python
305 lines
12 KiB
Python
"""Local image cache + downscale for news article images.
|
|
|
|
Article images used to be hotlinked from the source, so a slow/flaky third-party CDN
|
|
left a blank graphic until a refresh. Instead the CYCLE fetches a downscaled WebP copy
|
|
to data/img_cache/ (beside the DB, mounted into the API container, mirrors art_cache),
|
|
and the API serves only cache HITS — it never fetches, so the public endpoint has no
|
|
SSRF or worker-exhaustion surface. The cache is bounded by a hard size ceiling with LRU
|
|
eviction, so it can't grow without limit no matter the ingest rate.
|
|
|
|
Security posture (the fetch runs only in the trusted cycle, but feed image URLs are
|
|
still externally supplied, so we treat them as untrusted):
|
|
* SSRF-safe fetch reuses enrich._host_is_public + bounded redirect re-validation
|
|
(same path as feeds.safe_fetch_feed) — no private/loopback/link-local targets,
|
|
http(s) only, every redirect hop re-checked.
|
|
* Only successfully-decoded RASTER images are re-encoded to WebP and stored; SVG and
|
|
anything undecodable is REJECTED (never retained as a same-origin file).
|
|
* Decompression-bomb + dimension guards.
|
|
* Definitive failures are negative-cached (a .fail marker) so a bad URL isn't refetched
|
|
every cycle; transient network errors are not, so they retry.
|
|
Concurrency: all fetching happens inside the cycle, which holds an exclusive lock, so no
|
|
two fetches race; writes are atomic (temp + rename) regardless.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import io
|
|
import os
|
|
import time
|
|
import urllib.error
|
|
import urllib.request
|
|
from pathlib import Path
|
|
from urllib.parse import urljoin, urlsplit
|
|
|
|
from .enrich import MAX_REDIRECTS, _NoRedirect, _host_is_public
|
|
|
|
_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
|
|
_MIN_IMAGE_BYTES = 500
|
|
_MAX_FETCH_BYTES = 20 * 1024 * 1024 # never pull an absurd original into memory
|
|
_MAX_PIXELS = 50_000_000 # decompression-bomb ceiling (≈50 MP)
|
|
_MAX_DIM = 12000 # reject pathological single-axis dimensions
|
|
DISPLAY_WIDTH = 800 # cards / feed never show wider than this
|
|
WEBP_QUALITY = 80
|
|
DEFAULT_CAP_BYTES = 1024 * 1024 * 1024 # 1 GB hard ceiling (override via env)
|
|
_FAIL_TTL_S = 3 * 24 * 3600 # don't refetch a definitively-bad URL for 3 days
|
|
|
|
|
|
def cache_dir() -> Path:
|
|
override = os.environ.get("GOODNEWS_IMG_CACHE")
|
|
db = Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3"))
|
|
d = Path(override) if override else db.parent / "img_cache"
|
|
d.mkdir(parents=True, exist_ok=True)
|
|
return d
|
|
|
|
|
|
def cap_bytes() -> int:
|
|
try:
|
|
return int(os.environ.get("GOODNEWS_IMG_CACHE_CAP", DEFAULT_CAP_BYTES))
|
|
except ValueError:
|
|
return DEFAULT_CAP_BYTES
|
|
|
|
|
|
def _key(url: str) -> str:
|
|
return hashlib.sha1(url.encode("utf-8")).hexdigest()
|
|
|
|
|
|
def display_url(article_id: int, image_policy: str | None, raw_url: str | None) -> str | None:
|
|
"""The image URL the frontend should use, honoring the SOURCE's image policy:
|
|
'cache' → our locally-cached copy (/api/img/<id>) — only for sources we've cleared
|
|
to re-host (open license / explicit permission / public-domain).
|
|
'remote' → the publisher's own URL (hotlinked + the frontend's graceful retry). The
|
|
conservative DEFAULT: we display but never re-host.
|
|
'none' → no image (typographic cover).
|
|
Returns None when there's no image or the policy is 'none'."""
|
|
if not raw_url:
|
|
return None
|
|
if image_policy == "cache":
|
|
return f"/api/img/{article_id}"
|
|
if image_policy == "none":
|
|
return None
|
|
return raw_url # 'remote' (default) — hotlink, never re-hosted
|
|
|
|
|
|
class _FetchError(Exception):
|
|
"""permanent=True → negative-cache (won't retry soon); False → transient, retry."""
|
|
def __init__(self, msg: str, permanent: bool):
|
|
super().__init__(msg)
|
|
self.permanent = permanent
|
|
|
|
|
|
def _safe_fetch(url: str, timeout: int = 12) -> tuple[bytes, str]:
|
|
"""SSRF-safe fetch of an untrusted image URL: http(s) only, every redirect hop
|
|
re-validated against public IPs, bounded redirects, body capped. Raises _FetchError
|
|
(permanent for policy refusals, transient for network errors)."""
|
|
opener = urllib.request.build_opener(_NoRedirect)
|
|
current = url
|
|
for _ in range(MAX_REDIRECTS + 1):
|
|
parts = urlsplit(current)
|
|
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
|
|
raise _FetchError(f"non-public or non-http(s): {current}", permanent=True)
|
|
req = urllib.request.Request(current, headers=_UA)
|
|
try:
|
|
resp = opener.open(req, timeout=timeout)
|
|
except urllib.error.HTTPError as exc:
|
|
# _NoRedirect makes urllib RAISE on 3xx (rather than return a response), so
|
|
# redirects arrive here. Re-validate the destination on the next loop. 4xx
|
|
# (except 429) is a permanent miss → negative-cache; 429/5xx → transient.
|
|
if exc.code in (301, 302, 303, 307, 308):
|
|
loc = exc.headers.get("Location")
|
|
exc.close()
|
|
if not loc:
|
|
raise _FetchError("redirect without location", permanent=True) from exc
|
|
current = urljoin(current, loc)
|
|
continue
|
|
permanent = 400 <= exc.code < 500 and exc.code != 429
|
|
raise _FetchError(f"http {exc.code}", permanent=permanent) from exc
|
|
except (urllib.error.URLError, OSError, ValueError) as exc:
|
|
raise _FetchError(f"fetch failed: {exc}", permanent=False) from exc
|
|
try:
|
|
return resp.read(_MAX_FETCH_BYTES + 1), (resp.headers.get("Content-Type") or "")
|
|
finally:
|
|
resp.close()
|
|
raise _FetchError("too many redirects", permanent=True)
|
|
|
|
|
|
def _encode(data: bytes) -> bytes | None:
|
|
"""Downscale a decoded RASTER image to DISPLAY_WIDTH and re-encode as WebP. None if
|
|
it isn't a decodable raster (e.g. SVG), is a decompression bomb, or has pathological
|
|
dimensions — the caller then REJECTS it (never stores arbitrary bytes)."""
|
|
try:
|
|
from PIL import Image
|
|
Image.MAX_IMAGE_PIXELS = _MAX_PIXELS # backstop; Pillow only WARNS at this, raises ~2x
|
|
im = Image.open(io.BytesIO(data)) # lazy: header (size) read without decoding pixels
|
|
# Enforce the pixel/dimension ceiling BEFORE load() so a decompression bomb is never
|
|
# actually decoded (Pillow's own MAX_IMAGE_PIXELS only warns at the threshold).
|
|
if (im.width * im.height > _MAX_PIXELS or im.width > _MAX_DIM or im.height > _MAX_DIM
|
|
or im.width < 1 or im.height < 1):
|
|
return None
|
|
im.load() # decode now (also catches truncated data)
|
|
if im.mode not in ("RGB", "RGBA"):
|
|
im = im.convert("RGBA" if ("A" in im.mode or im.mode == "P") else "RGB")
|
|
if im.width > DISPLAY_WIDTH:
|
|
h = max(1, round(im.height * DISPLAY_WIDTH / im.width))
|
|
im = im.resize((DISPLAY_WIDTH, h), Image.LANCZOS)
|
|
out = io.BytesIO()
|
|
im.save(out, format="WEBP", quality=WEBP_QUALITY, method=4)
|
|
return out.getvalue()
|
|
except Exception: # noqa: BLE001 — UnidentifiedImageError, DecompressionBombError, SVG, truncated …
|
|
return None
|
|
|
|
|
|
def _fail_path(url: str) -> Path:
|
|
return cache_dir() / f"{_key(url)}.fail"
|
|
|
|
|
|
def _mark_failed(url: str) -> None:
|
|
try:
|
|
_fail_path(url).touch()
|
|
except OSError:
|
|
pass
|
|
|
|
|
|
def _failed_recently(url: str) -> bool:
|
|
try:
|
|
return (time.time() - _fail_path(url).stat().st_mtime) < _FAIL_TTL_S
|
|
except OSError:
|
|
return False
|
|
|
|
|
|
def path_for(url: str) -> Path | None:
|
|
"""The cached WebP for this URL if present (and bump its mtime, the LRU marker).
|
|
A pure cache lookup — never fetches."""
|
|
if not url:
|
|
return None
|
|
p = cache_dir() / f"{_key(url)}.webp"
|
|
if p.exists():
|
|
try:
|
|
os.utime(p, None) # touch → last-used time for LRU eviction
|
|
except OSError:
|
|
pass
|
|
return p
|
|
return None
|
|
|
|
|
|
def fetch_and_cache(url: str | None) -> Path | None:
|
|
"""Fetch (SSRF-safe), downscale to WebP, and cache atomically. CYCLE-ONLY — the API
|
|
endpoint never calls this. None on any failure; definitive failures are negative-cached
|
|
so they aren't retried every cycle."""
|
|
if not url or not url.startswith(("http://", "https://")):
|
|
return None
|
|
try:
|
|
data, _ctype = _safe_fetch(url)
|
|
except _FetchError as exc:
|
|
if exc.permanent:
|
|
_mark_failed(url)
|
|
return None
|
|
if not (_MIN_IMAGE_BYTES <= len(data) <= _MAX_FETCH_BYTES):
|
|
_mark_failed(url)
|
|
return None
|
|
blob = _encode(data)
|
|
if blob is None: # SVG / undecodable / bomb / bad dimensions
|
|
_mark_failed(url)
|
|
return None
|
|
cdir = cache_dir()
|
|
key = _key(url)
|
|
tmp = cdir / f".{key}.tmp"
|
|
dest = cdir / f"{key}.webp"
|
|
try:
|
|
tmp.write_bytes(blob)
|
|
os.replace(tmp, dest) # atomic
|
|
except OSError:
|
|
try:
|
|
tmp.unlink()
|
|
except OSError:
|
|
pass
|
|
return None
|
|
return dest
|
|
|
|
|
|
def purge_source(conn, source_id: int) -> int:
|
|
"""Delete every cached file for a source's article image URLs. Called when a source
|
|
leaves 'cache' policy (revoked permission / re-classified), so the re-hosted copies
|
|
come down immediately rather than lingering inaccessible on disk. Returns webp count."""
|
|
rows = conn.execute(
|
|
"SELECT DISTINCT image_url FROM articles WHERE source_id = ? "
|
|
"AND image_url IS NOT NULL AND image_url != ''",
|
|
(source_id,),
|
|
).fetchall()
|
|
cdir = cache_dir()
|
|
removed = 0
|
|
for r in rows:
|
|
key = _key(r[0])
|
|
for suffix in (".webp", ".fail"):
|
|
p = cdir / f"{key}{suffix}"
|
|
try:
|
|
if p.exists():
|
|
p.unlink()
|
|
if suffix == ".webp":
|
|
removed += 1
|
|
except OSError:
|
|
pass
|
|
return removed
|
|
|
|
|
|
def warm(conn, limit: int = 200) -> int:
|
|
"""Pre-fetch display copies for the newest ACCEPTED, CANONICAL articles whose SOURCE
|
|
is cleared to cache (image_policy='cache'), so the API only ever serves cache hits.
|
|
Bounded; skips already-cached and recently-failed URLs. Returns how many it newly
|
|
cached. Sources default to 'remote' (hotlink, never re-hosted), so this caches
|
|
nothing until a source is explicitly set to 'cache'."""
|
|
rows = conn.execute(
|
|
"SELECT DISTINCT a.image_url FROM article_scores s JOIN articles a ON a.id = s.article_id "
|
|
"JOIN sources src ON src.id = a.source_id "
|
|
"WHERE s.accepted=1 AND a.duplicate_of IS NULL AND src.image_policy='cache' "
|
|
"AND a.image_url IS NOT NULL AND a.image_url != '' ORDER BY a.id DESC LIMIT ?",
|
|
(limit,),
|
|
).fetchall()
|
|
made = 0
|
|
for r in rows:
|
|
url = r[0]
|
|
if path_for(url) or _failed_recently(url):
|
|
continue
|
|
if fetch_and_cache(url):
|
|
made += 1
|
|
return made
|
|
|
|
|
|
def prune(cap: int | None = None) -> dict:
|
|
"""Enforce the size ceiling: delete least-recently-used WebPs (oldest mtime first)
|
|
until under the cap; also sweep stale .fail markers. Returns {before, after, removed, cap}."""
|
|
if cap is None:
|
|
cap = cap_bytes()
|
|
now = time.time()
|
|
files, total = [], 0
|
|
for p in cache_dir().iterdir():
|
|
if p.name.startswith("."):
|
|
continue
|
|
if p.suffix == ".fail":
|
|
try:
|
|
if now - p.stat().st_mtime >= _FAIL_TTL_S:
|
|
p.unlink()
|
|
except OSError:
|
|
pass
|
|
continue
|
|
if p.suffix != ".webp" or not p.is_file():
|
|
continue
|
|
try:
|
|
st = p.stat()
|
|
except OSError:
|
|
continue
|
|
files.append((st.st_mtime, st.st_size, p))
|
|
total += st.st_size
|
|
before, removed = total, 0
|
|
if total > cap:
|
|
files.sort() # oldest mtime first = least recently used
|
|
for _mtime, size, p in files:
|
|
if total <= cap:
|
|
break
|
|
try:
|
|
p.unlink()
|
|
total -= size
|
|
removed += 1
|
|
except OSError:
|
|
pass
|
|
return {"before": before, "after": total, "removed": removed, "cap": cap}
|