Files
upbeatBytes/goodnews/newsimg.py
T
thejayman77 2dc4419024 images/analytics: purge on policy revoke + engagement warm-up note (Codex close-out)
- newsimg.purge_source(): when a source leaves 'cache' (permission revoked / re-classified),
  the admin image-policy endpoint now deletes that source's re-hosted copies immediately,
  rather than leaving them inaccessible-but-on-disk. Endpoint returns {purged}.
- Admin "Engaged readers" carries a warm-up note: tracking began 2026-06-30, so low
  rolling windows are partly warm-up, not all bots (compare d7 after a week, the window
  after its full span). Guards against misreading "6 engaged vs 135 visits" as 129 bots.
Tests: purge_source removes only the target source's copies; endpoint reports purged.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 14:29:55 -04:00

305 lines
12 KiB
Python

"""Local image cache + downscale for news article images.
Article images used to be hotlinked from the source, so a slow/flaky third-party CDN
left a blank graphic until a refresh. Instead the CYCLE fetches a downscaled WebP copy
to data/img_cache/ (beside the DB, mounted into the API container, mirrors art_cache),
and the API serves only cache HITS — it never fetches, so the public endpoint has no
SSRF or worker-exhaustion surface. The cache is bounded by a hard size ceiling with LRU
eviction, so it can't grow without limit no matter the ingest rate.
Security posture (the fetch runs only in the trusted cycle, but feed image URLs are
still externally supplied, so we treat them as untrusted):
* SSRF-safe fetch reuses enrich._host_is_public + bounded redirect re-validation
(same path as feeds.safe_fetch_feed) — no private/loopback/link-local targets,
http(s) only, every redirect hop re-checked.
* Only successfully-decoded RASTER images are re-encoded to WebP and stored; SVG and
anything undecodable is REJECTED (never retained as a same-origin file).
* Decompression-bomb + dimension guards.
* Definitive failures are negative-cached (a .fail marker) so a bad URL isn't refetched
every cycle; transient network errors are not, so they retry.
Concurrency: all fetching happens inside the cycle, which holds an exclusive lock, so no
two fetches race; writes are atomic (temp + rename) regardless.
"""
from __future__ import annotations
import hashlib
import io
import os
import time
import urllib.error
import urllib.request
from pathlib import Path
from urllib.parse import urljoin, urlsplit
from .enrich import MAX_REDIRECTS, _NoRedirect, _host_is_public
_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
_MIN_IMAGE_BYTES = 500
_MAX_FETCH_BYTES = 20 * 1024 * 1024 # never pull an absurd original into memory
_MAX_PIXELS = 50_000_000 # decompression-bomb ceiling (≈50 MP)
_MAX_DIM = 12000 # reject pathological single-axis dimensions
DISPLAY_WIDTH = 800 # cards / feed never show wider than this
WEBP_QUALITY = 80
DEFAULT_CAP_BYTES = 1024 * 1024 * 1024 # 1 GB hard ceiling (override via env)
_FAIL_TTL_S = 3 * 24 * 3600 # don't refetch a definitively-bad URL for 3 days
def cache_dir() -> Path:
override = os.environ.get("GOODNEWS_IMG_CACHE")
db = Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3"))
d = Path(override) if override else db.parent / "img_cache"
d.mkdir(parents=True, exist_ok=True)
return d
def cap_bytes() -> int:
try:
return int(os.environ.get("GOODNEWS_IMG_CACHE_CAP", DEFAULT_CAP_BYTES))
except ValueError:
return DEFAULT_CAP_BYTES
def _key(url: str) -> str:
return hashlib.sha1(url.encode("utf-8")).hexdigest()
def display_url(article_id: int, image_policy: str | None, raw_url: str | None) -> str | None:
"""The image URL the frontend should use, honoring the SOURCE's image policy:
'cache' → our locally-cached copy (/api/img/<id>) — only for sources we've cleared
to re-host (open license / explicit permission / public-domain).
'remote' → the publisher's own URL (hotlinked + the frontend's graceful retry). The
conservative DEFAULT: we display but never re-host.
'none' → no image (typographic cover).
Returns None when there's no image or the policy is 'none'."""
if not raw_url:
return None
if image_policy == "cache":
return f"/api/img/{article_id}"
if image_policy == "none":
return None
return raw_url # 'remote' (default) — hotlink, never re-hosted
class _FetchError(Exception):
"""permanent=True → negative-cache (won't retry soon); False → transient, retry."""
def __init__(self, msg: str, permanent: bool):
super().__init__(msg)
self.permanent = permanent
def _safe_fetch(url: str, timeout: int = 12) -> tuple[bytes, str]:
"""SSRF-safe fetch of an untrusted image URL: http(s) only, every redirect hop
re-validated against public IPs, bounded redirects, body capped. Raises _FetchError
(permanent for policy refusals, transient for network errors)."""
opener = urllib.request.build_opener(_NoRedirect)
current = url
for _ in range(MAX_REDIRECTS + 1):
parts = urlsplit(current)
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
raise _FetchError(f"non-public or non-http(s): {current}", permanent=True)
req = urllib.request.Request(current, headers=_UA)
try:
resp = opener.open(req, timeout=timeout)
except urllib.error.HTTPError as exc:
# _NoRedirect makes urllib RAISE on 3xx (rather than return a response), so
# redirects arrive here. Re-validate the destination on the next loop. 4xx
# (except 429) is a permanent miss → negative-cache; 429/5xx → transient.
if exc.code in (301, 302, 303, 307, 308):
loc = exc.headers.get("Location")
exc.close()
if not loc:
raise _FetchError("redirect without location", permanent=True) from exc
current = urljoin(current, loc)
continue
permanent = 400 <= exc.code < 500 and exc.code != 429
raise _FetchError(f"http {exc.code}", permanent=permanent) from exc
except (urllib.error.URLError, OSError, ValueError) as exc:
raise _FetchError(f"fetch failed: {exc}", permanent=False) from exc
try:
return resp.read(_MAX_FETCH_BYTES + 1), (resp.headers.get("Content-Type") or "")
finally:
resp.close()
raise _FetchError("too many redirects", permanent=True)
def _encode(data: bytes) -> bytes | None:
"""Downscale a decoded RASTER image to DISPLAY_WIDTH and re-encode as WebP. None if
it isn't a decodable raster (e.g. SVG), is a decompression bomb, or has pathological
dimensions — the caller then REJECTS it (never stores arbitrary bytes)."""
try:
from PIL import Image
Image.MAX_IMAGE_PIXELS = _MAX_PIXELS # backstop; Pillow only WARNS at this, raises ~2x
im = Image.open(io.BytesIO(data)) # lazy: header (size) read without decoding pixels
# Enforce the pixel/dimension ceiling BEFORE load() so a decompression bomb is never
# actually decoded (Pillow's own MAX_IMAGE_PIXELS only warns at the threshold).
if (im.width * im.height > _MAX_PIXELS or im.width > _MAX_DIM or im.height > _MAX_DIM
or im.width < 1 or im.height < 1):
return None
im.load() # decode now (also catches truncated data)
if im.mode not in ("RGB", "RGBA"):
im = im.convert("RGBA" if ("A" in im.mode or im.mode == "P") else "RGB")
if im.width > DISPLAY_WIDTH:
h = max(1, round(im.height * DISPLAY_WIDTH / im.width))
im = im.resize((DISPLAY_WIDTH, h), Image.LANCZOS)
out = io.BytesIO()
im.save(out, format="WEBP", quality=WEBP_QUALITY, method=4)
return out.getvalue()
except Exception: # noqa: BLE001 — UnidentifiedImageError, DecompressionBombError, SVG, truncated …
return None
def _fail_path(url: str) -> Path:
return cache_dir() / f"{_key(url)}.fail"
def _mark_failed(url: str) -> None:
try:
_fail_path(url).touch()
except OSError:
pass
def _failed_recently(url: str) -> bool:
try:
return (time.time() - _fail_path(url).stat().st_mtime) < _FAIL_TTL_S
except OSError:
return False
def path_for(url: str) -> Path | None:
"""The cached WebP for this URL if present (and bump its mtime, the LRU marker).
A pure cache lookup — never fetches."""
if not url:
return None
p = cache_dir() / f"{_key(url)}.webp"
if p.exists():
try:
os.utime(p, None) # touch → last-used time for LRU eviction
except OSError:
pass
return p
return None
def fetch_and_cache(url: str | None) -> Path | None:
"""Fetch (SSRF-safe), downscale to WebP, and cache atomically. CYCLE-ONLY — the API
endpoint never calls this. None on any failure; definitive failures are negative-cached
so they aren't retried every cycle."""
if not url or not url.startswith(("http://", "https://")):
return None
try:
data, _ctype = _safe_fetch(url)
except _FetchError as exc:
if exc.permanent:
_mark_failed(url)
return None
if not (_MIN_IMAGE_BYTES <= len(data) <= _MAX_FETCH_BYTES):
_mark_failed(url)
return None
blob = _encode(data)
if blob is None: # SVG / undecodable / bomb / bad dimensions
_mark_failed(url)
return None
cdir = cache_dir()
key = _key(url)
tmp = cdir / f".{key}.tmp"
dest = cdir / f"{key}.webp"
try:
tmp.write_bytes(blob)
os.replace(tmp, dest) # atomic
except OSError:
try:
tmp.unlink()
except OSError:
pass
return None
return dest
def purge_source(conn, source_id: int) -> int:
"""Delete every cached file for a source's article image URLs. Called when a source
leaves 'cache' policy (revoked permission / re-classified), so the re-hosted copies
come down immediately rather than lingering inaccessible on disk. Returns webp count."""
rows = conn.execute(
"SELECT DISTINCT image_url FROM articles WHERE source_id = ? "
"AND image_url IS NOT NULL AND image_url != ''",
(source_id,),
).fetchall()
cdir = cache_dir()
removed = 0
for r in rows:
key = _key(r[0])
for suffix in (".webp", ".fail"):
p = cdir / f"{key}{suffix}"
try:
if p.exists():
p.unlink()
if suffix == ".webp":
removed += 1
except OSError:
pass
return removed
def warm(conn, limit: int = 200) -> int:
"""Pre-fetch display copies for the newest ACCEPTED, CANONICAL articles whose SOURCE
is cleared to cache (image_policy='cache'), so the API only ever serves cache hits.
Bounded; skips already-cached and recently-failed URLs. Returns how many it newly
cached. Sources default to 'remote' (hotlink, never re-hosted), so this caches
nothing until a source is explicitly set to 'cache'."""
rows = conn.execute(
"SELECT DISTINCT a.image_url FROM article_scores s JOIN articles a ON a.id = s.article_id "
"JOIN sources src ON src.id = a.source_id "
"WHERE s.accepted=1 AND a.duplicate_of IS NULL AND src.image_policy='cache' "
"AND a.image_url IS NOT NULL AND a.image_url != '' ORDER BY a.id DESC LIMIT ?",
(limit,),
).fetchall()
made = 0
for r in rows:
url = r[0]
if path_for(url) or _failed_recently(url):
continue
if fetch_and_cache(url):
made += 1
return made
def prune(cap: int | None = None) -> dict:
"""Enforce the size ceiling: delete least-recently-used WebPs (oldest mtime first)
until under the cap; also sweep stale .fail markers. Returns {before, after, removed, cap}."""
if cap is None:
cap = cap_bytes()
now = time.time()
files, total = [], 0
for p in cache_dir().iterdir():
if p.name.startswith("."):
continue
if p.suffix == ".fail":
try:
if now - p.stat().st_mtime >= _FAIL_TTL_S:
p.unlink()
except OSError:
pass
continue
if p.suffix != ".webp" or not p.is_file():
continue
try:
st = p.stat()
except OSError:
continue
files.append((st.st_mtime, st.st_size, p))
total += st.st_size
before, removed = total, 0
if total > cap:
files.sort() # oldest mtime first = least recently used
for _mtime, size, p in files:
if total <= cap:
break
try:
p.unlink()
total -= size
removed += 1
except OSError:
pass
return {"before": before, "after": total, "removed": removed, "cap": cap}