8a3c00db3b
Stop hotlinking news images from third-party CDNs (the source of the "blank until
you refresh a few times" graphic). New goodnews/newsimg.py caches a downscaled WebP
display copy (≤800px) beside the DB, like art_cache:
- GET/HEAD /api/img/{article_id} — resolves id→image_url (allowlisted to our corpus,
not an open proxy), fetch+cache on first miss, serve local after, immutable headers.
- cycle warms display copies for recent accepted-with-image articles (so the FIRST
view is already local) and prunes to a hard size cap (default 1 GB) by LRU eviction.
Frontend now points at /api/img/<id>: the hub lead, every ArticleCard (feed hero +
cards), and the /a/<id> share page's visible image. og:image/twitter:image stay the
source URL so social crawlers fetch the canonical image directly.
Storage is bounded by construction — over the cap, least-recently-used files are
evicted, so it can't grow without limit regardless of ingest rate. Tests cover
fetch/downscale, cache-hit (no refetch), bad-scheme/non-image rejection, fetch
failure, LRU prune, warm, and the endpoint allowlist.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
188 lines
6.5 KiB
Python
188 lines
6.5 KiB
Python
"""Local image cache + downscale for news article images.
|
|
|
|
The hub, feed, and article pages used to hotlink each article's image_url straight
|
|
from the source's server, so a slow / rate-limited / flaky third-party CDN left a
|
|
blank graphic until a refresh. Instead we cache a downscaled display copy on our own
|
|
origin (beside the DB, like art_cache) and serve that. The cache is bounded by a HARD
|
|
size ceiling with LRU eviction (prune), so it can't grow without limit no matter the
|
|
ingest rate. Network + Pillow calls are isolated so tests can monkeypatch them.
|
|
|
|
Keyed by a hash of the source URL: a given image_url always maps to the same file.
|
|
The API resolves an article id -> its image_url (a tight allowlist — we only ever
|
|
fetch URLs already in our own corpus, so it is not an open proxy)."""
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import io
|
|
import os
|
|
import urllib.request
|
|
from pathlib import Path
|
|
|
|
_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
|
|
_MIN_IMAGE_BYTES = 500
|
|
_MAX_FETCH_BYTES = 20 * 1024 * 1024 # never pull an absurd original into memory
|
|
DISPLAY_WIDTH = 800 # cards / feed never show wider than this
|
|
WEBP_QUALITY = 80
|
|
DEFAULT_CAP_BYTES = 1024 * 1024 * 1024 # 1 GB hard ceiling (override via env)
|
|
|
|
|
|
def cache_dir() -> Path:
|
|
"""Where cached images live — beside the DB, so the host cycle writes and the API
|
|
container reads the same mounted volume (mirrors art.cache_dir)."""
|
|
override = os.environ.get("GOODNEWS_IMG_CACHE")
|
|
if override:
|
|
d = Path(override)
|
|
else:
|
|
db = Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3"))
|
|
d = db.parent / "img_cache"
|
|
d.mkdir(parents=True, exist_ok=True)
|
|
return d
|
|
|
|
|
|
def cap_bytes() -> int:
|
|
try:
|
|
return int(os.environ.get("GOODNEWS_IMG_CACHE_CAP", DEFAULT_CAP_BYTES))
|
|
except ValueError:
|
|
return DEFAULT_CAP_BYTES
|
|
|
|
|
|
def _key(url: str) -> str:
|
|
return hashlib.sha1(url.encode("utf-8")).hexdigest()
|
|
|
|
|
|
def _http_bytes(url: str, timeout: int = 12) -> tuple[bytes, str]:
|
|
req = urllib.request.Request(url, headers=_UA)
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
return r.read(_MAX_FETCH_BYTES + 1), (r.headers.get("Content-Type") or "")
|
|
|
|
|
|
def _encode(data: bytes) -> bytes | None:
|
|
"""Downscale to DISPLAY_WIDTH and re-encode as WebP. None if it isn't a decodable
|
|
raster image (e.g. SVG) — the caller then stores the original bytes as-is."""
|
|
try:
|
|
from PIL import Image
|
|
im = Image.open(io.BytesIO(data))
|
|
im.load()
|
|
if im.mode not in ("RGB", "RGBA"):
|
|
im = im.convert("RGBA" if ("A" in im.mode or im.mode == "P") else "RGB")
|
|
if im.width > DISPLAY_WIDTH:
|
|
h = max(1, round(im.height * DISPLAY_WIDTH / im.width))
|
|
im = im.resize((DISPLAY_WIDTH, h), Image.LANCZOS)
|
|
out = io.BytesIO()
|
|
im.save(out, format="WEBP", quality=WEBP_QUALITY, method=4)
|
|
return out.getvalue()
|
|
except Exception: # noqa: BLE001 — not a decodable raster image
|
|
return None
|
|
|
|
|
|
def _ext_for(ctype: str) -> str:
|
|
c = ctype.lower()
|
|
if "png" in c:
|
|
return ".png"
|
|
if "gif" in c:
|
|
return ".gif"
|
|
if "svg" in c:
|
|
return ".svg"
|
|
if "webp" in c:
|
|
return ".webp"
|
|
return ".jpg"
|
|
|
|
|
|
def path_for(url: str) -> Path | None:
|
|
"""The cached file for this URL if present (and bump its mtime, the LRU marker)."""
|
|
for p in cache_dir().glob(_key(url) + ".*"):
|
|
try:
|
|
os.utime(p, None) # touch -> last-used time for LRU eviction
|
|
except OSError:
|
|
pass
|
|
return p
|
|
return None
|
|
|
|
|
|
def get_or_fetch(url: str | None) -> Path | None:
|
|
"""Cached display copy for a source image URL, fetching + caching on first miss.
|
|
Atomic write (temp then rename) so a reader never sees a half-file. None on any
|
|
failure — callers (endpoint 404 -> frontend retry/typo cover) degrade gracefully."""
|
|
if not url or not url.startswith(("http://", "https://")):
|
|
return None
|
|
hit = path_for(url)
|
|
if hit:
|
|
return hit
|
|
try:
|
|
data, ctype = _http_bytes(url)
|
|
except Exception: # noqa: BLE001 — source down/slow/blocked
|
|
return None
|
|
if len(data) < _MIN_IMAGE_BYTES or len(data) > _MAX_FETCH_BYTES:
|
|
return None
|
|
encoded = _encode(data)
|
|
if encoded is not None:
|
|
blob, ext = encoded, ".webp"
|
|
elif ctype.startswith("image/"):
|
|
blob, ext = data, _ext_for(ctype) # couldn't re-encode (e.g. SVG): keep original
|
|
else:
|
|
return None
|
|
key = _key(url)
|
|
cdir = cache_dir()
|
|
tmp = cdir / f".{key}.tmp"
|
|
dest = cdir / f"{key}{ext}"
|
|
try:
|
|
tmp.write_bytes(blob)
|
|
os.replace(tmp, dest) # atomic
|
|
except OSError:
|
|
try:
|
|
tmp.unlink()
|
|
except OSError:
|
|
pass
|
|
return None
|
|
return dest
|
|
|
|
|
|
def warm(conn, limit: int = 200) -> int:
|
|
"""Pre-fetch display copies for the newest accepted articles that have an image, so
|
|
the FIRST page view is already a local hit (no first-view flakiness). Bounded; skips
|
|
already-cached. Returns how many it newly cached."""
|
|
rows = conn.execute(
|
|
"SELECT DISTINCT a.image_url FROM article_scores s JOIN articles a ON a.id = s.article_id "
|
|
"WHERE s.accepted=1 AND a.duplicate_of IS NULL AND a.image_url IS NOT NULL "
|
|
"AND a.image_url != '' ORDER BY a.id DESC LIMIT ?",
|
|
(limit,),
|
|
).fetchall()
|
|
made = 0
|
|
for r in rows:
|
|
url = r[0]
|
|
if path_for(url):
|
|
continue
|
|
if get_or_fetch(url):
|
|
made += 1
|
|
return made
|
|
|
|
|
|
def prune(cap: int | None = None) -> dict:
|
|
"""Enforce the size ceiling: delete least-recently-used files (oldest mtime first)
|
|
until the cache is under the cap. Returns {before, after, removed, cap}."""
|
|
if cap is None:
|
|
cap = cap_bytes()
|
|
files, total = [], 0
|
|
for p in cache_dir().iterdir():
|
|
if not p.is_file() or p.name.startswith("."):
|
|
continue
|
|
try:
|
|
st = p.stat()
|
|
except OSError:
|
|
continue
|
|
files.append((st.st_mtime, st.st_size, p))
|
|
total += st.st_size
|
|
before, removed = total, 0
|
|
if total > cap:
|
|
files.sort() # oldest mtime first = least recently used
|
|
for _mtime, size, p in files:
|
|
if total <= cap:
|
|
break
|
|
try:
|
|
p.unlink()
|
|
total -= size
|
|
removed += 1
|
|
except OSError:
|
|
pass
|
|
return {"before": before, "after": total, "removed": removed, "cap": cap}
|