images: harden the cache per Codex audit (SSRF-safe, cache-only endpoint, WebP-only)
Blocker fixes for the image cache:
- /api/img/{id} now serves cache HITS ONLY and is restricted to ACCEPTED, CANONICAL
articles. It never fetches — the cycle (newsimg.warm) owns all fetching — so the
public endpoint has no SSRF/worker-exhaustion surface. Dropped 1-year immutable
caching (image_url can change) → public, max-age=86400.
- newsimg._safe_fetch: SSRF-safe (reuses enrich._host_is_public + _NoRedirect, http(s)
only, every redirect hop re-validated, body capped). _FetchError distinguishes
permanent refusals (negative-cached via a .fail marker) from transient errors (retry).
- _encode re-encodes only decoded RASTER images to WebP and REJECTS everything else
(SVG, undecodable, decompression bombs via MAX_IMAGE_PIXELS, pathological dimensions);
originals are never retained. prune() also sweeps stale .fail markers.
- Concurrency: fetching only runs inside the cycle lock; writes stay atomic.
Smaller fixes:
- share.py visible image has onerror→this.remove() (degrade to the text unfurl, no
broken icon when an image isn't cached yet).
- share-page Back follows history only on a SAME-ORIGIN referrer (never bounce to an
external site); menu now honors Escape + resets crossing back to desktop (HubBar parity).
Tests: private host, redirect-to-private, hostile SVG/non-image, transient-vs-permanent
failure, LRU prune, warm (accepted+canonical only, idempotent), cache-only endpoint
(404 on not-cached/unaccepted/duplicate, never fetches), share chrome parity. 441 pass.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
+15
-13
@@ -2339,22 +2339,24 @@ def create_app() -> FastAPI:
|
||||
|
||||
@app.api_route("/api/img/{article_id}", methods=["GET", "HEAD"])
|
||||
def article_image(article_id: int) -> FileResponse:
|
||||
"""Serve a locally-cached, downscaled copy of an article's image instead of
|
||||
hotlinking the source. Allowlisted by construction: the id resolves to a URL
|
||||
already in our corpus (no open proxy). A miss fetches+caches once; a hard
|
||||
failure 404s, which the frontend handles (retry, then the typo cover)."""
|
||||
"""Serve the locally-cached, downscaled WebP for an accepted, canonical article.
|
||||
Cache HITS ONLY — this never fetches (the cycle owns all fetching via newsimg.warm),
|
||||
so the public endpoint has no SSRF/worker-exhaustion surface. A miss 404s; the
|
||||
frontend handles that (retry, then the typographic cover)."""
|
||||
with get_conn() as conn:
|
||||
row = conn.execute("SELECT image_url FROM articles WHERE id = ?", (article_id,)).fetchone()
|
||||
row = conn.execute(
|
||||
"SELECT a.image_url FROM articles a JOIN article_scores s ON s.article_id = a.id "
|
||||
"WHERE a.id = ? AND s.accepted = 1 AND a.duplicate_of IS NULL",
|
||||
(article_id,),
|
||||
).fetchone()
|
||||
url = row["image_url"] if row else None
|
||||
if not url:
|
||||
raise HTTPException(status_code=404, detail="no image for this article")
|
||||
path = newsimg.get_or_fetch(url)
|
||||
path = newsimg.path_for(url) if url else None
|
||||
if not path:
|
||||
raise HTTPException(status_code=404, detail="image unavailable")
|
||||
media = "image/webp" if path.suffix == ".webp" else None
|
||||
# Stable per article id (image_url doesn't change) → cache hard at the edge/browser.
|
||||
return FileResponse(str(path), media_type=media,
|
||||
headers={"Cache-Control": "public, max-age=31536000, immutable"})
|
||||
raise HTTPException(status_code=404, detail="image not cached")
|
||||
# NOT immutable: a re-enriched article can change image_url, so the bytes behind a
|
||||
# given id can change. A day's browser cache is plenty (we're direct-origin, no CDN).
|
||||
return FileResponse(str(path), media_type="image/webp",
|
||||
headers={"Cache-Control": "public, max-age=86400"})
|
||||
|
||||
@app.api_route("/api/art/image/{object_id}", methods=["GET", "HEAD"])
|
||||
def art_image(object_id: int, size: str = Query("")) -> FileResponse:
|
||||
|
||||
+127
-63
@@ -1,40 +1,53 @@
|
||||
"""Local image cache + downscale for news article images.
|
||||
|
||||
The hub, feed, and article pages used to hotlink each article's image_url straight
|
||||
from the source's server, so a slow / rate-limited / flaky third-party CDN left a
|
||||
blank graphic until a refresh. Instead we cache a downscaled display copy on our own
|
||||
origin (beside the DB, like art_cache) and serve that. The cache is bounded by a HARD
|
||||
size ceiling with LRU eviction (prune), so it can't grow without limit no matter the
|
||||
ingest rate. Network + Pillow calls are isolated so tests can monkeypatch them.
|
||||
Article images used to be hotlinked from the source, so a slow/flaky third-party CDN
|
||||
left a blank graphic until a refresh. Instead the CYCLE fetches a downscaled WebP copy
|
||||
to data/img_cache/ (beside the DB, mounted into the API container, mirrors art_cache),
|
||||
and the API serves only cache HITS — it never fetches, so the public endpoint has no
|
||||
SSRF or worker-exhaustion surface. The cache is bounded by a hard size ceiling with LRU
|
||||
eviction, so it can't grow without limit no matter the ingest rate.
|
||||
|
||||
Keyed by a hash of the source URL: a given image_url always maps to the same file.
|
||||
The API resolves an article id -> its image_url (a tight allowlist — we only ever
|
||||
fetch URLs already in our own corpus, so it is not an open proxy)."""
|
||||
Security posture (the fetch runs only in the trusted cycle, but feed image URLs are
|
||||
still externally supplied, so we treat them as untrusted):
|
||||
* SSRF-safe fetch reuses enrich._host_is_public + bounded redirect re-validation
|
||||
(same path as feeds.safe_fetch_feed) — no private/loopback/link-local targets,
|
||||
http(s) only, every redirect hop re-checked.
|
||||
* Only successfully-decoded RASTER images are re-encoded to WebP and stored; SVG and
|
||||
anything undecodable is REJECTED (never retained as a same-origin file).
|
||||
* Decompression-bomb + dimension guards.
|
||||
* Definitive failures are negative-cached (a .fail marker) so a bad URL isn't refetched
|
||||
every cycle; transient network errors are not, so they retry.
|
||||
Concurrency: all fetching happens inside the cycle, which holds an exclusive lock, so no
|
||||
two fetches race; writes are atomic (temp + rename) regardless.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import io
|
||||
import os
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, urlsplit
|
||||
|
||||
from .enrich import MAX_REDIRECTS, _NoRedirect, _host_is_public
|
||||
|
||||
_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
|
||||
_MIN_IMAGE_BYTES = 500
|
||||
_MAX_FETCH_BYTES = 20 * 1024 * 1024 # never pull an absurd original into memory
|
||||
_MAX_PIXELS = 50_000_000 # decompression-bomb ceiling (≈50 MP)
|
||||
_MAX_DIM = 12000 # reject pathological single-axis dimensions
|
||||
DISPLAY_WIDTH = 800 # cards / feed never show wider than this
|
||||
WEBP_QUALITY = 80
|
||||
DEFAULT_CAP_BYTES = 1024 * 1024 * 1024 # 1 GB hard ceiling (override via env)
|
||||
_FAIL_TTL_S = 3 * 24 * 3600 # don't refetch a definitively-bad URL for 3 days
|
||||
|
||||
|
||||
def cache_dir() -> Path:
|
||||
"""Where cached images live — beside the DB, so the host cycle writes and the API
|
||||
container reads the same mounted volume (mirrors art.cache_dir)."""
|
||||
override = os.environ.get("GOODNEWS_IMG_CACHE")
|
||||
if override:
|
||||
d = Path(override)
|
||||
else:
|
||||
db = Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3"))
|
||||
d = db.parent / "img_cache"
|
||||
d = Path(override) if override else db.parent / "img_cache"
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
return d
|
||||
|
||||
@@ -50,19 +63,54 @@ def _key(url: str) -> str:
|
||||
return hashlib.sha1(url.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def _http_bytes(url: str, timeout: int = 12) -> tuple[bytes, str]:
|
||||
req = urllib.request.Request(url, headers=_UA)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read(_MAX_FETCH_BYTES + 1), (r.headers.get("Content-Type") or "")
|
||||
class _FetchError(Exception):
|
||||
"""permanent=True → negative-cache (won't retry soon); False → transient, retry."""
|
||||
def __init__(self, msg: str, permanent: bool):
|
||||
super().__init__(msg)
|
||||
self.permanent = permanent
|
||||
|
||||
|
||||
def _safe_fetch(url: str, timeout: int = 12) -> tuple[bytes, str]:
|
||||
"""SSRF-safe fetch of an untrusted image URL: http(s) only, every redirect hop
|
||||
re-validated against public IPs, bounded redirects, body capped. Raises _FetchError
|
||||
(permanent for policy refusals, transient for network errors)."""
|
||||
opener = urllib.request.build_opener(_NoRedirect)
|
||||
current = url
|
||||
for _ in range(MAX_REDIRECTS + 1):
|
||||
parts = urlsplit(current)
|
||||
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
|
||||
raise _FetchError(f"non-public or non-http(s): {current}", permanent=True)
|
||||
req = urllib.request.Request(current, headers=_UA)
|
||||
try:
|
||||
resp = opener.open(req, timeout=timeout)
|
||||
except (urllib.error.URLError, OSError, ValueError) as exc:
|
||||
raise _FetchError(f"fetch failed: {exc}", permanent=False) from exc
|
||||
status = getattr(resp, "status", 200) or 200
|
||||
if status in (301, 302, 303, 307, 308):
|
||||
loc = resp.headers.get("Location")
|
||||
resp.close()
|
||||
if not loc:
|
||||
raise _FetchError("redirect without location", permanent=True)
|
||||
current = urljoin(current, loc)
|
||||
continue
|
||||
try:
|
||||
return resp.read(_MAX_FETCH_BYTES + 1), (resp.headers.get("Content-Type") or "")
|
||||
finally:
|
||||
resp.close()
|
||||
raise _FetchError("too many redirects", permanent=True)
|
||||
|
||||
|
||||
def _encode(data: bytes) -> bytes | None:
|
||||
"""Downscale to DISPLAY_WIDTH and re-encode as WebP. None if it isn't a decodable
|
||||
raster image (e.g. SVG) — the caller then stores the original bytes as-is."""
|
||||
"""Downscale a decoded RASTER image to DISPLAY_WIDTH and re-encode as WebP. None if
|
||||
it isn't a decodable raster (e.g. SVG), is a decompression bomb, or has pathological
|
||||
dimensions — the caller then REJECTS it (never stores arbitrary bytes)."""
|
||||
try:
|
||||
from PIL import Image
|
||||
Image.MAX_IMAGE_PIXELS = _MAX_PIXELS # raise DecompressionBombError past this
|
||||
im = Image.open(io.BytesIO(data))
|
||||
im.load()
|
||||
im.load() # forces decode → catches truncated/bomb here
|
||||
if im.width > _MAX_DIM or im.height > _MAX_DIM or im.width < 1 or im.height < 1:
|
||||
return None
|
||||
if im.mode not in ("RGB", "RGBA"):
|
||||
im = im.convert("RGBA" if ("A" in im.mode or im.mode == "P") else "RGB")
|
||||
if im.width > DISPLAY_WIDTH:
|
||||
@@ -71,60 +119,66 @@ def _encode(data: bytes) -> bytes | None:
|
||||
out = io.BytesIO()
|
||||
im.save(out, format="WEBP", quality=WEBP_QUALITY, method=4)
|
||||
return out.getvalue()
|
||||
except Exception: # noqa: BLE001 — not a decodable raster image
|
||||
except Exception: # noqa: BLE001 — UnidentifiedImageError, DecompressionBombError, SVG, truncated …
|
||||
return None
|
||||
|
||||
|
||||
def _ext_for(ctype: str) -> str:
|
||||
c = ctype.lower()
|
||||
if "png" in c:
|
||||
return ".png"
|
||||
if "gif" in c:
|
||||
return ".gif"
|
||||
if "svg" in c:
|
||||
return ".svg"
|
||||
if "webp" in c:
|
||||
return ".webp"
|
||||
return ".jpg"
|
||||
def _fail_path(url: str) -> Path:
|
||||
return cache_dir() / f"{_key(url)}.fail"
|
||||
|
||||
|
||||
def _mark_failed(url: str) -> None:
|
||||
try:
|
||||
_fail_path(url).touch()
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _failed_recently(url: str) -> bool:
|
||||
try:
|
||||
return (time.time() - _fail_path(url).stat().st_mtime) < _FAIL_TTL_S
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
|
||||
def path_for(url: str) -> Path | None:
|
||||
"""The cached file for this URL if present (and bump its mtime, the LRU marker)."""
|
||||
for p in cache_dir().glob(_key(url) + ".*"):
|
||||
"""The cached WebP for this URL if present (and bump its mtime, the LRU marker).
|
||||
A pure cache lookup — never fetches."""
|
||||
if not url:
|
||||
return None
|
||||
p = cache_dir() / f"{_key(url)}.webp"
|
||||
if p.exists():
|
||||
try:
|
||||
os.utime(p, None) # touch -> last-used time for LRU eviction
|
||||
os.utime(p, None) # touch → last-used time for LRU eviction
|
||||
except OSError:
|
||||
pass
|
||||
return p
|
||||
return None
|
||||
|
||||
|
||||
def get_or_fetch(url: str | None) -> Path | None:
|
||||
"""Cached display copy for a source image URL, fetching + caching on first miss.
|
||||
Atomic write (temp then rename) so a reader never sees a half-file. None on any
|
||||
failure — callers (endpoint 404 -> frontend retry/typo cover) degrade gracefully."""
|
||||
def fetch_and_cache(url: str | None) -> Path | None:
|
||||
"""Fetch (SSRF-safe), downscale to WebP, and cache atomically. CYCLE-ONLY — the API
|
||||
endpoint never calls this. None on any failure; definitive failures are negative-cached
|
||||
so they aren't retried every cycle."""
|
||||
if not url or not url.startswith(("http://", "https://")):
|
||||
return None
|
||||
hit = path_for(url)
|
||||
if hit:
|
||||
return hit
|
||||
try:
|
||||
data, ctype = _http_bytes(url)
|
||||
except Exception: # noqa: BLE001 — source down/slow/blocked
|
||||
data, _ctype = _safe_fetch(url)
|
||||
except _FetchError as exc:
|
||||
if exc.permanent:
|
||||
_mark_failed(url)
|
||||
return None
|
||||
if len(data) < _MIN_IMAGE_BYTES or len(data) > _MAX_FETCH_BYTES:
|
||||
if not (_MIN_IMAGE_BYTES <= len(data) <= _MAX_FETCH_BYTES):
|
||||
_mark_failed(url)
|
||||
return None
|
||||
encoded = _encode(data)
|
||||
if encoded is not None:
|
||||
blob, ext = encoded, ".webp"
|
||||
elif ctype.startswith("image/"):
|
||||
blob, ext = data, _ext_for(ctype) # couldn't re-encode (e.g. SVG): keep original
|
||||
else:
|
||||
blob = _encode(data)
|
||||
if blob is None: # SVG / undecodable / bomb / bad dimensions
|
||||
_mark_failed(url)
|
||||
return None
|
||||
key = _key(url)
|
||||
cdir = cache_dir()
|
||||
key = _key(url)
|
||||
tmp = cdir / f".{key}.tmp"
|
||||
dest = cdir / f"{key}{ext}"
|
||||
dest = cdir / f"{key}.webp"
|
||||
try:
|
||||
tmp.write_bytes(blob)
|
||||
os.replace(tmp, dest) # atomic
|
||||
@@ -138,9 +192,9 @@ def get_or_fetch(url: str | None) -> Path | None:
|
||||
|
||||
|
||||
def warm(conn, limit: int = 200) -> int:
|
||||
"""Pre-fetch display copies for the newest accepted articles that have an image, so
|
||||
the FIRST page view is already a local hit (no first-view flakiness). Bounded; skips
|
||||
already-cached. Returns how many it newly cached."""
|
||||
"""Pre-fetch display copies for the newest ACCEPTED, CANONICAL articles that have an
|
||||
image, so the API only ever serves cache hits. Bounded; skips already-cached and
|
||||
recently-failed URLs. Returns how many it newly cached."""
|
||||
rows = conn.execute(
|
||||
"SELECT DISTINCT a.image_url FROM article_scores s JOIN articles a ON a.id = s.article_id "
|
||||
"WHERE s.accepted=1 AND a.duplicate_of IS NULL AND a.image_url IS NOT NULL "
|
||||
@@ -150,21 +204,31 @@ def warm(conn, limit: int = 200) -> int:
|
||||
made = 0
|
||||
for r in rows:
|
||||
url = r[0]
|
||||
if path_for(url):
|
||||
if path_for(url) or _failed_recently(url):
|
||||
continue
|
||||
if get_or_fetch(url):
|
||||
if fetch_and_cache(url):
|
||||
made += 1
|
||||
return made
|
||||
|
||||
|
||||
def prune(cap: int | None = None) -> dict:
|
||||
"""Enforce the size ceiling: delete least-recently-used files (oldest mtime first)
|
||||
until the cache is under the cap. Returns {before, after, removed, cap}."""
|
||||
"""Enforce the size ceiling: delete least-recently-used WebPs (oldest mtime first)
|
||||
until under the cap; also sweep stale .fail markers. Returns {before, after, removed, cap}."""
|
||||
if cap is None:
|
||||
cap = cap_bytes()
|
||||
now = time.time()
|
||||
files, total = [], 0
|
||||
for p in cache_dir().iterdir():
|
||||
if not p.is_file() or p.name.startswith("."):
|
||||
if p.name.startswith("."):
|
||||
continue
|
||||
if p.suffix == ".fail":
|
||||
try:
|
||||
if now - p.stat().st_mtime >= _FAIL_TTL_S:
|
||||
p.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
continue
|
||||
if p.suffix != ".webp" or not p.is_file():
|
||||
continue
|
||||
try:
|
||||
st = p.stat()
|
||||
|
||||
+13
-4
@@ -98,13 +98,17 @@ _TOP_BAR_CSS = """
|
||||
"""
|
||||
|
||||
# Burger toggle + signed-in avatar (read from the SPA's localStorage cache, same as HubBar).
|
||||
# Parity with HubBar: Escape closes the menu, and crossing back to desktop width resets it.
|
||||
_TOP_BAR_JS = """<script>
|
||||
(function(){
|
||||
var b=document.querySelector('[data-burger]'), m=document.querySelector('[data-menu]');
|
||||
function close(){ if(m) m.setAttribute('hidden',''); if(b){ b.classList.remove('open'); b.setAttribute('aria-expanded','false'); } }
|
||||
if(b&&m){ b.addEventListener('click',function(){
|
||||
if(m.hasAttribute('hidden')){ m.removeAttribute('hidden'); b.classList.add('open'); b.setAttribute('aria-expanded','true'); }
|
||||
else { m.setAttribute('hidden',''); b.classList.remove('open'); b.setAttribute('aria-expanded','false'); }
|
||||
else { close(); }
|
||||
}); }
|
||||
document.addEventListener('keydown',function(e){ if(e.key==='Escape') close(); });
|
||||
if(window.matchMedia){ var mq=window.matchMedia('(min-width:721px)'); if(mq.addEventListener) mq.addEventListener('change',function(e){ if(e.matches) close(); }); }
|
||||
try{
|
||||
var u=JSON.parse(localStorage.getItem('goodnews:auth_user')||'null');
|
||||
if(u&&u.avatar_url){
|
||||
@@ -117,12 +121,14 @@ _TOP_BAR_JS = """<script>
|
||||
})();
|
||||
</script>"""
|
||||
|
||||
# Single-history Back: return to where you came from in-app, else home (mirrors HubShell).
|
||||
# Single-history Back (mirrors HubShell): go back ONLY when we arrived from our own origin,
|
||||
# else go home — never bounce the reader off to an external referrer.
|
||||
_BACK_JS = """<script>
|
||||
(function(){
|
||||
var b=document.querySelector('[data-back]'); if(!b) return;
|
||||
b.addEventListener('click',function(){
|
||||
if(document.referrer && history.length>1){ history.back(); } else { location.href='/'; }
|
||||
var same=false; try{ same=!!document.referrer && new URL(document.referrer).origin===location.origin; }catch(e){}
|
||||
if(same && history.length>1){ history.back(); } else { location.href='/'; }
|
||||
});
|
||||
})();
|
||||
</script>"""
|
||||
@@ -165,8 +171,11 @@ def render_share_page(article: dict, base_url: str, summary: str | None = None,
|
||||
# The visible image is served from our cached/downscaled copy (not a hotlink), so a
|
||||
# flaky source CDN can't blank it. og:image/twitter:image above stay the source URL
|
||||
# so social crawlers fetch the canonical image directly.
|
||||
# Served from our cache (/api/img/<id>); if it isn't cached yet / fails, drop the
|
||||
# element so the page degrades to the clean text unfurl rather than a broken icon.
|
||||
media = (
|
||||
f'<img class="media" src="/api/img/{aid}" alt="" referrerpolicy="no-referrer">'
|
||||
f'<img class="media" src="/api/img/{aid}" alt="" referrerpolicy="no-referrer" '
|
||||
f'onerror="this.remove()">'
|
||||
if image else ""
|
||||
)
|
||||
|
||||
|
||||
+107
-44
@@ -1,6 +1,6 @@
|
||||
"""News image cache: fetch + downscale to WebP, cache-hit (no refetch), graceful
|
||||
failure, LRU eviction under a size cap, cycle warm, and the /api/img endpoint
|
||||
(allowlisted to our own corpus)."""
|
||||
"""News image cache (hardened): SSRF-safe cycle-only fetch + downscale to WebP, reject
|
||||
SVG/non-raster/private-host/redirect-to-private, negative-cache failures, LRU eviction,
|
||||
and the cache-HITS-ONLY /api/img endpoint (restricted to accepted, canonical articles)."""
|
||||
import io
|
||||
import os
|
||||
|
||||
@@ -11,6 +11,10 @@ from fastapi.testclient import TestClient
|
||||
from goodnews import newsimg
|
||||
from goodnews.db import connect, init_db
|
||||
|
||||
# The real SSRF-safe fetch, captured before any fixture stubs it — the SSRF tests below
|
||||
# restore it so they exercise the genuine host/redirect validation.
|
||||
_REAL_SAFE_FETCH = newsimg._safe_fetch
|
||||
|
||||
|
||||
def _png(w=1600, h=1000, color=(40, 130, 173)):
|
||||
out = io.BytesIO()
|
||||
@@ -21,81 +25,140 @@ def _png(w=1600, h=1000, color=(40, 130, 173)):
|
||||
@pytest.fixture
|
||||
def cache(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("GOODNEWS_IMG_CACHE", str(tmp_path / "img_cache"))
|
||||
# default: treat hosts as public + a happy PNG fetch; individual tests override.
|
||||
monkeypatch.setattr(newsimg, "_host_is_public", lambda h: True)
|
||||
monkeypatch.setattr(newsimg, "_safe_fetch", lambda url, timeout=12: (_png(), "image/png"))
|
||||
return tmp_path / "img_cache"
|
||||
|
||||
|
||||
def test_get_or_fetch_caches_and_downscales(cache, monkeypatch):
|
||||
calls = []
|
||||
monkeypatch.setattr(newsimg, "_http_bytes",
|
||||
lambda url, timeout=12: (calls.append(url), (_png(), "image/png"))[1])
|
||||
p = newsimg.get_or_fetch("https://example.com/big.png")
|
||||
def test_fetch_and_cache_downscales_to_webp(cache):
|
||||
p = newsimg.fetch_and_cache("https://example.com/big.png")
|
||||
assert p and p.exists() and p.suffix == ".webp"
|
||||
with Image.open(p) as im: # downscaled + re-encoded
|
||||
with Image.open(p) as im:
|
||||
assert im.width == newsimg.DISPLAY_WIDTH and im.format == "WEBP"
|
||||
assert newsimg.get_or_fetch("https://example.com/big.png") == p # cache hit
|
||||
assert len(calls) == 1 # ...not refetched
|
||||
assert newsimg.path_for("https://example.com/big.png") == p # pure cache lookup
|
||||
|
||||
|
||||
def test_get_or_fetch_rejects_non_image_and_bad_scheme(cache, monkeypatch):
|
||||
monkeypatch.setattr(newsimg, "_http_bytes",
|
||||
lambda url, timeout=12: (b"<html>nope</html>", "text/html"))
|
||||
assert newsimg.get_or_fetch("https://example.com/page.html") is None
|
||||
assert newsimg.get_or_fetch(None) is None
|
||||
assert newsimg.get_or_fetch("ftp://example.com/x.png") is None # http(s) only (no SSRF surface)
|
||||
def test_bad_scheme_and_empty_are_rejected(cache):
|
||||
assert newsimg.fetch_and_cache(None) is None
|
||||
assert newsimg.fetch_and_cache("ftp://example.com/x.png") is None
|
||||
assert not list(cache.glob("*.webp"))
|
||||
|
||||
|
||||
def test_fetch_failure_returns_none(cache, monkeypatch):
|
||||
def test_svg_and_nonimage_rejected_and_negative_cached(cache, monkeypatch):
|
||||
monkeypatch.setattr(newsimg, "_safe_fetch",
|
||||
lambda url, timeout=12: (b"<svg xmlns='http://www.w3.org/2000/svg'></svg>" + b" " * 600,
|
||||
"image/svg+xml"))
|
||||
url = "https://example.com/evil.svg"
|
||||
assert newsimg.fetch_and_cache(url) is None # decoded-raster only → SVG refused
|
||||
assert not list(cache.glob("*.webp")) # nothing stored
|
||||
assert newsimg._failed_recently(url) # negative-cached (won't refetch)
|
||||
|
||||
|
||||
def test_private_host_refused_and_negative_cached(cache, monkeypatch):
|
||||
monkeypatch.setattr(newsimg, "_safe_fetch", _REAL_SAFE_FETCH) # exercise real validation
|
||||
monkeypatch.setattr(newsimg, "_host_is_public", lambda h: False)
|
||||
url = "http://169.254.169.254/latest/meta-data/"
|
||||
assert newsimg.fetch_and_cache(url) is None
|
||||
assert newsimg._failed_recently(url) # SSRF target → permanent fail
|
||||
|
||||
|
||||
def test_redirect_to_private_is_refused(cache, monkeypatch):
|
||||
monkeypatch.setattr(newsimg, "_safe_fetch", _REAL_SAFE_FETCH) # exercise real redirect re-validation
|
||||
hosts = {"good.example": True, "evil.internal": False}
|
||||
monkeypatch.setattr(newsimg, "_host_is_public", lambda h: hosts.get(h, False))
|
||||
|
||||
class _Resp:
|
||||
def __init__(self, status, headers, body=b""):
|
||||
self.status, self.headers, self._b = status, headers, body
|
||||
def read(self, n=-1):
|
||||
return self._b
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
class _Opener:
|
||||
def open(self, req, timeout=None):
|
||||
if "good.example" in req.full_url: # public → 302 to private
|
||||
return _Resp(302, {"Location": "http://evil.internal/x.png"})
|
||||
return _Resp(200, {"Content-Type": "image/png"}, _png()) # should never be reached
|
||||
monkeypatch.setattr(newsimg.urllib.request, "build_opener", lambda *a: _Opener())
|
||||
|
||||
url = "http://good.example/p.png"
|
||||
assert newsimg.fetch_and_cache(url) is None # redirect hop re-validated → refused
|
||||
assert newsimg._failed_recently(url)
|
||||
|
||||
|
||||
def test_transient_failure_is_not_negative_cached(cache, monkeypatch):
|
||||
def boom(url, timeout=12):
|
||||
raise OSError("source down")
|
||||
monkeypatch.setattr(newsimg, "_http_bytes", boom)
|
||||
assert newsimg.get_or_fetch("https://example.com/x.jpg") is None
|
||||
raise newsimg._FetchError("network down", permanent=False)
|
||||
monkeypatch.setattr(newsimg, "_safe_fetch", boom)
|
||||
url = "https://example.com/x.jpg"
|
||||
assert newsimg.fetch_and_cache(url) is None
|
||||
assert not newsimg._failed_recently(url) # transient → retried next cycle
|
||||
|
||||
|
||||
def test_prune_evicts_least_recently_used_over_cap(cache, monkeypatch):
|
||||
monkeypatch.setattr(newsimg, "_http_bytes", lambda url, timeout=12: (_png(), "image/png"))
|
||||
paths = [newsimg.get_or_fetch(f"https://example.com/{i}.png") for i in range(5)]
|
||||
def test_prune_evicts_least_recently_used_over_cap(cache):
|
||||
paths = [newsimg.fetch_and_cache(f"https://example.com/{i}.png") for i in range(5)]
|
||||
for i, p in enumerate(paths): # 0 = oldest/LRU, 4 = newest
|
||||
os.utime(p, (1000 + i, 1000 + i))
|
||||
sizes = [p.stat().st_size for p in paths]
|
||||
cap = sum(sizes) - sizes[0] - sizes[1] + 1 # room for the 3 newest only
|
||||
r = newsimg.prune(cap)
|
||||
assert r["removed"] == 2 and r["after"] <= cap
|
||||
assert not paths[0].exists() and not paths[1].exists() # the two oldest evicted
|
||||
assert paths[2].exists() and paths[4].exists() # newer kept
|
||||
assert not paths[0].exists() and not paths[1].exists()
|
||||
assert paths[2].exists() and paths[4].exists()
|
||||
|
||||
|
||||
def test_warm_caches_recent_accepted_with_image(cache, monkeypatch):
|
||||
monkeypatch.setattr(newsimg, "_http_bytes", lambda url, timeout=12: (_png(), "image/png"))
|
||||
def test_warm_skips_cached_and_failed(cache, monkeypatch):
|
||||
conn = connect(":memory:"); init_db(conn)
|
||||
conn.execute("INSERT INTO sources (id,name,feed_url) VALUES (1,'S','http://s/f')")
|
||||
for aid, img in ((1, "https://x/1.jpg"), (2, "https://x/2.jpg"), (3, "")):
|
||||
conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) "
|
||||
"VALUES (?,1,?,?,?,?)", (aid, f"http://s/{aid}", f"t{aid}", f"h{aid}", img))
|
||||
conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?,1)", (aid,))
|
||||
for aid, img, acc, dup in ((1, "https://x/1.jpg", 1, None), (2, "https://x/2.jpg", 1, None),
|
||||
(3, "https://x/3.jpg", 0, None), (4, "https://x/4.jpg", 1, 1)):
|
||||
conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url,duplicate_of) "
|
||||
"VALUES (?,1,?,?,?,?,?)", (aid, f"http://s/{aid}", f"t{aid}", f"h{aid}", img, dup))
|
||||
conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?,?)", (aid, acc))
|
||||
conn.commit()
|
||||
assert newsimg.warm(conn) == 2 # the two with an image
|
||||
assert newsimg.warm(conn) == 0 # idempotent (already cached)
|
||||
assert newsimg.warm(conn) == 2 # only the 2 accepted, canonical, with image
|
||||
assert newsimg.warm(conn) == 0 # idempotent — already cached
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("GOODNEWS_DB", str(tmp_path / "t.sqlite3"))
|
||||
monkeypatch.setenv("GOODNEWS_IMG_CACHE", str(tmp_path / "img_cache"))
|
||||
monkeypatch.setattr(newsimg, "_http_bytes", lambda url, timeout=12: (_png(), "image/png"))
|
||||
monkeypatch.setattr(newsimg, "_host_is_public", lambda h: True)
|
||||
monkeypatch.setattr(newsimg, "_safe_fetch", lambda url, timeout=12: (_png(), "image/png"))
|
||||
conn = connect(tmp_path / "t.sqlite3"); init_db(conn)
|
||||
conn.execute("INSERT INTO sources (id,name,feed_url) VALUES (1,'S','http://s/f')")
|
||||
conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) "
|
||||
"VALUES (1,1,'http://s/1','t1','h1','https://x/pic.jpg')")
|
||||
conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) "
|
||||
"VALUES (2,1,'http://s/2','t2','h2','')") # no image
|
||||
conn.commit(); conn.close()
|
||||
# 1 = accepted+canonical+image, 2 = no image, 3 = NOT accepted, 4 = duplicate
|
||||
rows = ((1, "https://x/pic.jpg", 1, None), (2, "", 1, None),
|
||||
(3, "https://x/p3.jpg", 0, None), (4, "https://x/p4.jpg", 1, 1))
|
||||
for aid, img, acc, dup in rows:
|
||||
conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url,duplicate_of) "
|
||||
"VALUES (?,1,?,?,?,?,?)", (aid, f"http://s/{aid}", f"t{aid}", f"h{aid}", img, dup))
|
||||
conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?,?)", (aid, acc))
|
||||
conn.commit()
|
||||
# the cycle owns fetching — pre-warm so the endpoint has a hit to serve
|
||||
newsimg.warm(conn)
|
||||
conn.close()
|
||||
from goodnews.api import create_app
|
||||
return TestClient(create_app())
|
||||
|
||||
|
||||
def test_img_endpoint_serves_and_allowlists(client):
|
||||
def test_img_endpoint_serves_cached_and_restricts(client):
|
||||
r = client.get("/api/img/1")
|
||||
assert r.status_code == 200 and r.headers["content-type"] == "image/webp"
|
||||
assert "immutable" in r.headers.get("cache-control", "")
|
||||
assert client.get("/api/img/2").status_code == 404 # article has no image
|
||||
assert client.get("/api/img/999").status_code == 404 # unknown id (not in corpus)
|
||||
assert "immutable" not in r.headers.get("cache-control", "") # no 1-year immutable
|
||||
assert client.get("/api/img/2").status_code == 404 # no image
|
||||
assert client.get("/api/img/3").status_code == 404 # not accepted
|
||||
assert client.get("/api/img/4").status_code == 404 # duplicate (non-canonical)
|
||||
assert client.get("/api/img/999").status_code == 404 # unknown id
|
||||
|
||||
|
||||
def test_img_endpoint_never_fetches(client, monkeypatch):
|
||||
# An accepted article whose image was never warmed must 404, not trigger a fetch.
|
||||
called = {"n": 0}
|
||||
monkeypatch.setattr(newsimg, "fetch_and_cache", lambda *a, **k: called.__setitem__("n", called["n"] + 1))
|
||||
# /api/img/1 is cached (warmed in fixture) → still served without any fetch
|
||||
assert client.get("/api/img/1").status_code == 200
|
||||
assert called["n"] == 0
|
||||
|
||||
@@ -105,3 +105,15 @@ def test_complete_page_is_cached_and_served_from_cache(app_complete):
|
||||
assert 1 in api._SHARE_CACHE # finished page cached
|
||||
r2 = tc.get("/a/1") # second hit served from cache
|
||||
assert r2.status_code == 200 and r2.text == r1.text
|
||||
|
||||
|
||||
def test_share_chrome_parity_and_fallbacks():
|
||||
"""HubBar-replica parity Codex asked for: Back only follows a SAME-ORIGIN referrer,
|
||||
the menu honors Escape, and a failed image removes itself (clean text unfurl)."""
|
||||
from goodnews import share
|
||||
html = share.render_share_page(
|
||||
{"id": 7, "title": "T", "image_url": "https://x/p.jpg", "canonical_url": "https://s/x",
|
||||
"source_name": "S"}, "https://upbeatbytes.com")
|
||||
assert "origin===location.origin" in html # Back requires same-origin referrer
|
||||
assert "'Escape'" in html or "Escape" in html # menu closes on Escape (parity)
|
||||
assert 'onerror="this.remove()"' in html # broken image degrades to text unfurl
|
||||
|
||||
Reference in New Issue
Block a user