images: harden the cache per Codex audit (SSRF-safe, cache-only endpoint, WebP-only)

Blocker fixes for the image cache:
- /api/img/{id} now serves cache HITS ONLY and is restricted to ACCEPTED, CANONICAL
  articles. It never fetches — the cycle (newsimg.warm) owns all fetching — so the
  public endpoint has no SSRF/worker-exhaustion surface. Dropped 1-year immutable
  caching (image_url can change) → public, max-age=86400.
- newsimg._safe_fetch: SSRF-safe (reuses enrich._host_is_public + _NoRedirect, http(s)
  only, every redirect hop re-validated, body capped). _FetchError distinguishes
  permanent refusals (negative-cached via a .fail marker) from transient errors (retry).
- _encode re-encodes only decoded RASTER images to WebP and REJECTS everything else
  (SVG, undecodable, decompression bombs via MAX_IMAGE_PIXELS, pathological dimensions);
  originals are never retained. prune() also sweeps stale .fail markers.
- Concurrency: fetching only runs inside the cycle lock; writes stay atomic.

Smaller fixes:
- share.py visible image has onerror→this.remove() (degrade to the text unfurl, no
  broken icon when an image isn't cached yet).
- share-page Back follows history only on a SAME-ORIGIN referrer (never bounce to an
  external site); menu now honors Escape + resets crossing back to desktop (HubBar parity).

Tests: private host, redirect-to-private, hostile SVG/non-image, transient-vs-permanent
failure, LRU prune, warm (accepted+canonical only, idempotent), cache-only endpoint
(404 on not-cached/unaccepted/duplicate, never fetches), share chrome parity. 441 pass.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
jay
2026-06-30 12:19:57 -04:00
parent c350a2713b
commit a55ba185a8
5 changed files with 278 additions and 128 deletions
+109 -46
View File
@@ -1,6 +1,6 @@
"""News image cache: fetch + downscale to WebP, cache-hit (no refetch), graceful
failure, LRU eviction under a size cap, cycle warm, and the /api/img endpoint
(allowlisted to our own corpus)."""
"""News image cache (hardened): SSRF-safe cycle-only fetch + downscale to WebP, reject
SVG/non-raster/private-host/redirect-to-private, negative-cache failures, LRU eviction,
and the cache-HITS-ONLY /api/img endpoint (restricted to accepted, canonical articles)."""
import io
import os
@@ -11,6 +11,10 @@ from fastapi.testclient import TestClient
from goodnews import newsimg
from goodnews.db import connect, init_db
# The real SSRF-safe fetch, captured before any fixture stubs it — the SSRF tests below
# restore it so they exercise the genuine host/redirect validation.
_REAL_SAFE_FETCH = newsimg._safe_fetch
def _png(w=1600, h=1000, color=(40, 130, 173)):
out = io.BytesIO()
@@ -21,81 +25,140 @@ def _png(w=1600, h=1000, color=(40, 130, 173)):
@pytest.fixture
def cache(tmp_path, monkeypatch):
monkeypatch.setenv("GOODNEWS_IMG_CACHE", str(tmp_path / "img_cache"))
# default: treat hosts as public + a happy PNG fetch; individual tests override.
monkeypatch.setattr(newsimg, "_host_is_public", lambda h: True)
monkeypatch.setattr(newsimg, "_safe_fetch", lambda url, timeout=12: (_png(), "image/png"))
return tmp_path / "img_cache"
def test_get_or_fetch_caches_and_downscales(cache, monkeypatch):
calls = []
monkeypatch.setattr(newsimg, "_http_bytes",
lambda url, timeout=12: (calls.append(url), (_png(), "image/png"))[1])
p = newsimg.get_or_fetch("https://example.com/big.png")
def test_fetch_and_cache_downscales_to_webp(cache):
p = newsimg.fetch_and_cache("https://example.com/big.png")
assert p and p.exists() and p.suffix == ".webp"
with Image.open(p) as im: # downscaled + re-encoded
with Image.open(p) as im:
assert im.width == newsimg.DISPLAY_WIDTH and im.format == "WEBP"
assert newsimg.get_or_fetch("https://example.com/big.png") == p # cache hit
assert len(calls) == 1 # ...not refetched
assert newsimg.path_for("https://example.com/big.png") == p # pure cache lookup
def test_get_or_fetch_rejects_non_image_and_bad_scheme(cache, monkeypatch):
monkeypatch.setattr(newsimg, "_http_bytes",
lambda url, timeout=12: (b"<html>nope</html>", "text/html"))
assert newsimg.get_or_fetch("https://example.com/page.html") is None
assert newsimg.get_or_fetch(None) is None
assert newsimg.get_or_fetch("ftp://example.com/x.png") is None # http(s) only (no SSRF surface)
def test_bad_scheme_and_empty_are_rejected(cache):
assert newsimg.fetch_and_cache(None) is None
assert newsimg.fetch_and_cache("ftp://example.com/x.png") is None
assert not list(cache.glob("*.webp"))
def test_fetch_failure_returns_none(cache, monkeypatch):
def test_svg_and_nonimage_rejected_and_negative_cached(cache, monkeypatch):
monkeypatch.setattr(newsimg, "_safe_fetch",
lambda url, timeout=12: (b"<svg xmlns='http://www.w3.org/2000/svg'></svg>" + b" " * 600,
"image/svg+xml"))
url = "https://example.com/evil.svg"
assert newsimg.fetch_and_cache(url) is None # decoded-raster only → SVG refused
assert not list(cache.glob("*.webp")) # nothing stored
assert newsimg._failed_recently(url) # negative-cached (won't refetch)
def test_private_host_refused_and_negative_cached(cache, monkeypatch):
monkeypatch.setattr(newsimg, "_safe_fetch", _REAL_SAFE_FETCH) # exercise real validation
monkeypatch.setattr(newsimg, "_host_is_public", lambda h: False)
url = "http://169.254.169.254/latest/meta-data/"
assert newsimg.fetch_and_cache(url) is None
assert newsimg._failed_recently(url) # SSRF target → permanent fail
def test_redirect_to_private_is_refused(cache, monkeypatch):
monkeypatch.setattr(newsimg, "_safe_fetch", _REAL_SAFE_FETCH) # exercise real redirect re-validation
hosts = {"good.example": True, "evil.internal": False}
monkeypatch.setattr(newsimg, "_host_is_public", lambda h: hosts.get(h, False))
class _Resp:
def __init__(self, status, headers, body=b""):
self.status, self.headers, self._b = status, headers, body
def read(self, n=-1):
return self._b
def close(self):
pass
class _Opener:
def open(self, req, timeout=None):
if "good.example" in req.full_url: # public → 302 to private
return _Resp(302, {"Location": "http://evil.internal/x.png"})
return _Resp(200, {"Content-Type": "image/png"}, _png()) # should never be reached
monkeypatch.setattr(newsimg.urllib.request, "build_opener", lambda *a: _Opener())
url = "http://good.example/p.png"
assert newsimg.fetch_and_cache(url) is None # redirect hop re-validated → refused
assert newsimg._failed_recently(url)
def test_transient_failure_is_not_negative_cached(cache, monkeypatch):
def boom(url, timeout=12):
raise OSError("source down")
monkeypatch.setattr(newsimg, "_http_bytes", boom)
assert newsimg.get_or_fetch("https://example.com/x.jpg") is None
raise newsimg._FetchError("network down", permanent=False)
monkeypatch.setattr(newsimg, "_safe_fetch", boom)
url = "https://example.com/x.jpg"
assert newsimg.fetch_and_cache(url) is None
assert not newsimg._failed_recently(url) # transient → retried next cycle
def test_prune_evicts_least_recently_used_over_cap(cache, monkeypatch):
monkeypatch.setattr(newsimg, "_http_bytes", lambda url, timeout=12: (_png(), "image/png"))
paths = [newsimg.get_or_fetch(f"https://example.com/{i}.png") for i in range(5)]
for i, p in enumerate(paths): # 0 = oldest/LRU, 4 = newest
def test_prune_evicts_least_recently_used_over_cap(cache):
paths = [newsimg.fetch_and_cache(f"https://example.com/{i}.png") for i in range(5)]
for i, p in enumerate(paths): # 0 = oldest/LRU, 4 = newest
os.utime(p, (1000 + i, 1000 + i))
sizes = [p.stat().st_size for p in paths]
cap = sum(sizes) - sizes[0] - sizes[1] + 1 # room for the 3 newest only
cap = sum(sizes) - sizes[0] - sizes[1] + 1 # room for the 3 newest only
r = newsimg.prune(cap)
assert r["removed"] == 2 and r["after"] <= cap
assert not paths[0].exists() and not paths[1].exists() # the two oldest evicted
assert paths[2].exists() and paths[4].exists() # newer kept
assert not paths[0].exists() and not paths[1].exists()
assert paths[2].exists() and paths[4].exists()
def test_warm_caches_recent_accepted_with_image(cache, monkeypatch):
monkeypatch.setattr(newsimg, "_http_bytes", lambda url, timeout=12: (_png(), "image/png"))
def test_warm_skips_cached_and_failed(cache, monkeypatch):
conn = connect(":memory:"); init_db(conn)
conn.execute("INSERT INTO sources (id,name,feed_url) VALUES (1,'S','http://s/f')")
for aid, img in ((1, "https://x/1.jpg"), (2, "https://x/2.jpg"), (3, "")):
conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) "
"VALUES (?,1,?,?,?,?)", (aid, f"http://s/{aid}", f"t{aid}", f"h{aid}", img))
conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?,1)", (aid,))
for aid, img, acc, dup in ((1, "https://x/1.jpg", 1, None), (2, "https://x/2.jpg", 1, None),
(3, "https://x/3.jpg", 0, None), (4, "https://x/4.jpg", 1, 1)):
conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url,duplicate_of) "
"VALUES (?,1,?,?,?,?,?)", (aid, f"http://s/{aid}", f"t{aid}", f"h{aid}", img, dup))
conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?,?)", (aid, acc))
conn.commit()
assert newsimg.warm(conn) == 2 # the two with an image
assert newsimg.warm(conn) == 0 # idempotent (already cached)
assert newsimg.warm(conn) == 2 # only the 2 accepted, canonical, with image
assert newsimg.warm(conn) == 0 # idempotent already cached
@pytest.fixture
def client(tmp_path, monkeypatch):
monkeypatch.setenv("GOODNEWS_DB", str(tmp_path / "t.sqlite3"))
monkeypatch.setenv("GOODNEWS_IMG_CACHE", str(tmp_path / "img_cache"))
monkeypatch.setattr(newsimg, "_http_bytes", lambda url, timeout=12: (_png(), "image/png"))
monkeypatch.setattr(newsimg, "_host_is_public", lambda h: True)
monkeypatch.setattr(newsimg, "_safe_fetch", lambda url, timeout=12: (_png(), "image/png"))
conn = connect(tmp_path / "t.sqlite3"); init_db(conn)
conn.execute("INSERT INTO sources (id,name,feed_url) VALUES (1,'S','http://s/f')")
conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) "
"VALUES (1,1,'http://s/1','t1','h1','https://x/pic.jpg')")
conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) "
"VALUES (2,1,'http://s/2','t2','h2','')") # no image
conn.commit(); conn.close()
# 1 = accepted+canonical+image, 2 = no image, 3 = NOT accepted, 4 = duplicate
rows = ((1, "https://x/pic.jpg", 1, None), (2, "", 1, None),
(3, "https://x/p3.jpg", 0, None), (4, "https://x/p4.jpg", 1, 1))
for aid, img, acc, dup in rows:
conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url,duplicate_of) "
"VALUES (?,1,?,?,?,?,?)", (aid, f"http://s/{aid}", f"t{aid}", f"h{aid}", img, dup))
conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?,?)", (aid, acc))
conn.commit()
# the cycle owns fetching — pre-warm so the endpoint has a hit to serve
newsimg.warm(conn)
conn.close()
from goodnews.api import create_app
return TestClient(create_app())
def test_img_endpoint_serves_and_allowlists(client):
def test_img_endpoint_serves_cached_and_restricts(client):
r = client.get("/api/img/1")
assert r.status_code == 200 and r.headers["content-type"] == "image/webp"
assert "immutable" in r.headers.get("cache-control", "")
assert client.get("/api/img/2").status_code == 404 # article has no image
assert client.get("/api/img/999").status_code == 404 # unknown id (not in corpus)
assert "immutable" not in r.headers.get("cache-control", "") # no 1-year immutable
assert client.get("/api/img/2").status_code == 404 # no image
assert client.get("/api/img/3").status_code == 404 # not accepted
assert client.get("/api/img/4").status_code == 404 # duplicate (non-canonical)
assert client.get("/api/img/999").status_code == 404 # unknown id
def test_img_endpoint_never_fetches(client, monkeypatch):
# An accepted article whose image was never warmed must 404, not trigger a fetch.
called = {"n": 0}
monkeypatch.setattr(newsimg, "fetch_and_cache", lambda *a, **k: called.__setitem__("n", called["n"] + 1))
# /api/img/1 is cached (warmed in fixture) → still served without any fetch
assert client.get("/api/img/1").status_code == 200
assert called["n"] == 0
+12
View File
@@ -105,3 +105,15 @@ def test_complete_page_is_cached_and_served_from_cache(app_complete):
assert 1 in api._SHARE_CACHE # finished page cached
r2 = tc.get("/a/1") # second hit served from cache
assert r2.status_code == 200 and r2.text == r1.text
def test_share_chrome_parity_and_fallbacks():
"""HubBar-replica parity Codex asked for: Back only follows a SAME-ORIGIN referrer,
the menu honors Escape, and a failed image removes itself (clean text unfurl)."""
from goodnews import share
html = share.render_share_page(
{"id": 7, "title": "T", "image_url": "https://x/p.jpg", "canonical_url": "https://s/x",
"source_name": "S"}, "https://upbeatbytes.com")
assert "origin===location.origin" in html # Back requires same-origin referrer
assert "'Escape'" in html or "Escape" in html # menu closes on Escape (parity)
assert 'onerror="this.remove()"' in html # broken image degrades to text unfurl