diff --git a/goodnews/api.py b/goodnews/api.py index 0507628..46b99fc 100644 --- a/goodnews/api.py +++ b/goodnews/api.py @@ -2339,22 +2339,24 @@ def create_app() -> FastAPI: @app.api_route("/api/img/{article_id}", methods=["GET", "HEAD"]) def article_image(article_id: int) -> FileResponse: - """Serve a locally-cached, downscaled copy of an article's image instead of - hotlinking the source. Allowlisted by construction: the id resolves to a URL - already in our corpus (no open proxy). A miss fetches+caches once; a hard - failure 404s, which the frontend handles (retry, then the typo cover).""" + """Serve the locally-cached, downscaled WebP for an accepted, canonical article. + Cache HITS ONLY — this never fetches (the cycle owns all fetching via newsimg.warm), + so the public endpoint has no SSRF/worker-exhaustion surface. A miss 404s; the + frontend handles that (retry, then the typographic cover).""" with get_conn() as conn: - row = conn.execute("SELECT image_url FROM articles WHERE id = ?", (article_id,)).fetchone() + row = conn.execute( + "SELECT a.image_url FROM articles a JOIN article_scores s ON s.article_id = a.id " + "WHERE a.id = ? AND s.accepted = 1 AND a.duplicate_of IS NULL", + (article_id,), + ).fetchone() url = row["image_url"] if row else None - if not url: - raise HTTPException(status_code=404, detail="no image for this article") - path = newsimg.get_or_fetch(url) + path = newsimg.path_for(url) if url else None if not path: - raise HTTPException(status_code=404, detail="image unavailable") - media = "image/webp" if path.suffix == ".webp" else None - # Stable per article id (image_url doesn't change) → cache hard at the edge/browser. - return FileResponse(str(path), media_type=media, - headers={"Cache-Control": "public, max-age=31536000, immutable"}) + raise HTTPException(status_code=404, detail="image not cached") + # NOT immutable: a re-enriched article can change image_url, so the bytes behind a + # given id can change. A day's browser cache is plenty (we're direct-origin, no CDN). + return FileResponse(str(path), media_type="image/webp", + headers={"Cache-Control": "public, max-age=86400"}) @app.api_route("/api/art/image/{object_id}", methods=["GET", "HEAD"]) def art_image(object_id: int, size: str = Query("")) -> FileResponse: diff --git a/goodnews/newsimg.py b/goodnews/newsimg.py index e527cf8..9b8d1ce 100644 --- a/goodnews/newsimg.py +++ b/goodnews/newsimg.py @@ -1,40 +1,53 @@ """Local image cache + downscale for news article images. -The hub, feed, and article pages used to hotlink each article's image_url straight -from the source's server, so a slow / rate-limited / flaky third-party CDN left a -blank graphic until a refresh. Instead we cache a downscaled display copy on our own -origin (beside the DB, like art_cache) and serve that. The cache is bounded by a HARD -size ceiling with LRU eviction (prune), so it can't grow without limit no matter the -ingest rate. Network + Pillow calls are isolated so tests can monkeypatch them. +Article images used to be hotlinked from the source, so a slow/flaky third-party CDN +left a blank graphic until a refresh. Instead the CYCLE fetches a downscaled WebP copy +to data/img_cache/ (beside the DB, mounted into the API container, mirrors art_cache), +and the API serves only cache HITS — it never fetches, so the public endpoint has no +SSRF or worker-exhaustion surface. The cache is bounded by a hard size ceiling with LRU +eviction, so it can't grow without limit no matter the ingest rate. -Keyed by a hash of the source URL: a given image_url always maps to the same file. -The API resolves an article id -> its image_url (a tight allowlist — we only ever -fetch URLs already in our own corpus, so it is not an open proxy).""" +Security posture (the fetch runs only in the trusted cycle, but feed image URLs are +still externally supplied, so we treat them as untrusted): + * SSRF-safe fetch reuses enrich._host_is_public + bounded redirect re-validation + (same path as feeds.safe_fetch_feed) — no private/loopback/link-local targets, + http(s) only, every redirect hop re-checked. + * Only successfully-decoded RASTER images are re-encoded to WebP and stored; SVG and + anything undecodable is REJECTED (never retained as a same-origin file). + * Decompression-bomb + dimension guards. + * Definitive failures are negative-cached (a .fail marker) so a bad URL isn't refetched + every cycle; transient network errors are not, so they retry. +Concurrency: all fetching happens inside the cycle, which holds an exclusive lock, so no +two fetches race; writes are atomic (temp + rename) regardless. +""" from __future__ import annotations import hashlib import io import os +import time +import urllib.error import urllib.request from pathlib import Path +from urllib.parse import urljoin, urlsplit + +from .enrich import MAX_REDIRECTS, _NoRedirect, _host_is_public _UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"} _MIN_IMAGE_BYTES = 500 _MAX_FETCH_BYTES = 20 * 1024 * 1024 # never pull an absurd original into memory +_MAX_PIXELS = 50_000_000 # decompression-bomb ceiling (≈50 MP) +_MAX_DIM = 12000 # reject pathological single-axis dimensions DISPLAY_WIDTH = 800 # cards / feed never show wider than this WEBP_QUALITY = 80 DEFAULT_CAP_BYTES = 1024 * 1024 * 1024 # 1 GB hard ceiling (override via env) +_FAIL_TTL_S = 3 * 24 * 3600 # don't refetch a definitively-bad URL for 3 days def cache_dir() -> Path: - """Where cached images live — beside the DB, so the host cycle writes and the API - container reads the same mounted volume (mirrors art.cache_dir).""" override = os.environ.get("GOODNEWS_IMG_CACHE") - if override: - d = Path(override) - else: - db = Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3")) - d = db.parent / "img_cache" + db = Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3")) + d = Path(override) if override else db.parent / "img_cache" d.mkdir(parents=True, exist_ok=True) return d @@ -50,19 +63,54 @@ def _key(url: str) -> str: return hashlib.sha1(url.encode("utf-8")).hexdigest() -def _http_bytes(url: str, timeout: int = 12) -> tuple[bytes, str]: - req = urllib.request.Request(url, headers=_UA) - with urllib.request.urlopen(req, timeout=timeout) as r: - return r.read(_MAX_FETCH_BYTES + 1), (r.headers.get("Content-Type") or "") +class _FetchError(Exception): + """permanent=True → negative-cache (won't retry soon); False → transient, retry.""" + def __init__(self, msg: str, permanent: bool): + super().__init__(msg) + self.permanent = permanent + + +def _safe_fetch(url: str, timeout: int = 12) -> tuple[bytes, str]: + """SSRF-safe fetch of an untrusted image URL: http(s) only, every redirect hop + re-validated against public IPs, bounded redirects, body capped. Raises _FetchError + (permanent for policy refusals, transient for network errors).""" + opener = urllib.request.build_opener(_NoRedirect) + current = url + for _ in range(MAX_REDIRECTS + 1): + parts = urlsplit(current) + if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname): + raise _FetchError(f"non-public or non-http(s): {current}", permanent=True) + req = urllib.request.Request(current, headers=_UA) + try: + resp = opener.open(req, timeout=timeout) + except (urllib.error.URLError, OSError, ValueError) as exc: + raise _FetchError(f"fetch failed: {exc}", permanent=False) from exc + status = getattr(resp, "status", 200) or 200 + if status in (301, 302, 303, 307, 308): + loc = resp.headers.get("Location") + resp.close() + if not loc: + raise _FetchError("redirect without location", permanent=True) + current = urljoin(current, loc) + continue + try: + return resp.read(_MAX_FETCH_BYTES + 1), (resp.headers.get("Content-Type") or "") + finally: + resp.close() + raise _FetchError("too many redirects", permanent=True) def _encode(data: bytes) -> bytes | None: - """Downscale to DISPLAY_WIDTH and re-encode as WebP. None if it isn't a decodable - raster image (e.g. SVG) — the caller then stores the original bytes as-is.""" + """Downscale a decoded RASTER image to DISPLAY_WIDTH and re-encode as WebP. None if + it isn't a decodable raster (e.g. SVG), is a decompression bomb, or has pathological + dimensions — the caller then REJECTS it (never stores arbitrary bytes).""" try: from PIL import Image + Image.MAX_IMAGE_PIXELS = _MAX_PIXELS # raise DecompressionBombError past this im = Image.open(io.BytesIO(data)) - im.load() + im.load() # forces decode → catches truncated/bomb here + if im.width > _MAX_DIM or im.height > _MAX_DIM or im.width < 1 or im.height < 1: + return None if im.mode not in ("RGB", "RGBA"): im = im.convert("RGBA" if ("A" in im.mode or im.mode == "P") else "RGB") if im.width > DISPLAY_WIDTH: @@ -71,63 +119,69 @@ def _encode(data: bytes) -> bytes | None: out = io.BytesIO() im.save(out, format="WEBP", quality=WEBP_QUALITY, method=4) return out.getvalue() - except Exception: # noqa: BLE001 — not a decodable raster image + except Exception: # noqa: BLE001 — UnidentifiedImageError, DecompressionBombError, SVG, truncated … return None -def _ext_for(ctype: str) -> str: - c = ctype.lower() - if "png" in c: - return ".png" - if "gif" in c: - return ".gif" - if "svg" in c: - return ".svg" - if "webp" in c: - return ".webp" - return ".jpg" +def _fail_path(url: str) -> Path: + return cache_dir() / f"{_key(url)}.fail" + + +def _mark_failed(url: str) -> None: + try: + _fail_path(url).touch() + except OSError: + pass + + +def _failed_recently(url: str) -> bool: + try: + return (time.time() - _fail_path(url).stat().st_mtime) < _FAIL_TTL_S + except OSError: + return False def path_for(url: str) -> Path | None: - """The cached file for this URL if present (and bump its mtime, the LRU marker).""" - for p in cache_dir().glob(_key(url) + ".*"): + """The cached WebP for this URL if present (and bump its mtime, the LRU marker). + A pure cache lookup — never fetches.""" + if not url: + return None + p = cache_dir() / f"{_key(url)}.webp" + if p.exists(): try: - os.utime(p, None) # touch -> last-used time for LRU eviction + os.utime(p, None) # touch → last-used time for LRU eviction except OSError: pass return p return None -def get_or_fetch(url: str | None) -> Path | None: - """Cached display copy for a source image URL, fetching + caching on first miss. - Atomic write (temp then rename) so a reader never sees a half-file. None on any - failure — callers (endpoint 404 -> frontend retry/typo cover) degrade gracefully.""" +def fetch_and_cache(url: str | None) -> Path | None: + """Fetch (SSRF-safe), downscale to WebP, and cache atomically. CYCLE-ONLY — the API + endpoint never calls this. None on any failure; definitive failures are negative-cached + so they aren't retried every cycle.""" if not url or not url.startswith(("http://", "https://")): return None - hit = path_for(url) - if hit: - return hit try: - data, ctype = _http_bytes(url) - except Exception: # noqa: BLE001 — source down/slow/blocked + data, _ctype = _safe_fetch(url) + except _FetchError as exc: + if exc.permanent: + _mark_failed(url) return None - if len(data) < _MIN_IMAGE_BYTES or len(data) > _MAX_FETCH_BYTES: + if not (_MIN_IMAGE_BYTES <= len(data) <= _MAX_FETCH_BYTES): + _mark_failed(url) return None - encoded = _encode(data) - if encoded is not None: - blob, ext = encoded, ".webp" - elif ctype.startswith("image/"): - blob, ext = data, _ext_for(ctype) # couldn't re-encode (e.g. SVG): keep original - else: + blob = _encode(data) + if blob is None: # SVG / undecodable / bomb / bad dimensions + _mark_failed(url) return None - key = _key(url) cdir = cache_dir() + key = _key(url) tmp = cdir / f".{key}.tmp" - dest = cdir / f"{key}{ext}" + dest = cdir / f"{key}.webp" try: tmp.write_bytes(blob) - os.replace(tmp, dest) # atomic + os.replace(tmp, dest) # atomic except OSError: try: tmp.unlink() @@ -138,9 +192,9 @@ def get_or_fetch(url: str | None) -> Path | None: def warm(conn, limit: int = 200) -> int: - """Pre-fetch display copies for the newest accepted articles that have an image, so - the FIRST page view is already a local hit (no first-view flakiness). Bounded; skips - already-cached. Returns how many it newly cached.""" + """Pre-fetch display copies for the newest ACCEPTED, CANONICAL articles that have an + image, so the API only ever serves cache hits. Bounded; skips already-cached and + recently-failed URLs. Returns how many it newly cached.""" rows = conn.execute( "SELECT DISTINCT a.image_url FROM article_scores s JOIN articles a ON a.id = s.article_id " "WHERE s.accepted=1 AND a.duplicate_of IS NULL AND a.image_url IS NOT NULL " @@ -150,21 +204,31 @@ def warm(conn, limit: int = 200) -> int: made = 0 for r in rows: url = r[0] - if path_for(url): + if path_for(url) or _failed_recently(url): continue - if get_or_fetch(url): + if fetch_and_cache(url): made += 1 return made def prune(cap: int | None = None) -> dict: - """Enforce the size ceiling: delete least-recently-used files (oldest mtime first) - until the cache is under the cap. Returns {before, after, removed, cap}.""" + """Enforce the size ceiling: delete least-recently-used WebPs (oldest mtime first) + until under the cap; also sweep stale .fail markers. Returns {before, after, removed, cap}.""" if cap is None: cap = cap_bytes() + now = time.time() files, total = [], 0 for p in cache_dir().iterdir(): - if not p.is_file() or p.name.startswith("."): + if p.name.startswith("."): + continue + if p.suffix == ".fail": + try: + if now - p.stat().st_mtime >= _FAIL_TTL_S: + p.unlink() + except OSError: + pass + continue + if p.suffix != ".webp" or not p.is_file(): continue try: st = p.stat() diff --git a/goodnews/share.py b/goodnews/share.py index 4b2aed8..daa7390 100644 --- a/goodnews/share.py +++ b/goodnews/share.py @@ -98,13 +98,17 @@ _TOP_BAR_CSS = """ """ # Burger toggle + signed-in avatar (read from the SPA's localStorage cache, same as HubBar). +# Parity with HubBar: Escape closes the menu, and crossing back to desktop width resets it. _TOP_BAR_JS = """""" -# Single-history Back: return to where you came from in-app, else home (mirrors HubShell). +# Single-history Back (mirrors HubShell): go back ONLY when we arrived from our own origin, +# else go home — never bounce the reader off to an external referrer. _BACK_JS = """""" @@ -165,8 +171,11 @@ def render_share_page(article: dict, base_url: str, summary: str | None = None, # The visible image is served from our cached/downscaled copy (not a hotlink), so a # flaky source CDN can't blank it. og:image/twitter:image above stay the source URL # so social crawlers fetch the canonical image directly. + # Served from our cache (/api/img/); if it isn't cached yet / fails, drop the + # element so the page degrades to the clean text unfurl rather than a broken icon. media = ( - f'' + f'' if image else "" ) diff --git a/tests/test_newsimg.py b/tests/test_newsimg.py index 515dd05..7d51153 100644 --- a/tests/test_newsimg.py +++ b/tests/test_newsimg.py @@ -1,6 +1,6 @@ -"""News image cache: fetch + downscale to WebP, cache-hit (no refetch), graceful -failure, LRU eviction under a size cap, cycle warm, and the /api/img endpoint -(allowlisted to our own corpus).""" +"""News image cache (hardened): SSRF-safe cycle-only fetch + downscale to WebP, reject +SVG/non-raster/private-host/redirect-to-private, negative-cache failures, LRU eviction, +and the cache-HITS-ONLY /api/img endpoint (restricted to accepted, canonical articles).""" import io import os @@ -11,6 +11,10 @@ from fastapi.testclient import TestClient from goodnews import newsimg from goodnews.db import connect, init_db +# The real SSRF-safe fetch, captured before any fixture stubs it — the SSRF tests below +# restore it so they exercise the genuine host/redirect validation. +_REAL_SAFE_FETCH = newsimg._safe_fetch + def _png(w=1600, h=1000, color=(40, 130, 173)): out = io.BytesIO() @@ -21,81 +25,140 @@ def _png(w=1600, h=1000, color=(40, 130, 173)): @pytest.fixture def cache(tmp_path, monkeypatch): monkeypatch.setenv("GOODNEWS_IMG_CACHE", str(tmp_path / "img_cache")) + # default: treat hosts as public + a happy PNG fetch; individual tests override. + monkeypatch.setattr(newsimg, "_host_is_public", lambda h: True) + monkeypatch.setattr(newsimg, "_safe_fetch", lambda url, timeout=12: (_png(), "image/png")) return tmp_path / "img_cache" -def test_get_or_fetch_caches_and_downscales(cache, monkeypatch): - calls = [] - monkeypatch.setattr(newsimg, "_http_bytes", - lambda url, timeout=12: (calls.append(url), (_png(), "image/png"))[1]) - p = newsimg.get_or_fetch("https://example.com/big.png") +def test_fetch_and_cache_downscales_to_webp(cache): + p = newsimg.fetch_and_cache("https://example.com/big.png") assert p and p.exists() and p.suffix == ".webp" - with Image.open(p) as im: # downscaled + re-encoded + with Image.open(p) as im: assert im.width == newsimg.DISPLAY_WIDTH and im.format == "WEBP" - assert newsimg.get_or_fetch("https://example.com/big.png") == p # cache hit - assert len(calls) == 1 # ...not refetched + assert newsimg.path_for("https://example.com/big.png") == p # pure cache lookup -def test_get_or_fetch_rejects_non_image_and_bad_scheme(cache, monkeypatch): - monkeypatch.setattr(newsimg, "_http_bytes", - lambda url, timeout=12: (b"nope", "text/html")) - assert newsimg.get_or_fetch("https://example.com/page.html") is None - assert newsimg.get_or_fetch(None) is None - assert newsimg.get_or_fetch("ftp://example.com/x.png") is None # http(s) only (no SSRF surface) +def test_bad_scheme_and_empty_are_rejected(cache): + assert newsimg.fetch_and_cache(None) is None + assert newsimg.fetch_and_cache("ftp://example.com/x.png") is None + assert not list(cache.glob("*.webp")) -def test_fetch_failure_returns_none(cache, monkeypatch): +def test_svg_and_nonimage_rejected_and_negative_cached(cache, monkeypatch): + monkeypatch.setattr(newsimg, "_safe_fetch", + lambda url, timeout=12: (b"" + b" " * 600, + "image/svg+xml")) + url = "https://example.com/evil.svg" + assert newsimg.fetch_and_cache(url) is None # decoded-raster only → SVG refused + assert not list(cache.glob("*.webp")) # nothing stored + assert newsimg._failed_recently(url) # negative-cached (won't refetch) + + +def test_private_host_refused_and_negative_cached(cache, monkeypatch): + monkeypatch.setattr(newsimg, "_safe_fetch", _REAL_SAFE_FETCH) # exercise real validation + monkeypatch.setattr(newsimg, "_host_is_public", lambda h: False) + url = "http://169.254.169.254/latest/meta-data/" + assert newsimg.fetch_and_cache(url) is None + assert newsimg._failed_recently(url) # SSRF target → permanent fail + + +def test_redirect_to_private_is_refused(cache, monkeypatch): + monkeypatch.setattr(newsimg, "_safe_fetch", _REAL_SAFE_FETCH) # exercise real redirect re-validation + hosts = {"good.example": True, "evil.internal": False} + monkeypatch.setattr(newsimg, "_host_is_public", lambda h: hosts.get(h, False)) + + class _Resp: + def __init__(self, status, headers, body=b""): + self.status, self.headers, self._b = status, headers, body + def read(self, n=-1): + return self._b + def close(self): + pass + + class _Opener: + def open(self, req, timeout=None): + if "good.example" in req.full_url: # public → 302 to private + return _Resp(302, {"Location": "http://evil.internal/x.png"}) + return _Resp(200, {"Content-Type": "image/png"}, _png()) # should never be reached + monkeypatch.setattr(newsimg.urllib.request, "build_opener", lambda *a: _Opener()) + + url = "http://good.example/p.png" + assert newsimg.fetch_and_cache(url) is None # redirect hop re-validated → refused + assert newsimg._failed_recently(url) + + +def test_transient_failure_is_not_negative_cached(cache, monkeypatch): def boom(url, timeout=12): - raise OSError("source down") - monkeypatch.setattr(newsimg, "_http_bytes", boom) - assert newsimg.get_or_fetch("https://example.com/x.jpg") is None + raise newsimg._FetchError("network down", permanent=False) + monkeypatch.setattr(newsimg, "_safe_fetch", boom) + url = "https://example.com/x.jpg" + assert newsimg.fetch_and_cache(url) is None + assert not newsimg._failed_recently(url) # transient → retried next cycle -def test_prune_evicts_least_recently_used_over_cap(cache, monkeypatch): - monkeypatch.setattr(newsimg, "_http_bytes", lambda url, timeout=12: (_png(), "image/png")) - paths = [newsimg.get_or_fetch(f"https://example.com/{i}.png") for i in range(5)] - for i, p in enumerate(paths): # 0 = oldest/LRU, 4 = newest +def test_prune_evicts_least_recently_used_over_cap(cache): + paths = [newsimg.fetch_and_cache(f"https://example.com/{i}.png") for i in range(5)] + for i, p in enumerate(paths): # 0 = oldest/LRU, 4 = newest os.utime(p, (1000 + i, 1000 + i)) sizes = [p.stat().st_size for p in paths] - cap = sum(sizes) - sizes[0] - sizes[1] + 1 # room for the 3 newest only + cap = sum(sizes) - sizes[0] - sizes[1] + 1 # room for the 3 newest only r = newsimg.prune(cap) assert r["removed"] == 2 and r["after"] <= cap - assert not paths[0].exists() and not paths[1].exists() # the two oldest evicted - assert paths[2].exists() and paths[4].exists() # newer kept + assert not paths[0].exists() and not paths[1].exists() + assert paths[2].exists() and paths[4].exists() -def test_warm_caches_recent_accepted_with_image(cache, monkeypatch): - monkeypatch.setattr(newsimg, "_http_bytes", lambda url, timeout=12: (_png(), "image/png")) +def test_warm_skips_cached_and_failed(cache, monkeypatch): conn = connect(":memory:"); init_db(conn) conn.execute("INSERT INTO sources (id,name,feed_url) VALUES (1,'S','http://s/f')") - for aid, img in ((1, "https://x/1.jpg"), (2, "https://x/2.jpg"), (3, "")): - conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) " - "VALUES (?,1,?,?,?,?)", (aid, f"http://s/{aid}", f"t{aid}", f"h{aid}", img)) - conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?,1)", (aid,)) + for aid, img, acc, dup in ((1, "https://x/1.jpg", 1, None), (2, "https://x/2.jpg", 1, None), + (3, "https://x/3.jpg", 0, None), (4, "https://x/4.jpg", 1, 1)): + conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url,duplicate_of) " + "VALUES (?,1,?,?,?,?,?)", (aid, f"http://s/{aid}", f"t{aid}", f"h{aid}", img, dup)) + conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?,?)", (aid, acc)) conn.commit() - assert newsimg.warm(conn) == 2 # the two with an image - assert newsimg.warm(conn) == 0 # idempotent (already cached) + assert newsimg.warm(conn) == 2 # only the 2 accepted, canonical, with image + assert newsimg.warm(conn) == 0 # idempotent — already cached @pytest.fixture def client(tmp_path, monkeypatch): monkeypatch.setenv("GOODNEWS_DB", str(tmp_path / "t.sqlite3")) monkeypatch.setenv("GOODNEWS_IMG_CACHE", str(tmp_path / "img_cache")) - monkeypatch.setattr(newsimg, "_http_bytes", lambda url, timeout=12: (_png(), "image/png")) + monkeypatch.setattr(newsimg, "_host_is_public", lambda h: True) + monkeypatch.setattr(newsimg, "_safe_fetch", lambda url, timeout=12: (_png(), "image/png")) conn = connect(tmp_path / "t.sqlite3"); init_db(conn) conn.execute("INSERT INTO sources (id,name,feed_url) VALUES (1,'S','http://s/f')") - conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) " - "VALUES (1,1,'http://s/1','t1','h1','https://x/pic.jpg')") - conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) " - "VALUES (2,1,'http://s/2','t2','h2','')") # no image - conn.commit(); conn.close() + # 1 = accepted+canonical+image, 2 = no image, 3 = NOT accepted, 4 = duplicate + rows = ((1, "https://x/pic.jpg", 1, None), (2, "", 1, None), + (3, "https://x/p3.jpg", 0, None), (4, "https://x/p4.jpg", 1, 1)) + for aid, img, acc, dup in rows: + conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url,duplicate_of) " + "VALUES (?,1,?,?,?,?,?)", (aid, f"http://s/{aid}", f"t{aid}", f"h{aid}", img, dup)) + conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?,?)", (aid, acc)) + conn.commit() + # the cycle owns fetching — pre-warm so the endpoint has a hit to serve + newsimg.warm(conn) + conn.close() from goodnews.api import create_app return TestClient(create_app()) -def test_img_endpoint_serves_and_allowlists(client): +def test_img_endpoint_serves_cached_and_restricts(client): r = client.get("/api/img/1") assert r.status_code == 200 and r.headers["content-type"] == "image/webp" - assert "immutable" in r.headers.get("cache-control", "") - assert client.get("/api/img/2").status_code == 404 # article has no image - assert client.get("/api/img/999").status_code == 404 # unknown id (not in corpus) + assert "immutable" not in r.headers.get("cache-control", "") # no 1-year immutable + assert client.get("/api/img/2").status_code == 404 # no image + assert client.get("/api/img/3").status_code == 404 # not accepted + assert client.get("/api/img/4").status_code == 404 # duplicate (non-canonical) + assert client.get("/api/img/999").status_code == 404 # unknown id + + +def test_img_endpoint_never_fetches(client, monkeypatch): + # An accepted article whose image was never warmed must 404, not trigger a fetch. + called = {"n": 0} + monkeypatch.setattr(newsimg, "fetch_and_cache", lambda *a, **k: called.__setitem__("n", called["n"] + 1)) + # /api/img/1 is cached (warmed in fixture) → still served without any fetch + assert client.get("/api/img/1").status_code == 200 + assert called["n"] == 0 diff --git a/tests/test_share.py b/tests/test_share.py index 6bc96dc..4bea8ac 100644 --- a/tests/test_share.py +++ b/tests/test_share.py @@ -105,3 +105,15 @@ def test_complete_page_is_cached_and_served_from_cache(app_complete): assert 1 in api._SHARE_CACHE # finished page cached r2 = tc.get("/a/1") # second hit served from cache assert r2.status_code == 200 and r2.text == r1.text + + +def test_share_chrome_parity_and_fallbacks(): + """HubBar-replica parity Codex asked for: Back only follows a SAME-ORIGIN referrer, + the menu honors Escape, and a failed image removes itself (clean text unfurl).""" + from goodnews import share + html = share.render_share_page( + {"id": 7, "title": "T", "image_url": "https://x/p.jpg", "canonical_url": "https://s/x", + "source_name": "S"}, "https://upbeatbytes.com") + assert "origin===location.origin" in html # Back requires same-origin referrer + assert "'Escape'" in html or "Escape" in html # menu closes on Escape (parity) + assert 'onerror="this.remove()"' in html # broken image degrades to text unfurl