diff --git a/frontend/src/lib/components/ArticleCard.svelte b/frontend/src/lib/components/ArticleCard.svelte index f1bcc99..edaded5 100644 --- a/frontend/src/lib/components/ArticleCard.svelte +++ b/frontend/src/lib/components/ArticleCard.svelte @@ -100,7 +100,8 @@ > {#if showImage} - ) instead of hotlinking the source --> + { failed = true; onimageerror?.(); }} /> {:else if usePlaceholder} diff --git a/frontend/src/routes/+page.svelte b/frontend/src/routes/+page.svelte index 1968f15..79cdb71 100644 --- a/frontend/src/routes/+page.svelte +++ b/frontend/src/routes/+page.svelte @@ -112,7 +112,9 @@ const q = P.param(prefs.data); // the reader's boundaries (excluded topics, ceilings) try { const it = (await getJSON(`/api/brief?limit=1${homeq}${q ? '&' + q : ''}`))?.items?.[0]; - if (it) news = { id: it.id, title: it.title, summary: it.summary || it.description || '', image: it.image_url || null, topic: it.topic || null, source_read_minutes: it.source_read_minutes }; + // Serve the photo from our own cached/downscaled copy (/api/img/) rather than + // hotlinking the source — keep image_url only as the "has a photo?" signal. + if (it) news = { id: it.id, title: it.title, summary: it.summary || it.description || '', image: it.image_url ? `/api/img/${it.id}` : null, topic: it.topic || null, source_read_minutes: it.source_read_minutes }; // News images are hotlinked from the source, so a single fetch can transiently // fail (slow CDN, rate limit, hiccup) and leave a blank plate until you refresh. // Probe with a couple of retries and reveal the photo only once it's truly loaded diff --git a/goodnews/api.py b/goodnews/api.py index 7b38dec..3cbf5e4 100644 --- a/goodnews/api.py +++ b/goodnews/api.py @@ -36,7 +36,7 @@ from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse from fastapi.staticfiles import StaticFiles from pydantic import BaseModel -from . import art, auth, bloom, daily, email_send, feeds, games, oauth_google, onthisday, publishing, queries, quote, readtime, share, sources, summarize, wotd +from . import art, auth, bloom, daily, email_send, feeds, games, newsimg, oauth_google, onthisday, publishing, queries, quote, readtime, share, sources, summarize, wotd from .localtime import local_today from .markup import reply_html_to_text, sanitize_reply_html from .db import connect @@ -2332,6 +2332,25 @@ def create_app() -> FastAPI: "image_url_large": f"/api/art/image/{a['object_id']}?size=full", } + @app.api_route("/api/img/{article_id}", methods=["GET", "HEAD"]) + def article_image(article_id: int) -> FileResponse: + """Serve a locally-cached, downscaled copy of an article's image instead of + hotlinking the source. Allowlisted by construction: the id resolves to a URL + already in our corpus (no open proxy). A miss fetches+caches once; a hard + failure 404s, which the frontend handles (retry, then the typo cover).""" + with get_conn() as conn: + row = conn.execute("SELECT image_url FROM articles WHERE id = ?", (article_id,)).fetchone() + url = row["image_url"] if row else None + if not url: + raise HTTPException(status_code=404, detail="no image for this article") + path = newsimg.get_or_fetch(url) + if not path: + raise HTTPException(status_code=404, detail="image unavailable") + media = "image/webp" if path.suffix == ".webp" else None + # Stable per article id (image_url doesn't change) → cache hard at the edge/browser. + return FileResponse(str(path), media_type=media, + headers={"Cache-Control": "public, max-age=31536000, immutable"}) + @app.api_route("/api/art/image/{object_id}", methods=["GET", "HEAD"]) def art_image(object_id: int, size: str = Query("")) -> FileResponse: cdir = art.cache_dir() diff --git a/goodnews/cli.py b/goodnews/cli.py index dede165..9bf7077 100644 --- a/goodnews/cli.py +++ b/goodnews/cli.py @@ -13,7 +13,7 @@ from .games import generate_daily_puzzles from .localtime import local_today from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, cluster_duplicates, dedup as run_dedup from .geo import tag_articles as tag_geo -from . import art, onthisday, quote, wotd +from . import art, newsimg, onthisday, quote, wotd from .enrich import enrich_brief_images, enrich_read_times, enrich_recent_images, enrich_summarized_images from .summarize import generate_summary, get_summary from .feeds import ( @@ -599,6 +599,17 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non except Exception as exc: print(f"recent images: skipped ({exc})") + # Cache downscaled display copies of those images on our own origin (so the + # hub/feed serve local files, not flaky third-party hotlinks), then enforce the + # size ceiling with LRU eviction. Both bounded + non-fatal. + try: + warmed = newsimg.warm(conn) + pr = newsimg.prune() + print(f"images: cached={warmed} size={pr['after'] // 1048576}MB/" + f"{pr['cap'] // 1048576}MB evicted={pr['removed']}") + except Exception as exc: + print(f"images: skipped ({exc})") + # Full-article read-times: count words for recent accepted articles so the # front door can show "Full story · ~N min" next to our gist (bounded per cycle). try: diff --git a/goodnews/newsimg.py b/goodnews/newsimg.py new file mode 100644 index 0000000..e527cf8 --- /dev/null +++ b/goodnews/newsimg.py @@ -0,0 +1,187 @@ +"""Local image cache + downscale for news article images. + +The hub, feed, and article pages used to hotlink each article's image_url straight +from the source's server, so a slow / rate-limited / flaky third-party CDN left a +blank graphic until a refresh. Instead we cache a downscaled display copy on our own +origin (beside the DB, like art_cache) and serve that. The cache is bounded by a HARD +size ceiling with LRU eviction (prune), so it can't grow without limit no matter the +ingest rate. Network + Pillow calls are isolated so tests can monkeypatch them. + +Keyed by a hash of the source URL: a given image_url always maps to the same file. +The API resolves an article id -> its image_url (a tight allowlist — we only ever +fetch URLs already in our own corpus, so it is not an open proxy).""" +from __future__ import annotations + +import hashlib +import io +import os +import urllib.request +from pathlib import Path + +_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"} +_MIN_IMAGE_BYTES = 500 +_MAX_FETCH_BYTES = 20 * 1024 * 1024 # never pull an absurd original into memory +DISPLAY_WIDTH = 800 # cards / feed never show wider than this +WEBP_QUALITY = 80 +DEFAULT_CAP_BYTES = 1024 * 1024 * 1024 # 1 GB hard ceiling (override via env) + + +def cache_dir() -> Path: + """Where cached images live — beside the DB, so the host cycle writes and the API + container reads the same mounted volume (mirrors art.cache_dir).""" + override = os.environ.get("GOODNEWS_IMG_CACHE") + if override: + d = Path(override) + else: + db = Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3")) + d = db.parent / "img_cache" + d.mkdir(parents=True, exist_ok=True) + return d + + +def cap_bytes() -> int: + try: + return int(os.environ.get("GOODNEWS_IMG_CACHE_CAP", DEFAULT_CAP_BYTES)) + except ValueError: + return DEFAULT_CAP_BYTES + + +def _key(url: str) -> str: + return hashlib.sha1(url.encode("utf-8")).hexdigest() + + +def _http_bytes(url: str, timeout: int = 12) -> tuple[bytes, str]: + req = urllib.request.Request(url, headers=_UA) + with urllib.request.urlopen(req, timeout=timeout) as r: + return r.read(_MAX_FETCH_BYTES + 1), (r.headers.get("Content-Type") or "") + + +def _encode(data: bytes) -> bytes | None: + """Downscale to DISPLAY_WIDTH and re-encode as WebP. None if it isn't a decodable + raster image (e.g. SVG) — the caller then stores the original bytes as-is.""" + try: + from PIL import Image + im = Image.open(io.BytesIO(data)) + im.load() + if im.mode not in ("RGB", "RGBA"): + im = im.convert("RGBA" if ("A" in im.mode or im.mode == "P") else "RGB") + if im.width > DISPLAY_WIDTH: + h = max(1, round(im.height * DISPLAY_WIDTH / im.width)) + im = im.resize((DISPLAY_WIDTH, h), Image.LANCZOS) + out = io.BytesIO() + im.save(out, format="WEBP", quality=WEBP_QUALITY, method=4) + return out.getvalue() + except Exception: # noqa: BLE001 — not a decodable raster image + return None + + +def _ext_for(ctype: str) -> str: + c = ctype.lower() + if "png" in c: + return ".png" + if "gif" in c: + return ".gif" + if "svg" in c: + return ".svg" + if "webp" in c: + return ".webp" + return ".jpg" + + +def path_for(url: str) -> Path | None: + """The cached file for this URL if present (and bump its mtime, the LRU marker).""" + for p in cache_dir().glob(_key(url) + ".*"): + try: + os.utime(p, None) # touch -> last-used time for LRU eviction + except OSError: + pass + return p + return None + + +def get_or_fetch(url: str | None) -> Path | None: + """Cached display copy for a source image URL, fetching + caching on first miss. + Atomic write (temp then rename) so a reader never sees a half-file. None on any + failure — callers (endpoint 404 -> frontend retry/typo cover) degrade gracefully.""" + if not url or not url.startswith(("http://", "https://")): + return None + hit = path_for(url) + if hit: + return hit + try: + data, ctype = _http_bytes(url) + except Exception: # noqa: BLE001 — source down/slow/blocked + return None + if len(data) < _MIN_IMAGE_BYTES or len(data) > _MAX_FETCH_BYTES: + return None + encoded = _encode(data) + if encoded is not None: + blob, ext = encoded, ".webp" + elif ctype.startswith("image/"): + blob, ext = data, _ext_for(ctype) # couldn't re-encode (e.g. SVG): keep original + else: + return None + key = _key(url) + cdir = cache_dir() + tmp = cdir / f".{key}.tmp" + dest = cdir / f"{key}{ext}" + try: + tmp.write_bytes(blob) + os.replace(tmp, dest) # atomic + except OSError: + try: + tmp.unlink() + except OSError: + pass + return None + return dest + + +def warm(conn, limit: int = 200) -> int: + """Pre-fetch display copies for the newest accepted articles that have an image, so + the FIRST page view is already a local hit (no first-view flakiness). Bounded; skips + already-cached. Returns how many it newly cached.""" + rows = conn.execute( + "SELECT DISTINCT a.image_url FROM article_scores s JOIN articles a ON a.id = s.article_id " + "WHERE s.accepted=1 AND a.duplicate_of IS NULL AND a.image_url IS NOT NULL " + "AND a.image_url != '' ORDER BY a.id DESC LIMIT ?", + (limit,), + ).fetchall() + made = 0 + for r in rows: + url = r[0] + if path_for(url): + continue + if get_or_fetch(url): + made += 1 + return made + + +def prune(cap: int | None = None) -> dict: + """Enforce the size ceiling: delete least-recently-used files (oldest mtime first) + until the cache is under the cap. Returns {before, after, removed, cap}.""" + if cap is None: + cap = cap_bytes() + files, total = [], 0 + for p in cache_dir().iterdir(): + if not p.is_file() or p.name.startswith("."): + continue + try: + st = p.stat() + except OSError: + continue + files.append((st.st_mtime, st.st_size, p)) + total += st.st_size + before, removed = total, 0 + if total > cap: + files.sort() # oldest mtime first = least recently used + for _mtime, size, p in files: + if total <= cap: + break + try: + p.unlink() + total -= size + removed += 1 + except OSError: + pass + return {"before": before, "after": total, "removed": removed, "cap": cap} diff --git a/goodnews/share.py b/goodnews/share.py index 81d1457..0b9f877 100644 --- a/goodnews/share.py +++ b/goodnews/share.py @@ -51,8 +51,11 @@ def render_share_page(article: dict, base_url: str, summary: str | None = None, _tag("twitter:image", image, attr="name"), ])) + # The visible image is served from our cached/downscaled copy (not a hotlink), so a + # flaky source CDN can't blank it. og:image/twitter:image above stay the source URL + # so social crawlers fetch the canonical image directly. media = ( - f'' + f'' if image else "" ) diff --git a/tests/test_newsimg.py b/tests/test_newsimg.py new file mode 100644 index 0000000..515dd05 --- /dev/null +++ b/tests/test_newsimg.py @@ -0,0 +1,101 @@ +"""News image cache: fetch + downscale to WebP, cache-hit (no refetch), graceful +failure, LRU eviction under a size cap, cycle warm, and the /api/img endpoint +(allowlisted to our own corpus).""" +import io +import os + +import pytest +from PIL import Image +from fastapi.testclient import TestClient + +from goodnews import newsimg +from goodnews.db import connect, init_db + + +def _png(w=1600, h=1000, color=(40, 130, 173)): + out = io.BytesIO() + Image.new("RGB", (w, h), color).save(out, format="PNG") + return out.getvalue() + + +@pytest.fixture +def cache(tmp_path, monkeypatch): + monkeypatch.setenv("GOODNEWS_IMG_CACHE", str(tmp_path / "img_cache")) + return tmp_path / "img_cache" + + +def test_get_or_fetch_caches_and_downscales(cache, monkeypatch): + calls = [] + monkeypatch.setattr(newsimg, "_http_bytes", + lambda url, timeout=12: (calls.append(url), (_png(), "image/png"))[1]) + p = newsimg.get_or_fetch("https://example.com/big.png") + assert p and p.exists() and p.suffix == ".webp" + with Image.open(p) as im: # downscaled + re-encoded + assert im.width == newsimg.DISPLAY_WIDTH and im.format == "WEBP" + assert newsimg.get_or_fetch("https://example.com/big.png") == p # cache hit + assert len(calls) == 1 # ...not refetched + + +def test_get_or_fetch_rejects_non_image_and_bad_scheme(cache, monkeypatch): + monkeypatch.setattr(newsimg, "_http_bytes", + lambda url, timeout=12: (b"nope", "text/html")) + assert newsimg.get_or_fetch("https://example.com/page.html") is None + assert newsimg.get_or_fetch(None) is None + assert newsimg.get_or_fetch("ftp://example.com/x.png") is None # http(s) only (no SSRF surface) + + +def test_fetch_failure_returns_none(cache, monkeypatch): + def boom(url, timeout=12): + raise OSError("source down") + monkeypatch.setattr(newsimg, "_http_bytes", boom) + assert newsimg.get_or_fetch("https://example.com/x.jpg") is None + + +def test_prune_evicts_least_recently_used_over_cap(cache, monkeypatch): + monkeypatch.setattr(newsimg, "_http_bytes", lambda url, timeout=12: (_png(), "image/png")) + paths = [newsimg.get_or_fetch(f"https://example.com/{i}.png") for i in range(5)] + for i, p in enumerate(paths): # 0 = oldest/LRU, 4 = newest + os.utime(p, (1000 + i, 1000 + i)) + sizes = [p.stat().st_size for p in paths] + cap = sum(sizes) - sizes[0] - sizes[1] + 1 # room for the 3 newest only + r = newsimg.prune(cap) + assert r["removed"] == 2 and r["after"] <= cap + assert not paths[0].exists() and not paths[1].exists() # the two oldest evicted + assert paths[2].exists() and paths[4].exists() # newer kept + + +def test_warm_caches_recent_accepted_with_image(cache, monkeypatch): + monkeypatch.setattr(newsimg, "_http_bytes", lambda url, timeout=12: (_png(), "image/png")) + conn = connect(":memory:"); init_db(conn) + conn.execute("INSERT INTO sources (id,name,feed_url) VALUES (1,'S','http://s/f')") + for aid, img in ((1, "https://x/1.jpg"), (2, "https://x/2.jpg"), (3, "")): + conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) " + "VALUES (?,1,?,?,?,?)", (aid, f"http://s/{aid}", f"t{aid}", f"h{aid}", img)) + conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?,1)", (aid,)) + conn.commit() + assert newsimg.warm(conn) == 2 # the two with an image + assert newsimg.warm(conn) == 0 # idempotent (already cached) + + +@pytest.fixture +def client(tmp_path, monkeypatch): + monkeypatch.setenv("GOODNEWS_DB", str(tmp_path / "t.sqlite3")) + monkeypatch.setenv("GOODNEWS_IMG_CACHE", str(tmp_path / "img_cache")) + monkeypatch.setattr(newsimg, "_http_bytes", lambda url, timeout=12: (_png(), "image/png")) + conn = connect(tmp_path / "t.sqlite3"); init_db(conn) + conn.execute("INSERT INTO sources (id,name,feed_url) VALUES (1,'S','http://s/f')") + conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) " + "VALUES (1,1,'http://s/1','t1','h1','https://x/pic.jpg')") + conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) " + "VALUES (2,1,'http://s/2','t2','h2','')") # no image + conn.commit(); conn.close() + from goodnews.api import create_app + return TestClient(create_app()) + + +def test_img_endpoint_serves_and_allowlists(client): + r = client.get("/api/img/1") + assert r.status_code == 200 and r.headers["content-type"] == "image/webp" + assert "immutable" in r.headers.get("cache-control", "") + assert client.get("/api/img/2").status_code == 404 # article has no image + assert client.get("/api/img/999").status_code == 404 # unknown id (not in corpus)