images: cache + serve article images from our own origin (bounded, LRU-evicted)
Stop hotlinking news images from third-party CDNs (the source of the "blank until
you refresh a few times" graphic). New goodnews/newsimg.py caches a downscaled WebP
display copy (≤800px) beside the DB, like art_cache:
- GET/HEAD /api/img/{article_id} — resolves id→image_url (allowlisted to our corpus,
not an open proxy), fetch+cache on first miss, serve local after, immutable headers.
- cycle warms display copies for recent accepted-with-image articles (so the FIRST
view is already local) and prunes to a hard size cap (default 1 GB) by LRU eviction.
Frontend now points at /api/img/<id>: the hub lead, every ArticleCard (feed hero +
cards), and the /a/<id> share page's visible image. og:image/twitter:image stay the
source URL so social crawlers fetch the canonical image directly.
Storage is bounded by construction — over the cap, least-recently-used files are
evicted, so it can't grow without limit regardless of ingest rate. Tests cover
fetch/downscale, cache-hit (no refetch), bad-scheme/non-image rejection, fetch
failure, LRU prune, warm, and the endpoint allowlist.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -100,7 +100,8 @@
|
||||
>
|
||||
{#if showImage}
|
||||
<a class="media" href={summaryHref} onclick={opened}>
|
||||
<img src={article.image_url} alt="" loading="lazy" referrerpolicy="no-referrer"
|
||||
<!-- our cached/downscaled copy (/api/img/<id>) instead of hotlinking the source -->
|
||||
<img src={article.id ? `/api/img/${article.id}` : article.image_url} alt="" loading="lazy" referrerpolicy="no-referrer"
|
||||
onerror={() => { failed = true; onimageerror?.(); }} />
|
||||
</a>
|
||||
{:else if usePlaceholder}
|
||||
|
||||
@@ -112,7 +112,9 @@
|
||||
const q = P.param(prefs.data); // the reader's boundaries (excluded topics, ceilings)
|
||||
try {
|
||||
const it = (await getJSON(`/api/brief?limit=1${homeq}${q ? '&' + q : ''}`))?.items?.[0];
|
||||
if (it) news = { id: it.id, title: it.title, summary: it.summary || it.description || '', image: it.image_url || null, topic: it.topic || null, source_read_minutes: it.source_read_minutes };
|
||||
// Serve the photo from our own cached/downscaled copy (/api/img/<id>) rather than
|
||||
// hotlinking the source — keep image_url only as the "has a photo?" signal.
|
||||
if (it) news = { id: it.id, title: it.title, summary: it.summary || it.description || '', image: it.image_url ? `/api/img/${it.id}` : null, topic: it.topic || null, source_read_minutes: it.source_read_minutes };
|
||||
// News images are hotlinked from the source, so a single fetch can transiently
|
||||
// fail (slow CDN, rate limit, hiccup) and leave a blank plate until you refresh.
|
||||
// Probe with a couple of retries and reveal the photo only once it's truly loaded
|
||||
|
||||
+20
-1
@@ -36,7 +36,7 @@ from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from pydantic import BaseModel
|
||||
|
||||
from . import art, auth, bloom, daily, email_send, feeds, games, oauth_google, onthisday, publishing, queries, quote, readtime, share, sources, summarize, wotd
|
||||
from . import art, auth, bloom, daily, email_send, feeds, games, newsimg, oauth_google, onthisday, publishing, queries, quote, readtime, share, sources, summarize, wotd
|
||||
from .localtime import local_today
|
||||
from .markup import reply_html_to_text, sanitize_reply_html
|
||||
from .db import connect
|
||||
@@ -2332,6 +2332,25 @@ def create_app() -> FastAPI:
|
||||
"image_url_large": f"/api/art/image/{a['object_id']}?size=full",
|
||||
}
|
||||
|
||||
@app.api_route("/api/img/{article_id}", methods=["GET", "HEAD"])
|
||||
def article_image(article_id: int) -> FileResponse:
|
||||
"""Serve a locally-cached, downscaled copy of an article's image instead of
|
||||
hotlinking the source. Allowlisted by construction: the id resolves to a URL
|
||||
already in our corpus (no open proxy). A miss fetches+caches once; a hard
|
||||
failure 404s, which the frontend handles (retry, then the typo cover)."""
|
||||
with get_conn() as conn:
|
||||
row = conn.execute("SELECT image_url FROM articles WHERE id = ?", (article_id,)).fetchone()
|
||||
url = row["image_url"] if row else None
|
||||
if not url:
|
||||
raise HTTPException(status_code=404, detail="no image for this article")
|
||||
path = newsimg.get_or_fetch(url)
|
||||
if not path:
|
||||
raise HTTPException(status_code=404, detail="image unavailable")
|
||||
media = "image/webp" if path.suffix == ".webp" else None
|
||||
# Stable per article id (image_url doesn't change) → cache hard at the edge/browser.
|
||||
return FileResponse(str(path), media_type=media,
|
||||
headers={"Cache-Control": "public, max-age=31536000, immutable"})
|
||||
|
||||
@app.api_route("/api/art/image/{object_id}", methods=["GET", "HEAD"])
|
||||
def art_image(object_id: int, size: str = Query("")) -> FileResponse:
|
||||
cdir = art.cache_dir()
|
||||
|
||||
+12
-1
@@ -13,7 +13,7 @@ from .games import generate_daily_puzzles
|
||||
from .localtime import local_today
|
||||
from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, cluster_duplicates, dedup as run_dedup
|
||||
from .geo import tag_articles as tag_geo
|
||||
from . import art, onthisday, quote, wotd
|
||||
from . import art, newsimg, onthisday, quote, wotd
|
||||
from .enrich import enrich_brief_images, enrich_read_times, enrich_recent_images, enrich_summarized_images
|
||||
from .summarize import generate_summary, get_summary
|
||||
from .feeds import (
|
||||
@@ -599,6 +599,17 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non
|
||||
except Exception as exc:
|
||||
print(f"recent images: skipped ({exc})")
|
||||
|
||||
# Cache downscaled display copies of those images on our own origin (so the
|
||||
# hub/feed serve local files, not flaky third-party hotlinks), then enforce the
|
||||
# size ceiling with LRU eviction. Both bounded + non-fatal.
|
||||
try:
|
||||
warmed = newsimg.warm(conn)
|
||||
pr = newsimg.prune()
|
||||
print(f"images: cached={warmed} size={pr['after'] // 1048576}MB/"
|
||||
f"{pr['cap'] // 1048576}MB evicted={pr['removed']}")
|
||||
except Exception as exc:
|
||||
print(f"images: skipped ({exc})")
|
||||
|
||||
# Full-article read-times: count words for recent accepted articles so the
|
||||
# front door can show "Full story · ~N min" next to our gist (bounded per cycle).
|
||||
try:
|
||||
|
||||
@@ -0,0 +1,187 @@
|
||||
"""Local image cache + downscale for news article images.
|
||||
|
||||
The hub, feed, and article pages used to hotlink each article's image_url straight
|
||||
from the source's server, so a slow / rate-limited / flaky third-party CDN left a
|
||||
blank graphic until a refresh. Instead we cache a downscaled display copy on our own
|
||||
origin (beside the DB, like art_cache) and serve that. The cache is bounded by a HARD
|
||||
size ceiling with LRU eviction (prune), so it can't grow without limit no matter the
|
||||
ingest rate. Network + Pillow calls are isolated so tests can monkeypatch them.
|
||||
|
||||
Keyed by a hash of the source URL: a given image_url always maps to the same file.
|
||||
The API resolves an article id -> its image_url (a tight allowlist — we only ever
|
||||
fetch URLs already in our own corpus, so it is not an open proxy)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import io
|
||||
import os
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
|
||||
_MIN_IMAGE_BYTES = 500
|
||||
_MAX_FETCH_BYTES = 20 * 1024 * 1024 # never pull an absurd original into memory
|
||||
DISPLAY_WIDTH = 800 # cards / feed never show wider than this
|
||||
WEBP_QUALITY = 80
|
||||
DEFAULT_CAP_BYTES = 1024 * 1024 * 1024 # 1 GB hard ceiling (override via env)
|
||||
|
||||
|
||||
def cache_dir() -> Path:
|
||||
"""Where cached images live — beside the DB, so the host cycle writes and the API
|
||||
container reads the same mounted volume (mirrors art.cache_dir)."""
|
||||
override = os.environ.get("GOODNEWS_IMG_CACHE")
|
||||
if override:
|
||||
d = Path(override)
|
||||
else:
|
||||
db = Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3"))
|
||||
d = db.parent / "img_cache"
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
return d
|
||||
|
||||
|
||||
def cap_bytes() -> int:
|
||||
try:
|
||||
return int(os.environ.get("GOODNEWS_IMG_CACHE_CAP", DEFAULT_CAP_BYTES))
|
||||
except ValueError:
|
||||
return DEFAULT_CAP_BYTES
|
||||
|
||||
|
||||
def _key(url: str) -> str:
|
||||
return hashlib.sha1(url.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def _http_bytes(url: str, timeout: int = 12) -> tuple[bytes, str]:
|
||||
req = urllib.request.Request(url, headers=_UA)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read(_MAX_FETCH_BYTES + 1), (r.headers.get("Content-Type") or "")
|
||||
|
||||
|
||||
def _encode(data: bytes) -> bytes | None:
|
||||
"""Downscale to DISPLAY_WIDTH and re-encode as WebP. None if it isn't a decodable
|
||||
raster image (e.g. SVG) — the caller then stores the original bytes as-is."""
|
||||
try:
|
||||
from PIL import Image
|
||||
im = Image.open(io.BytesIO(data))
|
||||
im.load()
|
||||
if im.mode not in ("RGB", "RGBA"):
|
||||
im = im.convert("RGBA" if ("A" in im.mode or im.mode == "P") else "RGB")
|
||||
if im.width > DISPLAY_WIDTH:
|
||||
h = max(1, round(im.height * DISPLAY_WIDTH / im.width))
|
||||
im = im.resize((DISPLAY_WIDTH, h), Image.LANCZOS)
|
||||
out = io.BytesIO()
|
||||
im.save(out, format="WEBP", quality=WEBP_QUALITY, method=4)
|
||||
return out.getvalue()
|
||||
except Exception: # noqa: BLE001 — not a decodable raster image
|
||||
return None
|
||||
|
||||
|
||||
def _ext_for(ctype: str) -> str:
|
||||
c = ctype.lower()
|
||||
if "png" in c:
|
||||
return ".png"
|
||||
if "gif" in c:
|
||||
return ".gif"
|
||||
if "svg" in c:
|
||||
return ".svg"
|
||||
if "webp" in c:
|
||||
return ".webp"
|
||||
return ".jpg"
|
||||
|
||||
|
||||
def path_for(url: str) -> Path | None:
|
||||
"""The cached file for this URL if present (and bump its mtime, the LRU marker)."""
|
||||
for p in cache_dir().glob(_key(url) + ".*"):
|
||||
try:
|
||||
os.utime(p, None) # touch -> last-used time for LRU eviction
|
||||
except OSError:
|
||||
pass
|
||||
return p
|
||||
return None
|
||||
|
||||
|
||||
def get_or_fetch(url: str | None) -> Path | None:
|
||||
"""Cached display copy for a source image URL, fetching + caching on first miss.
|
||||
Atomic write (temp then rename) so a reader never sees a half-file. None on any
|
||||
failure — callers (endpoint 404 -> frontend retry/typo cover) degrade gracefully."""
|
||||
if not url or not url.startswith(("http://", "https://")):
|
||||
return None
|
||||
hit = path_for(url)
|
||||
if hit:
|
||||
return hit
|
||||
try:
|
||||
data, ctype = _http_bytes(url)
|
||||
except Exception: # noqa: BLE001 — source down/slow/blocked
|
||||
return None
|
||||
if len(data) < _MIN_IMAGE_BYTES or len(data) > _MAX_FETCH_BYTES:
|
||||
return None
|
||||
encoded = _encode(data)
|
||||
if encoded is not None:
|
||||
blob, ext = encoded, ".webp"
|
||||
elif ctype.startswith("image/"):
|
||||
blob, ext = data, _ext_for(ctype) # couldn't re-encode (e.g. SVG): keep original
|
||||
else:
|
||||
return None
|
||||
key = _key(url)
|
||||
cdir = cache_dir()
|
||||
tmp = cdir / f".{key}.tmp"
|
||||
dest = cdir / f"{key}{ext}"
|
||||
try:
|
||||
tmp.write_bytes(blob)
|
||||
os.replace(tmp, dest) # atomic
|
||||
except OSError:
|
||||
try:
|
||||
tmp.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
return None
|
||||
return dest
|
||||
|
||||
|
||||
def warm(conn, limit: int = 200) -> int:
|
||||
"""Pre-fetch display copies for the newest accepted articles that have an image, so
|
||||
the FIRST page view is already a local hit (no first-view flakiness). Bounded; skips
|
||||
already-cached. Returns how many it newly cached."""
|
||||
rows = conn.execute(
|
||||
"SELECT DISTINCT a.image_url FROM article_scores s JOIN articles a ON a.id = s.article_id "
|
||||
"WHERE s.accepted=1 AND a.duplicate_of IS NULL AND a.image_url IS NOT NULL "
|
||||
"AND a.image_url != '' ORDER BY a.id DESC LIMIT ?",
|
||||
(limit,),
|
||||
).fetchall()
|
||||
made = 0
|
||||
for r in rows:
|
||||
url = r[0]
|
||||
if path_for(url):
|
||||
continue
|
||||
if get_or_fetch(url):
|
||||
made += 1
|
||||
return made
|
||||
|
||||
|
||||
def prune(cap: int | None = None) -> dict:
|
||||
"""Enforce the size ceiling: delete least-recently-used files (oldest mtime first)
|
||||
until the cache is under the cap. Returns {before, after, removed, cap}."""
|
||||
if cap is None:
|
||||
cap = cap_bytes()
|
||||
files, total = [], 0
|
||||
for p in cache_dir().iterdir():
|
||||
if not p.is_file() or p.name.startswith("."):
|
||||
continue
|
||||
try:
|
||||
st = p.stat()
|
||||
except OSError:
|
||||
continue
|
||||
files.append((st.st_mtime, st.st_size, p))
|
||||
total += st.st_size
|
||||
before, removed = total, 0
|
||||
if total > cap:
|
||||
files.sort() # oldest mtime first = least recently used
|
||||
for _mtime, size, p in files:
|
||||
if total <= cap:
|
||||
break
|
||||
try:
|
||||
p.unlink()
|
||||
total -= size
|
||||
removed += 1
|
||||
except OSError:
|
||||
pass
|
||||
return {"before": before, "after": total, "removed": removed, "cap": cap}
|
||||
+4
-1
@@ -51,8 +51,11 @@ def render_share_page(article: dict, base_url: str, summary: str | None = None,
|
||||
_tag("twitter:image", image, attr="name"),
|
||||
]))
|
||||
|
||||
# The visible image is served from our cached/downscaled copy (not a hotlink), so a
|
||||
# flaky source CDN can't blank it. og:image/twitter:image above stay the source URL
|
||||
# so social crawlers fetch the canonical image directly.
|
||||
media = (
|
||||
f'<img class="media" src="{escape(image)}" alt="" referrerpolicy="no-referrer">'
|
||||
f'<img class="media" src="/api/img/{aid}" alt="" referrerpolicy="no-referrer">'
|
||||
if image else ""
|
||||
)
|
||||
|
||||
|
||||
@@ -0,0 +1,101 @@
|
||||
"""News image cache: fetch + downscale to WebP, cache-hit (no refetch), graceful
|
||||
failure, LRU eviction under a size cap, cycle warm, and the /api/img endpoint
|
||||
(allowlisted to our own corpus)."""
|
||||
import io
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from goodnews import newsimg
|
||||
from goodnews.db import connect, init_db
|
||||
|
||||
|
||||
def _png(w=1600, h=1000, color=(40, 130, 173)):
|
||||
out = io.BytesIO()
|
||||
Image.new("RGB", (w, h), color).save(out, format="PNG")
|
||||
return out.getvalue()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def cache(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("GOODNEWS_IMG_CACHE", str(tmp_path / "img_cache"))
|
||||
return tmp_path / "img_cache"
|
||||
|
||||
|
||||
def test_get_or_fetch_caches_and_downscales(cache, monkeypatch):
|
||||
calls = []
|
||||
monkeypatch.setattr(newsimg, "_http_bytes",
|
||||
lambda url, timeout=12: (calls.append(url), (_png(), "image/png"))[1])
|
||||
p = newsimg.get_or_fetch("https://example.com/big.png")
|
||||
assert p and p.exists() and p.suffix == ".webp"
|
||||
with Image.open(p) as im: # downscaled + re-encoded
|
||||
assert im.width == newsimg.DISPLAY_WIDTH and im.format == "WEBP"
|
||||
assert newsimg.get_or_fetch("https://example.com/big.png") == p # cache hit
|
||||
assert len(calls) == 1 # ...not refetched
|
||||
|
||||
|
||||
def test_get_or_fetch_rejects_non_image_and_bad_scheme(cache, monkeypatch):
|
||||
monkeypatch.setattr(newsimg, "_http_bytes",
|
||||
lambda url, timeout=12: (b"<html>nope</html>", "text/html"))
|
||||
assert newsimg.get_or_fetch("https://example.com/page.html") is None
|
||||
assert newsimg.get_or_fetch(None) is None
|
||||
assert newsimg.get_or_fetch("ftp://example.com/x.png") is None # http(s) only (no SSRF surface)
|
||||
|
||||
|
||||
def test_fetch_failure_returns_none(cache, monkeypatch):
|
||||
def boom(url, timeout=12):
|
||||
raise OSError("source down")
|
||||
monkeypatch.setattr(newsimg, "_http_bytes", boom)
|
||||
assert newsimg.get_or_fetch("https://example.com/x.jpg") is None
|
||||
|
||||
|
||||
def test_prune_evicts_least_recently_used_over_cap(cache, monkeypatch):
|
||||
monkeypatch.setattr(newsimg, "_http_bytes", lambda url, timeout=12: (_png(), "image/png"))
|
||||
paths = [newsimg.get_or_fetch(f"https://example.com/{i}.png") for i in range(5)]
|
||||
for i, p in enumerate(paths): # 0 = oldest/LRU, 4 = newest
|
||||
os.utime(p, (1000 + i, 1000 + i))
|
||||
sizes = [p.stat().st_size for p in paths]
|
||||
cap = sum(sizes) - sizes[0] - sizes[1] + 1 # room for the 3 newest only
|
||||
r = newsimg.prune(cap)
|
||||
assert r["removed"] == 2 and r["after"] <= cap
|
||||
assert not paths[0].exists() and not paths[1].exists() # the two oldest evicted
|
||||
assert paths[2].exists() and paths[4].exists() # newer kept
|
||||
|
||||
|
||||
def test_warm_caches_recent_accepted_with_image(cache, monkeypatch):
|
||||
monkeypatch.setattr(newsimg, "_http_bytes", lambda url, timeout=12: (_png(), "image/png"))
|
||||
conn = connect(":memory:"); init_db(conn)
|
||||
conn.execute("INSERT INTO sources (id,name,feed_url) VALUES (1,'S','http://s/f')")
|
||||
for aid, img in ((1, "https://x/1.jpg"), (2, "https://x/2.jpg"), (3, "")):
|
||||
conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) "
|
||||
"VALUES (?,1,?,?,?,?)", (aid, f"http://s/{aid}", f"t{aid}", f"h{aid}", img))
|
||||
conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?,1)", (aid,))
|
||||
conn.commit()
|
||||
assert newsimg.warm(conn) == 2 # the two with an image
|
||||
assert newsimg.warm(conn) == 0 # idempotent (already cached)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("GOODNEWS_DB", str(tmp_path / "t.sqlite3"))
|
||||
monkeypatch.setenv("GOODNEWS_IMG_CACHE", str(tmp_path / "img_cache"))
|
||||
monkeypatch.setattr(newsimg, "_http_bytes", lambda url, timeout=12: (_png(), "image/png"))
|
||||
conn = connect(tmp_path / "t.sqlite3"); init_db(conn)
|
||||
conn.execute("INSERT INTO sources (id,name,feed_url) VALUES (1,'S','http://s/f')")
|
||||
conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) "
|
||||
"VALUES (1,1,'http://s/1','t1','h1','https://x/pic.jpg')")
|
||||
conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) "
|
||||
"VALUES (2,1,'http://s/2','t2','h2','')") # no image
|
||||
conn.commit(); conn.close()
|
||||
from goodnews.api import create_app
|
||||
return TestClient(create_app())
|
||||
|
||||
|
||||
def test_img_endpoint_serves_and_allowlists(client):
|
||||
r = client.get("/api/img/1")
|
||||
assert r.status_code == 200 and r.headers["content-type"] == "image/webp"
|
||||
assert "immutable" in r.headers.get("cache-control", "")
|
||||
assert client.get("/api/img/2").status_code == 404 # article has no image
|
||||
assert client.get("/api/img/999").status_code == 404 # unknown id (not in corpus)
|
||||
Reference in New Issue
Block a user