8a3c00db3b
Stop hotlinking news images from third-party CDNs (the source of the "blank until
you refresh a few times" graphic). New goodnews/newsimg.py caches a downscaled WebP
display copy (≤800px) beside the DB, like art_cache:
- GET/HEAD /api/img/{article_id} — resolves id→image_url (allowlisted to our corpus,
not an open proxy), fetch+cache on first miss, serve local after, immutable headers.
- cycle warms display copies for recent accepted-with-image articles (so the FIRST
view is already local) and prunes to a hard size cap (default 1 GB) by LRU eviction.
Frontend now points at /api/img/<id>: the hub lead, every ArticleCard (feed hero +
cards), and the /a/<id> share page's visible image. og:image/twitter:image stay the
source URL so social crawlers fetch the canonical image directly.
Storage is bounded by construction — over the cap, least-recently-used files are
evicted, so it can't grow without limit regardless of ingest rate. Tests cover
fetch/downscale, cache-hit (no refetch), bad-scheme/non-image rejection, fetch
failure, LRU prune, warm, and the endpoint allowlist.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
102 lines
4.8 KiB
Python
102 lines
4.8 KiB
Python
"""News image cache: fetch + downscale to WebP, cache-hit (no refetch), graceful
|
|
failure, LRU eviction under a size cap, cycle warm, and the /api/img endpoint
|
|
(allowlisted to our own corpus)."""
|
|
import io
|
|
import os
|
|
|
|
import pytest
|
|
from PIL import Image
|
|
from fastapi.testclient import TestClient
|
|
|
|
from goodnews import newsimg
|
|
from goodnews.db import connect, init_db
|
|
|
|
|
|
def _png(w=1600, h=1000, color=(40, 130, 173)):
|
|
out = io.BytesIO()
|
|
Image.new("RGB", (w, h), color).save(out, format="PNG")
|
|
return out.getvalue()
|
|
|
|
|
|
@pytest.fixture
|
|
def cache(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("GOODNEWS_IMG_CACHE", str(tmp_path / "img_cache"))
|
|
return tmp_path / "img_cache"
|
|
|
|
|
|
def test_get_or_fetch_caches_and_downscales(cache, monkeypatch):
|
|
calls = []
|
|
monkeypatch.setattr(newsimg, "_http_bytes",
|
|
lambda url, timeout=12: (calls.append(url), (_png(), "image/png"))[1])
|
|
p = newsimg.get_or_fetch("https://example.com/big.png")
|
|
assert p and p.exists() and p.suffix == ".webp"
|
|
with Image.open(p) as im: # downscaled + re-encoded
|
|
assert im.width == newsimg.DISPLAY_WIDTH and im.format == "WEBP"
|
|
assert newsimg.get_or_fetch("https://example.com/big.png") == p # cache hit
|
|
assert len(calls) == 1 # ...not refetched
|
|
|
|
|
|
def test_get_or_fetch_rejects_non_image_and_bad_scheme(cache, monkeypatch):
|
|
monkeypatch.setattr(newsimg, "_http_bytes",
|
|
lambda url, timeout=12: (b"<html>nope</html>", "text/html"))
|
|
assert newsimg.get_or_fetch("https://example.com/page.html") is None
|
|
assert newsimg.get_or_fetch(None) is None
|
|
assert newsimg.get_or_fetch("ftp://example.com/x.png") is None # http(s) only (no SSRF surface)
|
|
|
|
|
|
def test_fetch_failure_returns_none(cache, monkeypatch):
|
|
def boom(url, timeout=12):
|
|
raise OSError("source down")
|
|
monkeypatch.setattr(newsimg, "_http_bytes", boom)
|
|
assert newsimg.get_or_fetch("https://example.com/x.jpg") is None
|
|
|
|
|
|
def test_prune_evicts_least_recently_used_over_cap(cache, monkeypatch):
|
|
monkeypatch.setattr(newsimg, "_http_bytes", lambda url, timeout=12: (_png(), "image/png"))
|
|
paths = [newsimg.get_or_fetch(f"https://example.com/{i}.png") for i in range(5)]
|
|
for i, p in enumerate(paths): # 0 = oldest/LRU, 4 = newest
|
|
os.utime(p, (1000 + i, 1000 + i))
|
|
sizes = [p.stat().st_size for p in paths]
|
|
cap = sum(sizes) - sizes[0] - sizes[1] + 1 # room for the 3 newest only
|
|
r = newsimg.prune(cap)
|
|
assert r["removed"] == 2 and r["after"] <= cap
|
|
assert not paths[0].exists() and not paths[1].exists() # the two oldest evicted
|
|
assert paths[2].exists() and paths[4].exists() # newer kept
|
|
|
|
|
|
def test_warm_caches_recent_accepted_with_image(cache, monkeypatch):
|
|
monkeypatch.setattr(newsimg, "_http_bytes", lambda url, timeout=12: (_png(), "image/png"))
|
|
conn = connect(":memory:"); init_db(conn)
|
|
conn.execute("INSERT INTO sources (id,name,feed_url) VALUES (1,'S','http://s/f')")
|
|
for aid, img in ((1, "https://x/1.jpg"), (2, "https://x/2.jpg"), (3, "")):
|
|
conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) "
|
|
"VALUES (?,1,?,?,?,?)", (aid, f"http://s/{aid}", f"t{aid}", f"h{aid}", img))
|
|
conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?,1)", (aid,))
|
|
conn.commit()
|
|
assert newsimg.warm(conn) == 2 # the two with an image
|
|
assert newsimg.warm(conn) == 0 # idempotent (already cached)
|
|
|
|
|
|
@pytest.fixture
|
|
def client(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("GOODNEWS_DB", str(tmp_path / "t.sqlite3"))
|
|
monkeypatch.setenv("GOODNEWS_IMG_CACHE", str(tmp_path / "img_cache"))
|
|
monkeypatch.setattr(newsimg, "_http_bytes", lambda url, timeout=12: (_png(), "image/png"))
|
|
conn = connect(tmp_path / "t.sqlite3"); init_db(conn)
|
|
conn.execute("INSERT INTO sources (id,name,feed_url) VALUES (1,'S','http://s/f')")
|
|
conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) "
|
|
"VALUES (1,1,'http://s/1','t1','h1','https://x/pic.jpg')")
|
|
conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) "
|
|
"VALUES (2,1,'http://s/2','t2','h2','')") # no image
|
|
conn.commit(); conn.close()
|
|
from goodnews.api import create_app
|
|
return TestClient(create_app())
|
|
|
|
|
|
def test_img_endpoint_serves_and_allowlists(client):
|
|
r = client.get("/api/img/1")
|
|
assert r.status_code == 200 and r.headers["content-type"] == "image/webp"
|
|
assert "immutable" in r.headers.get("cache-control", "")
|
|
assert client.get("/api/img/2").status_code == 404 # article has no image
|
|
assert client.get("/api/img/999").status_code == 404 # unknown id (not in corpus)
|