images: fix two fetcher bugs + add source-level image-rights policy (Codex)
Fetcher (the two remaining bugs Codex found):
- Real redirects are now followed. _NoRedirect makes urllib RAISE HTTPError on 3xx, so
the old status-branch was dead code (mocked tests masked it). Handle 301/302/303/307/308
HTTPError as redirects (re-validate the destination); classify 4xx≠429 as PERMANENT
(negative-cached), 429/5xx/network as transient. Real-opener redirect + 404/5xx tests.
- The megapixel ceiling is now enforced: explicit `w*h > _MAX_PIXELS` check BEFORE load()
(Pillow only warns at MAX_IMAGE_PIXELS). Test with a lowered ceiling.
Image-rights policy (per Codex + owner decision — only cache what's cleared):
- sources.image_policy: 'cache' (re-host a downscaled copy — license/permission/PD only),
'remote' (hotlink the publisher's image — the conservative DEFAULT), 'none' (no image).
- newsimg.display_url resolves the display URL per policy; applied in Article.from_row so
feed/brief/history return the right URL, and in share.py (og/twitter still reference the
publisher's own image, never re-hosted). warm() + /api/img both gated on 'cache'.
- Frontend uses the server-resolved image_url (reverted the hardcoded /api/img); the
graceful retry covers remote hotlinks too. Admin: per-source image-policy selector +
POST /api/admin/sources/{id}/image-policy. Default 'remote' → nothing re-hosted until
a source is explicitly cleared.
445 backend + 36 frontend tests pass.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -585,3 +585,21 @@ def test_source_paywall_override(tmp_path, monkeypatch):
|
||||
# validation + 404
|
||||
assert tc.post("/api/admin/sources/2/paywall", json={"override": "bogus"}).status_code == 422
|
||||
assert tc.post("/api/admin/sources/999/paywall", json={"override": "free"}).status_code == 404
|
||||
|
||||
|
||||
def test_source_image_policy(tmp_path, monkeypatch):
|
||||
import sqlite3, os
|
||||
app, api = _make(tmp_path, monkeypatch, admin_email="boss@x.com")
|
||||
c = sqlite3.connect(os.environ["GOODNEWS_DB"])
|
||||
c.execute("INSERT INTO sources (id,name,feed_url) VALUES (2,'Gov','http://g/f')")
|
||||
c.commit(); c.close()
|
||||
anon = TestClient(app)
|
||||
assert anon.post("/api/admin/sources/2/image-policy", json={"policy": "cache"}).status_code == 401 # gated
|
||||
tc = _signin(app, api, "boss@x.com")
|
||||
assert tc.post("/api/admin/sources/2/image-policy", json={"policy": "cache"}).json()["policy"] == "cache"
|
||||
c = sqlite3.connect(os.environ["GOODNEWS_DB"])
|
||||
assert c.execute("SELECT image_policy FROM sources WHERE id=2").fetchone()[0] == "cache"
|
||||
c.close()
|
||||
assert tc.post("/api/admin/sources/2/image-policy", json={"policy": "remote"}).json()["policy"] == "remote"
|
||||
assert tc.post("/api/admin/sources/2/image-policy", json={"policy": "bogus"}).status_code == 422
|
||||
assert tc.post("/api/admin/sources/999/image-policy", json={"policy": "cache"}).status_code == 404
|
||||
|
||||
+102
-27
@@ -16,6 +16,30 @@ from goodnews.db import connect, init_db
|
||||
_REAL_SAFE_FETCH = newsimg._safe_fetch
|
||||
|
||||
|
||||
class _Resp:
|
||||
"""Minimal non-redirect (2xx) response for a fake opener."""
|
||||
def __init__(self, headers, body=b""):
|
||||
self.status, self.headers, self._b = 200, headers, body
|
||||
def read(self, n=-1):
|
||||
return self._b
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
|
||||
def _http_error(url, code, location=None):
|
||||
"""A real urllib HTTPError, as _NoRedirect raises for 3xx (and urllib for 4xx/5xx)."""
|
||||
import urllib.error
|
||||
hdrs = {"Location": location} if location else {}
|
||||
return urllib.error.HTTPError(url, code, "x", hdrs, io.BytesIO(b""))
|
||||
|
||||
|
||||
def _fake_opener(handler):
|
||||
class _Op:
|
||||
def open(self, req, timeout=None):
|
||||
return handler(req.full_url)
|
||||
return lambda *a, **k: _Op()
|
||||
|
||||
|
||||
def _png(w=1600, h=1000, color=(40, 130, 173)):
|
||||
out = io.BytesIO()
|
||||
Image.new("RGB", (w, h), color).save(out, format="PNG")
|
||||
@@ -64,30 +88,64 @@ def test_private_host_refused_and_negative_cached(cache, monkeypatch):
|
||||
|
||||
|
||||
def test_redirect_to_private_is_refused(cache, monkeypatch):
|
||||
monkeypatch.setattr(newsimg, "_safe_fetch", _REAL_SAFE_FETCH) # exercise real redirect re-validation
|
||||
monkeypatch.setattr(newsimg, "_safe_fetch", _REAL_SAFE_FETCH)
|
||||
hosts = {"good.example": True, "evil.internal": False}
|
||||
monkeypatch.setattr(newsimg, "_host_is_public", lambda h: hosts.get(h, False))
|
||||
|
||||
class _Resp:
|
||||
def __init__(self, status, headers, body=b""):
|
||||
self.status, self.headers, self._b = status, headers, body
|
||||
def read(self, n=-1):
|
||||
return self._b
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
class _Opener:
|
||||
def open(self, req, timeout=None):
|
||||
if "good.example" in req.full_url: # public → 302 to private
|
||||
return _Resp(302, {"Location": "http://evil.internal/x.png"})
|
||||
return _Resp(200, {"Content-Type": "image/png"}, _png()) # should never be reached
|
||||
monkeypatch.setattr(newsimg.urllib.request, "build_opener", lambda *a: _Opener())
|
||||
def handle(url): # real 302 (HTTPError) → private dest
|
||||
if "good.example" in url:
|
||||
raise _http_error(url, 302, "http://evil.internal/x.png")
|
||||
return _Resp({"Content-Type": "image/png"}, _png()) # must never be reached
|
||||
monkeypatch.setattr(newsimg.urllib.request, "build_opener", _fake_opener(handle))
|
||||
|
||||
url = "http://good.example/p.png"
|
||||
assert newsimg.fetch_and_cache(url) is None # redirect hop re-validated → refused
|
||||
assert newsimg._failed_recently(url)
|
||||
|
||||
|
||||
def test_real_redirect_followed_to_public(cache, monkeypatch):
|
||||
"""Regression for the _NoRedirect bug: 3xx arrive as HTTPError, must be followed."""
|
||||
monkeypatch.setattr(newsimg, "_safe_fetch", _REAL_SAFE_FETCH)
|
||||
monkeypatch.setattr(newsimg, "_host_is_public", lambda h: True)
|
||||
seen = []
|
||||
|
||||
def handle(url):
|
||||
seen.append(url)
|
||||
if url == "http://a.example/p.png":
|
||||
raise _http_error(url, 302, "http://b.example/final.png")
|
||||
return _Resp({"Content-Type": "image/png"}, _png())
|
||||
monkeypatch.setattr(newsimg.urllib.request, "build_opener", _fake_opener(handle))
|
||||
|
||||
p = newsimg.fetch_and_cache("http://a.example/p.png")
|
||||
assert p and p.suffix == ".webp" # followed the redirect, cached the dest
|
||||
assert seen == ["http://a.example/p.png", "http://b.example/final.png"]
|
||||
|
||||
|
||||
def test_404_is_permanent_but_5xx_is_transient(cache, monkeypatch):
|
||||
monkeypatch.setattr(newsimg, "_safe_fetch", _REAL_SAFE_FETCH)
|
||||
monkeypatch.setattr(newsimg, "_host_is_public", lambda h: True)
|
||||
monkeypatch.setattr(newsimg.urllib.request, "build_opener",
|
||||
_fake_opener(lambda url: (_ for _ in ()).throw(_http_error(url, 404))))
|
||||
gone = "http://x.example/missing.png"
|
||||
assert newsimg.fetch_and_cache(gone) is None
|
||||
assert newsimg._failed_recently(gone) # 404 → permanent, negative-cached
|
||||
|
||||
monkeypatch.setattr(newsimg.urllib.request, "build_opener",
|
||||
_fake_opener(lambda url: (_ for _ in ()).throw(_http_error(url, 503))))
|
||||
down = "http://x.example/down.png"
|
||||
assert newsimg.fetch_and_cache(down) is None
|
||||
assert not newsimg._failed_recently(down) # 5xx → transient, retried next cycle
|
||||
|
||||
|
||||
def test_megapixel_ceiling_enforced_before_decode(cache, monkeypatch):
|
||||
monkeypatch.setattr(newsimg, "_MAX_PIXELS", 1000) # a 1600x1000 image (1.6M px) exceeds this
|
||||
monkeypatch.setattr(newsimg, "_safe_fetch", lambda url, timeout=12: (_png(1600, 1000), "image/png"))
|
||||
url = "https://x.example/huge.png"
|
||||
assert newsimg.fetch_and_cache(url) is None # rejected by the explicit w*h check
|
||||
assert not list(cache.glob("*.webp"))
|
||||
assert newsimg._failed_recently(url)
|
||||
|
||||
|
||||
def test_transient_failure_is_not_negative_cached(cache, monkeypatch):
|
||||
def boom(url, timeout=12):
|
||||
raise newsimg._FetchError("network down", permanent=False)
|
||||
@@ -109,16 +167,29 @@ def test_prune_evicts_least_recently_used_over_cap(cache):
|
||||
assert paths[2].exists() and paths[4].exists()
|
||||
|
||||
|
||||
def test_warm_skips_cached_and_failed(cache, monkeypatch):
|
||||
def test_display_url_resolves_per_policy():
|
||||
assert newsimg.display_url(7, "cache", "https://x/p.jpg") == "/api/img/7" # our copy
|
||||
assert newsimg.display_url(7, "remote", "https://x/p.jpg") == "https://x/p.jpg" # hotlink
|
||||
assert newsimg.display_url(7, "none", "https://x/p.jpg") is None # typographic cover
|
||||
assert newsimg.display_url(7, None, "https://x/p.jpg") == "https://x/p.jpg" # default = remote
|
||||
assert newsimg.display_url(7, "cache", None) is None # no image
|
||||
|
||||
|
||||
def test_warm_only_caches_cache_policy_sources(cache, monkeypatch):
|
||||
conn = connect(":memory:"); init_db(conn)
|
||||
conn.execute("INSERT INTO sources (id,name,feed_url) VALUES (1,'S','http://s/f')")
|
||||
for aid, img, acc, dup in ((1, "https://x/1.jpg", 1, None), (2, "https://x/2.jpg", 1, None),
|
||||
(3, "https://x/3.jpg", 0, None), (4, "https://x/4.jpg", 1, 1)):
|
||||
conn.execute("INSERT INTO sources (id,name,feed_url,image_policy) VALUES (1,'C','http://c/f','cache')")
|
||||
conn.execute("INSERT INTO sources (id,name,feed_url,image_policy) VALUES (2,'R','http://r/f','remote')")
|
||||
# source 1 (cache): two accepted+canonical+image; plus not-accepted + duplicate (skipped)
|
||||
# source 2 (remote): accepted+image but must NOT be cached (re-hosting not cleared)
|
||||
rows = ((1, 1, "https://x/1.jpg", 1, None), (2, 1, "https://x/2.jpg", 1, None),
|
||||
(3, 1, "https://x/3.jpg", 0, None), (4, 1, "https://x/4.jpg", 1, 1),
|
||||
(5, 2, "https://x/5.jpg", 1, None))
|
||||
for aid, sid, img, acc, dup in rows:
|
||||
conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url,duplicate_of) "
|
||||
"VALUES (?,1,?,?,?,?,?)", (aid, f"http://s/{aid}", f"t{aid}", f"h{aid}", img, dup))
|
||||
"VALUES (?,?,?,?,?,?,?)", (aid, sid, f"http://s/{aid}", f"t{aid}", f"h{aid}", img, dup))
|
||||
conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?,?)", (aid, acc))
|
||||
conn.commit()
|
||||
assert newsimg.warm(conn) == 2 # only the 2 accepted, canonical, with image
|
||||
assert newsimg.warm(conn) == 2 # only source 1's accepted+canonical+image
|
||||
assert newsimg.warm(conn) == 0 # idempotent — already cached
|
||||
|
||||
|
||||
@@ -129,13 +200,16 @@ def client(tmp_path, monkeypatch):
|
||||
monkeypatch.setattr(newsimg, "_host_is_public", lambda h: True)
|
||||
monkeypatch.setattr(newsimg, "_safe_fetch", lambda url, timeout=12: (_png(), "image/png"))
|
||||
conn = connect(tmp_path / "t.sqlite3"); init_db(conn)
|
||||
conn.execute("INSERT INTO sources (id,name,feed_url) VALUES (1,'S','http://s/f')")
|
||||
# 1 = accepted+canonical+image, 2 = no image, 3 = NOT accepted, 4 = duplicate
|
||||
rows = ((1, "https://x/pic.jpg", 1, None), (2, "", 1, None),
|
||||
(3, "https://x/p3.jpg", 0, None), (4, "https://x/p4.jpg", 1, 1))
|
||||
for aid, img, acc, dup in rows:
|
||||
conn.execute("INSERT INTO sources (id,name,feed_url,image_policy) VALUES (1,'C','http://c/f','cache')")
|
||||
conn.execute("INSERT INTO sources (id,name,feed_url,image_policy) VALUES (2,'R','http://r/f','remote')")
|
||||
# src1 (cache): 1=accepted+image, 2=no image, 3=NOT accepted, 4=duplicate.
|
||||
# src2 (remote): 5=accepted+image but NOT cleared to cache → endpoint must 404.
|
||||
rows = ((1, 1, "https://x/pic.jpg", 1, None), (2, 1, "", 1, None),
|
||||
(3, 1, "https://x/p3.jpg", 0, None), (4, 1, "https://x/p4.jpg", 1, 1),
|
||||
(5, 2, "https://x/p5.jpg", 1, None))
|
||||
for aid, sid, img, acc, dup in rows:
|
||||
conn.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url,duplicate_of) "
|
||||
"VALUES (?,1,?,?,?,?,?)", (aid, f"http://s/{aid}", f"t{aid}", f"h{aid}", img, dup))
|
||||
"VALUES (?,?,?,?,?,?,?)", (aid, sid, f"http://s/{aid}", f"t{aid}", f"h{aid}", img, dup))
|
||||
conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?,?)", (aid, acc))
|
||||
conn.commit()
|
||||
# the cycle owns fetching — pre-warm so the endpoint has a hit to serve
|
||||
@@ -152,6 +226,7 @@ def test_img_endpoint_serves_cached_and_restricts(client):
|
||||
assert client.get("/api/img/2").status_code == 404 # no image
|
||||
assert client.get("/api/img/3").status_code == 404 # not accepted
|
||||
assert client.get("/api/img/4").status_code == 404 # duplicate (non-canonical)
|
||||
assert client.get("/api/img/5").status_code == 404 # remote-policy source (not cleared to cache)
|
||||
assert client.get("/api/img/999").status_code == 404 # unknown id
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user