images: fix two fetcher bugs + add source-level image-rights policy (Codex)

Fetcher (the two remaining bugs Codex found):
- Real redirects are now followed. _NoRedirect makes urllib RAISE HTTPError on 3xx, so
  the old status-branch was dead code (mocked tests masked it). Handle 301/302/303/307/308
  HTTPError as redirects (re-validate the destination); classify 4xx≠429 as PERMANENT
  (negative-cached), 429/5xx/network as transient. Real-opener redirect + 404/5xx tests.
- The megapixel ceiling is now enforced: explicit `w*h > _MAX_PIXELS` check BEFORE load()
  (Pillow only warns at MAX_IMAGE_PIXELS). Test with a lowered ceiling.

Image-rights policy (per Codex + owner decision — only cache what's cleared):
- sources.image_policy: 'cache' (re-host a downscaled copy — license/permission/PD only),
  'remote' (hotlink the publisher's image — the conservative DEFAULT), 'none' (no image).
- newsimg.display_url resolves the display URL per policy; applied in Article.from_row so
  feed/brief/history return the right URL, and in share.py (og/twitter still reference the
  publisher's own image, never re-hosted). warm() + /api/img both gated on 'cache'.
- Frontend uses the server-resolved image_url (reverted the hardcoded /api/img); the
  graceful retry covers remote hotlinks too. Admin: per-source image-policy selector +
  POST /api/admin/sources/{id}/image-policy. Default 'remote' → nothing re-hosted until
  a source is explicitly cleared.

445 backend + 36 frontend tests pass.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
jay
2026-06-30 14:01:11 -04:00
parent a55ba185a8
commit 8a7606e20d
10 changed files with 238 additions and 62 deletions
+26 -3
View File
@@ -351,7 +351,9 @@ class Article(BaseModel):
title=row["title"],
description=row.get("description"),
url=row["canonical_url"],
image_url=row.get("image_url"),
# Resolve per the source's image policy: our cached copy, the publisher's URL
# (hotlink), or none — so we never re-host an image we haven't cleared.
image_url=newsimg.display_url(row["id"], row.get("image_policy"), row.get("image_url")),
published_at=row.get("published_at"),
source=row["source_name"],
source_id=row.get("source_id"),
@@ -591,6 +593,10 @@ class SourcePaywallBody(BaseModel):
override: str | None = None # None = use domain rule · 'free' · 'paywalled'
class SourceImagePolicyBody(BaseModel):
policy: str = "remote" # 'cache' · 'remote' (default) · 'none'
class CandidateSuggestBody(BaseModel):
feed_url: str = ""
name: str | None = None
@@ -1111,7 +1117,7 @@ def create_app() -> FastAPI:
with get_conn() as conn:
row = conn.execute(
"SELECT a.id, a.title, a.description, a.image_url, a.canonical_url, "
"a.duplicate_of, a.source_id, src.name AS source_name, s.reason_text, s.accepted, "
"a.duplicate_of, a.source_id, src.name AS source_name, src.image_policy, s.reason_text, s.accepted, "
"(SELECT group_concat(t.tag) FROM article_tags t WHERE t.article_id = a.id) AS tags "
"FROM articles a JOIN sources src ON src.id = a.source_id "
"LEFT JOIN article_scores s ON s.article_id = a.id WHERE a.id = ?",
@@ -1416,6 +1422,22 @@ def create_app() -> FastAPI:
conn.commit()
return {"ok": True, "override": ov}
@app.post("/api/admin/sources/{sid}/image-policy")
def admin_source_image_policy(sid: int, body: SourceImagePolicyBody, request: Request) -> dict:
# Image rights policy: 'cache' (re-host a downscaled copy — only for sources we've
# cleared: open license / permission / public-domain), 'remote' (hotlink the
# publisher's image), 'none' (no image). Default is the conservative 'remote'.
pol = body.policy
if pol not in ("cache", "remote", "none"):
raise HTTPException(status_code=422, detail="policy must be 'cache', 'remote', or 'none'")
with get_conn() as conn:
_require_admin(conn, request)
cur = conn.execute("UPDATE sources SET image_policy = ? WHERE id = ?", (pol, sid))
if cur.rowcount == 0:
raise HTTPException(status_code=404, detail="source not found")
conn.commit()
return {"ok": True, "policy": pol}
# --- Source candidates (supervised add-a-source pipeline) ----------------
def _candidate_dict(row) -> dict:
@@ -2346,7 +2368,8 @@ def create_app() -> FastAPI:
with get_conn() as conn:
row = conn.execute(
"SELECT a.image_url FROM articles a JOIN article_scores s ON s.article_id = a.id "
"WHERE a.id = ? AND s.accepted = 1 AND a.duplicate_of IS NULL",
"JOIN sources src ON src.id = a.source_id "
"WHERE a.id = ? AND s.accepted = 1 AND a.duplicate_of IS NULL AND src.image_policy = 'cache'",
(article_id,),
).fetchone()
url = row["image_url"] if row else None
+5
View File
@@ -632,6 +632,11 @@ def _migrate(conn: sqlite3.Connection) -> None:
conn.execute("ALTER TABLE sources ADD COLUMN content_visible INTEGER NOT NULL DEFAULT 1")
if "retry_after_at" not in source_cols:
conn.execute("ALTER TABLE sources ADD COLUMN retry_after_at TEXT")
# Image rights policy per source: 'cache' (cleared to re-host a downscaled copy),
# 'remote' (hotlink the publisher's image — the conservative DEFAULT), 'none' (no
# image). Caching is opt-in; unknown/new sources are never re-hosted.
if "image_policy" not in source_cols:
conn.execute("ALTER TABLE sources ADD COLUMN image_policy TEXT NOT NULL DEFAULT 'remote'")
# Daily Art columns added after the tables first shipped.
pool_cols = {row["name"] for row in conn.execute("PRAGMA table_info(art_pool)")}
+45 -17
View File
@@ -63,6 +63,23 @@ def _key(url: str) -> str:
return hashlib.sha1(url.encode("utf-8")).hexdigest()
def display_url(article_id: int, image_policy: str | None, raw_url: str | None) -> str | None:
"""The image URL the frontend should use, honoring the SOURCE's image policy:
'cache' → our locally-cached copy (/api/img/<id>) — only for sources we've cleared
to re-host (open license / explicit permission / public-domain).
'remote' → the publisher's own URL (hotlinked + the frontend's graceful retry). The
conservative DEFAULT: we display but never re-host.
'none' → no image (typographic cover).
Returns None when there's no image or the policy is 'none'."""
if not raw_url:
return None
if image_policy == "cache":
return f"/api/img/{article_id}"
if image_policy == "none":
return None
return raw_url # 'remote' (default) — hotlink, never re-hosted
class _FetchError(Exception):
"""permanent=True → negative-cache (won't retry soon); False → transient, retry."""
def __init__(self, msg: str, permanent: bool):
@@ -83,16 +100,21 @@ def _safe_fetch(url: str, timeout: int = 12) -> tuple[bytes, str]:
req = urllib.request.Request(current, headers=_UA)
try:
resp = opener.open(req, timeout=timeout)
except urllib.error.HTTPError as exc:
# _NoRedirect makes urllib RAISE on 3xx (rather than return a response), so
# redirects arrive here. Re-validate the destination on the next loop. 4xx
# (except 429) is a permanent miss → negative-cache; 429/5xx → transient.
if exc.code in (301, 302, 303, 307, 308):
loc = exc.headers.get("Location")
exc.close()
if not loc:
raise _FetchError("redirect without location", permanent=True) from exc
current = urljoin(current, loc)
continue
permanent = 400 <= exc.code < 500 and exc.code != 429
raise _FetchError(f"http {exc.code}", permanent=permanent) from exc
except (urllib.error.URLError, OSError, ValueError) as exc:
raise _FetchError(f"fetch failed: {exc}", permanent=False) from exc
status = getattr(resp, "status", 200) or 200
if status in (301, 302, 303, 307, 308):
loc = resp.headers.get("Location")
resp.close()
if not loc:
raise _FetchError("redirect without location", permanent=True)
current = urljoin(current, loc)
continue
try:
return resp.read(_MAX_FETCH_BYTES + 1), (resp.headers.get("Content-Type") or "")
finally:
@@ -106,11 +128,14 @@ def _encode(data: bytes) -> bytes | None:
dimensions — the caller then REJECTS it (never stores arbitrary bytes)."""
try:
from PIL import Image
Image.MAX_IMAGE_PIXELS = _MAX_PIXELS # raise DecompressionBombError past this
im = Image.open(io.BytesIO(data))
im.load() # forces decode → catches truncated/bomb here
if im.width > _MAX_DIM or im.height > _MAX_DIM or im.width < 1 or im.height < 1:
Image.MAX_IMAGE_PIXELS = _MAX_PIXELS # backstop; Pillow only WARNS at this, raises ~2x
im = Image.open(io.BytesIO(data)) # lazy: header (size) read without decoding pixels
# Enforce the pixel/dimension ceiling BEFORE load() so a decompression bomb is never
# actually decoded (Pillow's own MAX_IMAGE_PIXELS only warns at the threshold).
if (im.width * im.height > _MAX_PIXELS or im.width > _MAX_DIM or im.height > _MAX_DIM
or im.width < 1 or im.height < 1):
return None
im.load() # decode now (also catches truncated data)
if im.mode not in ("RGB", "RGBA"):
im = im.convert("RGBA" if ("A" in im.mode or im.mode == "P") else "RGB")
if im.width > DISPLAY_WIDTH:
@@ -192,13 +217,16 @@ def fetch_and_cache(url: str | None) -> Path | None:
def warm(conn, limit: int = 200) -> int:
"""Pre-fetch display copies for the newest ACCEPTED, CANONICAL articles that have an
image, so the API only ever serves cache hits. Bounded; skips already-cached and
recently-failed URLs. Returns how many it newly cached."""
"""Pre-fetch display copies for the newest ACCEPTED, CANONICAL articles whose SOURCE
is cleared to cache (image_policy='cache'), so the API only ever serves cache hits.
Bounded; skips already-cached and recently-failed URLs. Returns how many it newly
cached. Sources default to 'remote' (hotlink, never re-hosted), so this caches
nothing until a source is explicitly set to 'cache'."""
rows = conn.execute(
"SELECT DISTINCT a.image_url FROM article_scores s JOIN articles a ON a.id = s.article_id "
"WHERE s.accepted=1 AND a.duplicate_of IS NULL AND a.image_url IS NOT NULL "
"AND a.image_url != '' ORDER BY a.id DESC LIMIT ?",
"JOIN sources src ON src.id = a.source_id "
"WHERE s.accepted=1 AND a.duplicate_of IS NULL AND src.image_policy='cache' "
"AND a.image_url IS NOT NULL AND a.image_url != '' ORDER BY a.id DESC LIMIT ?",
(limit,),
).fetchall()
made = 0
+2
View File
@@ -55,6 +55,7 @@ _ARTICLE_COLUMNS = f"""
s.reason_text,
s.model_name,
src.paywall_override AS paywall_override,
src.image_policy AS image_policy,
a.source_words,
(SELECT group_concat(t.tag) FROM article_tags t WHERE t.article_id = a.id) AS tags,
{RANK_SCORE_SQL} AS rank_score
@@ -525,6 +526,7 @@ def source_health(conn: sqlite3.Connection) -> list[dict]:
s.id, s.name, s.feed_url, s.homepage_url, s.default_category AS category, s.active,
s.status, s.content_visible, s.retry_after_at,
s.consecutive_failures AS failures, s.review_flag, s.review_reason, s.paywall_override,
s.image_policy,
s.poll_interval_minutes AS interval_minutes,
s.last_success_at, s.last_error_at, substr(s.last_error, 1, 160) AS last_error,
(SELECT MAX(r.finished_at) FROM ingest_runs r
+16 -10
View File
@@ -10,6 +10,8 @@ from __future__ import annotations
from html import escape
from .newsimg import display_url
def _tag(name: str, content: str | None, attr: str = "property") -> str:
if not content:
@@ -149,11 +151,17 @@ def render_share_page(article: dict, base_url: str, summary: str | None = None,
)
src_url = article.get("canonical_url") or base_url
image = article.get("image_url")
policy = article.get("image_policy")
# What WE show, honoring the source's image policy (cache → our copy; remote → the
# publisher's URL; none → nothing). og/twitter reference the publisher's own image
# (a link, not re-hosting) whenever we'd show anything; 'none' omits it entirely.
display = display_url(aid, policy, image)
og_image = image if (image and policy != "none") else None
page_url = f"{base_url}/a/{aid}"
# With an image: a large-image card. Without: a clean text unfurl (title +
# why + brand) — tidy, not a broken/muddy preview. (A branded fallback PNG
# can replace this later.)
twitter_card = "summary_large_image" if image else "summary"
twitter_card = "summary_large_image" if display else "summary"
meta = "\n".join(filter(None, [
_tag("og:site_name", "upbeatBytes"),
@@ -161,22 +169,20 @@ def render_share_page(article: dict, base_url: str, summary: str | None = None,
_tag("og:title", title),
_tag("og:description", why),
_tag("og:url", page_url),
_tag("og:image", image),
_tag("og:image", og_image),
_tag("twitter:card", twitter_card, attr="name"),
_tag("twitter:title", title, attr="name"),
_tag("twitter:description", why, attr="name"),
_tag("twitter:image", image, attr="name"),
_tag("twitter:image", og_image, attr="name"),
]))
# The visible image is served from our cached/downscaled copy (not a hotlink), so a
# flaky source CDN can't blank it. og:image/twitter:image above stay the source URL
# so social crawlers fetch the canonical image directly.
# Served from our cache (/api/img/<id>); if it isn't cached yet / fails, drop the
# element so the page degrades to the clean text unfurl rather than a broken icon.
# The visible image is whatever the policy resolved to (our cached copy for 'cache'
# sources, else the publisher's URL for 'remote'). If it isn't cached yet / fails to
# load, drop the element so the page degrades to the clean text unfurl, not a broken icon.
media = (
f'<img class="media" src="/api/img/{aid}" alt="" referrerpolicy="no-referrer" '
f'<img class="media" src="{escape(display)}" alt="" referrerpolicy="no-referrer" '
f'onerror="this.remove()">'
if image else ""
if display else ""
)
raw_tags = (article.get("tags") or "")