images: fix two fetcher bugs + add source-level image-rights policy (Codex)
Fetcher (the two remaining bugs Codex found):
- Real redirects are now followed. _NoRedirect makes urllib RAISE HTTPError on 3xx, so
the old status-branch was dead code (mocked tests masked it). Handle 301/302/303/307/308
HTTPError as redirects (re-validate the destination); classify 4xx≠429 as PERMANENT
(negative-cached), 429/5xx/network as transient. Real-opener redirect + 404/5xx tests.
- The megapixel ceiling is now enforced: explicit `w*h > _MAX_PIXELS` check BEFORE load()
(Pillow only warns at MAX_IMAGE_PIXELS). Test with a lowered ceiling.
Image-rights policy (per Codex + owner decision — only cache what's cleared):
- sources.image_policy: 'cache' (re-host a downscaled copy — license/permission/PD only),
'remote' (hotlink the publisher's image — the conservative DEFAULT), 'none' (no image).
- newsimg.display_url resolves the display URL per policy; applied in Article.from_row so
feed/brief/history return the right URL, and in share.py (og/twitter still reference the
publisher's own image, never re-hosted). warm() + /api/img both gated on 'cache'.
- Frontend uses the server-resolved image_url (reverted the hardcoded /api/img); the
graceful retry covers remote hotlinks too. Admin: per-source image-policy selector +
POST /api/admin/sources/{id}/image-policy. Default 'remote' → nothing re-hosted until
a source is explicitly cleared.
445 backend + 36 frontend tests pass.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
+26
-3
@@ -351,7 +351,9 @@ class Article(BaseModel):
|
||||
title=row["title"],
|
||||
description=row.get("description"),
|
||||
url=row["canonical_url"],
|
||||
image_url=row.get("image_url"),
|
||||
# Resolve per the source's image policy: our cached copy, the publisher's URL
|
||||
# (hotlink), or none — so we never re-host an image we haven't cleared.
|
||||
image_url=newsimg.display_url(row["id"], row.get("image_policy"), row.get("image_url")),
|
||||
published_at=row.get("published_at"),
|
||||
source=row["source_name"],
|
||||
source_id=row.get("source_id"),
|
||||
@@ -591,6 +593,10 @@ class SourcePaywallBody(BaseModel):
|
||||
override: str | None = None # None = use domain rule · 'free' · 'paywalled'
|
||||
|
||||
|
||||
class SourceImagePolicyBody(BaseModel):
|
||||
policy: str = "remote" # 'cache' · 'remote' (default) · 'none'
|
||||
|
||||
|
||||
class CandidateSuggestBody(BaseModel):
|
||||
feed_url: str = ""
|
||||
name: str | None = None
|
||||
@@ -1111,7 +1117,7 @@ def create_app() -> FastAPI:
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"SELECT a.id, a.title, a.description, a.image_url, a.canonical_url, "
|
||||
"a.duplicate_of, a.source_id, src.name AS source_name, s.reason_text, s.accepted, "
|
||||
"a.duplicate_of, a.source_id, src.name AS source_name, src.image_policy, s.reason_text, s.accepted, "
|
||||
"(SELECT group_concat(t.tag) FROM article_tags t WHERE t.article_id = a.id) AS tags "
|
||||
"FROM articles a JOIN sources src ON src.id = a.source_id "
|
||||
"LEFT JOIN article_scores s ON s.article_id = a.id WHERE a.id = ?",
|
||||
@@ -1416,6 +1422,22 @@ def create_app() -> FastAPI:
|
||||
conn.commit()
|
||||
return {"ok": True, "override": ov}
|
||||
|
||||
@app.post("/api/admin/sources/{sid}/image-policy")
|
||||
def admin_source_image_policy(sid: int, body: SourceImagePolicyBody, request: Request) -> dict:
|
||||
# Image rights policy: 'cache' (re-host a downscaled copy — only for sources we've
|
||||
# cleared: open license / permission / public-domain), 'remote' (hotlink the
|
||||
# publisher's image), 'none' (no image). Default is the conservative 'remote'.
|
||||
pol = body.policy
|
||||
if pol not in ("cache", "remote", "none"):
|
||||
raise HTTPException(status_code=422, detail="policy must be 'cache', 'remote', or 'none'")
|
||||
with get_conn() as conn:
|
||||
_require_admin(conn, request)
|
||||
cur = conn.execute("UPDATE sources SET image_policy = ? WHERE id = ?", (pol, sid))
|
||||
if cur.rowcount == 0:
|
||||
raise HTTPException(status_code=404, detail="source not found")
|
||||
conn.commit()
|
||||
return {"ok": True, "policy": pol}
|
||||
|
||||
# --- Source candidates (supervised add-a-source pipeline) ----------------
|
||||
|
||||
def _candidate_dict(row) -> dict:
|
||||
@@ -2346,7 +2368,8 @@ def create_app() -> FastAPI:
|
||||
with get_conn() as conn:
|
||||
row = conn.execute(
|
||||
"SELECT a.image_url FROM articles a JOIN article_scores s ON s.article_id = a.id "
|
||||
"WHERE a.id = ? AND s.accepted = 1 AND a.duplicate_of IS NULL",
|
||||
"JOIN sources src ON src.id = a.source_id "
|
||||
"WHERE a.id = ? AND s.accepted = 1 AND a.duplicate_of IS NULL AND src.image_policy = 'cache'",
|
||||
(article_id,),
|
||||
).fetchone()
|
||||
url = row["image_url"] if row else None
|
||||
|
||||
@@ -632,6 +632,11 @@ def _migrate(conn: sqlite3.Connection) -> None:
|
||||
conn.execute("ALTER TABLE sources ADD COLUMN content_visible INTEGER NOT NULL DEFAULT 1")
|
||||
if "retry_after_at" not in source_cols:
|
||||
conn.execute("ALTER TABLE sources ADD COLUMN retry_after_at TEXT")
|
||||
# Image rights policy per source: 'cache' (cleared to re-host a downscaled copy),
|
||||
# 'remote' (hotlink the publisher's image — the conservative DEFAULT), 'none' (no
|
||||
# image). Caching is opt-in; unknown/new sources are never re-hosted.
|
||||
if "image_policy" not in source_cols:
|
||||
conn.execute("ALTER TABLE sources ADD COLUMN image_policy TEXT NOT NULL DEFAULT 'remote'")
|
||||
|
||||
# Daily Art columns added after the tables first shipped.
|
||||
pool_cols = {row["name"] for row in conn.execute("PRAGMA table_info(art_pool)")}
|
||||
|
||||
+45
-17
@@ -63,6 +63,23 @@ def _key(url: str) -> str:
|
||||
return hashlib.sha1(url.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def display_url(article_id: int, image_policy: str | None, raw_url: str | None) -> str | None:
|
||||
"""The image URL the frontend should use, honoring the SOURCE's image policy:
|
||||
'cache' → our locally-cached copy (/api/img/<id>) — only for sources we've cleared
|
||||
to re-host (open license / explicit permission / public-domain).
|
||||
'remote' → the publisher's own URL (hotlinked + the frontend's graceful retry). The
|
||||
conservative DEFAULT: we display but never re-host.
|
||||
'none' → no image (typographic cover).
|
||||
Returns None when there's no image or the policy is 'none'."""
|
||||
if not raw_url:
|
||||
return None
|
||||
if image_policy == "cache":
|
||||
return f"/api/img/{article_id}"
|
||||
if image_policy == "none":
|
||||
return None
|
||||
return raw_url # 'remote' (default) — hotlink, never re-hosted
|
||||
|
||||
|
||||
class _FetchError(Exception):
|
||||
"""permanent=True → negative-cache (won't retry soon); False → transient, retry."""
|
||||
def __init__(self, msg: str, permanent: bool):
|
||||
@@ -83,16 +100,21 @@ def _safe_fetch(url: str, timeout: int = 12) -> tuple[bytes, str]:
|
||||
req = urllib.request.Request(current, headers=_UA)
|
||||
try:
|
||||
resp = opener.open(req, timeout=timeout)
|
||||
except urllib.error.HTTPError as exc:
|
||||
# _NoRedirect makes urllib RAISE on 3xx (rather than return a response), so
|
||||
# redirects arrive here. Re-validate the destination on the next loop. 4xx
|
||||
# (except 429) is a permanent miss → negative-cache; 429/5xx → transient.
|
||||
if exc.code in (301, 302, 303, 307, 308):
|
||||
loc = exc.headers.get("Location")
|
||||
exc.close()
|
||||
if not loc:
|
||||
raise _FetchError("redirect without location", permanent=True) from exc
|
||||
current = urljoin(current, loc)
|
||||
continue
|
||||
permanent = 400 <= exc.code < 500 and exc.code != 429
|
||||
raise _FetchError(f"http {exc.code}", permanent=permanent) from exc
|
||||
except (urllib.error.URLError, OSError, ValueError) as exc:
|
||||
raise _FetchError(f"fetch failed: {exc}", permanent=False) from exc
|
||||
status = getattr(resp, "status", 200) or 200
|
||||
if status in (301, 302, 303, 307, 308):
|
||||
loc = resp.headers.get("Location")
|
||||
resp.close()
|
||||
if not loc:
|
||||
raise _FetchError("redirect without location", permanent=True)
|
||||
current = urljoin(current, loc)
|
||||
continue
|
||||
try:
|
||||
return resp.read(_MAX_FETCH_BYTES + 1), (resp.headers.get("Content-Type") or "")
|
||||
finally:
|
||||
@@ -106,11 +128,14 @@ def _encode(data: bytes) -> bytes | None:
|
||||
dimensions — the caller then REJECTS it (never stores arbitrary bytes)."""
|
||||
try:
|
||||
from PIL import Image
|
||||
Image.MAX_IMAGE_PIXELS = _MAX_PIXELS # raise DecompressionBombError past this
|
||||
im = Image.open(io.BytesIO(data))
|
||||
im.load() # forces decode → catches truncated/bomb here
|
||||
if im.width > _MAX_DIM or im.height > _MAX_DIM or im.width < 1 or im.height < 1:
|
||||
Image.MAX_IMAGE_PIXELS = _MAX_PIXELS # backstop; Pillow only WARNS at this, raises ~2x
|
||||
im = Image.open(io.BytesIO(data)) # lazy: header (size) read without decoding pixels
|
||||
# Enforce the pixel/dimension ceiling BEFORE load() so a decompression bomb is never
|
||||
# actually decoded (Pillow's own MAX_IMAGE_PIXELS only warns at the threshold).
|
||||
if (im.width * im.height > _MAX_PIXELS or im.width > _MAX_DIM or im.height > _MAX_DIM
|
||||
or im.width < 1 or im.height < 1):
|
||||
return None
|
||||
im.load() # decode now (also catches truncated data)
|
||||
if im.mode not in ("RGB", "RGBA"):
|
||||
im = im.convert("RGBA" if ("A" in im.mode or im.mode == "P") else "RGB")
|
||||
if im.width > DISPLAY_WIDTH:
|
||||
@@ -192,13 +217,16 @@ def fetch_and_cache(url: str | None) -> Path | None:
|
||||
|
||||
|
||||
def warm(conn, limit: int = 200) -> int:
|
||||
"""Pre-fetch display copies for the newest ACCEPTED, CANONICAL articles that have an
|
||||
image, so the API only ever serves cache hits. Bounded; skips already-cached and
|
||||
recently-failed URLs. Returns how many it newly cached."""
|
||||
"""Pre-fetch display copies for the newest ACCEPTED, CANONICAL articles whose SOURCE
|
||||
is cleared to cache (image_policy='cache'), so the API only ever serves cache hits.
|
||||
Bounded; skips already-cached and recently-failed URLs. Returns how many it newly
|
||||
cached. Sources default to 'remote' (hotlink, never re-hosted), so this caches
|
||||
nothing until a source is explicitly set to 'cache'."""
|
||||
rows = conn.execute(
|
||||
"SELECT DISTINCT a.image_url FROM article_scores s JOIN articles a ON a.id = s.article_id "
|
||||
"WHERE s.accepted=1 AND a.duplicate_of IS NULL AND a.image_url IS NOT NULL "
|
||||
"AND a.image_url != '' ORDER BY a.id DESC LIMIT ?",
|
||||
"JOIN sources src ON src.id = a.source_id "
|
||||
"WHERE s.accepted=1 AND a.duplicate_of IS NULL AND src.image_policy='cache' "
|
||||
"AND a.image_url IS NOT NULL AND a.image_url != '' ORDER BY a.id DESC LIMIT ?",
|
||||
(limit,),
|
||||
).fetchall()
|
||||
made = 0
|
||||
|
||||
@@ -55,6 +55,7 @@ _ARTICLE_COLUMNS = f"""
|
||||
s.reason_text,
|
||||
s.model_name,
|
||||
src.paywall_override AS paywall_override,
|
||||
src.image_policy AS image_policy,
|
||||
a.source_words,
|
||||
(SELECT group_concat(t.tag) FROM article_tags t WHERE t.article_id = a.id) AS tags,
|
||||
{RANK_SCORE_SQL} AS rank_score
|
||||
@@ -525,6 +526,7 @@ def source_health(conn: sqlite3.Connection) -> list[dict]:
|
||||
s.id, s.name, s.feed_url, s.homepage_url, s.default_category AS category, s.active,
|
||||
s.status, s.content_visible, s.retry_after_at,
|
||||
s.consecutive_failures AS failures, s.review_flag, s.review_reason, s.paywall_override,
|
||||
s.image_policy,
|
||||
s.poll_interval_minutes AS interval_minutes,
|
||||
s.last_success_at, s.last_error_at, substr(s.last_error, 1, 160) AS last_error,
|
||||
(SELECT MAX(r.finished_at) FROM ingest_runs r
|
||||
|
||||
+16
-10
@@ -10,6 +10,8 @@ from __future__ import annotations
|
||||
|
||||
from html import escape
|
||||
|
||||
from .newsimg import display_url
|
||||
|
||||
|
||||
def _tag(name: str, content: str | None, attr: str = "property") -> str:
|
||||
if not content:
|
||||
@@ -149,11 +151,17 @@ def render_share_page(article: dict, base_url: str, summary: str | None = None,
|
||||
)
|
||||
src_url = article.get("canonical_url") or base_url
|
||||
image = article.get("image_url")
|
||||
policy = article.get("image_policy")
|
||||
# What WE show, honoring the source's image policy (cache → our copy; remote → the
|
||||
# publisher's URL; none → nothing). og/twitter reference the publisher's own image
|
||||
# (a link, not re-hosting) whenever we'd show anything; 'none' omits it entirely.
|
||||
display = display_url(aid, policy, image)
|
||||
og_image = image if (image and policy != "none") else None
|
||||
page_url = f"{base_url}/a/{aid}"
|
||||
# With an image: a large-image card. Without: a clean text unfurl (title +
|
||||
# why + brand) — tidy, not a broken/muddy preview. (A branded fallback PNG
|
||||
# can replace this later.)
|
||||
twitter_card = "summary_large_image" if image else "summary"
|
||||
twitter_card = "summary_large_image" if display else "summary"
|
||||
|
||||
meta = "\n".join(filter(None, [
|
||||
_tag("og:site_name", "upbeatBytes"),
|
||||
@@ -161,22 +169,20 @@ def render_share_page(article: dict, base_url: str, summary: str | None = None,
|
||||
_tag("og:title", title),
|
||||
_tag("og:description", why),
|
||||
_tag("og:url", page_url),
|
||||
_tag("og:image", image),
|
||||
_tag("og:image", og_image),
|
||||
_tag("twitter:card", twitter_card, attr="name"),
|
||||
_tag("twitter:title", title, attr="name"),
|
||||
_tag("twitter:description", why, attr="name"),
|
||||
_tag("twitter:image", image, attr="name"),
|
||||
_tag("twitter:image", og_image, attr="name"),
|
||||
]))
|
||||
|
||||
# The visible image is served from our cached/downscaled copy (not a hotlink), so a
|
||||
# flaky source CDN can't blank it. og:image/twitter:image above stay the source URL
|
||||
# so social crawlers fetch the canonical image directly.
|
||||
# Served from our cache (/api/img/<id>); if it isn't cached yet / fails, drop the
|
||||
# element so the page degrades to the clean text unfurl rather than a broken icon.
|
||||
# The visible image is whatever the policy resolved to (our cached copy for 'cache'
|
||||
# sources, else the publisher's URL for 'remote'). If it isn't cached yet / fails to
|
||||
# load, drop the element so the page degrades to the clean text unfurl, not a broken icon.
|
||||
media = (
|
||||
f'<img class="media" src="/api/img/{aid}" alt="" referrerpolicy="no-referrer" '
|
||||
f'<img class="media" src="{escape(display)}" alt="" referrerpolicy="no-referrer" '
|
||||
f'onerror="this.remove()">'
|
||||
if image else ""
|
||||
if display else ""
|
||||
)
|
||||
|
||||
raw_tags = (article.get("tags") or "")
|
||||
|
||||
Reference in New Issue
Block a user