images: fix two fetcher bugs + add source-level image-rights policy (Codex)

Fetcher (the two remaining bugs Codex found): - Real redirects are now followed. _NoRedirect makes urllib RAISE HTTPError on 3xx, so the old status-branch was dead code (mocked tests masked it). Handle 301/302/303/307/308 HTTPError as redirects (re-validate the destination); classify 4xx≠429 as PERMANENT (negative-cached), 429/5xx/network as transient. Real-opener redirect + 404/5xx tests. - The megapixel ceiling is now enforced: explicit `w*h > _MAX_PIXELS` check BEFORE load() (Pillow only warns at MAX_IMAGE_PIXELS). Test with a lowered ceiling. Image-rights policy (per Codex + owner decision — only cache what's cleared): - sources.image_policy: 'cache' (re-host a downscaled copy — license/permission/PD only), 'remote' (hotlink the publisher's image — the conservative DEFAULT), 'none' (no image). - newsimg.display_url resolves the display URL per policy; applied in Article.from_row so feed/brief/history return the right URL, and in share.py (og/twitter still reference the publisher's own image, never re-hosted). warm() + /api/img both gated on 'cache'. - Frontend uses the server-resolved image_url (reverted the hardcoded /api/img); the graceful retry covers remote hotlinks too. Admin: per-source image-policy selector + POST /api/admin/sources/{id}/image-policy. Default 'remote' → nothing re-hosted until a source is explicitly cleared. 445 backend + 36 frontend tests pass. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 14:01:11 -04:00
parent a55ba185a8
commit 8a7606e20d
10 changed files with 238 additions and 62 deletions
@@ -351,7 +351,9 @@ class Article(BaseModel):
            title=row["title"],
            description=row.get("description"),
            url=row["canonical_url"],
-            image_url=row.get("image_url"),
+            # Resolve per the source's image policy: our cached copy, the publisher's URL
+            # (hotlink), or none — so we never re-host an image we haven't cleared.
+            image_url=newsimg.display_url(row["id"], row.get("image_policy"), row.get("image_url")),
            published_at=row.get("published_at"),
            source=row["source_name"],
            source_id=row.get("source_id"),
@@ -591,6 +593,10 @@ class SourcePaywallBody(BaseModel):
    override: str | None = None  # None = use domain rule · 'free' · 'paywalled'


+class SourceImagePolicyBody(BaseModel):
+    policy: str = "remote"       # 'cache' · 'remote' (default) · 'none'
+
+
 class CandidateSuggestBody(BaseModel):
    feed_url: str = ""
    name: str | None = None
@@ -1111,7 +1117,7 @@ def create_app() -> FastAPI:
        with get_conn() as conn:
            row = conn.execute(
                "SELECT a.id, a.title, a.description, a.image_url, a.canonical_url, "
-                "a.duplicate_of, a.source_id, src.name AS source_name, s.reason_text, s.accepted, "
+                "a.duplicate_of, a.source_id, src.name AS source_name, src.image_policy, s.reason_text, s.accepted, "
                "(SELECT group_concat(t.tag) FROM article_tags t WHERE t.article_id = a.id) AS tags "
                "FROM articles a JOIN sources src ON src.id = a.source_id "
                "LEFT JOIN article_scores s ON s.article_id = a.id WHERE a.id = ?",
@@ -1416,6 +1422,22 @@ def create_app() -> FastAPI:
            conn.commit()
        return {"ok": True, "override": ov}

+    @app.post("/api/admin/sources/{sid}/image-policy")
+    def admin_source_image_policy(sid: int, body: SourceImagePolicyBody, request: Request) -> dict:
+        # Image rights policy: 'cache' (re-host a downscaled copy — only for sources we've
+        # cleared: open license / permission / public-domain), 'remote' (hotlink the
+        # publisher's image), 'none' (no image). Default is the conservative 'remote'.
+        pol = body.policy
+        if pol not in ("cache", "remote", "none"):
+            raise HTTPException(status_code=422, detail="policy must be 'cache', 'remote', or 'none'")
+        with get_conn() as conn:
+            _require_admin(conn, request)
+            cur = conn.execute("UPDATE sources SET image_policy = ? WHERE id = ?", (pol, sid))
+            if cur.rowcount == 0:
+                raise HTTPException(status_code=404, detail="source not found")
+            conn.commit()
+        return {"ok": True, "policy": pol}
+
    # --- Source candidates (supervised add-a-source pipeline) ----------------

    def _candidate_dict(row) -> dict:
@@ -2346,7 +2368,8 @@ def create_app() -> FastAPI:
        with get_conn() as conn:
            row = conn.execute(
                "SELECT a.image_url FROM articles a JOIN article_scores s ON s.article_id = a.id "
-                "WHERE a.id = ? AND s.accepted = 1 AND a.duplicate_of IS NULL",
+                "JOIN sources src ON src.id = a.source_id "
+                "WHERE a.id = ? AND s.accepted = 1 AND a.duplicate_of IS NULL AND src.image_policy = 'cache'",
                (article_id,),
            ).fetchone()
        url = row["image_url"] if row else None
@@ -632,6 +632,11 @@ def _migrate(conn: sqlite3.Connection) -> None:
        conn.execute("ALTER TABLE sources ADD COLUMN content_visible INTEGER NOT NULL DEFAULT 1")
    if "retry_after_at" not in source_cols:
        conn.execute("ALTER TABLE sources ADD COLUMN retry_after_at TEXT")
+    # Image rights policy per source: 'cache' (cleared to re-host a downscaled copy),
+    # 'remote' (hotlink the publisher's image — the conservative DEFAULT), 'none' (no
+    # image). Caching is opt-in; unknown/new sources are never re-hosted.
+    if "image_policy" not in source_cols:
+        conn.execute("ALTER TABLE sources ADD COLUMN image_policy TEXT NOT NULL DEFAULT 'remote'")

    # Daily Art columns added after the tables first shipped.
    pool_cols = {row["name"] for row in conn.execute("PRAGMA table_info(art_pool)")}
@@ -63,6 +63,23 @@ def _key(url: str) -> str:
    return hashlib.sha1(url.encode("utf-8")).hexdigest()


+def display_url(article_id: int, image_policy: str | None, raw_url: str | None) -> str | None:
+    """The image URL the frontend should use, honoring the SOURCE's image policy:
+      'cache'  → our locally-cached copy (/api/img/<id>) — only for sources we've cleared
+                 to re-host (open license / explicit permission / public-domain).
+      'remote' → the publisher's own URL (hotlinked + the frontend's graceful retry). The
+                 conservative DEFAULT: we display but never re-host.
+      'none'   → no image (typographic cover).
+    Returns None when there's no image or the policy is 'none'."""
+    if not raw_url:
+        return None
+    if image_policy == "cache":
+        return f"/api/img/{article_id}"
+    if image_policy == "none":
+        return None
+    return raw_url   # 'remote' (default) — hotlink, never re-hosted
+
+
 class _FetchError(Exception):
    """permanent=True → negative-cache (won't retry soon); False → transient, retry."""
    def __init__(self, msg: str, permanent: bool):
@@ -83,16 +100,21 @@ def _safe_fetch(url: str, timeout: int = 12) -> tuple[bytes, str]:
        req = urllib.request.Request(current, headers=_UA)
        try:
            resp = opener.open(req, timeout=timeout)
+        except urllib.error.HTTPError as exc:
+            # _NoRedirect makes urllib RAISE on 3xx (rather than return a response), so
+            # redirects arrive here. Re-validate the destination on the next loop. 4xx
+            # (except 429) is a permanent miss → negative-cache; 429/5xx → transient.
+            if exc.code in (301, 302, 303, 307, 308):
+                loc = exc.headers.get("Location")
+                exc.close()
+                if not loc:
+                    raise _FetchError("redirect without location", permanent=True) from exc
+                current = urljoin(current, loc)
+                continue
+            permanent = 400 <= exc.code < 500 and exc.code != 429
+            raise _FetchError(f"http {exc.code}", permanent=permanent) from exc
        except (urllib.error.URLError, OSError, ValueError) as exc:
            raise _FetchError(f"fetch failed: {exc}", permanent=False) from exc
-        status = getattr(resp, "status", 200) or 200
-        if status in (301, 302, 303, 307, 308):
-            loc = resp.headers.get("Location")
-            resp.close()
-            if not loc:
-                raise _FetchError("redirect without location", permanent=True)
-            current = urljoin(current, loc)
-            continue
        try:
            return resp.read(_MAX_FETCH_BYTES + 1), (resp.headers.get("Content-Type") or "")
        finally:
@@ -106,11 +128,14 @@ def _encode(data: bytes) -> bytes | None:
    dimensions — the caller then REJECTS it (never stores arbitrary bytes)."""
    try:
        from PIL import Image
-        Image.MAX_IMAGE_PIXELS = _MAX_PIXELS   # raise DecompressionBombError past this
-        im = Image.open(io.BytesIO(data))
-        im.load()                              # forces decode → catches truncated/bomb here
-        if im.width > _MAX_DIM or im.height > _MAX_DIM or im.width < 1 or im.height < 1:
+        Image.MAX_IMAGE_PIXELS = _MAX_PIXELS   # backstop; Pillow only WARNS at this, raises ~2x
+        im = Image.open(io.BytesIO(data))      # lazy: header (size) read without decoding pixels
+        # Enforce the pixel/dimension ceiling BEFORE load() so a decompression bomb is never
+        # actually decoded (Pillow's own MAX_IMAGE_PIXELS only warns at the threshold).
+        if (im.width * im.height > _MAX_PIXELS or im.width > _MAX_DIM or im.height > _MAX_DIM
+                or im.width < 1 or im.height < 1):
            return None
+        im.load()                              # decode now (also catches truncated data)
        if im.mode not in ("RGB", "RGBA"):
            im = im.convert("RGBA" if ("A" in im.mode or im.mode == "P") else "RGB")
        if im.width > DISPLAY_WIDTH:
@@ -192,13 +217,16 @@ def fetch_and_cache(url: str | None) -> Path | None:


 def warm(conn, limit: int = 200) -> int:
-    """Pre-fetch display copies for the newest ACCEPTED, CANONICAL articles that have an
-    image, so the API only ever serves cache hits. Bounded; skips already-cached and
-    recently-failed URLs. Returns how many it newly cached."""
+    """Pre-fetch display copies for the newest ACCEPTED, CANONICAL articles whose SOURCE
+    is cleared to cache (image_policy='cache'), so the API only ever serves cache hits.
+    Bounded; skips already-cached and recently-failed URLs. Returns how many it newly
+    cached. Sources default to 'remote' (hotlink, never re-hosted), so this caches
+    nothing until a source is explicitly set to 'cache'."""
    rows = conn.execute(
        "SELECT DISTINCT a.image_url FROM article_scores s JOIN articles a ON a.id = s.article_id "
-        "WHERE s.accepted=1 AND a.duplicate_of IS NULL AND a.image_url IS NOT NULL "
-        "AND a.image_url != '' ORDER BY a.id DESC LIMIT ?",
+        "JOIN sources src ON src.id = a.source_id "
+        "WHERE s.accepted=1 AND a.duplicate_of IS NULL AND src.image_policy='cache' "
+        "AND a.image_url IS NOT NULL AND a.image_url != '' ORDER BY a.id DESC LIMIT ?",
        (limit,),
    ).fetchall()
    made = 0
@@ -55,6 +55,7 @@ _ARTICLE_COLUMNS = f"""
    s.reason_text,
    s.model_name,
    src.paywall_override AS paywall_override,
+    src.image_policy AS image_policy,
    a.source_words,
    (SELECT group_concat(t.tag) FROM article_tags t WHERE t.article_id = a.id) AS tags,
    {RANK_SCORE_SQL} AS rank_score
@@ -525,6 +526,7 @@ def source_health(conn: sqlite3.Connection) -> list[dict]:
            s.id, s.name, s.feed_url, s.homepage_url, s.default_category AS category, s.active,
            s.status, s.content_visible, s.retry_after_at,
            s.consecutive_failures AS failures, s.review_flag, s.review_reason, s.paywall_override,
+            s.image_policy,
            s.poll_interval_minutes AS interval_minutes,
            s.last_success_at, s.last_error_at, substr(s.last_error, 1, 160) AS last_error,
            (SELECT MAX(r.finished_at) FROM ingest_runs r
@@ -10,6 +10,8 @@ from __future__ import annotations

 from html import escape

+from .newsimg import display_url
+

 def _tag(name: str, content: str | None, attr: str = "property") -> str:
    if not content:
@@ -149,11 +151,17 @@ def render_share_page(article: dict, base_url: str, summary: str | None = None,
    )
    src_url = article.get("canonical_url") or base_url
    image = article.get("image_url")
+    policy = article.get("image_policy")
+    # What WE show, honoring the source's image policy (cache → our copy; remote → the
+    # publisher's URL; none → nothing). og/twitter reference the publisher's own image
+    # (a link, not re-hosting) whenever we'd show anything; 'none' omits it entirely.
+    display = display_url(aid, policy, image)
+    og_image = image if (image and policy != "none") else None
    page_url = f"{base_url}/a/{aid}"
    # With an image: a large-image card. Without: a clean text unfurl (title +
    # why + brand) — tidy, not a broken/muddy preview. (A branded fallback PNG
    # can replace this later.)
-    twitter_card = "summary_large_image" if image else "summary"
+    twitter_card = "summary_large_image" if display else "summary"

    meta = "\n".join(filter(None, [
        _tag("og:site_name", "upbeatBytes"),
@@ -161,22 +169,20 @@ def render_share_page(article: dict, base_url: str, summary: str | None = None,
        _tag("og:title", title),
        _tag("og:description", why),
        _tag("og:url", page_url),
-        _tag("og:image", image),
+        _tag("og:image", og_image),
        _tag("twitter:card", twitter_card, attr="name"),
        _tag("twitter:title", title, attr="name"),
        _tag("twitter:description", why, attr="name"),
-        _tag("twitter:image", image, attr="name"),
+        _tag("twitter:image", og_image, attr="name"),
    ]))

-    # The visible image is served from our cached/downscaled copy (not a hotlink), so a
-    # flaky source CDN can't blank it. og:image/twitter:image above stay the source URL
-    # so social crawlers fetch the canonical image directly.
-    # Served from our cache (/api/img/<id>); if it isn't cached yet / fails, drop the
-    # element so the page degrades to the clean text unfurl rather than a broken icon.
+    # The visible image is whatever the policy resolved to (our cached copy for 'cache'
+    # sources, else the publisher's URL for 'remote'). If it isn't cached yet / fails to
+    # load, drop the element so the page degrades to the clean text unfurl, not a broken icon.
    media = (
-        f'<img class="media" src="/api/img/{aid}" alt="" referrerpolicy="no-referrer" '
+        f'<img class="media" src="{escape(display)}" alt="" referrerpolicy="no-referrer" '
        f'onerror="this.remove()">'
-        if image else ""
+        if display else ""
    )

    raw_tags = (article.get("tags") or "")