Bounded hero-image enrichment (og:image for brief items only)

The grid stays typographic; the hero is the one intentional visual slot. At brief-build time we fetch a hero-quality image for the daily five that lack one: - enrich.py reads ONLY a page's <head> og:image/twitter:image and stores just the URL (never the body). - SSRF-guarded: http(s) only, 6s timeout, 300KB cap, <=3 manual redirects each re-validated, and hosts rejected if any resolved address is private, loopback, link-local, multicast, reserved, or unspecified. - image_checked_at column caches success AND failure, so an article is never retried forever. - Wired into build-brief and cycle (brief items only, only if image missing and unchecked). Everything else stays metadata-only. - Verified live: today's five all carry images (feed + enriched). Tests: og:image parser, head-only scope, IP guard across internal ranges, and enrich success + failure-caching (85 total). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-31 12:37:41 +00:00
parent 3858380ffe
commit 9e8eddf46d
5 changed files with 230 additions and 1 deletions
@@ -129,6 +129,14 @@ If `frontend/build` is absent, the server falls back to the legacy single-page
 harness in `goodnews/static/`. The Docker image builds the frontend automatically
 (multi-stage), so deployment is just `docker build`.

+The secondary grid is intentionally typographic (no thumbnails). The **hero** is
+the one image slot: at brief-build time, `enrich.py` fetches a hero-quality image
+for the daily five that lack one — reading **only** a page's `<head>` og:image /
+twitter:image and storing just the URL (never the body). It's SSRF-guarded
+(http(s) only, short timeout, byte cap, capped redirects, and rejects hosts that
+resolve to private/loopback/link-local/multicast addresses), and failures are
+cached so an article is never retried. Everywhere else stays metadata-only.
+
 ## Calm Filters

 Personal, device-local controls so a reader can stay informed without subjects
@@ -9,6 +9,7 @@ from pathlib import Path
 from .briefs import build_daily_brief, show_brief
 from .db import connect, init_db
 from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, dedup as run_dedup
+from .enrich import enrich_brief_images
 from .feeds import (
    fetch_feed,
    parse_feed,
@@ -298,6 +299,10 @@ def main() -> None:
            replace=args.replace,
        )
        print(f"Built brief {brief_id}")
+        bdate = args.date or date.today().isoformat()
+        found = enrich_brief_images(conn, bdate)
+        if found:
+            print(f"Enriched {found} hero image(s)")
        print_brief(show_brief(conn, brief_date=args.date, limit=args.limit))
    elif args.command == "show-brief":
        print_brief(show_brief(conn, brief_date=args.date, limit=args.limit))
@@ -442,7 +447,8 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non
        today = date.today().isoformat()
        try:
            brief_id = build_daily_brief(conn, brief_date=today, limit=5, replace=True)
-            print(f"brief: rebuilt {today} (id {brief_id})")
+            found = enrich_brief_images(conn, today)
+            print(f"brief: rebuilt {today} (id {brief_id}); {found} hero image(s) enriched")
        except Exception as exc:
            print(f"brief: skipped ({exc})")

@@ -44,6 +44,7 @@ CREATE TABLE IF NOT EXISTS articles (
    url_hash TEXT NOT NULL UNIQUE,
    title_hash TEXT,
    duplicate_of INTEGER REFERENCES articles(id) ON DELETE SET NULL,
+    image_checked_at TEXT,
    FOREIGN KEY (source_id) REFERENCES sources(id)
 );

@@ -152,6 +153,8 @@ def _migrate(conn: sqlite3.Connection) -> None:
        conn.execute(
            "ALTER TABLE articles ADD COLUMN duplicate_of INTEGER REFERENCES articles(id)"
        )
+    if "image_checked_at" not in article_cols:
+        conn.execute("ALTER TABLE articles ADD COLUMN image_checked_at TEXT")
    # Created here (not in SCHEMA) so it runs after the column exists on upgrades.
    conn.execute("CREATE INDEX IF NOT EXISTS idx_articles_duplicate_of ON articles(duplicate_of)")

@@ -0,0 +1,157 @@
+"""Bounded hero-image enrichment.
+
+The grid stays purely typographic; the daily brief's items are the one place we
+make an exception and fetch a real image — because the hero is the single
+intentional visual doorway. We fetch ONLY the article page's <head> metadata
+(og:image / twitter:image), store ONLY the resulting image URL, and never touch
+the body. This is opt-in, brief-only, once per build.
+
+Security (this is the one place we fetch user-/source-supplied pages):
+- http(s) only; short timeout; byte cap; redirects followed manually and capped;
+- every hop's host is DNS-resolved and rejected if ANY resolved address is
+  private / loopback / link-local / multicast / reserved / unspecified (SSRF).
+Failures are cached by the caller (image_checked_at) so an article is never
+retried forever.
+"""
+
+from __future__ import annotations
+
+import ipaddress
+import re
+import socket
+import sqlite3
+import urllib.error
+import urllib.request
+from urllib.parse import urljoin, urlsplit
+
+from .text import canonicalize_url
+
+USER_AGENT = "goodNews/0.1 (+local constructive news prototype)"
+TIMEOUT = 6
+MAX_BYTES = 300_000
+MAX_REDIRECTS = 3
+
+_META_RE = re.compile(rb"<meta\b[^>]*>", re.IGNORECASE)
+_HEAD_END_RE = re.compile(rb"</head>", re.IGNORECASE)
+
+
+def _attr(tag: bytes, name: bytes) -> bytes | None:
+    m = re.search(name + rb"""\s*=\s*["']([^"']*)["']""", tag, re.IGNORECASE)
+    return m.group(1) if m else None
+
+
+def og_image_from_html(html: bytes) -> str | None:
+    """Extract og:image / twitter:image from a page's <head> bytes."""
+    head = html.split(b"</head>", 1)[0] if _HEAD_END_RE.search(html) else html
+    for tag in _META_RE.findall(head):
+        key = _attr(tag, b"property") or _attr(tag, b"name")
+        if key and key.lower() in (b"og:image", b"og:image:url", b"twitter:image"):
+            content = _attr(tag, b"content")
+            if content:
+                return canonicalize_url(content.decode("utf-8", "replace"))
+    return None
+
+
+def _host_is_public(host: str | None) -> bool:
+    """True only if the host resolves and ALL its addresses are public."""
+    if not host:
+        return False
+    try:
+        infos = socket.getaddrinfo(host, None)
+    except (socket.gaierror, UnicodeError, OSError):
+        return False
+    addrs = {info[4][0] for info in infos}
+    if not addrs:
+        return False
+    for addr in addrs:
+        try:
+            ip = ipaddress.ip_address(addr.split("%")[0])  # strip scope id
+        except ValueError:
+            return False
+        if (
+            ip.is_private or ip.is_loopback or ip.is_link_local
+            or ip.is_multicast or ip.is_reserved or ip.is_unspecified
+        ):
+            return False
+    return True
+
+
+class _NoRedirect(urllib.request.HTTPRedirectHandler):
+    # Don't auto-follow — we re-validate each hop's host ourselves.
+    def redirect_request(self, *args, **kwargs):
+        return None
+
+
+def fetch_og_image(url: str | None) -> str | None:
+    """Fetch a page's head metadata and return its og:image URL, or None.
+
+    Best-effort and safe: returns None on any error, bad scheme, redirect loop,
+    or a host that resolves to a non-public address.
+    """
+    opener = urllib.request.build_opener(_NoRedirect)
+    for _ in range(MAX_REDIRECTS + 1):
+        if not url:
+            return None
+        parts = urlsplit(url)
+        if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
+            return None
+        request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"})
+        try:
+            response = opener.open(request, timeout=TIMEOUT)
+        except (urllib.error.URLError, OSError, ValueError):
+            return None
+        status = getattr(response, "status", 200) or 200
+        if status in (301, 302, 303, 307, 308):
+            location = response.headers.get("Location")
+            response.close()
+            if not location:
+                return None
+            url = urljoin(url, location)
+            continue
+        ctype = response.headers.get("Content-Type", "")
+        if "html" not in ctype.lower():
+            response.close()
+            return None
+        try:
+            body = response.read(MAX_BYTES)
+        finally:
+            response.close()
+        return og_image_from_html(body)
+    return None  # too many redirects
+
+
+def enrich_brief_images(conn: sqlite3.Connection, brief_date: str, fetch=fetch_og_image, limit: int = 5) -> int:
+    """Fetch a hero-quality image for brief items that lack one (once each).
+
+    Only touches the given date's brief items with no image and not yet checked;
+    stamps image_checked_at either way so failures are not retried forever.
+    Returns how many images were newly found.
+    """
+    rows = conn.execute(
+        """
+        SELECT a.id, a.canonical_url
+        FROM daily_briefs b
+        JOIN daily_brief_items bi ON bi.brief_id = b.id
+        JOIN articles a ON a.id = bi.article_id
+        WHERE b.brief_date = ? AND a.image_url IS NULL AND a.image_checked_at IS NULL
+        ORDER BY bi.rank
+        LIMIT ?
+        """,
+        (brief_date, limit),
+    ).fetchall()
+
+    found = 0
+    for row in rows:
+        try:
+            image = fetch(row["canonical_url"])
+        except Exception:
+            image = None
+        conn.execute(
+            "UPDATE articles SET image_url = COALESCE(?, image_url), image_checked_at = CURRENT_TIMESTAMP "
+            "WHERE id = ?",
+            (image, row["id"]),
+        )
+        if image:
+            found += 1
+    conn.commit()
+    return found
@@ -0,0 +1,55 @@
+import pytest
+
+from goodnews.db import connect, init_db
+from goodnews.enrich import og_image_from_html, _host_is_public, enrich_brief_images
+
+
+def test_og_image_parser_handles_attr_order_and_fallback():
+    assert og_image_from_html(b'<head><meta property="og:image" content="https://x.com/a.jpg"></head>') == "https://x.com/a.jpg"
+    assert og_image_from_html(b'<meta content="https://x.com/b.jpg" property="og:image">') == "https://x.com/b.jpg"
+    assert og_image_from_html(b'<meta name="twitter:image" content="https://x.com/c.jpg">') == "https://x.com/c.jpg"
+    assert og_image_from_html(b"<html><body>nope</body></html>") is None
+
+
+def test_og_image_only_reads_head():
+    # a meta after </head> must be ignored
+    assert og_image_from_html(b'<head></head><meta property="og:image" content="https://x.com/d.jpg">') is None
+
+
+def test_host_public_guard_blocks_internal_ranges():
+    assert _host_is_public("8.8.8.8")          # public
+    assert not _host_is_public("127.0.0.1")    # loopback
+    assert not _host_is_public("10.0.0.1")     # private
+    assert not _host_is_public("192.168.1.1")  # private
+    assert not _host_is_public("169.254.0.1")  # link-local
+    assert not _host_is_public("0.0.0.0")      # unspecified
+    assert not _host_is_public("")
+    assert not _host_is_public(None)
+
+
+@pytest.fixture
+def conn():
+    c = connect(":memory:"); init_db(c)
+    c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)")
+    c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash) VALUES (1,1,'https://phys.org/x','t1','h1')")
+    c.execute("INSERT INTO daily_briefs (id,brief_date,title) VALUES (1,'2026-05-31','B')")
+    c.execute("INSERT INTO daily_brief_items (brief_id,article_id,rank) VALUES (1,1,1)")
+    c.commit(); yield c; c.close()
+
+
+def test_enrich_sets_image_and_stamps(conn):
+    calls = []
+    found = enrich_brief_images(conn, "2026-05-31", fetch=lambda u: calls.append(u) or "https://img.example/p.jpg")
+    assert found == 1 and calls == ["https://phys.org/x"]
+    r = conn.execute("SELECT image_url, image_checked_at FROM articles WHERE id=1").fetchone()
+    assert r["image_url"] == "https://img.example/p.jpg" and r["image_checked_at"] is not None
+
+
+def test_enrich_caches_failure_and_does_not_retry(conn):
+    calls = []
+    fail = lambda u: calls.append(u) or None
+    assert enrich_brief_images(conn, "2026-05-31", fetch=fail) == 0
+    r = conn.execute("SELECT image_url, image_checked_at FROM articles WHERE id=1").fetchone()
+    assert r["image_url"] is None and r["image_checked_at"] is not None  # checked, cached
+    assert enrich_brief_images(conn, "2026-05-31", fetch=fail) == 0
+    assert len(calls) == 1  # not retried once checked