Bounded hero-image enrichment (og:image for brief items only)

The grid stays typographic; the hero is the one intentional visual slot. At
brief-build time we fetch a hero-quality image for the daily five that lack one:
- enrich.py reads ONLY a page's <head> og:image/twitter:image and stores just
  the URL (never the body).
- SSRF-guarded: http(s) only, 6s timeout, 300KB cap, <=3 manual redirects each
  re-validated, and hosts rejected if any resolved address is private, loopback,
  link-local, multicast, reserved, or unspecified.
- image_checked_at column caches success AND failure, so an article is never
  retried forever.
- Wired into build-brief and cycle (brief items only, only if image missing and
  unchecked). Everything else stays metadata-only.
- Verified live: today's five all carry images (feed + enriched).

Tests: og:image parser, head-only scope, IP guard across internal ranges, and
enrich success + failure-caching (85 total).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
jay
2026-05-31 12:37:41 +00:00
parent 3858380ffe
commit 9e8eddf46d
5 changed files with 230 additions and 1 deletions
+8
View File
@@ -129,6 +129,14 @@ If `frontend/build` is absent, the server falls back to the legacy single-page
harness in `goodnews/static/`. The Docker image builds the frontend automatically
(multi-stage), so deployment is just `docker build`.
The secondary grid is intentionally typographic (no thumbnails). The **hero** is
the one image slot: at brief-build time, `enrich.py` fetches a hero-quality image
for the daily five that lack one — reading **only** a page's `<head>` og:image /
twitter:image and storing just the URL (never the body). It's SSRF-guarded
(http(s) only, short timeout, byte cap, capped redirects, and rejects hosts that
resolve to private/loopback/link-local/multicast addresses), and failures are
cached so an article is never retried. Everywhere else stays metadata-only.
## Calm Filters
Personal, device-local controls so a reader can stay informed without subjects
+7 -1
View File
@@ -9,6 +9,7 @@ from pathlib import Path
from .briefs import build_daily_brief, show_brief
from .db import connect, init_db
from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, dedup as run_dedup
from .enrich import enrich_brief_images
from .feeds import (
fetch_feed,
parse_feed,
@@ -298,6 +299,10 @@ def main() -> None:
replace=args.replace,
)
print(f"Built brief {brief_id}")
bdate = args.date or date.today().isoformat()
found = enrich_brief_images(conn, bdate)
if found:
print(f"Enriched {found} hero image(s)")
print_brief(show_brief(conn, brief_date=args.date, limit=args.limit))
elif args.command == "show-brief":
print_brief(show_brief(conn, brief_date=args.date, limit=args.limit))
@@ -442,7 +447,8 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non
today = date.today().isoformat()
try:
brief_id = build_daily_brief(conn, brief_date=today, limit=5, replace=True)
print(f"brief: rebuilt {today} (id {brief_id})")
found = enrich_brief_images(conn, today)
print(f"brief: rebuilt {today} (id {brief_id}); {found} hero image(s) enriched")
except Exception as exc:
print(f"brief: skipped ({exc})")
+3
View File
@@ -44,6 +44,7 @@ CREATE TABLE IF NOT EXISTS articles (
url_hash TEXT NOT NULL UNIQUE,
title_hash TEXT,
duplicate_of INTEGER REFERENCES articles(id) ON DELETE SET NULL,
image_checked_at TEXT,
FOREIGN KEY (source_id) REFERENCES sources(id)
);
@@ -152,6 +153,8 @@ def _migrate(conn: sqlite3.Connection) -> None:
conn.execute(
"ALTER TABLE articles ADD COLUMN duplicate_of INTEGER REFERENCES articles(id)"
)
if "image_checked_at" not in article_cols:
conn.execute("ALTER TABLE articles ADD COLUMN image_checked_at TEXT")
# Created here (not in SCHEMA) so it runs after the column exists on upgrades.
conn.execute("CREATE INDEX IF NOT EXISTS idx_articles_duplicate_of ON articles(duplicate_of)")
+157
View File
@@ -0,0 +1,157 @@
"""Bounded hero-image enrichment.
The grid stays purely typographic; the daily brief's items are the one place we
make an exception and fetch a real image — because the hero is the single
intentional visual doorway. We fetch ONLY the article page's <head> metadata
(og:image / twitter:image), store ONLY the resulting image URL, and never touch
the body. This is opt-in, brief-only, once per build.
Security (this is the one place we fetch user-/source-supplied pages):
- http(s) only; short timeout; byte cap; redirects followed manually and capped;
- every hop's host is DNS-resolved and rejected if ANY resolved address is
private / loopback / link-local / multicast / reserved / unspecified (SSRF).
Failures are cached by the caller (image_checked_at) so an article is never
retried forever.
"""
from __future__ import annotations
import ipaddress
import re
import socket
import sqlite3
import urllib.error
import urllib.request
from urllib.parse import urljoin, urlsplit
from .text import canonicalize_url
USER_AGENT = "goodNews/0.1 (+local constructive news prototype)"
TIMEOUT = 6
MAX_BYTES = 300_000
MAX_REDIRECTS = 3
_META_RE = re.compile(rb"<meta\b[^>]*>", re.IGNORECASE)
_HEAD_END_RE = re.compile(rb"</head>", re.IGNORECASE)
def _attr(tag: bytes, name: bytes) -> bytes | None:
m = re.search(name + rb"""\s*=\s*["']([^"']*)["']""", tag, re.IGNORECASE)
return m.group(1) if m else None
def og_image_from_html(html: bytes) -> str | None:
"""Extract og:image / twitter:image from a page's <head> bytes."""
head = html.split(b"</head>", 1)[0] if _HEAD_END_RE.search(html) else html
for tag in _META_RE.findall(head):
key = _attr(tag, b"property") or _attr(tag, b"name")
if key and key.lower() in (b"og:image", b"og:image:url", b"twitter:image"):
content = _attr(tag, b"content")
if content:
return canonicalize_url(content.decode("utf-8", "replace"))
return None
def _host_is_public(host: str | None) -> bool:
"""True only if the host resolves and ALL its addresses are public."""
if not host:
return False
try:
infos = socket.getaddrinfo(host, None)
except (socket.gaierror, UnicodeError, OSError):
return False
addrs = {info[4][0] for info in infos}
if not addrs:
return False
for addr in addrs:
try:
ip = ipaddress.ip_address(addr.split("%")[0]) # strip scope id
except ValueError:
return False
if (
ip.is_private or ip.is_loopback or ip.is_link_local
or ip.is_multicast or ip.is_reserved or ip.is_unspecified
):
return False
return True
class _NoRedirect(urllib.request.HTTPRedirectHandler):
# Don't auto-follow — we re-validate each hop's host ourselves.
def redirect_request(self, *args, **kwargs):
return None
def fetch_og_image(url: str | None) -> str | None:
"""Fetch a page's head metadata and return its og:image URL, or None.
Best-effort and safe: returns None on any error, bad scheme, redirect loop,
or a host that resolves to a non-public address.
"""
opener = urllib.request.build_opener(_NoRedirect)
for _ in range(MAX_REDIRECTS + 1):
if not url:
return None
parts = urlsplit(url)
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
return None
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"})
try:
response = opener.open(request, timeout=TIMEOUT)
except (urllib.error.URLError, OSError, ValueError):
return None
status = getattr(response, "status", 200) or 200
if status in (301, 302, 303, 307, 308):
location = response.headers.get("Location")
response.close()
if not location:
return None
url = urljoin(url, location)
continue
ctype = response.headers.get("Content-Type", "")
if "html" not in ctype.lower():
response.close()
return None
try:
body = response.read(MAX_BYTES)
finally:
response.close()
return og_image_from_html(body)
return None # too many redirects
def enrich_brief_images(conn: sqlite3.Connection, brief_date: str, fetch=fetch_og_image, limit: int = 5) -> int:
"""Fetch a hero-quality image for brief items that lack one (once each).
Only touches the given date's brief items with no image and not yet checked;
stamps image_checked_at either way so failures are not retried forever.
Returns how many images were newly found.
"""
rows = conn.execute(
"""
SELECT a.id, a.canonical_url
FROM daily_briefs b
JOIN daily_brief_items bi ON bi.brief_id = b.id
JOIN articles a ON a.id = bi.article_id
WHERE b.brief_date = ? AND a.image_url IS NULL AND a.image_checked_at IS NULL
ORDER BY bi.rank
LIMIT ?
""",
(brief_date, limit),
).fetchall()
found = 0
for row in rows:
try:
image = fetch(row["canonical_url"])
except Exception:
image = None
conn.execute(
"UPDATE articles SET image_url = COALESCE(?, image_url), image_checked_at = CURRENT_TIMESTAMP "
"WHERE id = ?",
(image, row["id"]),
)
if image:
found += 1
conn.commit()
return found
+55
View File
@@ -0,0 +1,55 @@
import pytest
from goodnews.db import connect, init_db
from goodnews.enrich import og_image_from_html, _host_is_public, enrich_brief_images
def test_og_image_parser_handles_attr_order_and_fallback():
assert og_image_from_html(b'<head><meta property="og:image" content="https://x.com/a.jpg"></head>') == "https://x.com/a.jpg"
assert og_image_from_html(b'<meta content="https://x.com/b.jpg" property="og:image">') == "https://x.com/b.jpg"
assert og_image_from_html(b'<meta name="twitter:image" content="https://x.com/c.jpg">') == "https://x.com/c.jpg"
assert og_image_from_html(b"<html><body>nope</body></html>") is None
def test_og_image_only_reads_head():
# a meta after </head> must be ignored
assert og_image_from_html(b'<head></head><meta property="og:image" content="https://x.com/d.jpg">') is None
def test_host_public_guard_blocks_internal_ranges():
assert _host_is_public("8.8.8.8") # public
assert not _host_is_public("127.0.0.1") # loopback
assert not _host_is_public("10.0.0.1") # private
assert not _host_is_public("192.168.1.1") # private
assert not _host_is_public("169.254.0.1") # link-local
assert not _host_is_public("0.0.0.0") # unspecified
assert not _host_is_public("")
assert not _host_is_public(None)
@pytest.fixture
def conn():
c = connect(":memory:"); init_db(c)
c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)")
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash) VALUES (1,1,'https://phys.org/x','t1','h1')")
c.execute("INSERT INTO daily_briefs (id,brief_date,title) VALUES (1,'2026-05-31','B')")
c.execute("INSERT INTO daily_brief_items (brief_id,article_id,rank) VALUES (1,1,1)")
c.commit(); yield c; c.close()
def test_enrich_sets_image_and_stamps(conn):
calls = []
found = enrich_brief_images(conn, "2026-05-31", fetch=lambda u: calls.append(u) or "https://img.example/p.jpg")
assert found == 1 and calls == ["https://phys.org/x"]
r = conn.execute("SELECT image_url, image_checked_at FROM articles WHERE id=1").fetchone()
assert r["image_url"] == "https://img.example/p.jpg" and r["image_checked_at"] is not None
def test_enrich_caches_failure_and_does_not_retry(conn):
calls = []
fail = lambda u: calls.append(u) or None
assert enrich_brief_images(conn, "2026-05-31", fetch=fail) == 0
r = conn.execute("SELECT image_url, image_checked_at FROM articles WHERE id=1").fetchone()
assert r["image_url"] is None and r["image_checked_at"] is not None # checked, cached
assert enrich_brief_images(conn, "2026-05-31", fetch=fail) == 0
assert len(calls) == 1 # not retried once checked