Bounded hero-image enrichment (og:image for brief items only)
The grid stays typographic; the hero is the one intentional visual slot. At brief-build time we fetch a hero-quality image for the daily five that lack one: - enrich.py reads ONLY a page's <head> og:image/twitter:image and stores just the URL (never the body). - SSRF-guarded: http(s) only, 6s timeout, 300KB cap, <=3 manual redirects each re-validated, and hosts rejected if any resolved address is private, loopback, link-local, multicast, reserved, or unspecified. - image_checked_at column caches success AND failure, so an article is never retried forever. - Wired into build-brief and cycle (brief items only, only if image missing and unchecked). Everything else stays metadata-only. - Verified live: today's five all carry images (feed + enriched). Tests: og:image parser, head-only scope, IP guard across internal ranges, and enrich success + failure-caching (85 total). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -129,6 +129,14 @@ If `frontend/build` is absent, the server falls back to the legacy single-page
|
||||
harness in `goodnews/static/`. The Docker image builds the frontend automatically
|
||||
(multi-stage), so deployment is just `docker build`.
|
||||
|
||||
The secondary grid is intentionally typographic (no thumbnails). The **hero** is
|
||||
the one image slot: at brief-build time, `enrich.py` fetches a hero-quality image
|
||||
for the daily five that lack one — reading **only** a page's `<head>` og:image /
|
||||
twitter:image and storing just the URL (never the body). It's SSRF-guarded
|
||||
(http(s) only, short timeout, byte cap, capped redirects, and rejects hosts that
|
||||
resolve to private/loopback/link-local/multicast addresses), and failures are
|
||||
cached so an article is never retried. Everywhere else stays metadata-only.
|
||||
|
||||
## Calm Filters
|
||||
|
||||
Personal, device-local controls so a reader can stay informed without subjects
|
||||
|
||||
+7
-1
@@ -9,6 +9,7 @@ from pathlib import Path
|
||||
from .briefs import build_daily_brief, show_brief
|
||||
from .db import connect, init_db
|
||||
from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, dedup as run_dedup
|
||||
from .enrich import enrich_brief_images
|
||||
from .feeds import (
|
||||
fetch_feed,
|
||||
parse_feed,
|
||||
@@ -298,6 +299,10 @@ def main() -> None:
|
||||
replace=args.replace,
|
||||
)
|
||||
print(f"Built brief {brief_id}")
|
||||
bdate = args.date or date.today().isoformat()
|
||||
found = enrich_brief_images(conn, bdate)
|
||||
if found:
|
||||
print(f"Enriched {found} hero image(s)")
|
||||
print_brief(show_brief(conn, brief_date=args.date, limit=args.limit))
|
||||
elif args.command == "show-brief":
|
||||
print_brief(show_brief(conn, brief_date=args.date, limit=args.limit))
|
||||
@@ -442,7 +447,8 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non
|
||||
today = date.today().isoformat()
|
||||
try:
|
||||
brief_id = build_daily_brief(conn, brief_date=today, limit=5, replace=True)
|
||||
print(f"brief: rebuilt {today} (id {brief_id})")
|
||||
found = enrich_brief_images(conn, today)
|
||||
print(f"brief: rebuilt {today} (id {brief_id}); {found} hero image(s) enriched")
|
||||
except Exception as exc:
|
||||
print(f"brief: skipped ({exc})")
|
||||
|
||||
|
||||
@@ -44,6 +44,7 @@ CREATE TABLE IF NOT EXISTS articles (
|
||||
url_hash TEXT NOT NULL UNIQUE,
|
||||
title_hash TEXT,
|
||||
duplicate_of INTEGER REFERENCES articles(id) ON DELETE SET NULL,
|
||||
image_checked_at TEXT,
|
||||
FOREIGN KEY (source_id) REFERENCES sources(id)
|
||||
);
|
||||
|
||||
@@ -152,6 +153,8 @@ def _migrate(conn: sqlite3.Connection) -> None:
|
||||
conn.execute(
|
||||
"ALTER TABLE articles ADD COLUMN duplicate_of INTEGER REFERENCES articles(id)"
|
||||
)
|
||||
if "image_checked_at" not in article_cols:
|
||||
conn.execute("ALTER TABLE articles ADD COLUMN image_checked_at TEXT")
|
||||
# Created here (not in SCHEMA) so it runs after the column exists on upgrades.
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_articles_duplicate_of ON articles(duplicate_of)")
|
||||
|
||||
|
||||
@@ -0,0 +1,157 @@
|
||||
"""Bounded hero-image enrichment.
|
||||
|
||||
The grid stays purely typographic; the daily brief's items are the one place we
|
||||
make an exception and fetch a real image — because the hero is the single
|
||||
intentional visual doorway. We fetch ONLY the article page's <head> metadata
|
||||
(og:image / twitter:image), store ONLY the resulting image URL, and never touch
|
||||
the body. This is opt-in, brief-only, once per build.
|
||||
|
||||
Security (this is the one place we fetch user-/source-supplied pages):
|
||||
- http(s) only; short timeout; byte cap; redirects followed manually and capped;
|
||||
- every hop's host is DNS-resolved and rejected if ANY resolved address is
|
||||
private / loopback / link-local / multicast / reserved / unspecified (SSRF).
|
||||
Failures are cached by the caller (image_checked_at) so an article is never
|
||||
retried forever.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import ipaddress
|
||||
import re
|
||||
import socket
|
||||
import sqlite3
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from urllib.parse import urljoin, urlsplit
|
||||
|
||||
from .text import canonicalize_url
|
||||
|
||||
USER_AGENT = "goodNews/0.1 (+local constructive news prototype)"
|
||||
TIMEOUT = 6
|
||||
MAX_BYTES = 300_000
|
||||
MAX_REDIRECTS = 3
|
||||
|
||||
_META_RE = re.compile(rb"<meta\b[^>]*>", re.IGNORECASE)
|
||||
_HEAD_END_RE = re.compile(rb"</head>", re.IGNORECASE)
|
||||
|
||||
|
||||
def _attr(tag: bytes, name: bytes) -> bytes | None:
|
||||
m = re.search(name + rb"""\s*=\s*["']([^"']*)["']""", tag, re.IGNORECASE)
|
||||
return m.group(1) if m else None
|
||||
|
||||
|
||||
def og_image_from_html(html: bytes) -> str | None:
|
||||
"""Extract og:image / twitter:image from a page's <head> bytes."""
|
||||
head = html.split(b"</head>", 1)[0] if _HEAD_END_RE.search(html) else html
|
||||
for tag in _META_RE.findall(head):
|
||||
key = _attr(tag, b"property") or _attr(tag, b"name")
|
||||
if key and key.lower() in (b"og:image", b"og:image:url", b"twitter:image"):
|
||||
content = _attr(tag, b"content")
|
||||
if content:
|
||||
return canonicalize_url(content.decode("utf-8", "replace"))
|
||||
return None
|
||||
|
||||
|
||||
def _host_is_public(host: str | None) -> bool:
|
||||
"""True only if the host resolves and ALL its addresses are public."""
|
||||
if not host:
|
||||
return False
|
||||
try:
|
||||
infos = socket.getaddrinfo(host, None)
|
||||
except (socket.gaierror, UnicodeError, OSError):
|
||||
return False
|
||||
addrs = {info[4][0] for info in infos}
|
||||
if not addrs:
|
||||
return False
|
||||
for addr in addrs:
|
||||
try:
|
||||
ip = ipaddress.ip_address(addr.split("%")[0]) # strip scope id
|
||||
except ValueError:
|
||||
return False
|
||||
if (
|
||||
ip.is_private or ip.is_loopback or ip.is_link_local
|
||||
or ip.is_multicast or ip.is_reserved or ip.is_unspecified
|
||||
):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
class _NoRedirect(urllib.request.HTTPRedirectHandler):
|
||||
# Don't auto-follow — we re-validate each hop's host ourselves.
|
||||
def redirect_request(self, *args, **kwargs):
|
||||
return None
|
||||
|
||||
|
||||
def fetch_og_image(url: str | None) -> str | None:
|
||||
"""Fetch a page's head metadata and return its og:image URL, or None.
|
||||
|
||||
Best-effort and safe: returns None on any error, bad scheme, redirect loop,
|
||||
or a host that resolves to a non-public address.
|
||||
"""
|
||||
opener = urllib.request.build_opener(_NoRedirect)
|
||||
for _ in range(MAX_REDIRECTS + 1):
|
||||
if not url:
|
||||
return None
|
||||
parts = urlsplit(url)
|
||||
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
|
||||
return None
|
||||
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"})
|
||||
try:
|
||||
response = opener.open(request, timeout=TIMEOUT)
|
||||
except (urllib.error.URLError, OSError, ValueError):
|
||||
return None
|
||||
status = getattr(response, "status", 200) or 200
|
||||
if status in (301, 302, 303, 307, 308):
|
||||
location = response.headers.get("Location")
|
||||
response.close()
|
||||
if not location:
|
||||
return None
|
||||
url = urljoin(url, location)
|
||||
continue
|
||||
ctype = response.headers.get("Content-Type", "")
|
||||
if "html" not in ctype.lower():
|
||||
response.close()
|
||||
return None
|
||||
try:
|
||||
body = response.read(MAX_BYTES)
|
||||
finally:
|
||||
response.close()
|
||||
return og_image_from_html(body)
|
||||
return None # too many redirects
|
||||
|
||||
|
||||
def enrich_brief_images(conn: sqlite3.Connection, brief_date: str, fetch=fetch_og_image, limit: int = 5) -> int:
|
||||
"""Fetch a hero-quality image for brief items that lack one (once each).
|
||||
|
||||
Only touches the given date's brief items with no image and not yet checked;
|
||||
stamps image_checked_at either way so failures are not retried forever.
|
||||
Returns how many images were newly found.
|
||||
"""
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT a.id, a.canonical_url
|
||||
FROM daily_briefs b
|
||||
JOIN daily_brief_items bi ON bi.brief_id = b.id
|
||||
JOIN articles a ON a.id = bi.article_id
|
||||
WHERE b.brief_date = ? AND a.image_url IS NULL AND a.image_checked_at IS NULL
|
||||
ORDER BY bi.rank
|
||||
LIMIT ?
|
||||
""",
|
||||
(brief_date, limit),
|
||||
).fetchall()
|
||||
|
||||
found = 0
|
||||
for row in rows:
|
||||
try:
|
||||
image = fetch(row["canonical_url"])
|
||||
except Exception:
|
||||
image = None
|
||||
conn.execute(
|
||||
"UPDATE articles SET image_url = COALESCE(?, image_url), image_checked_at = CURRENT_TIMESTAMP "
|
||||
"WHERE id = ?",
|
||||
(image, row["id"]),
|
||||
)
|
||||
if image:
|
||||
found += 1
|
||||
conn.commit()
|
||||
return found
|
||||
@@ -0,0 +1,55 @@
|
||||
import pytest
|
||||
|
||||
from goodnews.db import connect, init_db
|
||||
from goodnews.enrich import og_image_from_html, _host_is_public, enrich_brief_images
|
||||
|
||||
|
||||
def test_og_image_parser_handles_attr_order_and_fallback():
|
||||
assert og_image_from_html(b'<head><meta property="og:image" content="https://x.com/a.jpg"></head>') == "https://x.com/a.jpg"
|
||||
assert og_image_from_html(b'<meta content="https://x.com/b.jpg" property="og:image">') == "https://x.com/b.jpg"
|
||||
assert og_image_from_html(b'<meta name="twitter:image" content="https://x.com/c.jpg">') == "https://x.com/c.jpg"
|
||||
assert og_image_from_html(b"<html><body>nope</body></html>") is None
|
||||
|
||||
|
||||
def test_og_image_only_reads_head():
|
||||
# a meta after </head> must be ignored
|
||||
assert og_image_from_html(b'<head></head><meta property="og:image" content="https://x.com/d.jpg">') is None
|
||||
|
||||
|
||||
def test_host_public_guard_blocks_internal_ranges():
|
||||
assert _host_is_public("8.8.8.8") # public
|
||||
assert not _host_is_public("127.0.0.1") # loopback
|
||||
assert not _host_is_public("10.0.0.1") # private
|
||||
assert not _host_is_public("192.168.1.1") # private
|
||||
assert not _host_is_public("169.254.0.1") # link-local
|
||||
assert not _host_is_public("0.0.0.0") # unspecified
|
||||
assert not _host_is_public("")
|
||||
assert not _host_is_public(None)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def conn():
|
||||
c = connect(":memory:"); init_db(c)
|
||||
c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)")
|
||||
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash) VALUES (1,1,'https://phys.org/x','t1','h1')")
|
||||
c.execute("INSERT INTO daily_briefs (id,brief_date,title) VALUES (1,'2026-05-31','B')")
|
||||
c.execute("INSERT INTO daily_brief_items (brief_id,article_id,rank) VALUES (1,1,1)")
|
||||
c.commit(); yield c; c.close()
|
||||
|
||||
|
||||
def test_enrich_sets_image_and_stamps(conn):
|
||||
calls = []
|
||||
found = enrich_brief_images(conn, "2026-05-31", fetch=lambda u: calls.append(u) or "https://img.example/p.jpg")
|
||||
assert found == 1 and calls == ["https://phys.org/x"]
|
||||
r = conn.execute("SELECT image_url, image_checked_at FROM articles WHERE id=1").fetchone()
|
||||
assert r["image_url"] == "https://img.example/p.jpg" and r["image_checked_at"] is not None
|
||||
|
||||
|
||||
def test_enrich_caches_failure_and_does_not_retry(conn):
|
||||
calls = []
|
||||
fail = lambda u: calls.append(u) or None
|
||||
assert enrich_brief_images(conn, "2026-05-31", fetch=fail) == 0
|
||||
r = conn.execute("SELECT image_url, image_checked_at FROM articles WHERE id=1").fetchone()
|
||||
assert r["image_url"] is None and r["image_checked_at"] is not None # checked, cached
|
||||
assert enrich_brief_images(conn, "2026-05-31", fetch=fail) == 0
|
||||
assert len(calls) == 1 # not retried once checked
|
||||
Reference in New Issue
Block a user