"""Bounded hero-image enrichment.
The grid stays purely typographic; the daily brief's items are the one place we
make an exception and fetch a real image — because the hero is the single
intentional visual doorway. We fetch ONLY the article page's
metadata
(og:image / twitter:image), store ONLY the resulting image URL, and never touch
the body. This is opt-in, brief-only, once per build.
Security (this is the one place we fetch user-/source-supplied pages):
- http(s) only; short timeout; byte cap; redirects followed manually and capped;
- every hop's host is DNS-resolved and rejected if ANY resolved address is
private / loopback / link-local / multicast / reserved / unspecified (SSRF).
Failures are cached by the caller (image_checked_at) so an article is never
retried forever.
"""
from __future__ import annotations
import ipaddress
import re
import socket
import sqlite3
import struct
import urllib.error
import urllib.request
from urllib.parse import urljoin, urlsplit
from .text import canonicalize_url
USER_AGENT = "goodNews/0.1 (+local constructive news prototype)"
TIMEOUT = 6
MAX_BYTES = 300_000
MAX_REDIRECTS = 3
# Below this, a feed thumbnail upscales to mush in the card banner. Real share
# images (og:image) are ~1200×630; tiny RSS thumbnails (~90px) are what we reject.
MIN_IMG_WIDTH = 450
MIN_IMG_HEIGHT = 250
_META_RE = re.compile(rb"]*>", re.IGNORECASE)
_HEAD_END_RE = re.compile(rb"", re.IGNORECASE)
# Substrings that mark a generic placeholder/default share image rather than the
# article's own picture (e.g. NPR's facebook-default). We'd rather show no image
# (typographic hero) than a generic logo card. NOTE: do NOT add "branded_news" —
# that's BBC's normal CDN path for real article photos, so rejecting it threw away
# every BBC hero image and fell back to the tiny RSS thumbnail.
_GENERIC_IMAGE_MARKERS = (
"facebook-default",
"default-wide",
"default-fb",
"og-default",
"default-og",
"twitter-default",
"default-image",
"/placeholder",
"share-default",
"social-default",
# tracking pixels / spacers / data-URIs — never a real share image
"data:image",
"/pixel",
"1x1",
"spacer",
"/blank.",
"transparent.",
)
def _is_generic_image(url: str) -> bool:
lowered = url.lower()
return any(marker in lowered for marker in _GENERIC_IMAGE_MARKERS)
def _prefer_unbranded(url: str) -> str:
"""Swap BBC's logo-branded image variant for its clean one.
BBC's og:image is served from the "branded_news" CDN path with a "BBC NEWS"
logo baked into the picture (it shows as "…EWS" once the hero crops it). The
identical photo is served under "cpsprodpb" with no logo, so prefer that — a
clean hero at the same full resolution.
"""
if "ichef.bbci.co.uk" in url and "/branded_news/" in url:
return url.replace("/branded_news/", "/cpsprodpb/")
return url
def _attr(tag: bytes, name: bytes) -> bytes | None:
m = re.search(name + rb"""\s*=\s*["']([^"']*)["']""", tag, re.IGNORECASE)
return m.group(1) if m else None
def og_image_from_html(html: bytes) -> str | None:
"""Extract og:image / twitter:image from a page's bytes."""
head = html.split(b"", 1)[0] if _HEAD_END_RE.search(html) else html
for tag in _META_RE.findall(head):
key = _attr(tag, b"property") or _attr(tag, b"name")
if key and key.lower() in (b"og:image", b"og:image:url", b"twitter:image"):
content = _attr(tag, b"content")
if not content:
continue
image = canonicalize_url(content.decode("utf-8", "replace"))
# Skip generic placeholders; keep scanning for a real one.
if image and not _is_generic_image(image):
return _prefer_unbranded(image)
return None
def _host_is_public(host: str | None) -> bool:
"""True only if the host resolves and ALL its addresses are public."""
if not host:
return False
try:
infos = socket.getaddrinfo(host, None)
except (socket.gaierror, UnicodeError, OSError):
return False
addrs = {info[4][0] for info in infos}
if not addrs:
return False
for addr in addrs:
try:
ip = ipaddress.ip_address(addr.split("%")[0]) # strip scope id
except ValueError:
return False
if (
ip.is_private or ip.is_loopback or ip.is_link_local
or ip.is_multicast or ip.is_reserved or ip.is_unspecified
):
return False
return True
class _NoRedirect(urllib.request.HTTPRedirectHandler):
# Don't auto-follow — we re-validate each hop's host ourselves.
def redirect_request(self, *args, **kwargs):
return None
def fetch_og_image(url: str | None) -> str | None:
"""Fetch a page's head metadata and return its og:image URL, or None.
Best-effort and safe: returns None on any error, bad scheme, redirect loop,
or a host that resolves to a non-public address.
"""
opener = urllib.request.build_opener(_NoRedirect)
for _ in range(MAX_REDIRECTS + 1):
if not url:
return None
parts = urlsplit(url)
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
return None
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"})
try:
response = opener.open(request, timeout=TIMEOUT)
except (urllib.error.URLError, OSError, ValueError):
return None
status = getattr(response, "status", 200) or 200
if status in (301, 302, 303, 307, 308):
location = response.headers.get("Location")
response.close()
if not location:
return None
url = urljoin(url, location)
continue
ctype = response.headers.get("Content-Type", "")
if "html" not in ctype.lower():
response.close()
return None
try:
body = response.read(MAX_BYTES)
finally:
response.close()
image = og_image_from_html(body)
# A stored URL is not proof it renders — confirm it actually loads.
return image if (image and _image_loads(image)) else None
return None # too many redirects
# Word counting reads more of the body than image metadata (which only needs ).
_READ_MAX_BYTES = 900_000
def fetch_source_words(url: str | None) -> int | None:
"""Fetch a page and return its full-article word count (furniture stripped), or
None on any failure or a too-thin extraction (JS/video/paywall pages). Same SSRF
safety as fetch_og_image; we read the count only, never store the body."""
from .readtime import source_read_minutes, word_count_from_html
opener = urllib.request.build_opener(_NoRedirect)
for _ in range(MAX_REDIRECTS + 1):
if not url:
return None
parts = urlsplit(url)
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
return None
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"})
try:
response = opener.open(request, timeout=TIMEOUT)
except (urllib.error.URLError, OSError, ValueError):
return None
status = getattr(response, "status", 200) or 200
if status in (301, 302, 303, 307, 308):
location = response.headers.get("Location")
response.close()
if not location:
return None
url = urljoin(url, location)
continue
if "html" not in response.headers.get("Content-Type", "").lower():
response.close()
return None
try:
body = response.read(_READ_MAX_BYTES)
finally:
response.close()
words = word_count_from_html(body)
return words if source_read_minutes(words) is not None else None
return None # too many redirects
def _image_dimensions(data: bytes) -> "tuple[int, int] | None":
"""Best-effort (width, height) from an image file's header bytes — PNG, GIF,
JPEG, WebP. Returns None for formats we can't cheaply measure (e.g. SVG)."""
if len(data) < 10:
return None
if len(data) >= 24 and data[:8] == b"\x89PNG\r\n\x1a\n" and data[12:16] == b"IHDR":
return struct.unpack(">II", data[16:24])
if data[:6] in (b"GIF87a", b"GIF89a"):
return struct.unpack("HH", data[i + 5:i + 9])
return (w, h)
if marker == 0xD8 or marker == 0xD9 or 0xD0 <= marker <= 0xD7:
i += 2
continue
i += 2 + struct.unpack(">H", data[i + 2:i + 4])[0]
return None
if data[:4] == b"RIFF" and data[8:12] == b"WEBP":
fmt = data[12:16]
try:
if fmt == b"VP8 ":
return (struct.unpack(" bool:
"""Confirm an image URL returns a real, big-enough image (HTTP 200 + image/*
+ dimensions ≥ the minimum).
Two failure modes this guards against: signed/hotlink-protected URLs that
401/403 on a direct load (e.g. the Guardian's i.guim.co.uk), and tiny feed
thumbnails (~90px) that upscale to mush in the card banner. We request as the
browser does — no referrer — with the same per-hop host safety as the page
fetch. Images we can't measure (SVG/AVIF) pass on content-type alone.
"""
opener = urllib.request.build_opener(_NoRedirect)
for _ in range(MAX_REDIRECTS + 1):
if not url:
return False
parts = urlsplit(url)
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
return False
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "image/*,*/*"})
try:
response = opener.open(request, timeout=TIMEOUT)
except (urllib.error.URLError, OSError, ValueError):
return False
try:
status = getattr(response, "status", 200) or 200
if status in (301, 302, 303, 307, 308):
location = response.headers.get("Location")
if not location:
return False
url = urljoin(url, location)
continue
ctype = (response.headers.get("Content-Type") or "").lower()
if status != 200 or not ctype.startswith("image/"):
return False
head = response.read(200_000)
finally:
response.close()
dims = _image_dimensions(head)
if dims and (dims[0] < MIN_IMG_WIDTH or dims[1] < MIN_IMG_HEIGHT):
return False # too small — would upscale to mush
return True
return False
def prune_broken_images(conn: sqlite3.Connection, check=_image_loads, limit: int = 3000) -> int:
"""Clear stored image URLs that no longer load (signed/expired/hotlink-
protected), so coverage is honest and those cards fall back to the calm
placeholder cleanly instead of attempting a doomed fetch. Returns count cleared.
"""
rows = conn.execute(
"SELECT id, image_url FROM articles WHERE image_url IS NOT NULL AND image_url != '' "
"ORDER BY id DESC LIMIT ?",
(limit,),
).fetchall()
cleared = 0
for row in rows:
if not check(row["image_url"]):
conn.execute(
"UPDATE articles SET image_url = NULL, image_checked_at = CURRENT_TIMESTAMP WHERE id = ?",
(row["id"],),
)
cleared += 1
conn.commit()
return cleared
def enrich_brief_images(
conn: sqlite3.Connection, brief_date: str, fetch=fetch_og_image, limit: int = 7, retry_days: int = 2
) -> int:
"""Fetch a hero-quality image for brief items that lack one.
Any of the brief's items can become the hero (via the client's fallback or a
replace), so this covers the whole brief (limit defaults to the brief size, 7),
not just the top few. Items already carrying an image are left alone; items
still without one are retried after `retry_days` so a transient fetch failure
or a weaker earlier extractor doesn't mark an article imageless forever.
Returns how many images were newly found.
"""
# Fetch even when a feed image exists, because feed thumbnails are often tiny
# and the hero is shown large — a page's og:image is the better hero visual.
rows = conn.execute(
"""
SELECT a.id, a.canonical_url
FROM daily_briefs b
JOIN daily_brief_items bi ON bi.brief_id = b.id
JOIN articles a ON a.id = bi.article_id
WHERE b.brief_date = ?
AND (
a.image_checked_at IS NULL
OR ((a.image_url IS NULL OR a.image_url = '')
AND a.image_checked_at < datetime('now', ?))
)
ORDER BY bi.rank
LIMIT ?
""",
(brief_date, f"-{retry_days} days", limit),
).fetchall()
found = 0
for row in rows:
try:
image = fetch(row["canonical_url"])
except Exception:
image = None
conn.execute(
"UPDATE articles SET image_url = COALESCE(?, image_url), image_checked_at = CURRENT_TIMESTAMP "
"WHERE id = ?",
(image, row["id"]),
)
if image:
found += 1
conn.commit()
return found
def enrich_article_image(
conn: sqlite3.Connection, article_id: int, fetch=fetch_og_image, retry_days: int = 7
) -> bool:
"""Attention-triggered: fetch an og:image for ONE article that lacks one.
Called when an article earns a summary (i.e. it's actually being read), so we
only spend a fetch on articles a reader has reached. Leaves an existing image
alone; retries a still-imageless article only after `retry_days`. Returns True
if a new image was stored. Best-effort — never raises.
"""
row = conn.execute(
"""
SELECT id, canonical_url FROM articles
WHERE id = ?
AND (image_url IS NULL OR image_url = '')
AND (image_checked_at IS NULL OR image_checked_at < datetime('now', ?))
""",
(article_id, f"-{retry_days} days"),
).fetchone()
if not row:
return False # has an image already, or checked too recently
try:
image = fetch(row["canonical_url"])
except Exception:
image = None
conn.execute(
"UPDATE articles SET image_url = COALESCE(?, image_url), image_checked_at = CURRENT_TIMESTAMP "
"WHERE id = ?",
(image, article_id),
)
conn.commit()
return bool(image)
def enrich_recent_images(
conn: sqlite3.Connection, fetch=fetch_og_image, limit: int = 40, retry_days: int = 7
) -> int:
"""Keep the Latest feed photo-rich: fetch a quality og:image for the newest
accepted, non-duplicate articles that lack one. Bounded per run, so it tracks
fresh content without blanket-fetching the archive. Returns newly-found count.
"""
rows = conn.execute(
"""
SELECT a.id FROM articles a
JOIN article_scores s ON s.article_id = a.id
WHERE s.accepted = 1 AND a.duplicate_of IS NULL
AND (a.image_url IS NULL OR a.image_url = '')
AND (a.image_checked_at IS NULL OR a.image_checked_at < datetime('now', ?))
ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
LIMIT ?
""",
(f"-{retry_days} days", limit),
).fetchall()
found = 0
for row in rows:
if enrich_article_image(conn, row["id"], fetch=fetch, retry_days=retry_days):
found += 1
return found
def enrich_summarized_images(
conn: sqlite3.Connection, fetch=fetch_og_image, limit: int = 50, retry_days: int = 7
) -> int:
"""Slow backfill: give already-summarized, accepted articles an image if they
lack one. Run in modest batches so we never blast publishers. Returns count
of newly-found images.
"""
rows = conn.execute(
"""
SELECT a.id FROM articles a
JOIN article_summaries m ON m.article_id = a.id
JOIN article_scores s ON s.article_id = a.id
WHERE s.accepted = 1 AND a.duplicate_of IS NULL
AND (a.image_url IS NULL OR a.image_url = '')
AND (a.image_checked_at IS NULL OR a.image_checked_at < datetime('now', ?))
ORDER BY a.id DESC
LIMIT ?
""",
(f"-{retry_days} days", limit),
).fetchall()
found = 0
for row in rows:
if enrich_article_image(conn, row["id"], fetch=fetch, retry_days=retry_days):
found += 1
return found
def enrich_read_times(
conn: sqlite3.Connection, fetch=fetch_source_words, limit: int = 40, retry_days: int = 14
) -> int:
"""Give recent accepted articles a full-article word count, so the front door can
show "Full story · ~N min" next to our one-minute gist. Bounded per run (mirrors
the image enrichers); fetches each article once, retrying a failed/too-thin
extraction only after `retry_days`. Returns how many real counts were stored."""
rows = conn.execute(
"""
SELECT a.id, a.canonical_url FROM articles a
JOIN article_scores s ON s.article_id = a.id
WHERE s.accepted = 1 AND a.duplicate_of IS NULL
AND a.source_words IS NULL
AND (a.read_checked_at IS NULL OR a.read_checked_at < datetime('now', ?))
ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
LIMIT ?
""",
(f"-{retry_days} days", limit),
).fetchall()
found = 0
for row in rows:
try:
words = fetch(row["canonical_url"])
except Exception:
words = None
# Only ever write a REAL count; never overwrite a good value with null/zero.
# Always stamp the check time so failed/thin pages aren't re-fetched until retry.
if words:
conn.execute(
"UPDATE articles SET source_words = ?, read_checked_at = CURRENT_TIMESTAMP WHERE id = ?",
(words, row["id"]),
)
found += 1
else:
conn.execute("UPDATE articles SET read_checked_at = CURRENT_TIMESTAMP WHERE id = ?", (row["id"],))
conn.commit()
return found