"""Bounded hero-image enrichment. The grid stays purely typographic; the daily brief's items are the one place we make an exception and fetch a real image — because the hero is the single intentional visual doorway. We fetch ONLY the article page's metadata (og:image / twitter:image), store ONLY the resulting image URL, and never touch the body. This is opt-in, brief-only, once per build. Security (this is the one place we fetch user-/source-supplied pages): - http(s) only; short timeout; byte cap; redirects followed manually and capped; - every hop's host is DNS-resolved and rejected if ANY resolved address is private / loopback / link-local / multicast / reserved / unspecified (SSRF). Failures are cached by the caller (image_checked_at) so an article is never retried forever. """ from __future__ import annotations import ipaddress import re import socket import sqlite3 import struct import urllib.error import urllib.request from urllib.parse import urljoin, urlsplit from .text import canonicalize_url USER_AGENT = "goodNews/0.1 (+local constructive news prototype)" TIMEOUT = 6 MAX_BYTES = 300_000 MAX_REDIRECTS = 3 # Below this, a feed thumbnail upscales to mush in the card banner. Real share # images (og:image) are ~1200×630; tiny RSS thumbnails (~90px) are what we reject. MIN_IMG_WIDTH = 450 MIN_IMG_HEIGHT = 250 _META_RE = re.compile(rb"]*>", re.IGNORECASE) _HEAD_END_RE = re.compile(rb"", re.IGNORECASE) # Substrings that mark a generic placeholder/default share image rather than the # article's own picture (e.g. NPR's facebook-default). We'd rather show no image # (typographic hero) than a generic logo card. NOTE: do NOT add "branded_news" — # that's BBC's normal CDN path for real article photos, so rejecting it threw away # every BBC hero image and fell back to the tiny RSS thumbnail. _GENERIC_IMAGE_MARKERS = ( "facebook-default", "default-wide", "default-fb", "og-default", "default-og", "twitter-default", "default-image", "/placeholder", "share-default", "social-default", # tracking pixels / spacers / data-URIs — never a real share image "data:image", "/pixel", "1x1", "spacer", "/blank.", "transparent.", ) def _is_generic_image(url: str) -> bool: lowered = url.lower() return any(marker in lowered for marker in _GENERIC_IMAGE_MARKERS) def _prefer_unbranded(url: str) -> str: """Swap BBC's logo-branded image variant for its clean one. BBC's og:image is served from the "branded_news" CDN path with a "BBC NEWS" logo baked into the picture (it shows as "…EWS" once the hero crops it). The identical photo is served under "cpsprodpb" with no logo, so prefer that — a clean hero at the same full resolution. """ if "ichef.bbci.co.uk" in url and "/branded_news/" in url: return url.replace("/branded_news/", "/cpsprodpb/") return url def _attr(tag: bytes, name: bytes) -> bytes | None: m = re.search(name + rb"""\s*=\s*["']([^"']*)["']""", tag, re.IGNORECASE) return m.group(1) if m else None def og_image_from_html(html: bytes) -> str | None: """Extract og:image / twitter:image from a page's bytes.""" head = html.split(b"", 1)[0] if _HEAD_END_RE.search(html) else html for tag in _META_RE.findall(head): key = _attr(tag, b"property") or _attr(tag, b"name") if key and key.lower() in (b"og:image", b"og:image:url", b"twitter:image"): content = _attr(tag, b"content") if not content: continue image = canonicalize_url(content.decode("utf-8", "replace")) # Skip generic placeholders; keep scanning for a real one. if image and not _is_generic_image(image): return _prefer_unbranded(image) return None def _host_is_public(host: str | None) -> bool: """True only if the host resolves and ALL its addresses are public.""" if not host: return False try: infos = socket.getaddrinfo(host, None) except (socket.gaierror, UnicodeError, OSError): return False addrs = {info[4][0] for info in infos} if not addrs: return False for addr in addrs: try: ip = ipaddress.ip_address(addr.split("%")[0]) # strip scope id except ValueError: return False if ( ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_multicast or ip.is_reserved or ip.is_unspecified ): return False return True class _NoRedirect(urllib.request.HTTPRedirectHandler): # Don't auto-follow — we re-validate each hop's host ourselves. def redirect_request(self, *args, **kwargs): return None def fetch_og_image(url: str | None) -> str | None: """Fetch a page's head metadata and return its og:image URL, or None. Best-effort and safe: returns None on any error, bad scheme, redirect loop, or a host that resolves to a non-public address. """ opener = urllib.request.build_opener(_NoRedirect) for _ in range(MAX_REDIRECTS + 1): if not url: return None parts = urlsplit(url) if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname): return None request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"}) try: response = opener.open(request, timeout=TIMEOUT) except (urllib.error.URLError, OSError, ValueError): return None status = getattr(response, "status", 200) or 200 if status in (301, 302, 303, 307, 308): location = response.headers.get("Location") response.close() if not location: return None url = urljoin(url, location) continue ctype = response.headers.get("Content-Type", "") if "html" not in ctype.lower(): response.close() return None try: body = response.read(MAX_BYTES) finally: response.close() image = og_image_from_html(body) # A stored URL is not proof it renders — confirm it actually loads. return image if (image and _image_loads(image)) else None return None # too many redirects # Word counting reads more of the body than image metadata (which only needs ). _READ_MAX_BYTES = 900_000 def fetch_source_words(url: str | None) -> int | None: """Fetch a page and return its full-article word count (furniture stripped), or None on any failure or a too-thin extraction (JS/video/paywall pages). Same SSRF safety as fetch_og_image; we read the count only, never store the body.""" from .readtime import source_read_minutes, word_count_from_html opener = urllib.request.build_opener(_NoRedirect) for _ in range(MAX_REDIRECTS + 1): if not url: return None parts = urlsplit(url) if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname): return None request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"}) try: response = opener.open(request, timeout=TIMEOUT) except (urllib.error.URLError, OSError, ValueError): return None status = getattr(response, "status", 200) or 200 if status in (301, 302, 303, 307, 308): location = response.headers.get("Location") response.close() if not location: return None url = urljoin(url, location) continue if "html" not in response.headers.get("Content-Type", "").lower(): response.close() return None try: body = response.read(_READ_MAX_BYTES) finally: response.close() words = word_count_from_html(body) return words if source_read_minutes(words) is not None else None return None # too many redirects def _image_dimensions(data: bytes) -> "tuple[int, int] | None": """Best-effort (width, height) from an image file's header bytes — PNG, GIF, JPEG, WebP. Returns None for formats we can't cheaply measure (e.g. SVG).""" if len(data) < 10: return None if len(data) >= 24 and data[:8] == b"\x89PNG\r\n\x1a\n" and data[12:16] == b"IHDR": return struct.unpack(">II", data[16:24]) if data[:6] in (b"GIF87a", b"GIF89a"): return struct.unpack("HH", data[i + 5:i + 9]) return (w, h) if marker == 0xD8 or marker == 0xD9 or 0xD0 <= marker <= 0xD7: i += 2 continue i += 2 + struct.unpack(">H", data[i + 2:i + 4])[0] return None if data[:4] == b"RIFF" and data[8:12] == b"WEBP": fmt = data[12:16] try: if fmt == b"VP8 ": return (struct.unpack(" bool: """Confirm an image URL returns a real, big-enough image (HTTP 200 + image/* + dimensions ≥ the minimum). Two failure modes this guards against: signed/hotlink-protected URLs that 401/403 on a direct load (e.g. the Guardian's i.guim.co.uk), and tiny feed thumbnails (~90px) that upscale to mush in the card banner. We request as the browser does — no referrer — with the same per-hop host safety as the page fetch. Images we can't measure (SVG/AVIF) pass on content-type alone. """ opener = urllib.request.build_opener(_NoRedirect) for _ in range(MAX_REDIRECTS + 1): if not url: return False parts = urlsplit(url) if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname): return False request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "image/*,*/*"}) try: response = opener.open(request, timeout=TIMEOUT) except (urllib.error.URLError, OSError, ValueError): return False try: status = getattr(response, "status", 200) or 200 if status in (301, 302, 303, 307, 308): location = response.headers.get("Location") if not location: return False url = urljoin(url, location) continue ctype = (response.headers.get("Content-Type") or "").lower() if status != 200 or not ctype.startswith("image/"): return False head = response.read(200_000) finally: response.close() dims = _image_dimensions(head) if dims and (dims[0] < MIN_IMG_WIDTH or dims[1] < MIN_IMG_HEIGHT): return False # too small — would upscale to mush return True return False def prune_broken_images(conn: sqlite3.Connection, check=_image_loads, limit: int = 3000) -> int: """Clear stored image URLs that no longer load (signed/expired/hotlink- protected), so coverage is honest and those cards fall back to the calm placeholder cleanly instead of attempting a doomed fetch. Returns count cleared. """ rows = conn.execute( "SELECT id, image_url FROM articles WHERE image_url IS NOT NULL AND image_url != '' " "ORDER BY id DESC LIMIT ?", (limit,), ).fetchall() cleared = 0 for row in rows: if not check(row["image_url"]): conn.execute( "UPDATE articles SET image_url = NULL, image_checked_at = CURRENT_TIMESTAMP WHERE id = ?", (row["id"],), ) cleared += 1 conn.commit() return cleared def enrich_brief_images( conn: sqlite3.Connection, brief_date: str, fetch=fetch_og_image, limit: int = 7, retry_days: int = 2 ) -> int: """Fetch a hero-quality image for brief items that lack one. Any of the brief's items can become the hero (via the client's fallback or a replace), so this covers the whole brief (limit defaults to the brief size, 7), not just the top few. Items already carrying an image are left alone; items still without one are retried after `retry_days` so a transient fetch failure or a weaker earlier extractor doesn't mark an article imageless forever. Returns how many images were newly found. """ # Fetch even when a feed image exists, because feed thumbnails are often tiny # and the hero is shown large — a page's og:image is the better hero visual. rows = conn.execute( """ SELECT a.id, a.canonical_url FROM daily_briefs b JOIN daily_brief_items bi ON bi.brief_id = b.id JOIN articles a ON a.id = bi.article_id WHERE b.brief_date = ? AND ( a.image_checked_at IS NULL OR ((a.image_url IS NULL OR a.image_url = '') AND a.image_checked_at < datetime('now', ?)) ) ORDER BY bi.rank LIMIT ? """, (brief_date, f"-{retry_days} days", limit), ).fetchall() found = 0 for row in rows: try: image = fetch(row["canonical_url"]) except Exception: image = None conn.execute( "UPDATE articles SET image_url = COALESCE(?, image_url), image_checked_at = CURRENT_TIMESTAMP " "WHERE id = ?", (image, row["id"]), ) if image: found += 1 conn.commit() return found def enrich_article_image( conn: sqlite3.Connection, article_id: int, fetch=fetch_og_image, retry_days: int = 7 ) -> bool: """Attention-triggered: fetch an og:image for ONE article that lacks one. Called when an article earns a summary (i.e. it's actually being read), so we only spend a fetch on articles a reader has reached. Leaves an existing image alone; retries a still-imageless article only after `retry_days`. Returns True if a new image was stored. Best-effort — never raises. """ row = conn.execute( """ SELECT id, canonical_url FROM articles WHERE id = ? AND (image_url IS NULL OR image_url = '') AND (image_checked_at IS NULL OR image_checked_at < datetime('now', ?)) """, (article_id, f"-{retry_days} days"), ).fetchone() if not row: return False # has an image already, or checked too recently try: image = fetch(row["canonical_url"]) except Exception: image = None conn.execute( "UPDATE articles SET image_url = COALESCE(?, image_url), image_checked_at = CURRENT_TIMESTAMP " "WHERE id = ?", (image, article_id), ) conn.commit() return bool(image) def enrich_recent_images( conn: sqlite3.Connection, fetch=fetch_og_image, limit: int = 40, retry_days: int = 7 ) -> int: """Keep the Latest feed photo-rich: fetch a quality og:image for the newest accepted, non-duplicate articles that lack one. Bounded per run, so it tracks fresh content without blanket-fetching the archive. Returns newly-found count. """ rows = conn.execute( """ SELECT a.id FROM articles a JOIN article_scores s ON s.article_id = a.id WHERE s.accepted = 1 AND a.duplicate_of IS NULL AND (a.image_url IS NULL OR a.image_url = '') AND (a.image_checked_at IS NULL OR a.image_checked_at < datetime('now', ?)) ORDER BY COALESCE(a.published_at, a.discovered_at) DESC LIMIT ? """, (f"-{retry_days} days", limit), ).fetchall() found = 0 for row in rows: if enrich_article_image(conn, row["id"], fetch=fetch, retry_days=retry_days): found += 1 return found def enrich_summarized_images( conn: sqlite3.Connection, fetch=fetch_og_image, limit: int = 50, retry_days: int = 7 ) -> int: """Slow backfill: give already-summarized, accepted articles an image if they lack one. Run in modest batches so we never blast publishers. Returns count of newly-found images. """ rows = conn.execute( """ SELECT a.id FROM articles a JOIN article_summaries m ON m.article_id = a.id JOIN article_scores s ON s.article_id = a.id WHERE s.accepted = 1 AND a.duplicate_of IS NULL AND (a.image_url IS NULL OR a.image_url = '') AND (a.image_checked_at IS NULL OR a.image_checked_at < datetime('now', ?)) ORDER BY a.id DESC LIMIT ? """, (f"-{retry_days} days", limit), ).fetchall() found = 0 for row in rows: if enrich_article_image(conn, row["id"], fetch=fetch, retry_days=retry_days): found += 1 return found def enrich_read_times( conn: sqlite3.Connection, fetch=fetch_source_words, limit: int = 40, retry_days: int = 14 ) -> int: """Give recent accepted articles a full-article word count, so the front door can show "Full story · ~N min" next to our one-minute gist. Bounded per run (mirrors the image enrichers); fetches each article once, retrying a failed/too-thin extraction only after `retry_days`. Returns how many real counts were stored.""" rows = conn.execute( """ SELECT a.id, a.canonical_url FROM articles a JOIN article_scores s ON s.article_id = a.id WHERE s.accepted = 1 AND a.duplicate_of IS NULL AND a.source_words IS NULL AND (a.read_checked_at IS NULL OR a.read_checked_at < datetime('now', ?)) ORDER BY COALESCE(a.published_at, a.discovered_at) DESC LIMIT ? """, (f"-{retry_days} days", limit), ).fetchall() found = 0 for row in rows: try: words = fetch(row["canonical_url"]) except Exception: words = None # Only ever write a REAL count; never overwrite a good value with null/zero. # Always stamp the check time so failed/thin pages aren't re-fetched until retry. if words: conn.execute( "UPDATE articles SET source_words = ?, read_checked_at = CURRENT_TIMESTAMP WHERE id = ?", (words, row["id"]), ) found += 1 else: conn.execute("UPDATE articles SET read_checked_at = CURRENT_TIMESTAMP WHERE id = ?", (row["id"],)) conn.commit() return found