dc23277b38
Replaces the gist-based read-time with the SOURCE article's full read time — the
contrast that sells the gist ("calm 1-min version here; ~10 min for the deep dive").
- goodnews/readtime.py: word_count_from_html (strips script/style/nav/header/
footer/form/button/aside furniture before counting) + source_read_minutes
(~225 wpm, 200-word floor, None when extraction looks failed/too thin).
- articles.source_words + read_checked_at columns (count only, never the body;
fits the privacy posture). Idempotent migration.
- enrich.fetch_source_words + enrich_read_times: a bounded, retry-guarded cycle
step (mirrors the image enrichers) that counts words for recent accepted
articles. Only ever writes a real count; never overwrites good with zero. Wired
into the cycle after recent-image enrichment.
- queries: source_words flows through _ARTICLE_COLUMNS; api exposes
source_read_minutes on Article (null when unknown).
- home3: News card shows "Full story · ~N min", hidden entirely when null (no
misleading "1 min").
- Tests: furniture stripping, threshold/rounding, enrich idempotency + no
zero-overwrite, API null handling. 412 backend.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
494 lines
19 KiB
Python
494 lines
19 KiB
Python
"""Bounded hero-image enrichment.
|
||
|
||
The grid stays purely typographic; the daily brief's items are the one place we
|
||
make an exception and fetch a real image — because the hero is the single
|
||
intentional visual doorway. We fetch ONLY the article page's <head> metadata
|
||
(og:image / twitter:image), store ONLY the resulting image URL, and never touch
|
||
the body. This is opt-in, brief-only, once per build.
|
||
|
||
Security (this is the one place we fetch user-/source-supplied pages):
|
||
- http(s) only; short timeout; byte cap; redirects followed manually and capped;
|
||
- every hop's host is DNS-resolved and rejected if ANY resolved address is
|
||
private / loopback / link-local / multicast / reserved / unspecified (SSRF).
|
||
Failures are cached by the caller (image_checked_at) so an article is never
|
||
retried forever.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import ipaddress
|
||
import re
|
||
import socket
|
||
import sqlite3
|
||
import struct
|
||
import urllib.error
|
||
import urllib.request
|
||
from urllib.parse import urljoin, urlsplit
|
||
|
||
from .text import canonicalize_url
|
||
|
||
USER_AGENT = "goodNews/0.1 (+local constructive news prototype)"
|
||
TIMEOUT = 6
|
||
MAX_BYTES = 300_000
|
||
MAX_REDIRECTS = 3
|
||
# Below this, a feed thumbnail upscales to mush in the card banner. Real share
|
||
# images (og:image) are ~1200×630; tiny RSS thumbnails (~90px) are what we reject.
|
||
MIN_IMG_WIDTH = 450
|
||
MIN_IMG_HEIGHT = 250
|
||
|
||
_META_RE = re.compile(rb"<meta\b[^>]*>", re.IGNORECASE)
|
||
_HEAD_END_RE = re.compile(rb"</head>", re.IGNORECASE)
|
||
|
||
# Substrings that mark a generic placeholder/default share image rather than the
|
||
# article's own picture (e.g. NPR's facebook-default). We'd rather show no image
|
||
# (typographic hero) than a generic logo card. NOTE: do NOT add "branded_news" —
|
||
# that's BBC's normal CDN path for real article photos, so rejecting it threw away
|
||
# every BBC hero image and fell back to the tiny RSS thumbnail.
|
||
_GENERIC_IMAGE_MARKERS = (
|
||
"facebook-default",
|
||
"default-wide",
|
||
"default-fb",
|
||
"og-default",
|
||
"default-og",
|
||
"twitter-default",
|
||
"default-image",
|
||
"/placeholder",
|
||
"share-default",
|
||
"social-default",
|
||
# tracking pixels / spacers / data-URIs — never a real share image
|
||
"data:image",
|
||
"/pixel",
|
||
"1x1",
|
||
"spacer",
|
||
"/blank.",
|
||
"transparent.",
|
||
)
|
||
|
||
|
||
def _is_generic_image(url: str) -> bool:
|
||
lowered = url.lower()
|
||
return any(marker in lowered for marker in _GENERIC_IMAGE_MARKERS)
|
||
|
||
|
||
def _prefer_unbranded(url: str) -> str:
|
||
"""Swap BBC's logo-branded image variant for its clean one.
|
||
|
||
BBC's og:image is served from the "branded_news" CDN path with a "BBC NEWS"
|
||
logo baked into the picture (it shows as "…EWS" once the hero crops it). The
|
||
identical photo is served under "cpsprodpb" with no logo, so prefer that — a
|
||
clean hero at the same full resolution.
|
||
"""
|
||
if "ichef.bbci.co.uk" in url and "/branded_news/" in url:
|
||
return url.replace("/branded_news/", "/cpsprodpb/")
|
||
return url
|
||
|
||
|
||
def _attr(tag: bytes, name: bytes) -> bytes | None:
|
||
m = re.search(name + rb"""\s*=\s*["']([^"']*)["']""", tag, re.IGNORECASE)
|
||
return m.group(1) if m else None
|
||
|
||
|
||
def og_image_from_html(html: bytes) -> str | None:
|
||
"""Extract og:image / twitter:image from a page's <head> bytes."""
|
||
head = html.split(b"</head>", 1)[0] if _HEAD_END_RE.search(html) else html
|
||
for tag in _META_RE.findall(head):
|
||
key = _attr(tag, b"property") or _attr(tag, b"name")
|
||
if key and key.lower() in (b"og:image", b"og:image:url", b"twitter:image"):
|
||
content = _attr(tag, b"content")
|
||
if not content:
|
||
continue
|
||
image = canonicalize_url(content.decode("utf-8", "replace"))
|
||
# Skip generic placeholders; keep scanning for a real one.
|
||
if image and not _is_generic_image(image):
|
||
return _prefer_unbranded(image)
|
||
return None
|
||
|
||
|
||
def _host_is_public(host: str | None) -> bool:
|
||
"""True only if the host resolves and ALL its addresses are public."""
|
||
if not host:
|
||
return False
|
||
try:
|
||
infos = socket.getaddrinfo(host, None)
|
||
except (socket.gaierror, UnicodeError, OSError):
|
||
return False
|
||
addrs = {info[4][0] for info in infos}
|
||
if not addrs:
|
||
return False
|
||
for addr in addrs:
|
||
try:
|
||
ip = ipaddress.ip_address(addr.split("%")[0]) # strip scope id
|
||
except ValueError:
|
||
return False
|
||
if (
|
||
ip.is_private or ip.is_loopback or ip.is_link_local
|
||
or ip.is_multicast or ip.is_reserved or ip.is_unspecified
|
||
):
|
||
return False
|
||
return True
|
||
|
||
|
||
class _NoRedirect(urllib.request.HTTPRedirectHandler):
|
||
# Don't auto-follow — we re-validate each hop's host ourselves.
|
||
def redirect_request(self, *args, **kwargs):
|
||
return None
|
||
|
||
|
||
def fetch_og_image(url: str | None) -> str | None:
|
||
"""Fetch a page's head metadata and return its og:image URL, or None.
|
||
|
||
Best-effort and safe: returns None on any error, bad scheme, redirect loop,
|
||
or a host that resolves to a non-public address.
|
||
"""
|
||
opener = urllib.request.build_opener(_NoRedirect)
|
||
for _ in range(MAX_REDIRECTS + 1):
|
||
if not url:
|
||
return None
|
||
parts = urlsplit(url)
|
||
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
|
||
return None
|
||
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"})
|
||
try:
|
||
response = opener.open(request, timeout=TIMEOUT)
|
||
except (urllib.error.URLError, OSError, ValueError):
|
||
return None
|
||
status = getattr(response, "status", 200) or 200
|
||
if status in (301, 302, 303, 307, 308):
|
||
location = response.headers.get("Location")
|
||
response.close()
|
||
if not location:
|
||
return None
|
||
url = urljoin(url, location)
|
||
continue
|
||
ctype = response.headers.get("Content-Type", "")
|
||
if "html" not in ctype.lower():
|
||
response.close()
|
||
return None
|
||
try:
|
||
body = response.read(MAX_BYTES)
|
||
finally:
|
||
response.close()
|
||
image = og_image_from_html(body)
|
||
# A stored URL is not proof it renders — confirm it actually loads.
|
||
return image if (image and _image_loads(image)) else None
|
||
return None # too many redirects
|
||
|
||
|
||
# Word counting reads more of the body than image metadata (which only needs <head>).
|
||
_READ_MAX_BYTES = 900_000
|
||
|
||
|
||
def fetch_source_words(url: str | None) -> int | None:
|
||
"""Fetch a page and return its full-article word count (furniture stripped), or
|
||
None on any failure or a too-thin extraction (JS/video/paywall pages). Same SSRF
|
||
safety as fetch_og_image; we read the count only, never store the body."""
|
||
from .readtime import source_read_minutes, word_count_from_html
|
||
opener = urllib.request.build_opener(_NoRedirect)
|
||
for _ in range(MAX_REDIRECTS + 1):
|
||
if not url:
|
||
return None
|
||
parts = urlsplit(url)
|
||
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
|
||
return None
|
||
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"})
|
||
try:
|
||
response = opener.open(request, timeout=TIMEOUT)
|
||
except (urllib.error.URLError, OSError, ValueError):
|
||
return None
|
||
status = getattr(response, "status", 200) or 200
|
||
if status in (301, 302, 303, 307, 308):
|
||
location = response.headers.get("Location")
|
||
response.close()
|
||
if not location:
|
||
return None
|
||
url = urljoin(url, location)
|
||
continue
|
||
if "html" not in response.headers.get("Content-Type", "").lower():
|
||
response.close()
|
||
return None
|
||
try:
|
||
body = response.read(_READ_MAX_BYTES)
|
||
finally:
|
||
response.close()
|
||
words = word_count_from_html(body)
|
||
return words if source_read_minutes(words) is not None else None
|
||
return None # too many redirects
|
||
|
||
|
||
def _image_dimensions(data: bytes) -> "tuple[int, int] | None":
|
||
"""Best-effort (width, height) from an image file's header bytes — PNG, GIF,
|
||
JPEG, WebP. Returns None for formats we can't cheaply measure (e.g. SVG)."""
|
||
if len(data) < 10:
|
||
return None
|
||
if len(data) >= 24 and data[:8] == b"\x89PNG\r\n\x1a\n" and data[12:16] == b"IHDR":
|
||
return struct.unpack(">II", data[16:24])
|
||
if data[:6] in (b"GIF87a", b"GIF89a"):
|
||
return struct.unpack("<HH", data[6:10])
|
||
if data[:2] == b"\xff\xd8": # JPEG: scan for a Start-Of-Frame marker
|
||
i, n = 2, len(data)
|
||
while i < n - 9:
|
||
if data[i] != 0xFF:
|
||
i += 1
|
||
continue
|
||
marker = data[i + 1]
|
||
if marker in (0xC0, 0xC1, 0xC2, 0xC3, 0xC5, 0xC6, 0xC7, 0xC9, 0xCA, 0xCB, 0xCD, 0xCE, 0xCF):
|
||
h, w = struct.unpack(">HH", data[i + 5:i + 9])
|
||
return (w, h)
|
||
if marker == 0xD8 or marker == 0xD9 or 0xD0 <= marker <= 0xD7:
|
||
i += 2
|
||
continue
|
||
i += 2 + struct.unpack(">H", data[i + 2:i + 4])[0]
|
||
return None
|
||
if data[:4] == b"RIFF" and data[8:12] == b"WEBP":
|
||
fmt = data[12:16]
|
||
try:
|
||
if fmt == b"VP8 ":
|
||
return (struct.unpack("<H", data[26:28])[0] & 0x3FFF,
|
||
struct.unpack("<H", data[28:30])[0] & 0x3FFF)
|
||
if fmt == b"VP8X":
|
||
return ((data[24] | data[25] << 8 | data[26] << 16) + 1,
|
||
(data[27] | data[28] << 8 | data[29] << 16) + 1)
|
||
except (struct.error, IndexError):
|
||
return None
|
||
return None
|
||
|
||
|
||
def _image_loads(url: str) -> bool:
|
||
"""Confirm an image URL returns a real, big-enough image (HTTP 200 + image/*
|
||
+ dimensions ≥ the minimum).
|
||
|
||
Two failure modes this guards against: signed/hotlink-protected URLs that
|
||
401/403 on a direct load (e.g. the Guardian's i.guim.co.uk), and tiny feed
|
||
thumbnails (~90px) that upscale to mush in the card banner. We request as the
|
||
browser does — no referrer — with the same per-hop host safety as the page
|
||
fetch. Images we can't measure (SVG/AVIF) pass on content-type alone.
|
||
"""
|
||
opener = urllib.request.build_opener(_NoRedirect)
|
||
for _ in range(MAX_REDIRECTS + 1):
|
||
if not url:
|
||
return False
|
||
parts = urlsplit(url)
|
||
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
|
||
return False
|
||
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "image/*,*/*"})
|
||
try:
|
||
response = opener.open(request, timeout=TIMEOUT)
|
||
except (urllib.error.URLError, OSError, ValueError):
|
||
return False
|
||
try:
|
||
status = getattr(response, "status", 200) or 200
|
||
if status in (301, 302, 303, 307, 308):
|
||
location = response.headers.get("Location")
|
||
if not location:
|
||
return False
|
||
url = urljoin(url, location)
|
||
continue
|
||
ctype = (response.headers.get("Content-Type") or "").lower()
|
||
if status != 200 or not ctype.startswith("image/"):
|
||
return False
|
||
head = response.read(200_000)
|
||
finally:
|
||
response.close()
|
||
dims = _image_dimensions(head)
|
||
if dims and (dims[0] < MIN_IMG_WIDTH or dims[1] < MIN_IMG_HEIGHT):
|
||
return False # too small — would upscale to mush
|
||
return True
|
||
return False
|
||
|
||
|
||
def prune_broken_images(conn: sqlite3.Connection, check=_image_loads, limit: int = 3000) -> int:
|
||
"""Clear stored image URLs that no longer load (signed/expired/hotlink-
|
||
protected), so coverage is honest and those cards fall back to the calm
|
||
placeholder cleanly instead of attempting a doomed fetch. Returns count cleared.
|
||
"""
|
||
rows = conn.execute(
|
||
"SELECT id, image_url FROM articles WHERE image_url IS NOT NULL AND image_url != '' "
|
||
"ORDER BY id DESC LIMIT ?",
|
||
(limit,),
|
||
).fetchall()
|
||
cleared = 0
|
||
for row in rows:
|
||
if not check(row["image_url"]):
|
||
conn.execute(
|
||
"UPDATE articles SET image_url = NULL, image_checked_at = CURRENT_TIMESTAMP WHERE id = ?",
|
||
(row["id"],),
|
||
)
|
||
cleared += 1
|
||
conn.commit()
|
||
return cleared
|
||
|
||
|
||
def enrich_brief_images(
|
||
conn: sqlite3.Connection, brief_date: str, fetch=fetch_og_image, limit: int = 7, retry_days: int = 2
|
||
) -> int:
|
||
"""Fetch a hero-quality image for brief items that lack one.
|
||
|
||
Any of the brief's items can become the hero (via the client's fallback or a
|
||
replace), so this covers the whole brief (limit defaults to the brief size, 7),
|
||
not just the top few. Items already carrying an image are left alone; items
|
||
still without one are retried after `retry_days` so a transient fetch failure
|
||
or a weaker earlier extractor doesn't mark an article imageless forever.
|
||
Returns how many images were newly found.
|
||
"""
|
||
# Fetch even when a feed image exists, because feed thumbnails are often tiny
|
||
# and the hero is shown large — a page's og:image is the better hero visual.
|
||
rows = conn.execute(
|
||
"""
|
||
SELECT a.id, a.canonical_url
|
||
FROM daily_briefs b
|
||
JOIN daily_brief_items bi ON bi.brief_id = b.id
|
||
JOIN articles a ON a.id = bi.article_id
|
||
WHERE b.brief_date = ?
|
||
AND (
|
||
a.image_checked_at IS NULL
|
||
OR ((a.image_url IS NULL OR a.image_url = '')
|
||
AND a.image_checked_at < datetime('now', ?))
|
||
)
|
||
ORDER BY bi.rank
|
||
LIMIT ?
|
||
""",
|
||
(brief_date, f"-{retry_days} days", limit),
|
||
).fetchall()
|
||
|
||
found = 0
|
||
for row in rows:
|
||
try:
|
||
image = fetch(row["canonical_url"])
|
||
except Exception:
|
||
image = None
|
||
conn.execute(
|
||
"UPDATE articles SET image_url = COALESCE(?, image_url), image_checked_at = CURRENT_TIMESTAMP "
|
||
"WHERE id = ?",
|
||
(image, row["id"]),
|
||
)
|
||
if image:
|
||
found += 1
|
||
conn.commit()
|
||
return found
|
||
|
||
|
||
def enrich_article_image(
|
||
conn: sqlite3.Connection, article_id: int, fetch=fetch_og_image, retry_days: int = 7
|
||
) -> bool:
|
||
"""Attention-triggered: fetch an og:image for ONE article that lacks one.
|
||
|
||
Called when an article earns a summary (i.e. it's actually being read), so we
|
||
only spend a fetch on articles a reader has reached. Leaves an existing image
|
||
alone; retries a still-imageless article only after `retry_days`. Returns True
|
||
if a new image was stored. Best-effort — never raises.
|
||
"""
|
||
row = conn.execute(
|
||
"""
|
||
SELECT id, canonical_url FROM articles
|
||
WHERE id = ?
|
||
AND (image_url IS NULL OR image_url = '')
|
||
AND (image_checked_at IS NULL OR image_checked_at < datetime('now', ?))
|
||
""",
|
||
(article_id, f"-{retry_days} days"),
|
||
).fetchone()
|
||
if not row:
|
||
return False # has an image already, or checked too recently
|
||
try:
|
||
image = fetch(row["canonical_url"])
|
||
except Exception:
|
||
image = None
|
||
conn.execute(
|
||
"UPDATE articles SET image_url = COALESCE(?, image_url), image_checked_at = CURRENT_TIMESTAMP "
|
||
"WHERE id = ?",
|
||
(image, article_id),
|
||
)
|
||
conn.commit()
|
||
return bool(image)
|
||
|
||
|
||
def enrich_recent_images(
|
||
conn: sqlite3.Connection, fetch=fetch_og_image, limit: int = 40, retry_days: int = 7
|
||
) -> int:
|
||
"""Keep the Latest feed photo-rich: fetch a quality og:image for the newest
|
||
accepted, non-duplicate articles that lack one. Bounded per run, so it tracks
|
||
fresh content without blanket-fetching the archive. Returns newly-found count.
|
||
"""
|
||
rows = conn.execute(
|
||
"""
|
||
SELECT a.id FROM articles a
|
||
JOIN article_scores s ON s.article_id = a.id
|
||
WHERE s.accepted = 1 AND a.duplicate_of IS NULL
|
||
AND (a.image_url IS NULL OR a.image_url = '')
|
||
AND (a.image_checked_at IS NULL OR a.image_checked_at < datetime('now', ?))
|
||
ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
|
||
LIMIT ?
|
||
""",
|
||
(f"-{retry_days} days", limit),
|
||
).fetchall()
|
||
found = 0
|
||
for row in rows:
|
||
if enrich_article_image(conn, row["id"], fetch=fetch, retry_days=retry_days):
|
||
found += 1
|
||
return found
|
||
|
||
|
||
def enrich_summarized_images(
|
||
conn: sqlite3.Connection, fetch=fetch_og_image, limit: int = 50, retry_days: int = 7
|
||
) -> int:
|
||
"""Slow backfill: give already-summarized, accepted articles an image if they
|
||
lack one. Run in modest batches so we never blast publishers. Returns count
|
||
of newly-found images.
|
||
"""
|
||
rows = conn.execute(
|
||
"""
|
||
SELECT a.id FROM articles a
|
||
JOIN article_summaries m ON m.article_id = a.id
|
||
JOIN article_scores s ON s.article_id = a.id
|
||
WHERE s.accepted = 1 AND a.duplicate_of IS NULL
|
||
AND (a.image_url IS NULL OR a.image_url = '')
|
||
AND (a.image_checked_at IS NULL OR a.image_checked_at < datetime('now', ?))
|
||
ORDER BY a.id DESC
|
||
LIMIT ?
|
||
""",
|
||
(f"-{retry_days} days", limit),
|
||
).fetchall()
|
||
found = 0
|
||
for row in rows:
|
||
if enrich_article_image(conn, row["id"], fetch=fetch, retry_days=retry_days):
|
||
found += 1
|
||
return found
|
||
|
||
|
||
def enrich_read_times(
|
||
conn: sqlite3.Connection, fetch=fetch_source_words, limit: int = 40, retry_days: int = 14
|
||
) -> int:
|
||
"""Give recent accepted articles a full-article word count, so the front door can
|
||
show "Full story · ~N min" next to our one-minute gist. Bounded per run (mirrors
|
||
the image enrichers); fetches each article once, retrying a failed/too-thin
|
||
extraction only after `retry_days`. Returns how many real counts were stored."""
|
||
rows = conn.execute(
|
||
"""
|
||
SELECT a.id, a.canonical_url FROM articles a
|
||
JOIN article_scores s ON s.article_id = a.id
|
||
WHERE s.accepted = 1 AND a.duplicate_of IS NULL
|
||
AND a.source_words IS NULL
|
||
AND (a.read_checked_at IS NULL OR a.read_checked_at < datetime('now', ?))
|
||
ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
|
||
LIMIT ?
|
||
""",
|
||
(f"-{retry_days} days", limit),
|
||
).fetchall()
|
||
found = 0
|
||
for row in rows:
|
||
try:
|
||
words = fetch(row["canonical_url"])
|
||
except Exception:
|
||
words = None
|
||
# Only ever write a REAL count; never overwrite a good value with null/zero.
|
||
# Always stamp the check time so failed/thin pages aren't re-fetched until retry.
|
||
if words:
|
||
conn.execute(
|
||
"UPDATE articles SET source_words = ?, read_checked_at = CURRENT_TIMESTAMP WHERE id = ?",
|
||
(words, row["id"]),
|
||
)
|
||
found += 1
|
||
else:
|
||
conn.execute("UPDATE articles SET read_checked_at = CURRENT_TIMESTAMP WHERE id = ?", (row["id"],))
|
||
conn.commit()
|
||
return found
|