Files
thejayman77 dc23277b38 Read-time: full-article "Full story · ~N min" badge (Option B)
Replaces the gist-based read-time with the SOURCE article's full read time — the
contrast that sells the gist ("calm 1-min version here; ~10 min for the deep dive").

- goodnews/readtime.py: word_count_from_html (strips script/style/nav/header/
  footer/form/button/aside furniture before counting) + source_read_minutes
  (~225 wpm, 200-word floor, None when extraction looks failed/too thin).
- articles.source_words + read_checked_at columns (count only, never the body;
  fits the privacy posture). Idempotent migration.
- enrich.fetch_source_words + enrich_read_times: a bounded, retry-guarded cycle
  step (mirrors the image enrichers) that counts words for recent accepted
  articles. Only ever writes a real count; never overwrites good with zero. Wired
  into the cycle after recent-image enrichment.
- queries: source_words flows through _ARTICLE_COLUMNS; api exposes
  source_read_minutes on Article (null when unknown).
- home3: News card shows "Full story · ~N min", hidden entirely when null (no
  misleading "1 min").
- Tests: furniture stripping, threshold/rounding, enrich idempotency + no
  zero-overwrite, API null handling. 412 backend.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-23 08:09:00 -04:00

494 lines
19 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Bounded hero-image enrichment.
The grid stays purely typographic; the daily brief's items are the one place we
make an exception and fetch a real image — because the hero is the single
intentional visual doorway. We fetch ONLY the article page's <head> metadata
(og:image / twitter:image), store ONLY the resulting image URL, and never touch
the body. This is opt-in, brief-only, once per build.
Security (this is the one place we fetch user-/source-supplied pages):
- http(s) only; short timeout; byte cap; redirects followed manually and capped;
- every hop's host is DNS-resolved and rejected if ANY resolved address is
private / loopback / link-local / multicast / reserved / unspecified (SSRF).
Failures are cached by the caller (image_checked_at) so an article is never
retried forever.
"""
from __future__ import annotations
import ipaddress
import re
import socket
import sqlite3
import struct
import urllib.error
import urllib.request
from urllib.parse import urljoin, urlsplit
from .text import canonicalize_url
USER_AGENT = "goodNews/0.1 (+local constructive news prototype)"
TIMEOUT = 6
MAX_BYTES = 300_000
MAX_REDIRECTS = 3
# Below this, a feed thumbnail upscales to mush in the card banner. Real share
# images (og:image) are ~1200×630; tiny RSS thumbnails (~90px) are what we reject.
MIN_IMG_WIDTH = 450
MIN_IMG_HEIGHT = 250
_META_RE = re.compile(rb"<meta\b[^>]*>", re.IGNORECASE)
_HEAD_END_RE = re.compile(rb"</head>", re.IGNORECASE)
# Substrings that mark a generic placeholder/default share image rather than the
# article's own picture (e.g. NPR's facebook-default). We'd rather show no image
# (typographic hero) than a generic logo card. NOTE: do NOT add "branded_news" —
# that's BBC's normal CDN path for real article photos, so rejecting it threw away
# every BBC hero image and fell back to the tiny RSS thumbnail.
_GENERIC_IMAGE_MARKERS = (
"facebook-default",
"default-wide",
"default-fb",
"og-default",
"default-og",
"twitter-default",
"default-image",
"/placeholder",
"share-default",
"social-default",
# tracking pixels / spacers / data-URIs — never a real share image
"data:image",
"/pixel",
"1x1",
"spacer",
"/blank.",
"transparent.",
)
def _is_generic_image(url: str) -> bool:
lowered = url.lower()
return any(marker in lowered for marker in _GENERIC_IMAGE_MARKERS)
def _prefer_unbranded(url: str) -> str:
"""Swap BBC's logo-branded image variant for its clean one.
BBC's og:image is served from the "branded_news" CDN path with a "BBC NEWS"
logo baked into the picture (it shows as "…EWS" once the hero crops it). The
identical photo is served under "cpsprodpb" with no logo, so prefer that — a
clean hero at the same full resolution.
"""
if "ichef.bbci.co.uk" in url and "/branded_news/" in url:
return url.replace("/branded_news/", "/cpsprodpb/")
return url
def _attr(tag: bytes, name: bytes) -> bytes | None:
m = re.search(name + rb"""\s*=\s*["']([^"']*)["']""", tag, re.IGNORECASE)
return m.group(1) if m else None
def og_image_from_html(html: bytes) -> str | None:
"""Extract og:image / twitter:image from a page's <head> bytes."""
head = html.split(b"</head>", 1)[0] if _HEAD_END_RE.search(html) else html
for tag in _META_RE.findall(head):
key = _attr(tag, b"property") or _attr(tag, b"name")
if key and key.lower() in (b"og:image", b"og:image:url", b"twitter:image"):
content = _attr(tag, b"content")
if not content:
continue
image = canonicalize_url(content.decode("utf-8", "replace"))
# Skip generic placeholders; keep scanning for a real one.
if image and not _is_generic_image(image):
return _prefer_unbranded(image)
return None
def _host_is_public(host: str | None) -> bool:
"""True only if the host resolves and ALL its addresses are public."""
if not host:
return False
try:
infos = socket.getaddrinfo(host, None)
except (socket.gaierror, UnicodeError, OSError):
return False
addrs = {info[4][0] for info in infos}
if not addrs:
return False
for addr in addrs:
try:
ip = ipaddress.ip_address(addr.split("%")[0]) # strip scope id
except ValueError:
return False
if (
ip.is_private or ip.is_loopback or ip.is_link_local
or ip.is_multicast or ip.is_reserved or ip.is_unspecified
):
return False
return True
class _NoRedirect(urllib.request.HTTPRedirectHandler):
# Don't auto-follow — we re-validate each hop's host ourselves.
def redirect_request(self, *args, **kwargs):
return None
def fetch_og_image(url: str | None) -> str | None:
"""Fetch a page's head metadata and return its og:image URL, or None.
Best-effort and safe: returns None on any error, bad scheme, redirect loop,
or a host that resolves to a non-public address.
"""
opener = urllib.request.build_opener(_NoRedirect)
for _ in range(MAX_REDIRECTS + 1):
if not url:
return None
parts = urlsplit(url)
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
return None
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"})
try:
response = opener.open(request, timeout=TIMEOUT)
except (urllib.error.URLError, OSError, ValueError):
return None
status = getattr(response, "status", 200) or 200
if status in (301, 302, 303, 307, 308):
location = response.headers.get("Location")
response.close()
if not location:
return None
url = urljoin(url, location)
continue
ctype = response.headers.get("Content-Type", "")
if "html" not in ctype.lower():
response.close()
return None
try:
body = response.read(MAX_BYTES)
finally:
response.close()
image = og_image_from_html(body)
# A stored URL is not proof it renders — confirm it actually loads.
return image if (image and _image_loads(image)) else None
return None # too many redirects
# Word counting reads more of the body than image metadata (which only needs <head>).
_READ_MAX_BYTES = 900_000
def fetch_source_words(url: str | None) -> int | None:
"""Fetch a page and return its full-article word count (furniture stripped), or
None on any failure or a too-thin extraction (JS/video/paywall pages). Same SSRF
safety as fetch_og_image; we read the count only, never store the body."""
from .readtime import source_read_minutes, word_count_from_html
opener = urllib.request.build_opener(_NoRedirect)
for _ in range(MAX_REDIRECTS + 1):
if not url:
return None
parts = urlsplit(url)
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
return None
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"})
try:
response = opener.open(request, timeout=TIMEOUT)
except (urllib.error.URLError, OSError, ValueError):
return None
status = getattr(response, "status", 200) or 200
if status in (301, 302, 303, 307, 308):
location = response.headers.get("Location")
response.close()
if not location:
return None
url = urljoin(url, location)
continue
if "html" not in response.headers.get("Content-Type", "").lower():
response.close()
return None
try:
body = response.read(_READ_MAX_BYTES)
finally:
response.close()
words = word_count_from_html(body)
return words if source_read_minutes(words) is not None else None
return None # too many redirects
def _image_dimensions(data: bytes) -> "tuple[int, int] | None":
"""Best-effort (width, height) from an image file's header bytes — PNG, GIF,
JPEG, WebP. Returns None for formats we can't cheaply measure (e.g. SVG)."""
if len(data) < 10:
return None
if len(data) >= 24 and data[:8] == b"\x89PNG\r\n\x1a\n" and data[12:16] == b"IHDR":
return struct.unpack(">II", data[16:24])
if data[:6] in (b"GIF87a", b"GIF89a"):
return struct.unpack("<HH", data[6:10])
if data[:2] == b"\xff\xd8": # JPEG: scan for a Start-Of-Frame marker
i, n = 2, len(data)
while i < n - 9:
if data[i] != 0xFF:
i += 1
continue
marker = data[i + 1]
if marker in (0xC0, 0xC1, 0xC2, 0xC3, 0xC5, 0xC6, 0xC7, 0xC9, 0xCA, 0xCB, 0xCD, 0xCE, 0xCF):
h, w = struct.unpack(">HH", data[i + 5:i + 9])
return (w, h)
if marker == 0xD8 or marker == 0xD9 or 0xD0 <= marker <= 0xD7:
i += 2
continue
i += 2 + struct.unpack(">H", data[i + 2:i + 4])[0]
return None
if data[:4] == b"RIFF" and data[8:12] == b"WEBP":
fmt = data[12:16]
try:
if fmt == b"VP8 ":
return (struct.unpack("<H", data[26:28])[0] & 0x3FFF,
struct.unpack("<H", data[28:30])[0] & 0x3FFF)
if fmt == b"VP8X":
return ((data[24] | data[25] << 8 | data[26] << 16) + 1,
(data[27] | data[28] << 8 | data[29] << 16) + 1)
except (struct.error, IndexError):
return None
return None
def _image_loads(url: str) -> bool:
"""Confirm an image URL returns a real, big-enough image (HTTP 200 + image/*
+ dimensions ≥ the minimum).
Two failure modes this guards against: signed/hotlink-protected URLs that
401/403 on a direct load (e.g. the Guardian's i.guim.co.uk), and tiny feed
thumbnails (~90px) that upscale to mush in the card banner. We request as the
browser does — no referrer — with the same per-hop host safety as the page
fetch. Images we can't measure (SVG/AVIF) pass on content-type alone.
"""
opener = urllib.request.build_opener(_NoRedirect)
for _ in range(MAX_REDIRECTS + 1):
if not url:
return False
parts = urlsplit(url)
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
return False
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "image/*,*/*"})
try:
response = opener.open(request, timeout=TIMEOUT)
except (urllib.error.URLError, OSError, ValueError):
return False
try:
status = getattr(response, "status", 200) or 200
if status in (301, 302, 303, 307, 308):
location = response.headers.get("Location")
if not location:
return False
url = urljoin(url, location)
continue
ctype = (response.headers.get("Content-Type") or "").lower()
if status != 200 or not ctype.startswith("image/"):
return False
head = response.read(200_000)
finally:
response.close()
dims = _image_dimensions(head)
if dims and (dims[0] < MIN_IMG_WIDTH or dims[1] < MIN_IMG_HEIGHT):
return False # too small — would upscale to mush
return True
return False
def prune_broken_images(conn: sqlite3.Connection, check=_image_loads, limit: int = 3000) -> int:
"""Clear stored image URLs that no longer load (signed/expired/hotlink-
protected), so coverage is honest and those cards fall back to the calm
placeholder cleanly instead of attempting a doomed fetch. Returns count cleared.
"""
rows = conn.execute(
"SELECT id, image_url FROM articles WHERE image_url IS NOT NULL AND image_url != '' "
"ORDER BY id DESC LIMIT ?",
(limit,),
).fetchall()
cleared = 0
for row in rows:
if not check(row["image_url"]):
conn.execute(
"UPDATE articles SET image_url = NULL, image_checked_at = CURRENT_TIMESTAMP WHERE id = ?",
(row["id"],),
)
cleared += 1
conn.commit()
return cleared
def enrich_brief_images(
conn: sqlite3.Connection, brief_date: str, fetch=fetch_og_image, limit: int = 7, retry_days: int = 2
) -> int:
"""Fetch a hero-quality image for brief items that lack one.
Any of the brief's items can become the hero (via the client's fallback or a
replace), so this covers the whole brief (limit defaults to the brief size, 7),
not just the top few. Items already carrying an image are left alone; items
still without one are retried after `retry_days` so a transient fetch failure
or a weaker earlier extractor doesn't mark an article imageless forever.
Returns how many images were newly found.
"""
# Fetch even when a feed image exists, because feed thumbnails are often tiny
# and the hero is shown large — a page's og:image is the better hero visual.
rows = conn.execute(
"""
SELECT a.id, a.canonical_url
FROM daily_briefs b
JOIN daily_brief_items bi ON bi.brief_id = b.id
JOIN articles a ON a.id = bi.article_id
WHERE b.brief_date = ?
AND (
a.image_checked_at IS NULL
OR ((a.image_url IS NULL OR a.image_url = '')
AND a.image_checked_at < datetime('now', ?))
)
ORDER BY bi.rank
LIMIT ?
""",
(brief_date, f"-{retry_days} days", limit),
).fetchall()
found = 0
for row in rows:
try:
image = fetch(row["canonical_url"])
except Exception:
image = None
conn.execute(
"UPDATE articles SET image_url = COALESCE(?, image_url), image_checked_at = CURRENT_TIMESTAMP "
"WHERE id = ?",
(image, row["id"]),
)
if image:
found += 1
conn.commit()
return found
def enrich_article_image(
conn: sqlite3.Connection, article_id: int, fetch=fetch_og_image, retry_days: int = 7
) -> bool:
"""Attention-triggered: fetch an og:image for ONE article that lacks one.
Called when an article earns a summary (i.e. it's actually being read), so we
only spend a fetch on articles a reader has reached. Leaves an existing image
alone; retries a still-imageless article only after `retry_days`. Returns True
if a new image was stored. Best-effort — never raises.
"""
row = conn.execute(
"""
SELECT id, canonical_url FROM articles
WHERE id = ?
AND (image_url IS NULL OR image_url = '')
AND (image_checked_at IS NULL OR image_checked_at < datetime('now', ?))
""",
(article_id, f"-{retry_days} days"),
).fetchone()
if not row:
return False # has an image already, or checked too recently
try:
image = fetch(row["canonical_url"])
except Exception:
image = None
conn.execute(
"UPDATE articles SET image_url = COALESCE(?, image_url), image_checked_at = CURRENT_TIMESTAMP "
"WHERE id = ?",
(image, article_id),
)
conn.commit()
return bool(image)
def enrich_recent_images(
conn: sqlite3.Connection, fetch=fetch_og_image, limit: int = 40, retry_days: int = 7
) -> int:
"""Keep the Latest feed photo-rich: fetch a quality og:image for the newest
accepted, non-duplicate articles that lack one. Bounded per run, so it tracks
fresh content without blanket-fetching the archive. Returns newly-found count.
"""
rows = conn.execute(
"""
SELECT a.id FROM articles a
JOIN article_scores s ON s.article_id = a.id
WHERE s.accepted = 1 AND a.duplicate_of IS NULL
AND (a.image_url IS NULL OR a.image_url = '')
AND (a.image_checked_at IS NULL OR a.image_checked_at < datetime('now', ?))
ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
LIMIT ?
""",
(f"-{retry_days} days", limit),
).fetchall()
found = 0
for row in rows:
if enrich_article_image(conn, row["id"], fetch=fetch, retry_days=retry_days):
found += 1
return found
def enrich_summarized_images(
conn: sqlite3.Connection, fetch=fetch_og_image, limit: int = 50, retry_days: int = 7
) -> int:
"""Slow backfill: give already-summarized, accepted articles an image if they
lack one. Run in modest batches so we never blast publishers. Returns count
of newly-found images.
"""
rows = conn.execute(
"""
SELECT a.id FROM articles a
JOIN article_summaries m ON m.article_id = a.id
JOIN article_scores s ON s.article_id = a.id
WHERE s.accepted = 1 AND a.duplicate_of IS NULL
AND (a.image_url IS NULL OR a.image_url = '')
AND (a.image_checked_at IS NULL OR a.image_checked_at < datetime('now', ?))
ORDER BY a.id DESC
LIMIT ?
""",
(f"-{retry_days} days", limit),
).fetchall()
found = 0
for row in rows:
if enrich_article_image(conn, row["id"], fetch=fetch, retry_days=retry_days):
found += 1
return found
def enrich_read_times(
conn: sqlite3.Connection, fetch=fetch_source_words, limit: int = 40, retry_days: int = 14
) -> int:
"""Give recent accepted articles a full-article word count, so the front door can
show "Full story · ~N min" next to our one-minute gist. Bounded per run (mirrors
the image enrichers); fetches each article once, retrying a failed/too-thin
extraction only after `retry_days`. Returns how many real counts were stored."""
rows = conn.execute(
"""
SELECT a.id, a.canonical_url FROM articles a
JOIN article_scores s ON s.article_id = a.id
WHERE s.accepted = 1 AND a.duplicate_of IS NULL
AND a.source_words IS NULL
AND (a.read_checked_at IS NULL OR a.read_checked_at < datetime('now', ?))
ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
LIMIT ?
""",
(f"-{retry_days} days", limit),
).fetchall()
found = 0
for row in rows:
try:
words = fetch(row["canonical_url"])
except Exception:
words = None
# Only ever write a REAL count; never overwrite a good value with null/zero.
# Always stamp the check time so failed/thin pages aren't re-fetched until retry.
if words:
conn.execute(
"UPDATE articles SET source_words = ?, read_checked_at = CURRENT_TIMESTAMP WHERE id = ?",
(words, row["id"]),
)
found += 1
else:
conn.execute("UPDATE articles SET read_checked_at = CURRENT_TIMESTAMP WHERE id = ?", (row["id"],))
conn.commit()
return found