Read-time: full-article "Full story · ~N min" badge (Option B)

Replaces the gist-based read-time with the SOURCE article's full read time — the contrast that sells the gist ("calm 1-min version here; ~10 min for the deep dive"). - goodnews/readtime.py: word_count_from_html (strips script/style/nav/header/ footer/form/button/aside furniture before counting) + source_read_minutes (~225 wpm, 200-word floor, None when extraction looks failed/too thin). - articles.source_words + read_checked_at columns (count only, never the body; fits the privacy posture). Idempotent migration. - enrich.fetch_source_words + enrich_read_times: a bounded, retry-guarded cycle step (mirrors the image enrichers) that counts words for recent accepted articles. Only ever writes a real count; never overwrites good with zero. Wired into the cycle after recent-image enrichment. - queries: source_words flows through _ARTICLE_COLUMNS; api exposes source_read_minutes on Article (null when unknown). - home3: News card shows "Full story · ~N min", hidden entirely when null (no misleading "1 min"). - Tests: furniture stripping, threshold/rounding, enrich idempotency + no zero-overwrite, API null handling. 412 backend. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-23 08:09:00 -04:00
parent bdf3b1f47b
commit dc23277b38
8 changed files with 230 additions and 7 deletions
@@ -36,7 +36,7 @@ from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel

-from . import art, auth, bloom, daily, email_send, feeds, games, oauth_google, onthisday, publishing, queries, quote, share, sources, summarize, wotd
+from . import art, auth, bloom, daily, email_send, feeds, games, oauth_google, onthisday, publishing, queries, quote, readtime, share, sources, summarize, wotd
 from .localtime import local_today
 from .markup import reply_html_to_text, sanitize_reply_html
 from .db import connect
@@ -322,6 +322,7 @@ class Article(BaseModel):
    paywalled: bool = False
    tags: list[str] = []
    summary: str | None = None  # our own cached summary (present on the brief)
+    source_read_minutes: int | None = None  # ~minutes to read the FULL source article (null = unknown)
    # Subject geography (present on feed rows; absent/empty on the brief). breadth is
    # locality|regional|national|multinational|global|unknown; places are ISO codes.
    geo_breadth: str | None = None
@@ -345,6 +346,7 @@ class Article(BaseModel):
            geo_confidence=row.get("geo_confidence"),
            geo_places=places,
            summary=row.get("summary"),
+            source_read_minutes=readtime.source_read_minutes(row.get("source_words")),
            id=row["id"],
            title=row["title"],
            description=row.get("description"),
@@ -14,7 +14,7 @@ from .localtime import local_today
 from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, cluster_duplicates, dedup as run_dedup
 from .geo import tag_articles as tag_geo
 from . import art, onthisday, quote, wotd
-from .enrich import enrich_brief_images, enrich_recent_images, enrich_summarized_images
+from .enrich import enrich_brief_images, enrich_read_times, enrich_recent_images, enrich_summarized_images
 from .summarize import generate_summary, get_summary
 from .feeds import (
    fetch_feed,
@@ -599,6 +599,15 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non
        except Exception as exc:
            print(f"recent images: skipped ({exc})")

+        # Full-article read-times: count words for recent accepted articles so the
+        # front door can show "Full story · ~N min" next to our gist (bounded per cycle).
+        try:
+            reads = enrich_read_times(conn)
+            if reads:
+                print(f"read-times: {reads} counted")
+        except Exception as exc:
+            print(f"read-times: skipped ({exc})")
+
        # Pre-warm summaries for today's brief so Today reads as a calm briefing.
        # Idempotent: cached items are skipped, so this only hits the LLM for new ones.
        try:
@@ -49,6 +49,8 @@ CREATE TABLE IF NOT EXISTS articles (
    title_hash TEXT,
    duplicate_of INTEGER REFERENCES articles(id) ON DELETE SET NULL,
    image_checked_at TEXT,
+    source_words INTEGER,          -- full-article word count (metadata only; never the body)
+    read_checked_at TEXT,          -- when we last tried to count words (retry guard)
    FOREIGN KEY (source_id) REFERENCES sources(id)
 );

@@ -595,6 +597,10 @@ def _migrate(conn: sqlite3.Connection) -> None:
        )
    if "image_checked_at" not in article_cols:
        conn.execute("ALTER TABLE articles ADD COLUMN image_checked_at TEXT")
+    if "source_words" not in article_cols:        # full-article read-time (count only, no body)
+        conn.execute("ALTER TABLE articles ADD COLUMN source_words INTEGER")
+    if "read_checked_at" not in article_cols:
+        conn.execute("ALTER TABLE articles ADD COLUMN read_checked_at TEXT")
    # Created here (not in SCHEMA) so it runs after the column exists on upgrades.
    conn.execute("CREATE INDEX IF NOT EXISTS idx_articles_duplicate_of ON articles(duplicate_of)")

@@ -174,6 +174,47 @@ def fetch_og_image(url: str | None) -> str | None:
    return None  # too many redirects


+# Word counting reads more of the body than image metadata (which only needs <head>).
+_READ_MAX_BYTES = 900_000
+
+
+def fetch_source_words(url: str | None) -> int | None:
+    """Fetch a page and return its full-article word count (furniture stripped), or
+    None on any failure or a too-thin extraction (JS/video/paywall pages). Same SSRF
+    safety as fetch_og_image; we read the count only, never store the body."""
+    from .readtime import source_read_minutes, word_count_from_html
+    opener = urllib.request.build_opener(_NoRedirect)
+    for _ in range(MAX_REDIRECTS + 1):
+        if not url:
+            return None
+        parts = urlsplit(url)
+        if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
+            return None
+        request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"})
+        try:
+            response = opener.open(request, timeout=TIMEOUT)
+        except (urllib.error.URLError, OSError, ValueError):
+            return None
+        status = getattr(response, "status", 200) or 200
+        if status in (301, 302, 303, 307, 308):
+            location = response.headers.get("Location")
+            response.close()
+            if not location:
+                return None
+            url = urljoin(url, location)
+            continue
+        if "html" not in response.headers.get("Content-Type", "").lower():
+            response.close()
+            return None
+        try:
+            body = response.read(_READ_MAX_BYTES)
+        finally:
+            response.close()
+        words = word_count_from_html(body)
+        return words if source_read_minutes(words) is not None else None
+    return None  # too many redirects
+
+
 def _image_dimensions(data: bytes) -> "tuple[int, int] | None":
    """Best-effort (width, height) from an image file's header bytes — PNG, GIF,
    JPEG, WebP. Returns None for formats we can't cheaply measure (e.g. SVG)."""
@@ -411,3 +452,42 @@ def enrich_summarized_images(
        if enrich_article_image(conn, row["id"], fetch=fetch, retry_days=retry_days):
            found += 1
    return found
+
+
+def enrich_read_times(
+    conn: sqlite3.Connection, fetch=fetch_source_words, limit: int = 40, retry_days: int = 14
+) -> int:
+    """Give recent accepted articles a full-article word count, so the front door can
+    show "Full story · ~N min" next to our one-minute gist. Bounded per run (mirrors
+    the image enrichers); fetches each article once, retrying a failed/too-thin
+    extraction only after `retry_days`. Returns how many real counts were stored."""
+    rows = conn.execute(
+        """
+        SELECT a.id, a.canonical_url FROM articles a
+        JOIN article_scores s ON s.article_id = a.id
+        WHERE s.accepted = 1 AND a.duplicate_of IS NULL
+          AND a.source_words IS NULL
+          AND (a.read_checked_at IS NULL OR a.read_checked_at < datetime('now', ?))
+        ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
+        LIMIT ?
+        """,
+        (f"-{retry_days} days", limit),
+    ).fetchall()
+    found = 0
+    for row in rows:
+        try:
+            words = fetch(row["canonical_url"])
+        except Exception:
+            words = None
+        # Only ever write a REAL count; never overwrite a good value with null/zero.
+        # Always stamp the check time so failed/thin pages aren't re-fetched until retry.
+        if words:
+            conn.execute(
+                "UPDATE articles SET source_words = ?, read_checked_at = CURRENT_TIMESTAMP WHERE id = ?",
+                (words, row["id"]),
+            )
+            found += 1
+        else:
+            conn.execute("UPDATE articles SET read_checked_at = CURRENT_TIMESTAMP WHERE id = ?", (row["id"],))
+    conn.commit()
+    return found
@@ -55,6 +55,7 @@ _ARTICLE_COLUMNS = f"""
    s.reason_text,
    s.model_name,
    src.paywall_override AS paywall_override,
+    a.source_words,
    (SELECT group_concat(t.tag) FROM article_tags t WHERE t.article_id = a.id) AS tags,
    {RANK_SCORE_SQL} AS rank_score
 """
@@ -0,0 +1,43 @@
+"""Estimate a SOURCE article's full read time from its fetched HTML.
+
+We never store the publisher's body — only a word COUNT (metadata) — to derive a
+"Full story · ~N min" hint that contrasts with our one-minute gist. That tiny
+detail sells the value: the calm summary now, the deep dive only if you want it.
+
+Extraction is deliberately light (no readability parser yet): drop the obvious
+non-article furniture (scripts, styles, nav, header, footer, forms, buttons,
+asides), strip tags, count words. ~225 wpm offsets the boilerplate that still
+slips through. Below a floor we assume failed/blocked extraction and return None
+so the UI shows NO badge rather than a misleading "1 min".
+"""
+from __future__ import annotations
+
+import re
+
+_WPM = 225
+_MIN_WORDS = 200          # below this → assume failed/too-thin extraction → no badge
+
+# Blocks whose CONTENT is furniture, removed wholesale before counting.
+_FURNITURE = re.compile(
+    rb"<(script|style|noscript|template|svg|nav|header|footer|form|button|aside|select|option)\b[^>]*>.*?</\1>",
+    re.IGNORECASE | re.DOTALL,
+)
+_TAGS = re.compile(rb"<[^>]+>")
+_WS = re.compile(r"\s+")
+
+
+def word_count_from_html(raw: bytes | None) -> int:
+    """Rough article word count from raw HTML bytes, furniture stripped."""
+    if not raw:
+        return 0
+    cleaned = _FURNITURE.sub(b" ", raw)
+    text = _TAGS.sub(b" ", cleaned).decode("utf-8", "replace")
+    return len(_WS.sub(" ", text).split())
+
+
+def source_read_minutes(words: int | None) -> int | None:
+    """Whole-minute estimate for the FULL article, or None when the count looks
+    failed/too thin (so callers omit the badge instead of showing a wrong number)."""
+    if not words or words < _MIN_WORDS:
+        return None
+    return max(2, round(words / _WPM))