Read-time: full-article "Full story · ~N min" badge (Option B)
Replaces the gist-based read-time with the SOURCE article's full read time — the
contrast that sells the gist ("calm 1-min version here; ~10 min for the deep dive").
- goodnews/readtime.py: word_count_from_html (strips script/style/nav/header/
footer/form/button/aside furniture before counting) + source_read_minutes
(~225 wpm, 200-word floor, None when extraction looks failed/too thin).
- articles.source_words + read_checked_at columns (count only, never the body;
fits the privacy posture). Idempotent migration.
- enrich.fetch_source_words + enrich_read_times: a bounded, retry-guarded cycle
step (mirrors the image enrichers) that counts words for recent accepted
articles. Only ever writes a real count; never overwrites good with zero. Wired
into the cycle after recent-image enrichment.
- queries: source_words flows through _ARTICLE_COLUMNS; api exposes
source_read_minutes on Article (null when unknown).
- home3: News card shows "Full story · ~N min", hidden entirely when null (no
misleading "1 min").
- Tests: furniture stripping, threshold/rounding, enrich idempotency + no
zero-overwrite, API null handling. 412 backend.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -26,10 +26,10 @@
|
|||||||
// truncation handled by CSS (-webkit-line-clamp:2) — breaks on whole words, fills 2 full lines
|
// truncation handled by CSS (-webkit-line-clamp:2) — breaks on whole words, fills 2 full lines
|
||||||
let headline = $derived(news?.title ?? 'What went right this week: the good news that actually matters');
|
let headline = $derived(news?.title ?? 'What went right this week: the good news that actually matters');
|
||||||
|
|
||||||
// Honest read-time from our own gist (~200 wpm, floor 1). We summarize, so this is
|
// The badge shows how long the FULL source article takes — the contrast that sells
|
||||||
// usually "1 min read" — a feature, not a bug: the good news in about a minute.
|
// the gist ("the calm 1-min version here; ~10 min if you want the deep dive"). Computed
|
||||||
const readMins = (t) => Math.max(1, Math.round((t || '').trim().split(/\s+/).filter(Boolean).length / 200));
|
// server-side from the source word count; hidden entirely when we couldn't measure it.
|
||||||
let readTime = $derived(`${readMins(news?.summary)} min read`);
|
let fullRead = $derived(news?.source_read_minutes ? `Full story · ~${news.source_read_minutes} min` : '');
|
||||||
|
|
||||||
// small-joys shelf: 3 cells shown two at a time, rotated by the reader (no auto-motion)
|
// small-joys shelf: 3 cells shown two at a time, rotated by the reader (no auto-motion)
|
||||||
const JOY_ACCENTS = ['#4f7da8', '#b06a86', '#b06a45'];
|
const JOY_ACCENTS = ['#4f7da8', '#b06a86', '#b06a45'];
|
||||||
@@ -130,7 +130,7 @@
|
|||||||
<p class="summary">{news?.summary || "We read the week so you don't have to doomscroll it. Five quietly hopeful stories, summarised to the gist."}</p>
|
<p class="summary">{news?.summary || "We read the week so you don't have to doomscroll it. Five quietly hopeful stories, summarised to the gist."}</p>
|
||||||
</a>
|
</a>
|
||||||
<div class="news-foot">
|
<div class="news-foot">
|
||||||
<span class="meta">{readTime}</span>
|
{#if fullRead}<span class="meta">{fullRead}</span>{/if}
|
||||||
</div>
|
</div>
|
||||||
<hr class="news-div" />
|
<hr class="news-div" />
|
||||||
<a class="news-more" href="/">Read more good news →</a>
|
<a class="news-more" href="/">Read more good news →</a>
|
||||||
|
|||||||
+3
-1
@@ -36,7 +36,7 @@ from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse
|
|||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from . import art, auth, bloom, daily, email_send, feeds, games, oauth_google, onthisday, publishing, queries, quote, share, sources, summarize, wotd
|
from . import art, auth, bloom, daily, email_send, feeds, games, oauth_google, onthisday, publishing, queries, quote, readtime, share, sources, summarize, wotd
|
||||||
from .localtime import local_today
|
from .localtime import local_today
|
||||||
from .markup import reply_html_to_text, sanitize_reply_html
|
from .markup import reply_html_to_text, sanitize_reply_html
|
||||||
from .db import connect
|
from .db import connect
|
||||||
@@ -322,6 +322,7 @@ class Article(BaseModel):
|
|||||||
paywalled: bool = False
|
paywalled: bool = False
|
||||||
tags: list[str] = []
|
tags: list[str] = []
|
||||||
summary: str | None = None # our own cached summary (present on the brief)
|
summary: str | None = None # our own cached summary (present on the brief)
|
||||||
|
source_read_minutes: int | None = None # ~minutes to read the FULL source article (null = unknown)
|
||||||
# Subject geography (present on feed rows; absent/empty on the brief). breadth is
|
# Subject geography (present on feed rows; absent/empty on the brief). breadth is
|
||||||
# locality|regional|national|multinational|global|unknown; places are ISO codes.
|
# locality|regional|national|multinational|global|unknown; places are ISO codes.
|
||||||
geo_breadth: str | None = None
|
geo_breadth: str | None = None
|
||||||
@@ -345,6 +346,7 @@ class Article(BaseModel):
|
|||||||
geo_confidence=row.get("geo_confidence"),
|
geo_confidence=row.get("geo_confidence"),
|
||||||
geo_places=places,
|
geo_places=places,
|
||||||
summary=row.get("summary"),
|
summary=row.get("summary"),
|
||||||
|
source_read_minutes=readtime.source_read_minutes(row.get("source_words")),
|
||||||
id=row["id"],
|
id=row["id"],
|
||||||
title=row["title"],
|
title=row["title"],
|
||||||
description=row.get("description"),
|
description=row.get("description"),
|
||||||
|
|||||||
+10
-1
@@ -14,7 +14,7 @@ from .localtime import local_today
|
|||||||
from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, cluster_duplicates, dedup as run_dedup
|
from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, cluster_duplicates, dedup as run_dedup
|
||||||
from .geo import tag_articles as tag_geo
|
from .geo import tag_articles as tag_geo
|
||||||
from . import art, onthisday, quote, wotd
|
from . import art, onthisday, quote, wotd
|
||||||
from .enrich import enrich_brief_images, enrich_recent_images, enrich_summarized_images
|
from .enrich import enrich_brief_images, enrich_read_times, enrich_recent_images, enrich_summarized_images
|
||||||
from .summarize import generate_summary, get_summary
|
from .summarize import generate_summary, get_summary
|
||||||
from .feeds import (
|
from .feeds import (
|
||||||
fetch_feed,
|
fetch_feed,
|
||||||
@@ -599,6 +599,15 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non
|
|||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f"recent images: skipped ({exc})")
|
print(f"recent images: skipped ({exc})")
|
||||||
|
|
||||||
|
# Full-article read-times: count words for recent accepted articles so the
|
||||||
|
# front door can show "Full story · ~N min" next to our gist (bounded per cycle).
|
||||||
|
try:
|
||||||
|
reads = enrich_read_times(conn)
|
||||||
|
if reads:
|
||||||
|
print(f"read-times: {reads} counted")
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"read-times: skipped ({exc})")
|
||||||
|
|
||||||
# Pre-warm summaries for today's brief so Today reads as a calm briefing.
|
# Pre-warm summaries for today's brief so Today reads as a calm briefing.
|
||||||
# Idempotent: cached items are skipped, so this only hits the LLM for new ones.
|
# Idempotent: cached items are skipped, so this only hits the LLM for new ones.
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -49,6 +49,8 @@ CREATE TABLE IF NOT EXISTS articles (
|
|||||||
title_hash TEXT,
|
title_hash TEXT,
|
||||||
duplicate_of INTEGER REFERENCES articles(id) ON DELETE SET NULL,
|
duplicate_of INTEGER REFERENCES articles(id) ON DELETE SET NULL,
|
||||||
image_checked_at TEXT,
|
image_checked_at TEXT,
|
||||||
|
source_words INTEGER, -- full-article word count (metadata only; never the body)
|
||||||
|
read_checked_at TEXT, -- when we last tried to count words (retry guard)
|
||||||
FOREIGN KEY (source_id) REFERENCES sources(id)
|
FOREIGN KEY (source_id) REFERENCES sources(id)
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -595,6 +597,10 @@ def _migrate(conn: sqlite3.Connection) -> None:
|
|||||||
)
|
)
|
||||||
if "image_checked_at" not in article_cols:
|
if "image_checked_at" not in article_cols:
|
||||||
conn.execute("ALTER TABLE articles ADD COLUMN image_checked_at TEXT")
|
conn.execute("ALTER TABLE articles ADD COLUMN image_checked_at TEXT")
|
||||||
|
if "source_words" not in article_cols: # full-article read-time (count only, no body)
|
||||||
|
conn.execute("ALTER TABLE articles ADD COLUMN source_words INTEGER")
|
||||||
|
if "read_checked_at" not in article_cols:
|
||||||
|
conn.execute("ALTER TABLE articles ADD COLUMN read_checked_at TEXT")
|
||||||
# Created here (not in SCHEMA) so it runs after the column exists on upgrades.
|
# Created here (not in SCHEMA) so it runs after the column exists on upgrades.
|
||||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_articles_duplicate_of ON articles(duplicate_of)")
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_articles_duplicate_of ON articles(duplicate_of)")
|
||||||
|
|
||||||
|
|||||||
@@ -174,6 +174,47 @@ def fetch_og_image(url: str | None) -> str | None:
|
|||||||
return None # too many redirects
|
return None # too many redirects
|
||||||
|
|
||||||
|
|
||||||
|
# Word counting reads more of the body than image metadata (which only needs <head>).
|
||||||
|
_READ_MAX_BYTES = 900_000
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_source_words(url: str | None) -> int | None:
|
||||||
|
"""Fetch a page and return its full-article word count (furniture stripped), or
|
||||||
|
None on any failure or a too-thin extraction (JS/video/paywall pages). Same SSRF
|
||||||
|
safety as fetch_og_image; we read the count only, never store the body."""
|
||||||
|
from .readtime import source_read_minutes, word_count_from_html
|
||||||
|
opener = urllib.request.build_opener(_NoRedirect)
|
||||||
|
for _ in range(MAX_REDIRECTS + 1):
|
||||||
|
if not url:
|
||||||
|
return None
|
||||||
|
parts = urlsplit(url)
|
||||||
|
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
|
||||||
|
return None
|
||||||
|
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"})
|
||||||
|
try:
|
||||||
|
response = opener.open(request, timeout=TIMEOUT)
|
||||||
|
except (urllib.error.URLError, OSError, ValueError):
|
||||||
|
return None
|
||||||
|
status = getattr(response, "status", 200) or 200
|
||||||
|
if status in (301, 302, 303, 307, 308):
|
||||||
|
location = response.headers.get("Location")
|
||||||
|
response.close()
|
||||||
|
if not location:
|
||||||
|
return None
|
||||||
|
url = urljoin(url, location)
|
||||||
|
continue
|
||||||
|
if "html" not in response.headers.get("Content-Type", "").lower():
|
||||||
|
response.close()
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
body = response.read(_READ_MAX_BYTES)
|
||||||
|
finally:
|
||||||
|
response.close()
|
||||||
|
words = word_count_from_html(body)
|
||||||
|
return words if source_read_minutes(words) is not None else None
|
||||||
|
return None # too many redirects
|
||||||
|
|
||||||
|
|
||||||
def _image_dimensions(data: bytes) -> "tuple[int, int] | None":
|
def _image_dimensions(data: bytes) -> "tuple[int, int] | None":
|
||||||
"""Best-effort (width, height) from an image file's header bytes — PNG, GIF,
|
"""Best-effort (width, height) from an image file's header bytes — PNG, GIF,
|
||||||
JPEG, WebP. Returns None for formats we can't cheaply measure (e.g. SVG)."""
|
JPEG, WebP. Returns None for formats we can't cheaply measure (e.g. SVG)."""
|
||||||
@@ -411,3 +452,42 @@ def enrich_summarized_images(
|
|||||||
if enrich_article_image(conn, row["id"], fetch=fetch, retry_days=retry_days):
|
if enrich_article_image(conn, row["id"], fetch=fetch, retry_days=retry_days):
|
||||||
found += 1
|
found += 1
|
||||||
return found
|
return found
|
||||||
|
|
||||||
|
|
||||||
|
def enrich_read_times(
|
||||||
|
conn: sqlite3.Connection, fetch=fetch_source_words, limit: int = 40, retry_days: int = 14
|
||||||
|
) -> int:
|
||||||
|
"""Give recent accepted articles a full-article word count, so the front door can
|
||||||
|
show "Full story · ~N min" next to our one-minute gist. Bounded per run (mirrors
|
||||||
|
the image enrichers); fetches each article once, retrying a failed/too-thin
|
||||||
|
extraction only after `retry_days`. Returns how many real counts were stored."""
|
||||||
|
rows = conn.execute(
|
||||||
|
"""
|
||||||
|
SELECT a.id, a.canonical_url FROM articles a
|
||||||
|
JOIN article_scores s ON s.article_id = a.id
|
||||||
|
WHERE s.accepted = 1 AND a.duplicate_of IS NULL
|
||||||
|
AND a.source_words IS NULL
|
||||||
|
AND (a.read_checked_at IS NULL OR a.read_checked_at < datetime('now', ?))
|
||||||
|
ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
|
||||||
|
LIMIT ?
|
||||||
|
""",
|
||||||
|
(f"-{retry_days} days", limit),
|
||||||
|
).fetchall()
|
||||||
|
found = 0
|
||||||
|
for row in rows:
|
||||||
|
try:
|
||||||
|
words = fetch(row["canonical_url"])
|
||||||
|
except Exception:
|
||||||
|
words = None
|
||||||
|
# Only ever write a REAL count; never overwrite a good value with null/zero.
|
||||||
|
# Always stamp the check time so failed/thin pages aren't re-fetched until retry.
|
||||||
|
if words:
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE articles SET source_words = ?, read_checked_at = CURRENT_TIMESTAMP WHERE id = ?",
|
||||||
|
(words, row["id"]),
|
||||||
|
)
|
||||||
|
found += 1
|
||||||
|
else:
|
||||||
|
conn.execute("UPDATE articles SET read_checked_at = CURRENT_TIMESTAMP WHERE id = ?", (row["id"],))
|
||||||
|
conn.commit()
|
||||||
|
return found
|
||||||
|
|||||||
@@ -55,6 +55,7 @@ _ARTICLE_COLUMNS = f"""
|
|||||||
s.reason_text,
|
s.reason_text,
|
||||||
s.model_name,
|
s.model_name,
|
||||||
src.paywall_override AS paywall_override,
|
src.paywall_override AS paywall_override,
|
||||||
|
a.source_words,
|
||||||
(SELECT group_concat(t.tag) FROM article_tags t WHERE t.article_id = a.id) AS tags,
|
(SELECT group_concat(t.tag) FROM article_tags t WHERE t.article_id = a.id) AS tags,
|
||||||
{RANK_SCORE_SQL} AS rank_score
|
{RANK_SCORE_SQL} AS rank_score
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -0,0 +1,43 @@
|
|||||||
|
"""Estimate a SOURCE article's full read time from its fetched HTML.
|
||||||
|
|
||||||
|
We never store the publisher's body — only a word COUNT (metadata) — to derive a
|
||||||
|
"Full story · ~N min" hint that contrasts with our one-minute gist. That tiny
|
||||||
|
detail sells the value: the calm summary now, the deep dive only if you want it.
|
||||||
|
|
||||||
|
Extraction is deliberately light (no readability parser yet): drop the obvious
|
||||||
|
non-article furniture (scripts, styles, nav, header, footer, forms, buttons,
|
||||||
|
asides), strip tags, count words. ~225 wpm offsets the boilerplate that still
|
||||||
|
slips through. Below a floor we assume failed/blocked extraction and return None
|
||||||
|
so the UI shows NO badge rather than a misleading "1 min".
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
_WPM = 225
|
||||||
|
_MIN_WORDS = 200 # below this → assume failed/too-thin extraction → no badge
|
||||||
|
|
||||||
|
# Blocks whose CONTENT is furniture, removed wholesale before counting.
|
||||||
|
_FURNITURE = re.compile(
|
||||||
|
rb"<(script|style|noscript|template|svg|nav|header|footer|form|button|aside|select|option)\b[^>]*>.*?</\1>",
|
||||||
|
re.IGNORECASE | re.DOTALL,
|
||||||
|
)
|
||||||
|
_TAGS = re.compile(rb"<[^>]+>")
|
||||||
|
_WS = re.compile(r"\s+")
|
||||||
|
|
||||||
|
|
||||||
|
def word_count_from_html(raw: bytes | None) -> int:
|
||||||
|
"""Rough article word count from raw HTML bytes, furniture stripped."""
|
||||||
|
if not raw:
|
||||||
|
return 0
|
||||||
|
cleaned = _FURNITURE.sub(b" ", raw)
|
||||||
|
text = _TAGS.sub(b" ", cleaned).decode("utf-8", "replace")
|
||||||
|
return len(_WS.sub(" ", text).split())
|
||||||
|
|
||||||
|
|
||||||
|
def source_read_minutes(words: int | None) -> int | None:
|
||||||
|
"""Whole-minute estimate for the FULL article, or None when the count looks
|
||||||
|
failed/too thin (so callers omit the badge instead of showing a wrong number)."""
|
||||||
|
if not words or words < _MIN_WORDS:
|
||||||
|
return None
|
||||||
|
return max(2, round(words / _WPM))
|
||||||
@@ -0,0 +1,82 @@
|
|||||||
|
"""Full-article read-time: word counting strips furniture, threshold/None handling,
|
||||||
|
and the bounded enrich pass is idempotent + never overwrites a good count with zero."""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from goodnews import readtime
|
||||||
|
from goodnews.db import connect, init_db
|
||||||
|
from goodnews.enrich import enrich_read_times
|
||||||
|
|
||||||
|
|
||||||
|
def test_word_count_strips_furniture():
|
||||||
|
html = (b"<html><head><style>.x{color:red}</style></head><body>"
|
||||||
|
b"<nav>Home About Contact Subscribe Login Search Menu</nav>"
|
||||||
|
b"<header>Site Name Sections Newsletter</header>"
|
||||||
|
b"<article><p>" + b"word " * 300 + b"</p></article>"
|
||||||
|
b"<footer>Copyright cookie consent terms privacy policy</footer>"
|
||||||
|
b"<script>var a = 1; trackEverything(); analytics();</script></body></html>")
|
||||||
|
n = readtime.word_count_from_html(html)
|
||||||
|
# ~300 article words; nav/header/footer/script/style excluded → only a small overcount
|
||||||
|
assert 300 <= n <= 320
|
||||||
|
|
||||||
|
|
||||||
|
def test_read_minutes_threshold_and_rounding():
|
||||||
|
assert readtime.source_read_minutes(None) is None
|
||||||
|
assert readtime.source_read_minutes(0) is None
|
||||||
|
assert readtime.source_read_minutes(150) is None # below the 200-word floor
|
||||||
|
assert readtime.source_read_minutes(220) == 2 # clamped to a 2-min minimum
|
||||||
|
assert readtime.source_read_minutes(450) == 2 # 450/225 = 2
|
||||||
|
assert readtime.source_read_minutes(2250) == 10
|
||||||
|
|
||||||
|
|
||||||
|
def test_word_count_empty_or_none():
|
||||||
|
assert readtime.word_count_from_html(None) == 0
|
||||||
|
assert readtime.word_count_from_html(b"") == 0
|
||||||
|
assert readtime.source_read_minutes(readtime.word_count_from_html(b"<html></html>")) is None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def conn(tmp_path):
|
||||||
|
c = connect(str(tmp_path / "t.sqlite3")); init_db(c)
|
||||||
|
c.execute("INSERT INTO sources (id, name, feed_url) VALUES (1, 'S', 'http://s/f')")
|
||||||
|
for i in (1, 2):
|
||||||
|
c.execute("INSERT INTO articles (id, source_id, canonical_url, title, url_hash) VALUES (?,1,?,?,?)",
|
||||||
|
(i, f"https://ex.com/{i}", f"T{i}", f"h{i}"))
|
||||||
|
c.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?, 1)", (i,))
|
||||||
|
c.commit()
|
||||||
|
yield c
|
||||||
|
c.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_enrich_stores_then_skips_already_counted(conn):
|
||||||
|
calls = []
|
||||||
|
def fake(url):
|
||||||
|
calls.append(url)
|
||||||
|
return 900 # ~4 min
|
||||||
|
assert enrich_read_times(conn, fetch=fake) == 2
|
||||||
|
assert conn.execute("SELECT source_words FROM articles WHERE id=1").fetchone()[0] == 900
|
||||||
|
calls.clear()
|
||||||
|
assert enrich_read_times(conn, fetch=fake) == 0 # both counted → re-fetches nothing
|
||||||
|
assert calls == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_enrich_failure_stamps_but_never_overwrites(conn):
|
||||||
|
enrich_read_times(conn, fetch=lambda u: 600) # both get a good count
|
||||||
|
assert conn.execute("SELECT source_words FROM articles WHERE id=1").fetchone()[0] == 600
|
||||||
|
# a fresh article whose extraction fails: stamped (so we don't hammer it), left null
|
||||||
|
conn.execute("INSERT INTO articles (id, source_id, canonical_url, title, url_hash) "
|
||||||
|
"VALUES (3, 1, 'https://ex.com/3', 'T3', 'h3')")
|
||||||
|
conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (3, 1)")
|
||||||
|
conn.commit()
|
||||||
|
assert enrich_read_times(conn, fetch=lambda u: None) == 0
|
||||||
|
row = conn.execute("SELECT source_words, read_checked_at FROM articles WHERE id=3").fetchone()
|
||||||
|
assert row[0] is None and row[1] is not None
|
||||||
|
# the good counts are untouched by a later failing pass
|
||||||
|
assert conn.execute("SELECT source_words FROM articles WHERE id=1").fetchone()[0] == 600
|
||||||
|
|
||||||
|
|
||||||
|
def test_api_exposes_source_read_minutes_only_when_known():
|
||||||
|
from goodnews.api import Article
|
||||||
|
base = {"id": 1, "title": "T", "canonical_url": "https://ex.com/a", "source_name": "S", "accepted": 1}
|
||||||
|
assert Article.from_row({**base, "source_words": 2000}).source_read_minutes == 9
|
||||||
|
assert Article.from_row({**base, "source_words": 120}).source_read_minutes is None # too thin
|
||||||
|
assert Article.from_row({**base}).source_read_minutes is None # absent → null
|
||||||
Reference in New Issue
Block a user