diff --git a/frontend/src/routes/home3/+page.svelte b/frontend/src/routes/home3/+page.svelte index a41bb74..5850bcd 100644 --- a/frontend/src/routes/home3/+page.svelte +++ b/frontend/src/routes/home3/+page.svelte @@ -26,10 +26,10 @@ // truncation handled by CSS (-webkit-line-clamp:2) — breaks on whole words, fills 2 full lines let headline = $derived(news?.title ?? 'What went right this week: the good news that actually matters'); - // Honest read-time from our own gist (~200 wpm, floor 1). We summarize, so this is - // usually "1 min read" — a feature, not a bug: the good news in about a minute. - const readMins = (t) => Math.max(1, Math.round((t || '').trim().split(/\s+/).filter(Boolean).length / 200)); - let readTime = $derived(`${readMins(news?.summary)} min read`); + // The badge shows how long the FULL source article takes — the contrast that sells + // the gist ("the calm 1-min version here; ~10 min if you want the deep dive"). Computed + // server-side from the source word count; hidden entirely when we couldn't measure it. + let fullRead = $derived(news?.source_read_minutes ? `Full story · ~${news.source_read_minutes} min` : ''); // small-joys shelf: 3 cells shown two at a time, rotated by the reader (no auto-motion) const JOY_ACCENTS = ['#4f7da8', '#b06a86', '#b06a45']; @@ -130,7 +130,7 @@

{news?.summary || "We read the week so you don't have to doomscroll it. Five quietly hopeful stories, summarised to the gist."}

- {readTime} + {#if fullRead}{fullRead}{/if}

Read more good news → diff --git a/goodnews/api.py b/goodnews/api.py index 7833efb..b0fc1a9 100644 --- a/goodnews/api.py +++ b/goodnews/api.py @@ -36,7 +36,7 @@ from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse from fastapi.staticfiles import StaticFiles from pydantic import BaseModel -from . import art, auth, bloom, daily, email_send, feeds, games, oauth_google, onthisday, publishing, queries, quote, share, sources, summarize, wotd +from . import art, auth, bloom, daily, email_send, feeds, games, oauth_google, onthisday, publishing, queries, quote, readtime, share, sources, summarize, wotd from .localtime import local_today from .markup import reply_html_to_text, sanitize_reply_html from .db import connect @@ -322,6 +322,7 @@ class Article(BaseModel): paywalled: bool = False tags: list[str] = [] summary: str | None = None # our own cached summary (present on the brief) + source_read_minutes: int | None = None # ~minutes to read the FULL source article (null = unknown) # Subject geography (present on feed rows; absent/empty on the brief). breadth is # locality|regional|national|multinational|global|unknown; places are ISO codes. geo_breadth: str | None = None @@ -345,6 +346,7 @@ class Article(BaseModel): geo_confidence=row.get("geo_confidence"), geo_places=places, summary=row.get("summary"), + source_read_minutes=readtime.source_read_minutes(row.get("source_words")), id=row["id"], title=row["title"], description=row.get("description"), diff --git a/goodnews/cli.py b/goodnews/cli.py index 6df997e..41c8972 100644 --- a/goodnews/cli.py +++ b/goodnews/cli.py @@ -14,7 +14,7 @@ from .localtime import local_today from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, cluster_duplicates, dedup as run_dedup from .geo import tag_articles as tag_geo from . import art, onthisday, quote, wotd -from .enrich import enrich_brief_images, enrich_recent_images, enrich_summarized_images +from .enrich import enrich_brief_images, enrich_read_times, enrich_recent_images, enrich_summarized_images from .summarize import generate_summary, get_summary from .feeds import ( fetch_feed, @@ -599,6 +599,15 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non except Exception as exc: print(f"recent images: skipped ({exc})") + # Full-article read-times: count words for recent accepted articles so the + # front door can show "Full story · ~N min" next to our gist (bounded per cycle). + try: + reads = enrich_read_times(conn) + if reads: + print(f"read-times: {reads} counted") + except Exception as exc: + print(f"read-times: skipped ({exc})") + # Pre-warm summaries for today's brief so Today reads as a calm briefing. # Idempotent: cached items are skipped, so this only hits the LLM for new ones. try: diff --git a/goodnews/db.py b/goodnews/db.py index ff1e6c7..121cea2 100644 --- a/goodnews/db.py +++ b/goodnews/db.py @@ -49,6 +49,8 @@ CREATE TABLE IF NOT EXISTS articles ( title_hash TEXT, duplicate_of INTEGER REFERENCES articles(id) ON DELETE SET NULL, image_checked_at TEXT, + source_words INTEGER, -- full-article word count (metadata only; never the body) + read_checked_at TEXT, -- when we last tried to count words (retry guard) FOREIGN KEY (source_id) REFERENCES sources(id) ); @@ -595,6 +597,10 @@ def _migrate(conn: sqlite3.Connection) -> None: ) if "image_checked_at" not in article_cols: conn.execute("ALTER TABLE articles ADD COLUMN image_checked_at TEXT") + if "source_words" not in article_cols: # full-article read-time (count only, no body) + conn.execute("ALTER TABLE articles ADD COLUMN source_words INTEGER") + if "read_checked_at" not in article_cols: + conn.execute("ALTER TABLE articles ADD COLUMN read_checked_at TEXT") # Created here (not in SCHEMA) so it runs after the column exists on upgrades. conn.execute("CREATE INDEX IF NOT EXISTS idx_articles_duplicate_of ON articles(duplicate_of)") diff --git a/goodnews/enrich.py b/goodnews/enrich.py index 0cd7a66..5918d31 100644 --- a/goodnews/enrich.py +++ b/goodnews/enrich.py @@ -174,6 +174,47 @@ def fetch_og_image(url: str | None) -> str | None: return None # too many redirects +# Word counting reads more of the body than image metadata (which only needs ). +_READ_MAX_BYTES = 900_000 + + +def fetch_source_words(url: str | None) -> int | None: + """Fetch a page and return its full-article word count (furniture stripped), or + None on any failure or a too-thin extraction (JS/video/paywall pages). Same SSRF + safety as fetch_og_image; we read the count only, never store the body.""" + from .readtime import source_read_minutes, word_count_from_html + opener = urllib.request.build_opener(_NoRedirect) + for _ in range(MAX_REDIRECTS + 1): + if not url: + return None + parts = urlsplit(url) + if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname): + return None + request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"}) + try: + response = opener.open(request, timeout=TIMEOUT) + except (urllib.error.URLError, OSError, ValueError): + return None + status = getattr(response, "status", 200) or 200 + if status in (301, 302, 303, 307, 308): + location = response.headers.get("Location") + response.close() + if not location: + return None + url = urljoin(url, location) + continue + if "html" not in response.headers.get("Content-Type", "").lower(): + response.close() + return None + try: + body = response.read(_READ_MAX_BYTES) + finally: + response.close() + words = word_count_from_html(body) + return words if source_read_minutes(words) is not None else None + return None # too many redirects + + def _image_dimensions(data: bytes) -> "tuple[int, int] | None": """Best-effort (width, height) from an image file's header bytes — PNG, GIF, JPEG, WebP. Returns None for formats we can't cheaply measure (e.g. SVG).""" @@ -411,3 +452,42 @@ def enrich_summarized_images( if enrich_article_image(conn, row["id"], fetch=fetch, retry_days=retry_days): found += 1 return found + + +def enrich_read_times( + conn: sqlite3.Connection, fetch=fetch_source_words, limit: int = 40, retry_days: int = 14 +) -> int: + """Give recent accepted articles a full-article word count, so the front door can + show "Full story · ~N min" next to our one-minute gist. Bounded per run (mirrors + the image enrichers); fetches each article once, retrying a failed/too-thin + extraction only after `retry_days`. Returns how many real counts were stored.""" + rows = conn.execute( + """ + SELECT a.id, a.canonical_url FROM articles a + JOIN article_scores s ON s.article_id = a.id + WHERE s.accepted = 1 AND a.duplicate_of IS NULL + AND a.source_words IS NULL + AND (a.read_checked_at IS NULL OR a.read_checked_at < datetime('now', ?)) + ORDER BY COALESCE(a.published_at, a.discovered_at) DESC + LIMIT ? + """, + (f"-{retry_days} days", limit), + ).fetchall() + found = 0 + for row in rows: + try: + words = fetch(row["canonical_url"]) + except Exception: + words = None + # Only ever write a REAL count; never overwrite a good value with null/zero. + # Always stamp the check time so failed/thin pages aren't re-fetched until retry. + if words: + conn.execute( + "UPDATE articles SET source_words = ?, read_checked_at = CURRENT_TIMESTAMP WHERE id = ?", + (words, row["id"]), + ) + found += 1 + else: + conn.execute("UPDATE articles SET read_checked_at = CURRENT_TIMESTAMP WHERE id = ?", (row["id"],)) + conn.commit() + return found diff --git a/goodnews/queries.py b/goodnews/queries.py index 07650ba..4664b5c 100644 --- a/goodnews/queries.py +++ b/goodnews/queries.py @@ -55,6 +55,7 @@ _ARTICLE_COLUMNS = f""" s.reason_text, s.model_name, src.paywall_override AS paywall_override, + a.source_words, (SELECT group_concat(t.tag) FROM article_tags t WHERE t.article_id = a.id) AS tags, {RANK_SCORE_SQL} AS rank_score """ diff --git a/goodnews/readtime.py b/goodnews/readtime.py new file mode 100644 index 0000000..157e0b2 --- /dev/null +++ b/goodnews/readtime.py @@ -0,0 +1,43 @@ +"""Estimate a SOURCE article's full read time from its fetched HTML. + +We never store the publisher's body — only a word COUNT (metadata) — to derive a +"Full story · ~N min" hint that contrasts with our one-minute gist. That tiny +detail sells the value: the calm summary now, the deep dive only if you want it. + +Extraction is deliberately light (no readability parser yet): drop the obvious +non-article furniture (scripts, styles, nav, header, footer, forms, buttons, +asides), strip tags, count words. ~225 wpm offsets the boilerplate that still +slips through. Below a floor we assume failed/blocked extraction and return None +so the UI shows NO badge rather than a misleading "1 min". +""" +from __future__ import annotations + +import re + +_WPM = 225 +_MIN_WORDS = 200 # below this → assume failed/too-thin extraction → no badge + +# Blocks whose CONTENT is furniture, removed wholesale before counting. +_FURNITURE = re.compile( + rb"<(script|style|noscript|template|svg|nav|header|footer|form|button|aside|select|option)\b[^>]*>.*?", + re.IGNORECASE | re.DOTALL, +) +_TAGS = re.compile(rb"<[^>]+>") +_WS = re.compile(r"\s+") + + +def word_count_from_html(raw: bytes | None) -> int: + """Rough article word count from raw HTML bytes, furniture stripped.""" + if not raw: + return 0 + cleaned = _FURNITURE.sub(b" ", raw) + text = _TAGS.sub(b" ", cleaned).decode("utf-8", "replace") + return len(_WS.sub(" ", text).split()) + + +def source_read_minutes(words: int | None) -> int | None: + """Whole-minute estimate for the FULL article, or None when the count looks + failed/too thin (so callers omit the badge instead of showing a wrong number).""" + if not words or words < _MIN_WORDS: + return None + return max(2, round(words / _WPM)) diff --git a/tests/test_readtime.py b/tests/test_readtime.py new file mode 100644 index 0000000..f955ba4 --- /dev/null +++ b/tests/test_readtime.py @@ -0,0 +1,82 @@ +"""Full-article read-time: word counting strips furniture, threshold/None handling, +and the bounded enrich pass is idempotent + never overwrites a good count with zero.""" +import pytest + +from goodnews import readtime +from goodnews.db import connect, init_db +from goodnews.enrich import enrich_read_times + + +def test_word_count_strips_furniture(): + html = (b"" + b"" + b"
Site Name Sections Newsletter
" + b"

" + b"word " * 300 + b"

" + b"" + b"") + n = readtime.word_count_from_html(html) + # ~300 article words; nav/header/footer/script/style excluded → only a small overcount + assert 300 <= n <= 320 + + +def test_read_minutes_threshold_and_rounding(): + assert readtime.source_read_minutes(None) is None + assert readtime.source_read_minutes(0) is None + assert readtime.source_read_minutes(150) is None # below the 200-word floor + assert readtime.source_read_minutes(220) == 2 # clamped to a 2-min minimum + assert readtime.source_read_minutes(450) == 2 # 450/225 = 2 + assert readtime.source_read_minutes(2250) == 10 + + +def test_word_count_empty_or_none(): + assert readtime.word_count_from_html(None) == 0 + assert readtime.word_count_from_html(b"") == 0 + assert readtime.source_read_minutes(readtime.word_count_from_html(b"")) is None + + +@pytest.fixture +def conn(tmp_path): + c = connect(str(tmp_path / "t.sqlite3")); init_db(c) + c.execute("INSERT INTO sources (id, name, feed_url) VALUES (1, 'S', 'http://s/f')") + for i in (1, 2): + c.execute("INSERT INTO articles (id, source_id, canonical_url, title, url_hash) VALUES (?,1,?,?,?)", + (i, f"https://ex.com/{i}", f"T{i}", f"h{i}")) + c.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?, 1)", (i,)) + c.commit() + yield c + c.close() + + +def test_enrich_stores_then_skips_already_counted(conn): + calls = [] + def fake(url): + calls.append(url) + return 900 # ~4 min + assert enrich_read_times(conn, fetch=fake) == 2 + assert conn.execute("SELECT source_words FROM articles WHERE id=1").fetchone()[0] == 900 + calls.clear() + assert enrich_read_times(conn, fetch=fake) == 0 # both counted → re-fetches nothing + assert calls == [] + + +def test_enrich_failure_stamps_but_never_overwrites(conn): + enrich_read_times(conn, fetch=lambda u: 600) # both get a good count + assert conn.execute("SELECT source_words FROM articles WHERE id=1").fetchone()[0] == 600 + # a fresh article whose extraction fails: stamped (so we don't hammer it), left null + conn.execute("INSERT INTO articles (id, source_id, canonical_url, title, url_hash) " + "VALUES (3, 1, 'https://ex.com/3', 'T3', 'h3')") + conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (3, 1)") + conn.commit() + assert enrich_read_times(conn, fetch=lambda u: None) == 0 + row = conn.execute("SELECT source_words, read_checked_at FROM articles WHERE id=3").fetchone() + assert row[0] is None and row[1] is not None + # the good counts are untouched by a later failing pass + assert conn.execute("SELECT source_words FROM articles WHERE id=1").fetchone()[0] == 600 + + +def test_api_exposes_source_read_minutes_only_when_known(): + from goodnews.api import Article + base = {"id": 1, "title": "T", "canonical_url": "https://ex.com/a", "source_name": "S", "accepted": 1} + assert Article.from_row({**base, "source_words": 2000}).source_read_minutes == 9 + assert Article.from_row({**base, "source_words": 120}).source_read_minutes is None # too thin + assert Article.from_row({**base}).source_read_minutes is None # absent → null