dc23277b38
Replaces the gist-based read-time with the SOURCE article's full read time — the
contrast that sells the gist ("calm 1-min version here; ~10 min for the deep dive").
- goodnews/readtime.py: word_count_from_html (strips script/style/nav/header/
footer/form/button/aside furniture before counting) + source_read_minutes
(~225 wpm, 200-word floor, None when extraction looks failed/too thin).
- articles.source_words + read_checked_at columns (count only, never the body;
fits the privacy posture). Idempotent migration.
- enrich.fetch_source_words + enrich_read_times: a bounded, retry-guarded cycle
step (mirrors the image enrichers) that counts words for recent accepted
articles. Only ever writes a real count; never overwrites good with zero. Wired
into the cycle after recent-image enrichment.
- queries: source_words flows through _ARTICLE_COLUMNS; api exposes
source_read_minutes on Article (null when unknown).
- home3: News card shows "Full story · ~N min", hidden entirely when null (no
misleading "1 min").
- Tests: furniture stripping, threshold/rounding, enrich idempotency + no
zero-overwrite, API null handling. 412 backend.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
83 lines
4.0 KiB
Python
83 lines
4.0 KiB
Python
"""Full-article read-time: word counting strips furniture, threshold/None handling,
|
|
and the bounded enrich pass is idempotent + never overwrites a good count with zero."""
|
|
import pytest
|
|
|
|
from goodnews import readtime
|
|
from goodnews.db import connect, init_db
|
|
from goodnews.enrich import enrich_read_times
|
|
|
|
|
|
def test_word_count_strips_furniture():
|
|
html = (b"<html><head><style>.x{color:red}</style></head><body>"
|
|
b"<nav>Home About Contact Subscribe Login Search Menu</nav>"
|
|
b"<header>Site Name Sections Newsletter</header>"
|
|
b"<article><p>" + b"word " * 300 + b"</p></article>"
|
|
b"<footer>Copyright cookie consent terms privacy policy</footer>"
|
|
b"<script>var a = 1; trackEverything(); analytics();</script></body></html>")
|
|
n = readtime.word_count_from_html(html)
|
|
# ~300 article words; nav/header/footer/script/style excluded → only a small overcount
|
|
assert 300 <= n <= 320
|
|
|
|
|
|
def test_read_minutes_threshold_and_rounding():
|
|
assert readtime.source_read_minutes(None) is None
|
|
assert readtime.source_read_minutes(0) is None
|
|
assert readtime.source_read_minutes(150) is None # below the 200-word floor
|
|
assert readtime.source_read_minutes(220) == 2 # clamped to a 2-min minimum
|
|
assert readtime.source_read_minutes(450) == 2 # 450/225 = 2
|
|
assert readtime.source_read_minutes(2250) == 10
|
|
|
|
|
|
def test_word_count_empty_or_none():
|
|
assert readtime.word_count_from_html(None) == 0
|
|
assert readtime.word_count_from_html(b"") == 0
|
|
assert readtime.source_read_minutes(readtime.word_count_from_html(b"<html></html>")) is None
|
|
|
|
|
|
@pytest.fixture
|
|
def conn(tmp_path):
|
|
c = connect(str(tmp_path / "t.sqlite3")); init_db(c)
|
|
c.execute("INSERT INTO sources (id, name, feed_url) VALUES (1, 'S', 'http://s/f')")
|
|
for i in (1, 2):
|
|
c.execute("INSERT INTO articles (id, source_id, canonical_url, title, url_hash) VALUES (?,1,?,?,?)",
|
|
(i, f"https://ex.com/{i}", f"T{i}", f"h{i}"))
|
|
c.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?, 1)", (i,))
|
|
c.commit()
|
|
yield c
|
|
c.close()
|
|
|
|
|
|
def test_enrich_stores_then_skips_already_counted(conn):
|
|
calls = []
|
|
def fake(url):
|
|
calls.append(url)
|
|
return 900 # ~4 min
|
|
assert enrich_read_times(conn, fetch=fake) == 2
|
|
assert conn.execute("SELECT source_words FROM articles WHERE id=1").fetchone()[0] == 900
|
|
calls.clear()
|
|
assert enrich_read_times(conn, fetch=fake) == 0 # both counted → re-fetches nothing
|
|
assert calls == []
|
|
|
|
|
|
def test_enrich_failure_stamps_but_never_overwrites(conn):
|
|
enrich_read_times(conn, fetch=lambda u: 600) # both get a good count
|
|
assert conn.execute("SELECT source_words FROM articles WHERE id=1").fetchone()[0] == 600
|
|
# a fresh article whose extraction fails: stamped (so we don't hammer it), left null
|
|
conn.execute("INSERT INTO articles (id, source_id, canonical_url, title, url_hash) "
|
|
"VALUES (3, 1, 'https://ex.com/3', 'T3', 'h3')")
|
|
conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (3, 1)")
|
|
conn.commit()
|
|
assert enrich_read_times(conn, fetch=lambda u: None) == 0
|
|
row = conn.execute("SELECT source_words, read_checked_at FROM articles WHERE id=3").fetchone()
|
|
assert row[0] is None and row[1] is not None
|
|
# the good counts are untouched by a later failing pass
|
|
assert conn.execute("SELECT source_words FROM articles WHERE id=1").fetchone()[0] == 600
|
|
|
|
|
|
def test_api_exposes_source_read_minutes_only_when_known():
|
|
from goodnews.api import Article
|
|
base = {"id": 1, "title": "T", "canonical_url": "https://ex.com/a", "source_name": "S", "accepted": 1}
|
|
assert Article.from_row({**base, "source_words": 2000}).source_read_minutes == 9
|
|
assert Article.from_row({**base, "source_words": 120}).source_read_minutes is None # too thin
|
|
assert Article.from_row({**base}).source_read_minutes is None # absent → null
|