"""Full-article read-time: word counting strips furniture, threshold/None handling, and the bounded enrich pass is idempotent + never overwrites a good count with zero.""" import pytest from goodnews import readtime from goodnews.db import connect, init_db from goodnews.enrich import enrich_read_times def test_word_count_strips_furniture(): html = (b"" b"" b"
Site Name Sections Newsletter
" b"

" + b"word " * 300 + b"

" b"" b"") n = readtime.word_count_from_html(html) # ~300 article words; nav/header/footer/script/style excluded → only a small overcount assert 300 <= n <= 320 def test_read_minutes_threshold_and_rounding(): assert readtime.source_read_minutes(None) is None assert readtime.source_read_minutes(0) is None assert readtime.source_read_minutes(150) is None # below the 200-word floor assert readtime.source_read_minutes(220) == 2 # clamped to a 2-min minimum assert readtime.source_read_minutes(450) == 2 # 450/225 = 2 assert readtime.source_read_minutes(2250) == 10 def test_word_count_empty_or_none(): assert readtime.word_count_from_html(None) == 0 assert readtime.word_count_from_html(b"") == 0 assert readtime.source_read_minutes(readtime.word_count_from_html(b"")) is None @pytest.fixture def conn(tmp_path): c = connect(str(tmp_path / "t.sqlite3")); init_db(c) c.execute("INSERT INTO sources (id, name, feed_url) VALUES (1, 'S', 'http://s/f')") for i in (1, 2): c.execute("INSERT INTO articles (id, source_id, canonical_url, title, url_hash) VALUES (?,1,?,?,?)", (i, f"https://ex.com/{i}", f"T{i}", f"h{i}")) c.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?, 1)", (i,)) c.commit() yield c c.close() def test_enrich_stores_then_skips_already_counted(conn): calls = [] def fake(url): calls.append(url) return 900 # ~4 min assert enrich_read_times(conn, fetch=fake) == 2 assert conn.execute("SELECT source_words FROM articles WHERE id=1").fetchone()[0] == 900 calls.clear() assert enrich_read_times(conn, fetch=fake) == 0 # both counted → re-fetches nothing assert calls == [] def test_enrich_failure_stamps_but_never_overwrites(conn): enrich_read_times(conn, fetch=lambda u: 600) # both get a good count assert conn.execute("SELECT source_words FROM articles WHERE id=1").fetchone()[0] == 600 # a fresh article whose extraction fails: stamped (so we don't hammer it), left null conn.execute("INSERT INTO articles (id, source_id, canonical_url, title, url_hash) " "VALUES (3, 1, 'https://ex.com/3', 'T3', 'h3')") conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (3, 1)") conn.commit() assert enrich_read_times(conn, fetch=lambda u: None) == 0 row = conn.execute("SELECT source_words, read_checked_at FROM articles WHERE id=3").fetchone() assert row[0] is None and row[1] is not None # the good counts are untouched by a later failing pass assert conn.execute("SELECT source_words FROM articles WHERE id=1").fetchone()[0] == 600 def test_api_exposes_source_read_minutes_only_when_known(): from goodnews.api import Article base = {"id": 1, "title": "T", "canonical_url": "https://ex.com/a", "source_name": "S", "accepted": 1} assert Article.from_row({**base, "source_words": 2000}).source_read_minutes == 9 assert Article.from_row({**base, "source_words": 120}).source_read_minutes is None # too thin assert Article.from_row({**base}).source_read_minutes is None # absent → null