upbeatBytes/tests/test_readtime.py

"""Full-article read-time: word counting strips furniture, threshold/None handling,
and the bounded enrich pass is idempotent + never overwrites a good count with zero."""
import pytest

from goodnews import readtime
from goodnews.db import connect, init_db
from goodnews.enrich import enrich_read_times


def test_word_count_strips_furniture():
    html = (b"<html><head><style>.x{color:red}</style></head><body>"
            b"<nav>Home About Contact Subscribe Login Search Menu</nav>"
            b"<header>Site Name Sections Newsletter</header>"
            b"<article><p>" + b"word " * 300 + b"</p></article>"
            b"<footer>Copyright cookie consent terms privacy policy</footer>"
            b"<script>var a = 1; trackEverything(); analytics();</script></body></html>")
    n = readtime.word_count_from_html(html)
    # ~300 article words; nav/header/footer/script/style excluded → only a small overcount
    assert 300 <= n <= 320


def test_read_minutes_threshold_and_rounding():
    assert readtime.source_read_minutes(None) is None
    assert readtime.source_read_minutes(0) is None
    assert readtime.source_read_minutes(150) is None       # below the 200-word floor
    assert readtime.source_read_minutes(220) == 2          # clamped to a 2-min minimum
    assert readtime.source_read_minutes(450) == 2          # 450/225 = 2
    assert readtime.source_read_minutes(2250) == 10


def test_word_count_empty_or_none():
    assert readtime.word_count_from_html(None) == 0
    assert readtime.word_count_from_html(b"") == 0
    assert readtime.source_read_minutes(readtime.word_count_from_html(b"<html></html>")) is None


@pytest.fixture
def conn(tmp_path):
    c = connect(str(tmp_path / "t.sqlite3")); init_db(c)
    c.execute("INSERT INTO sources (id, name, feed_url) VALUES (1, 'S', 'http://s/f')")
    for i in (1, 2):
        c.execute("INSERT INTO articles (id, source_id, canonical_url, title, url_hash) VALUES (?,1,?,?,?)",
                  (i, f"https://ex.com/{i}", f"T{i}", f"h{i}"))
        c.execute("INSERT INTO article_scores (article_id, accepted) VALUES (?, 1)", (i,))
    c.commit()
    yield c
    c.close()


def test_enrich_stores_then_skips_already_counted(conn):
    calls = []
    def fake(url):
        calls.append(url)
        return 900                                          # ~4 min
    assert enrich_read_times(conn, fetch=fake) == 2
    assert conn.execute("SELECT source_words FROM articles WHERE id=1").fetchone()[0] == 900
    calls.clear()
    assert enrich_read_times(conn, fetch=fake) == 0         # both counted → re-fetches nothing
    assert calls == []


def test_enrich_failure_stamps_but_never_overwrites(conn):
    enrich_read_times(conn, fetch=lambda u: 600)            # both get a good count
    assert conn.execute("SELECT source_words FROM articles WHERE id=1").fetchone()[0] == 600
    # a fresh article whose extraction fails: stamped (so we don't hammer it), left null
    conn.execute("INSERT INTO articles (id, source_id, canonical_url, title, url_hash) "
                 "VALUES (3, 1, 'https://ex.com/3', 'T3', 'h3')")
    conn.execute("INSERT INTO article_scores (article_id, accepted) VALUES (3, 1)")
    conn.commit()
    assert enrich_read_times(conn, fetch=lambda u: None) == 0
    row = conn.execute("SELECT source_words, read_checked_at FROM articles WHERE id=3").fetchone()
    assert row[0] is None and row[1] is not None
    # the good counts are untouched by a later failing pass
    assert conn.execute("SELECT source_words FROM articles WHERE id=1").fetchone()[0] == 600


def test_api_exposes_source_read_minutes_only_when_known():
    from goodnews.api import Article
    base = {"id": 1, "title": "T", "canonical_url": "https://ex.com/a", "source_name": "S", "accepted": 1}
    assert Article.from_row({**base, "source_words": 2000}).source_read_minutes == 9
    assert Article.from_row({**base, "source_words": 120}).source_read_minutes is None   # too thin
    assert Article.from_row({**base}).source_read_minutes is None                         # absent → null