upbeatBytes/goodnews/readtime.py

"""Estimate a SOURCE article's full read time from its fetched HTML.

We never store the publisher's body — only a word COUNT (metadata) — to derive a
"Full story · ~N min" hint that contrasts with our one-minute gist. That tiny
detail sells the value: the calm summary now, the deep dive only if you want it.

Extraction is deliberately light (no readability parser yet): drop the obvious
non-article furniture (scripts, styles, nav, header, footer, forms, buttons,
asides), strip tags, count words. ~225 wpm offsets the boilerplate that still
slips through. Below a floor we assume failed/blocked extraction and return None
so the UI shows NO badge rather than a misleading "1 min".
"""
from __future__ import annotations

import re

_WPM = 225
_MIN_WORDS = 200          # below this → assume failed/too-thin extraction → no badge

# Blocks whose CONTENT is furniture, removed wholesale before counting.
_FURNITURE = re.compile(
    rb"<(script|style|noscript|template|svg|nav|header|footer|form|button|aside|select|option)\b[^>]*>.*?</\1>",
    re.IGNORECASE | re.DOTALL,
)
_TAGS = re.compile(rb"<[^>]+>")
_WS = re.compile(r"\s+")


def word_count_from_html(raw: bytes | None) -> int:
    """Rough article word count from raw HTML bytes, furniture stripped."""
    if not raw:
        return 0
    cleaned = _FURNITURE.sub(b" ", raw)
    text = _TAGS.sub(b" ", cleaned).decode("utf-8", "replace")
    return len(_WS.sub(" ", text).split())


def source_read_minutes(words: int | None) -> int | None:
    """Whole-minute estimate for the FULL article, or None when the count looks
    failed/too thin (so callers omit the badge instead of showing a wrong number)."""
    if not words or words < _MIN_WORDS:
        return None
    return max(2, round(words / _WPM))