upbeatBytes/goodnews/filters.py

"""Calm Filters — the canonical preference model and pure matching engine.

Everything (localStorage today, query params on the API, a user_preferences row
later) speaks this one shape, so the surfaces never drift. The functions here are
deliberately pure and side-effect-free so they are easy to test and reuse from
both the API and the CLI.

The humane surface ("Not today" / "Less like this" / "Always hide this") maps onto
this machinery: a pause is a topic/flavor muted *until* a timestamp; a mute is a
standing exclusion; avoid-terms drop anything mentioning a phrase the reader would
rather not see.
"""

from __future__ import annotations

import re
from dataclasses import dataclass, field
from datetime import datetime

# Split on any run of non-alphanumerics so matching is punctuation- and
# case-insensitive, and anchored to whole words/phrases (no substring surprises:
# "pan" must not match "pandemic", and "stock market" matches as a phrase).
_NONWORD = re.compile(r"[^a-z0-9]+")


def _normalize(text: str) -> str:
    """Lowercase, collapse non-alphanumerics to single spaces, pad with spaces."""
    return " " + _NONWORD.sub(" ", text.lower()).strip() + " "


def text_matches_avoid_terms(text: str | None, terms: list[str]) -> bool:
    """True if text contains any avoid term as a whole word or phrase."""
    if not text or not terms:
        return False
    haystack = _normalize(text)
    for term in terms:
        needle = _normalize(term).strip()
        if needle and f" {needle} " in haystack:
            return True
    return False


@dataclass
class Pause:
    kind: str  # "topic" or "flavor"
    value: str
    until: str  # ISO 8601 UTC timestamp

    def active(self, now: datetime) -> bool:
        try:
            until = datetime.fromisoformat(self.until.replace("Z", "+00:00"))
        except (ValueError, AttributeError):
            return False
        return until > now


@dataclass
class FilterPrefs:
    include_topics: list[str] = field(default_factory=list)
    include_flavors: list[str] = field(default_factory=list)
    mute_topics: list[str] = field(default_factory=list)
    mute_flavors: list[str] = field(default_factory=list)
    avoid_terms: list[str] = field(default_factory=list)
    pauses: list[Pause] = field(default_factory=list)

    @classmethod
    def from_dict(cls, data: dict | None) -> "FilterPrefs":
        data = data or {}
        return cls(
            include_topics=list(data.get("include_topics") or []),
            include_flavors=list(data.get("include_flavors") or []),
            mute_topics=list(data.get("mute_topics") or []),
            mute_flavors=list(data.get("mute_flavors") or []),
            avoid_terms=list(data.get("avoid_terms") or []),
            pauses=[Pause(**p) for p in (data.get("pauses") or [])],
        )

    def muted_topics(self, now: datetime) -> set[str]:
        """Standing mutes plus any topic currently paused."""
        muted = set(self.mute_topics)
        muted |= {p.value for p in self.pauses if p.kind == "topic" and p.active(now)}
        return muted

    def muted_flavors(self, now: datetime) -> set[str]:
        muted = set(self.mute_flavors)
        muted |= {p.value for p in self.pauses if p.kind == "flavor" and p.active(now)}
        return muted

    def is_empty(self) -> bool:
        return not (
            self.include_topics
            or self.include_flavors
            or self.mute_topics
            or self.mute_flavors
            or self.avoid_terms
            or self.pauses
        )


def allows(article: dict, prefs: FilterPrefs, now: datetime) -> bool:
    """True if an article (a feed/brief row dict) survives the preferences."""
    topic = article.get("topic")
    flavor = article.get("flavor")

    if prefs.include_topics and topic not in prefs.include_topics:
        return False
    if prefs.include_flavors and flavor not in prefs.include_flavors:
        return False
    if topic in prefs.muted_topics(now):
        return False
    if flavor in prefs.muted_flavors(now):
        return False
    blob = f"{article.get('title') or ''} {article.get('description') or ''}"
    if text_matches_avoid_terms(blob, prefs.avoid_terms):
        return False
    return True


def filter_articles(articles: list[dict], prefs: FilterPrefs, now: datetime) -> list[dict]:
    """Apply preferences to a list of article rows, preserving order."""
    if prefs.is_empty():
        return articles
    return [a for a in articles if allows(a, prefs, now)]