Files
upbeatBytes/goodnews/filters.py
T
thejayman77 9cdcda5e02 Durability pass: tests, clearer diversity/classify behavior, Calm Filters foundation
- Add pytest suite (34 tests) covering scoring thresholds, dedup clustering +
  representative selection + time window, brief source/category diversity,
  avoid-term phrase matching, and text canonicalization/truncation.
- Rewrite _select_diverse with an explicit, tested contract (best-first, one
  per source, backfill, then inject a second category by evicting the
  lowest-ranked pick).
- classify_articles now returns attempted/succeeded/skipped (ClassifyReport) so
  silent model failures are visible in both the cycle and classify output.
- Fix clean_text truncation to stay within max_len (ellipsis no longer
  overshoots).
- New filters.py: canonical FilterPrefs shape (include/mute topics+flavors,
  avoid_terms, pauses) and pure word/phrase-boundary matching engine seeding
  Calm Filters. Not yet wired into the API.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-30 19:07:31 +00:00

124 lines
4.4 KiB
Python

"""Calm Filters — the canonical preference model and pure matching engine.
Everything (localStorage today, query params on the API, a user_preferences row
later) speaks this one shape, so the surfaces never drift. The functions here are
deliberately pure and side-effect-free so they are easy to test and reuse from
both the API and the CLI.
The humane surface ("Not today" / "Less like this" / "Always hide this") maps onto
this machinery: a pause is a topic/flavor muted *until* a timestamp; a mute is a
standing exclusion; avoid-terms drop anything mentioning a phrase the reader would
rather not see.
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
from datetime import datetime
# Split on any run of non-alphanumerics so matching is punctuation- and
# case-insensitive, and anchored to whole words/phrases (no substring surprises:
# "pan" must not match "pandemic", and "stock market" matches as a phrase).
_NONWORD = re.compile(r"[^a-z0-9]+")
def _normalize(text: str) -> str:
"""Lowercase, collapse non-alphanumerics to single spaces, pad with spaces."""
return " " + _NONWORD.sub(" ", text.lower()).strip() + " "
def text_matches_avoid_terms(text: str | None, terms: list[str]) -> bool:
"""True if text contains any avoid term as a whole word or phrase."""
if not text or not terms:
return False
haystack = _normalize(text)
for term in terms:
needle = _normalize(term).strip()
if needle and f" {needle} " in haystack:
return True
return False
@dataclass
class Pause:
kind: str # "topic" or "flavor"
value: str
until: str # ISO 8601 UTC timestamp
def active(self, now: datetime) -> bool:
try:
until = datetime.fromisoformat(self.until.replace("Z", "+00:00"))
except (ValueError, AttributeError):
return False
return until > now
@dataclass
class FilterPrefs:
include_topics: list[str] = field(default_factory=list)
include_flavors: list[str] = field(default_factory=list)
mute_topics: list[str] = field(default_factory=list)
mute_flavors: list[str] = field(default_factory=list)
avoid_terms: list[str] = field(default_factory=list)
pauses: list[Pause] = field(default_factory=list)
@classmethod
def from_dict(cls, data: dict | None) -> "FilterPrefs":
data = data or {}
return cls(
include_topics=list(data.get("include_topics") or []),
include_flavors=list(data.get("include_flavors") or []),
mute_topics=list(data.get("mute_topics") or []),
mute_flavors=list(data.get("mute_flavors") or []),
avoid_terms=list(data.get("avoid_terms") or []),
pauses=[Pause(**p) for p in (data.get("pauses") or [])],
)
def muted_topics(self, now: datetime) -> set[str]:
"""Standing mutes plus any topic currently paused."""
muted = set(self.mute_topics)
muted |= {p.value for p in self.pauses if p.kind == "topic" and p.active(now)}
return muted
def muted_flavors(self, now: datetime) -> set[str]:
muted = set(self.mute_flavors)
muted |= {p.value for p in self.pauses if p.kind == "flavor" and p.active(now)}
return muted
def is_empty(self) -> bool:
return not (
self.include_topics
or self.include_flavors
or self.mute_topics
or self.mute_flavors
or self.avoid_terms
or self.pauses
)
def allows(article: dict, prefs: FilterPrefs, now: datetime) -> bool:
"""True if an article (a feed/brief row dict) survives the preferences."""
topic = article.get("topic")
flavor = article.get("flavor")
if prefs.include_topics and topic not in prefs.include_topics:
return False
if prefs.include_flavors and flavor not in prefs.include_flavors:
return False
if topic in prefs.muted_topics(now):
return False
if flavor in prefs.muted_flavors(now):
return False
blob = f"{article.get('title') or ''} {article.get('description') or ''}"
if text_matches_avoid_terms(blob, prefs.avoid_terms):
return False
return True
def filter_articles(articles: list[dict], prefs: FilterPrefs, now: datetime) -> list[dict]:
"""Apply preferences to a list of article rows, preserving order."""
if prefs.is_empty():
return articles
return [a for a in articles if allows(a, prefs, now)]