9cdcda5e02
- Add pytest suite (34 tests) covering scoring thresholds, dedup clustering + representative selection + time window, brief source/category diversity, avoid-term phrase matching, and text canonicalization/truncation. - Rewrite _select_diverse with an explicit, tested contract (best-first, one per source, backfill, then inject a second category by evicting the lowest-ranked pick). - classify_articles now returns attempted/succeeded/skipped (ClassifyReport) so silent model failures are visible in both the cycle and classify output. - Fix clean_text truncation to stay within max_len (ellipsis no longer overshoots). - New filters.py: canonical FilterPrefs shape (include/mute topics+flavors, avoid_terms, pauses) and pure word/phrase-boundary matching engine seeding Calm Filters. Not yet wired into the API. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
124 lines
4.4 KiB
Python
124 lines
4.4 KiB
Python
"""Calm Filters — the canonical preference model and pure matching engine.
|
|
|
|
Everything (localStorage today, query params on the API, a user_preferences row
|
|
later) speaks this one shape, so the surfaces never drift. The functions here are
|
|
deliberately pure and side-effect-free so they are easy to test and reuse from
|
|
both the API and the CLI.
|
|
|
|
The humane surface ("Not today" / "Less like this" / "Always hide this") maps onto
|
|
this machinery: a pause is a topic/flavor muted *until* a timestamp; a mute is a
|
|
standing exclusion; avoid-terms drop anything mentioning a phrase the reader would
|
|
rather not see.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime
|
|
|
|
# Split on any run of non-alphanumerics so matching is punctuation- and
|
|
# case-insensitive, and anchored to whole words/phrases (no substring surprises:
|
|
# "pan" must not match "pandemic", and "stock market" matches as a phrase).
|
|
_NONWORD = re.compile(r"[^a-z0-9]+")
|
|
|
|
|
|
def _normalize(text: str) -> str:
|
|
"""Lowercase, collapse non-alphanumerics to single spaces, pad with spaces."""
|
|
return " " + _NONWORD.sub(" ", text.lower()).strip() + " "
|
|
|
|
|
|
def text_matches_avoid_terms(text: str | None, terms: list[str]) -> bool:
|
|
"""True if text contains any avoid term as a whole word or phrase."""
|
|
if not text or not terms:
|
|
return False
|
|
haystack = _normalize(text)
|
|
for term in terms:
|
|
needle = _normalize(term).strip()
|
|
if needle and f" {needle} " in haystack:
|
|
return True
|
|
return False
|
|
|
|
|
|
@dataclass
|
|
class Pause:
|
|
kind: str # "topic" or "flavor"
|
|
value: str
|
|
until: str # ISO 8601 UTC timestamp
|
|
|
|
def active(self, now: datetime) -> bool:
|
|
try:
|
|
until = datetime.fromisoformat(self.until.replace("Z", "+00:00"))
|
|
except (ValueError, AttributeError):
|
|
return False
|
|
return until > now
|
|
|
|
|
|
@dataclass
|
|
class FilterPrefs:
|
|
include_topics: list[str] = field(default_factory=list)
|
|
include_flavors: list[str] = field(default_factory=list)
|
|
mute_topics: list[str] = field(default_factory=list)
|
|
mute_flavors: list[str] = field(default_factory=list)
|
|
avoid_terms: list[str] = field(default_factory=list)
|
|
pauses: list[Pause] = field(default_factory=list)
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict | None) -> "FilterPrefs":
|
|
data = data or {}
|
|
return cls(
|
|
include_topics=list(data.get("include_topics") or []),
|
|
include_flavors=list(data.get("include_flavors") or []),
|
|
mute_topics=list(data.get("mute_topics") or []),
|
|
mute_flavors=list(data.get("mute_flavors") or []),
|
|
avoid_terms=list(data.get("avoid_terms") or []),
|
|
pauses=[Pause(**p) for p in (data.get("pauses") or [])],
|
|
)
|
|
|
|
def muted_topics(self, now: datetime) -> set[str]:
|
|
"""Standing mutes plus any topic currently paused."""
|
|
muted = set(self.mute_topics)
|
|
muted |= {p.value for p in self.pauses if p.kind == "topic" and p.active(now)}
|
|
return muted
|
|
|
|
def muted_flavors(self, now: datetime) -> set[str]:
|
|
muted = set(self.mute_flavors)
|
|
muted |= {p.value for p in self.pauses if p.kind == "flavor" and p.active(now)}
|
|
return muted
|
|
|
|
def is_empty(self) -> bool:
|
|
return not (
|
|
self.include_topics
|
|
or self.include_flavors
|
|
or self.mute_topics
|
|
or self.mute_flavors
|
|
or self.avoid_terms
|
|
or self.pauses
|
|
)
|
|
|
|
|
|
def allows(article: dict, prefs: FilterPrefs, now: datetime) -> bool:
|
|
"""True if an article (a feed/brief row dict) survives the preferences."""
|
|
topic = article.get("topic")
|
|
flavor = article.get("flavor")
|
|
|
|
if prefs.include_topics and topic not in prefs.include_topics:
|
|
return False
|
|
if prefs.include_flavors and flavor not in prefs.include_flavors:
|
|
return False
|
|
if topic in prefs.muted_topics(now):
|
|
return False
|
|
if flavor in prefs.muted_flavors(now):
|
|
return False
|
|
blob = f"{article.get('title') or ''} {article.get('description') or ''}"
|
|
if text_matches_avoid_terms(blob, prefs.avoid_terms):
|
|
return False
|
|
return True
|
|
|
|
|
|
def filter_articles(articles: list[dict], prefs: FilterPrefs, now: datetime) -> list[dict]:
|
|
"""Apply preferences to a list of article rows, preserving order."""
|
|
if prefs.is_empty():
|
|
return articles
|
|
return [a for a in articles if allows(a, prefs, now)]
|