9cdcda5e02
- Add pytest suite (34 tests) covering scoring thresholds, dedup clustering + representative selection + time window, brief source/category diversity, avoid-term phrase matching, and text canonicalization/truncation. - Rewrite _select_diverse with an explicit, tested contract (best-first, one per source, backfill, then inject a second category by evicting the lowest-ranked pick). - classify_articles now returns attempted/succeeded/skipped (ClassifyReport) so silent model failures are visible in both the cycle and classify output. - Fix clean_text truncation to stay within max_len (ellipsis no longer overshoots). - New filters.py: canonical FilterPrefs shape (include/mute topics+flavors, avoid_terms, pauses) and pure word/phrase-boundary matching engine seeding Calm Filters. Not yet wired into the API. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
89 lines
3.1 KiB
Python
89 lines
3.1 KiB
Python
from datetime import datetime, timezone
|
|
|
|
from goodnews.filters import (
|
|
FilterPrefs,
|
|
Pause,
|
|
filter_articles,
|
|
text_matches_avoid_terms,
|
|
)
|
|
|
|
NOW = datetime(2026, 6, 1, tzinfo=timezone.utc)
|
|
|
|
|
|
def art(topic="science", flavor="discovery", title="A calm discovery", description=""):
|
|
return {"topic": topic, "flavor": flavor, "title": title, "description": description}
|
|
|
|
|
|
# --- avoid-term matching: the trust-critical pure function ---
|
|
|
|
def test_single_word_matches_whole_word_only():
|
|
assert text_matches_avoid_terms("New cancer drug approved", ["cancer"])
|
|
assert not text_matches_avoid_terms("Cancerous growth studied", ["cancer"])
|
|
|
|
|
|
def test_substring_does_not_match():
|
|
# "pan" must not match "pandemic"
|
|
assert not text_matches_avoid_terms("Pandemic preparedness improves", ["pan"])
|
|
|
|
|
|
def test_phrase_matches_as_phrase():
|
|
assert text_matches_avoid_terms("The stock market crashed today", ["stock market"])
|
|
assert not text_matches_avoid_terms("Stocks and other markets", ["stock market"])
|
|
|
|
|
|
def test_punctuation_and_case_normalized():
|
|
assert text_matches_avoid_terms("An Anti-Aging breakthrough", ["anti aging"])
|
|
assert text_matches_avoid_terms("ELECTION results", ["election"])
|
|
|
|
|
|
def test_empty_inputs_are_safe():
|
|
assert not text_matches_avoid_terms("", ["cancer"])
|
|
assert not text_matches_avoid_terms("anything", [])
|
|
assert not text_matches_avoid_terms(None, ["cancer"])
|
|
|
|
|
|
# --- filter_articles over the canonical prefs ---
|
|
|
|
def test_empty_prefs_pass_everything_through():
|
|
items = [art(), art(topic="health")]
|
|
assert filter_articles(items, FilterPrefs(), NOW) == items
|
|
|
|
|
|
def test_mute_topic_drops_matching_articles():
|
|
items = [art(topic="science"), art(topic="health")]
|
|
prefs = FilterPrefs.from_dict({"mute_topics": ["health"]})
|
|
out = filter_articles(items, prefs, NOW)
|
|
assert [a["topic"] for a in out] == ["science"]
|
|
|
|
|
|
def test_include_topics_keeps_only_those():
|
|
items = [art(topic="science"), art(topic="animals"), art(topic="health")]
|
|
prefs = FilterPrefs.from_dict({"include_topics": ["science", "animals"]})
|
|
out = filter_articles(items, prefs, NOW)
|
|
assert {a["topic"] for a in out} == {"science", "animals"}
|
|
|
|
|
|
def test_avoid_terms_match_title_and_description():
|
|
items = [art(title="Update on the election"), art(description="about an election too"), art()]
|
|
prefs = FilterPrefs.from_dict({"avoid_terms": ["election"]})
|
|
out = filter_articles(items, prefs, NOW)
|
|
assert len(out) == 1
|
|
|
|
|
|
def test_active_pause_hides_topic_but_expired_does_not():
|
|
items = [art(topic="health")]
|
|
active = FilterPrefs.from_dict(
|
|
{"pauses": [{"kind": "topic", "value": "health", "until": "2026-06-02T00:00:00Z"}]}
|
|
)
|
|
expired = FilterPrefs.from_dict(
|
|
{"pauses": [{"kind": "topic", "value": "health", "until": "2026-05-01T00:00:00Z"}]}
|
|
)
|
|
assert filter_articles(items, active, NOW) == []
|
|
assert filter_articles(items, expired, NOW) == items
|
|
|
|
|
|
def test_pause_active_helper():
|
|
assert Pause("topic", "health", "2026-06-02T00:00:00Z").active(NOW)
|
|
assert not Pause("topic", "health", "2026-05-01T00:00:00Z").active(NOW)
|
|
assert not Pause("topic", "health", "garbage").active(NOW)
|