9cdcda5e02
- Add pytest suite (34 tests) covering scoring thresholds, dedup clustering + representative selection + time window, brief source/category diversity, avoid-term phrase matching, and text canonicalization/truncation. - Rewrite _select_diverse with an explicit, tested contract (best-first, one per source, backfill, then inject a second category by evicting the lowest-ranked pick). - classify_articles now returns attempted/succeeded/skipped (ClassifyReport) so silent model failures are visible in both the cycle and classify output. - Fix clean_text truncation to stay within max_len (ellipsis no longer overshoots). - New filters.py: canonical FilterPrefs shape (include/mute topics+flavors, avoid_terms, pauses) and pure word/phrase-boundary matching engine seeding Calm Filters. Not yet wired into the API. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
51 lines
1.8 KiB
Python
51 lines
1.8 KiB
Python
from goodnews.briefs import _select_diverse
|
|
|
|
|
|
def row(id, source, category):
|
|
# _select_diverse only reads these three keys; plain dicts support [] access.
|
|
return {"id": id, "source_name": source, "default_category": category}
|
|
|
|
|
|
def test_prefers_distinct_sources_best_first():
|
|
rows = [
|
|
row(1, "A", "science"),
|
|
row(2, "A", "science"), # same source as #1 — should be skipped while others remain
|
|
row(3, "B", "science"),
|
|
row(4, "C", "environment"),
|
|
]
|
|
selected = _select_diverse(rows, limit=3)
|
|
ids = [r["id"] for r in selected]
|
|
assert ids == [1, 3, 4] # one per source, ranked order preserved
|
|
|
|
|
|
def test_backfills_when_sources_exhausted():
|
|
rows = [row(1, "A", "science"), row(2, "A", "science"), row(3, "A", "science")]
|
|
selected = _select_diverse(rows, limit=2)
|
|
assert len(selected) == 2 # repeats source A only because no others exist
|
|
|
|
|
|
def test_injects_second_category_without_shrinking():
|
|
rows = [
|
|
row(1, "A", "science"),
|
|
row(2, "B", "science"),
|
|
row(3, "C", "science"),
|
|
row(4, "D", "environment"), # the only other category, lowest ranked
|
|
]
|
|
selected = _select_diverse(rows, limit=3)
|
|
cats = {r["default_category"] for r in selected}
|
|
assert len(selected) == 3
|
|
assert len(cats) >= 2 # environment swapped in for diversity
|
|
assert any(r["default_category"] == "environment" for r in selected)
|
|
|
|
|
|
def test_keeps_single_category_when_no_alternative_exists():
|
|
rows = [row(1, "A", "science"), row(2, "B", "science"), row(3, "C", "science")]
|
|
selected = _select_diverse(rows, limit=3)
|
|
assert len(selected) == 3
|
|
assert {r["default_category"] for r in selected} == {"science"}
|
|
|
|
|
|
def test_never_returns_more_than_limit():
|
|
rows = [row(i, f"S{i}", "science") for i in range(10)]
|
|
assert len(_select_diverse(rows, limit=5)) == 5
|