9cdcda5e02
- Add pytest suite (34 tests) covering scoring thresholds, dedup clustering + representative selection + time window, brief source/category diversity, avoid-term phrase matching, and text canonicalization/truncation. - Rewrite _select_diverse with an explicit, tested contract (best-first, one per source, backfill, then inject a second category by evicting the lowest-ranked pick). - classify_articles now returns attempted/succeeded/skipped (ClassifyReport) so silent model failures are visible in both the cycle and classify output. - Fix clean_text truncation to stay within max_len (ellipsis no longer overshoots). - New filters.py: canonical FilterPrefs shape (include/mute topics+flavors, avoid_terms, pauses) and pure word/phrase-boundary matching engine seeding Calm Filters. Not yet wired into the API. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
37 lines
1.1 KiB
Python
37 lines
1.1 KiB
Python
from goodnews.text import canonicalize_url, clean_text, sha256_text
|
|
|
|
|
|
def test_clean_text_strips_tags_and_entities():
|
|
assert clean_text("<p>Hello& world</p>") == "Hello& world"
|
|
|
|
|
|
def test_clean_text_truncates():
|
|
out = clean_text("x" * 50, max_len=10)
|
|
assert out.endswith("...") and len(out) <= 10
|
|
|
|
|
|
def test_clean_text_empty_is_none():
|
|
assert clean_text("") is None
|
|
assert clean_text(None) is None
|
|
|
|
|
|
def test_canonicalize_strips_tracking_params():
|
|
url = "https://Example.com/story?utm_source=x&id=7&fbclid=abc"
|
|
out = canonicalize_url(url)
|
|
assert "utm_source" not in out and "fbclid" not in out
|
|
assert "id=7" in out
|
|
assert out.startswith("https://example.com") # scheme/host lowercased
|
|
|
|
|
|
def test_canonicalize_sorts_query_for_stable_hash():
|
|
a = canonicalize_url("https://e.com/p?b=2&a=1")
|
|
b = canonicalize_url("https://e.com/p?a=1&b=2")
|
|
assert a == b
|
|
assert sha256_text(a) == sha256_text(b)
|
|
|
|
|
|
def test_canonicalize_rejects_non_http():
|
|
assert canonicalize_url("ftp://e.com/x") is None
|
|
assert canonicalize_url("javascript:alert(1)") is None
|
|
assert canonicalize_url(None) is None
|