Files
upbeatBytes/tests/test_text.py
T
thejayman77 9cdcda5e02 Durability pass: tests, clearer diversity/classify behavior, Calm Filters foundation
- Add pytest suite (34 tests) covering scoring thresholds, dedup clustering +
  representative selection + time window, brief source/category diversity,
  avoid-term phrase matching, and text canonicalization/truncation.
- Rewrite _select_diverse with an explicit, tested contract (best-first, one
  per source, backfill, then inject a second category by evicting the
  lowest-ranked pick).
- classify_articles now returns attempted/succeeded/skipped (ClassifyReport) so
  silent model failures are visible in both the cycle and classify output.
- Fix clean_text truncation to stay within max_len (ellipsis no longer
  overshoots).
- New filters.py: canonical FilterPrefs shape (include/mute topics+flavors,
  avoid_terms, pauses) and pure word/phrase-boundary matching engine seeding
  Calm Filters. Not yet wired into the API.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-30 19:07:31 +00:00

37 lines
1.1 KiB
Python

from goodnews.text import canonicalize_url, clean_text, sha256_text
def test_clean_text_strips_tags_and_entities():
assert clean_text("<p>Hello&amp; &nbsp;world</p>") == "Hello& world"
def test_clean_text_truncates():
out = clean_text("x" * 50, max_len=10)
assert out.endswith("...") and len(out) <= 10
def test_clean_text_empty_is_none():
assert clean_text("") is None
assert clean_text(None) is None
def test_canonicalize_strips_tracking_params():
url = "https://Example.com/story?utm_source=x&id=7&fbclid=abc"
out = canonicalize_url(url)
assert "utm_source" not in out and "fbclid" not in out
assert "id=7" in out
assert out.startswith("https://example.com") # scheme/host lowercased
def test_canonicalize_sorts_query_for_stable_hash():
a = canonicalize_url("https://e.com/p?b=2&a=1")
b = canonicalize_url("https://e.com/p?a=1&b=2")
assert a == b
assert sha256_text(a) == sha256_text(b)
def test_canonicalize_rejects_non_http():
assert canonicalize_url("ftp://e.com/x") is None
assert canonicalize_url("javascript:alert(1)") is None
assert canonicalize_url(None) is None