from goodnews.text import canonicalize_url, clean_text, sha256_text def test_clean_text_strips_tags_and_entities(): assert clean_text("

Hello&  world

") == "Hello& world" def test_clean_text_truncates(): out = clean_text("x" * 50, max_len=10) assert out.endswith("...") and len(out) <= 10 def test_clean_text_empty_is_none(): assert clean_text("") is None assert clean_text(None) is None def test_canonicalize_strips_tracking_params(): url = "https://Example.com/story?utm_source=x&id=7&fbclid=abc" out = canonicalize_url(url) assert "utm_source" not in out and "fbclid" not in out assert "id=7" in out assert out.startswith("https://example.com") # scheme/host lowercased def test_canonicalize_sorts_query_for_stable_hash(): a = canonicalize_url("https://e.com/p?b=2&a=1") b = canonicalize_url("https://e.com/p?a=1&b=2") assert a == b assert sha256_text(a) == sha256_text(b) def test_canonicalize_rejects_non_http(): assert canonicalize_url("ftp://e.com/x") is None assert canonicalize_url("javascript:alert(1)") is None assert canonicalize_url(None) is None