Files
upbeatBytes/goodnews/text.py
T
thejayman77 9cdcda5e02 Durability pass: tests, clearer diversity/classify behavior, Calm Filters foundation
- Add pytest suite (34 tests) covering scoring thresholds, dedup clustering +
  representative selection + time window, brief source/category diversity,
  avoid-term phrase matching, and text canonicalization/truncation.
- Rewrite _select_diverse with an explicit, tested contract (best-first, one
  per source, backfill, then inject a second category by evicting the
  lowest-ranked pick).
- classify_articles now returns attempted/succeeded/skipped (ClassifyReport) so
  silent model failures are visible in both the cycle and classify output.
- Fix clean_text truncation to stay within max_len (ellipsis no longer
  overshoots).
- New filters.py: canonical FilterPrefs shape (include/mute topics+flavors,
  avoid_terms, pauses) and pure word/phrase-boundary matching engine seeding
  Calm Filters. Not yet wired into the API.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-30 19:07:31 +00:00

64 lines
1.6 KiB
Python

from __future__ import annotations
import hashlib
import html
import re
from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
TAG_RE = re.compile(r"<[^>]+>")
WHITESPACE_RE = re.compile(r"\s+")
TRACKING_PREFIXES = ("utm_",)
TRACKING_PARAMS = {
"fbclid",
"gclid",
"mc_cid",
"mc_eid",
"igshid",
"ref",
}
def clean_text(value: str | None, max_len: int = 1000) -> str | None:
if not value:
return None
text = TAG_RE.sub(" ", value)
text = html.unescape(text)
text = WHITESPACE_RE.sub(" ", text).strip()
if len(text) > max_len:
# Keep the ellipsis inside max_len rather than overshooting by 3.
return text[: max_len - 3].rstrip() + "..."
return text or None
def canonicalize_url(url: str | None) -> str | None:
if not url:
return None
url = html.unescape(url).strip()
if not url:
return None
parts = urlsplit(url)
if parts.scheme not in {"http", "https"} or not parts.netloc:
return None
query = []
for key, value in parse_qsl(parts.query, keep_blank_values=True):
lowered = key.lower()
if lowered in TRACKING_PARAMS or lowered.startswith(TRACKING_PREFIXES):
continue
query.append((key, value))
normalized = parts._replace(
scheme=parts.scheme.lower(),
netloc=parts.netloc.lower(),
query=urlencode(sorted(query), doseq=True),
fragment="",
)
return urlunsplit(normalized)
def sha256_text(value: str | None) -> str:
normalized = (value or "").strip().lower()
return hashlib.sha256(normalized.encode("utf-8")).hexdigest()