from __future__ import annotations import hashlib import html import re from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit TAG_RE = re.compile(r"<[^>]+>") WHITESPACE_RE = re.compile(r"\s+") TRACKING_PREFIXES = ("utm_",) TRACKING_PARAMS = { "fbclid", "gclid", "mc_cid", "mc_eid", "igshid", "ref", } def clean_text(value: str | None, max_len: int = 1000) -> str | None: if not value: return None text = TAG_RE.sub(" ", value) text = html.unescape(text) text = WHITESPACE_RE.sub(" ", text).strip() if len(text) > max_len: # Keep the ellipsis inside max_len rather than overshooting by 3. return text[: max_len - 3].rstrip() + "..." return text or None def canonicalize_url(url: str | None) -> str | None: if not url: return None url = html.unescape(url).strip() if not url: return None parts = urlsplit(url) if parts.scheme not in {"http", "https"} or not parts.netloc: return None query = [] for key, value in parse_qsl(parts.query, keep_blank_values=True): lowered = key.lower() if lowered in TRACKING_PARAMS or lowered.startswith(TRACKING_PREFIXES): continue query.append((key, value)) normalized = parts._replace( scheme=parts.scheme.lower(), netloc=parts.netloc.lower(), query=urlencode(sorted(query), doseq=True), fragment="", ) return urlunsplit(normalized) def sha256_text(value: str | None) -> str: normalized = (value or "").strip().lower() return hashlib.sha256(normalized.encode("utf-8")).hexdigest()