068073423f
Local-first RSS/Atom ingestion pipeline with metadata-only storage, heuristic + local-LLM scoring, and daily brief builder. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
63 lines
1.5 KiB
Python
63 lines
1.5 KiB
Python
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import html
|
|
import re
|
|
from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
|
|
|
|
|
|
TAG_RE = re.compile(r"<[^>]+>")
|
|
WHITESPACE_RE = re.compile(r"\s+")
|
|
TRACKING_PREFIXES = ("utm_",)
|
|
TRACKING_PARAMS = {
|
|
"fbclid",
|
|
"gclid",
|
|
"mc_cid",
|
|
"mc_eid",
|
|
"igshid",
|
|
"ref",
|
|
}
|
|
|
|
|
|
def clean_text(value: str | None, max_len: int = 1000) -> str | None:
|
|
if not value:
|
|
return None
|
|
text = TAG_RE.sub(" ", value)
|
|
text = html.unescape(text)
|
|
text = WHITESPACE_RE.sub(" ", text).strip()
|
|
if len(text) > max_len:
|
|
return text[: max_len - 1].rstrip() + "..."
|
|
return text or None
|
|
|
|
|
|
def canonicalize_url(url: str | None) -> str | None:
|
|
if not url:
|
|
return None
|
|
url = html.unescape(url).strip()
|
|
if not url:
|
|
return None
|
|
parts = urlsplit(url)
|
|
if parts.scheme not in {"http", "https"} or not parts.netloc:
|
|
return None
|
|
|
|
query = []
|
|
for key, value in parse_qsl(parts.query, keep_blank_values=True):
|
|
lowered = key.lower()
|
|
if lowered in TRACKING_PARAMS or lowered.startswith(TRACKING_PREFIXES):
|
|
continue
|
|
query.append((key, value))
|
|
|
|
normalized = parts._replace(
|
|
scheme=parts.scheme.lower(),
|
|
netloc=parts.netloc.lower(),
|
|
query=urlencode(sorted(query), doseq=True),
|
|
fragment="",
|
|
)
|
|
return urlunsplit(normalized)
|
|
|
|
|
|
def sha256_text(value: str | None) -> str:
|
|
normalized = (value or "").strip().lower()
|
|
return hashlib.sha256(normalized.encode("utf-8")).hexdigest()
|
|
|