068073423f
Local-first RSS/Atom ingestion pipeline with metadata-only storage, heuristic + local-LLM scoring, and daily brief builder. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
170 lines
3.8 KiB
Python
170 lines
3.8 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
|
|
POSITIVE_TERMS = {
|
|
"breakthrough",
|
|
"progress",
|
|
"improve",
|
|
"improves",
|
|
"improved",
|
|
"solution",
|
|
"solutions",
|
|
"recovery",
|
|
"restore",
|
|
"restores",
|
|
"rescued",
|
|
"rescue",
|
|
"volunteer",
|
|
"community",
|
|
"donate",
|
|
"donation",
|
|
"cure",
|
|
"treatment",
|
|
"therapy",
|
|
"clean energy",
|
|
"renewable",
|
|
"conservation",
|
|
"protect",
|
|
"protects",
|
|
"restoration",
|
|
"kindness",
|
|
"hope",
|
|
"first",
|
|
"record",
|
|
}
|
|
|
|
AGENCY_TERMS = {
|
|
"how",
|
|
"helps",
|
|
"helping",
|
|
"protect",
|
|
"protects",
|
|
"builds",
|
|
"creates",
|
|
"launches",
|
|
"teaches",
|
|
"learn",
|
|
"guide",
|
|
"tool",
|
|
"program",
|
|
"initiative",
|
|
"effort",
|
|
"plan",
|
|
"rebuild",
|
|
}
|
|
|
|
CORTISOL_TERMS = {
|
|
"war",
|
|
"killed",
|
|
"dead",
|
|
"death",
|
|
"murder",
|
|
"shooting",
|
|
"attack",
|
|
"crisis",
|
|
"catastrophe",
|
|
"disaster",
|
|
"collapse",
|
|
"panic",
|
|
"warning",
|
|
"threat",
|
|
"fear",
|
|
"fears",
|
|
"lawsuit",
|
|
"scandal",
|
|
}
|
|
|
|
RAGEBAIT_TERMS = {
|
|
"slams",
|
|
"blasts",
|
|
"furious",
|
|
"outrage",
|
|
"rage",
|
|
"shocking",
|
|
"you won't believe",
|
|
"sparks backlash",
|
|
"destroyed",
|
|
"humiliates",
|
|
}
|
|
|
|
PR_TERMS = {
|
|
"announces",
|
|
"unveils",
|
|
"funding round",
|
|
"raises",
|
|
"partnership",
|
|
"brand",
|
|
"sponsored",
|
|
"press release",
|
|
}
|
|
|
|
WORD_RE = re.compile(r"[a-z0-9']+")
|
|
|
|
|
|
def _count_terms(text: str, terms: set[str]) -> int:
|
|
lowered = text.lower()
|
|
words = set(WORD_RE.findall(lowered))
|
|
count = 0
|
|
for term in terms:
|
|
if " " in term:
|
|
count += 1 if term in lowered else 0
|
|
elif term in words:
|
|
count += 1
|
|
return count
|
|
|
|
|
|
def score_article(title: str, description: str | None, source_pr_risk: int) -> dict:
|
|
text = f"{title}. {description or ''}"
|
|
positive = _count_terms(text, POSITIVE_TERMS)
|
|
agency = _count_terms(text, AGENCY_TERMS)
|
|
cortisol = _count_terms(text, CORTISOL_TERMS)
|
|
ragebait = _count_terms(text, RAGEBAIT_TERMS)
|
|
pr_terms = _count_terms(text, PR_TERMS)
|
|
|
|
constructive_score = min(10, 2 + positive * 2 + agency)
|
|
agency_score = min(10, 1 + agency * 2)
|
|
cortisol_score = min(10, cortisol * 3)
|
|
ragebait_score = min(10, ragebait * 4)
|
|
pr_risk_score = min(10, source_pr_risk + pr_terms * 2)
|
|
human_benefit_score = min(10, positive * 2 + agency)
|
|
novelty_score = 5
|
|
|
|
accepted = (
|
|
constructive_score >= 5
|
|
and cortisol_score <= 5
|
|
and ragebait_score <= 3
|
|
and pr_risk_score <= 7
|
|
)
|
|
|
|
if accepted:
|
|
reason_code = "heuristic_constructive_candidate"
|
|
reason_text = "Constructive or agency-oriented language with low obvious cortisol/ragebait signals."
|
|
elif ragebait_score > 3:
|
|
reason_code = "heuristic_reject_ragebait_language"
|
|
reason_text = "Headline or snippet contains outrage-oriented language."
|
|
elif cortisol_score > 5:
|
|
reason_code = "heuristic_reject_cortisol_heavy"
|
|
reason_text = "Headline or snippet appears tragedy, threat, conflict, or crisis centered."
|
|
elif pr_risk_score > 7:
|
|
reason_code = "heuristic_reject_pr_risk"
|
|
reason_text = "Headline or source has signs of corporate PR framing."
|
|
else:
|
|
reason_code = "heuristic_needs_review"
|
|
reason_text = "Not enough constructive signal for automatic acceptance."
|
|
|
|
return {
|
|
"constructive_score": constructive_score,
|
|
"cortisol_score": cortisol_score,
|
|
"ragebait_score": ragebait_score,
|
|
"agency_score": agency_score,
|
|
"human_benefit_score": human_benefit_score,
|
|
"novelty_score": novelty_score,
|
|
"pr_risk_score": pr_risk_score,
|
|
"accepted": 1 if accepted else 0,
|
|
"reason_code": reason_code,
|
|
"reason_text": reason_text,
|
|
"model_name": "heuristic-v0",
|
|
}
|