upbeatBytes/goodnews/scoring.py

from __future__ import annotations

import re


POSITIVE_TERMS = {
    "breakthrough",
    "progress",
    "improve",
    "improves",
    "improved",
    "solution",
    "solutions",
    "recovery",
    "restore",
    "restores",
    "rescued",
    "rescue",
    "volunteer",
    "community",
    "donate",
    "donation",
    "cure",
    "treatment",
    "therapy",
    "clean energy",
    "renewable",
    "conservation",
    "protect",
    "protects",
    "restoration",
    "kindness",
    "hope",
    "first",
    "record",
}

AGENCY_TERMS = {
    "how",
    "helps",
    "helping",
    "protect",
    "protects",
    "builds",
    "creates",
    "launches",
    "teaches",
    "learn",
    "guide",
    "tool",
    "program",
    "initiative",
    "effort",
    "plan",
    "rebuild",
}

CORTISOL_TERMS = {
    "war",
    "killed",
    "dead",
    "death",
    "murder",
    "shooting",
    "attack",
    "crisis",
    "catastrophe",
    "disaster",
    "collapse",
    "panic",
    "warning",
    "threat",
    "fear",
    "fears",
    "lawsuit",
    "scandal",
}

RAGEBAIT_TERMS = {
    "slams",
    "blasts",
    "furious",
    "outrage",
    "rage",
    "shocking",
    "you won't believe",
    "sparks backlash",
    "destroyed",
    "humiliates",
}

PR_TERMS = {
    "announces",
    "unveils",
    "funding round",
    "raises",
    "partnership",
    "brand",
    "sponsored",
    "press release",
}

WORD_RE = re.compile(r"[a-z0-9']+")


def _count_terms(text: str, terms: set[str]) -> int:
    lowered = text.lower()
    words = set(WORD_RE.findall(lowered))
    count = 0
    for term in terms:
        if " " in term:
            count += 1 if term in lowered else 0
        elif term in words:
            count += 1
    return count


def score_article(title: str, description: str | None, source_pr_risk: int) -> dict:
    text = f"{title}. {description or ''}"
    positive = _count_terms(text, POSITIVE_TERMS)
    agency = _count_terms(text, AGENCY_TERMS)
    cortisol = _count_terms(text, CORTISOL_TERMS)
    ragebait = _count_terms(text, RAGEBAIT_TERMS)
    pr_terms = _count_terms(text, PR_TERMS)

    constructive_score = min(10, 2 + positive * 2 + agency)
    agency_score = min(10, 1 + agency * 2)
    cortisol_score = min(10, cortisol * 3)
    ragebait_score = min(10, ragebait * 4)
    pr_risk_score = min(10, source_pr_risk + pr_terms * 2)
    human_benefit_score = min(10, positive * 2 + agency)
    novelty_score = 5

    accepted = (
        constructive_score >= 5
        and cortisol_score <= 5
        and ragebait_score <= 3
        and pr_risk_score <= 7
    )

    if accepted:
        reason_code = "heuristic_constructive_candidate"
        reason_text = "Constructive or agency-oriented language with low obvious cortisol/ragebait signals."
    elif ragebait_score > 3:
        reason_code = "heuristic_reject_ragebait_language"
        reason_text = "Headline or snippet contains outrage-oriented language."
    elif cortisol_score > 5:
        reason_code = "heuristic_reject_cortisol_heavy"
        reason_text = "Headline or snippet appears tragedy, threat, conflict, or crisis centered."
    elif pr_risk_score > 7:
        reason_code = "heuristic_reject_pr_risk"
        reason_text = "Headline or source has signs of corporate PR framing."
    else:
        reason_code = "heuristic_needs_review"
        reason_text = "Not enough constructive signal for automatic acceptance."

    return {
        "constructive_score": constructive_score,
        "cortisol_score": cortisol_score,
        "ragebait_score": ragebait_score,
        "agency_score": agency_score,
        "human_benefit_score": human_benefit_score,
        "novelty_score": novelty_score,
        "pr_risk_score": pr_risk_score,
        "accepted": 1 if accepted else 0,
        "reason_code": reason_code,
        "reason_text": reason_text,
        "model_name": "heuristic-v0",
    }