upbeatBytes/goodnews/llm.py

from __future__ import annotations

import json
import os
import sqlite3
import urllib.error
import urllib.request
from collections.abc import Callable
from dataclasses import dataclass

from .taxonomy import (
    FLAVORS,
    TOPICS,
    coerce_flavor,
    coerce_topic,
    flavors_prompt_block,
    topics_prompt_block,
)


DEFAULT_BASE_URL = "http://127.0.0.1:1234/v1"
DEFAULT_MODEL = "gpt-oss"
DEFAULT_EMBED_MODEL = "text-embedding-nomic-embed-text-v1.5"
DEFAULT_TIMEOUT = 180


# Structured-output schema. Newer LM Studio / OpenAI-compatible servers want a
# json_schema response_format (older ones took json_object); we try schema first
# and fall back gracefully so the client works across server versions.
_SCORE_FIELD = {"type": "integer", "minimum": 0, "maximum": 10}
CLASSIFICATION_SCHEMA = {
    "type": "object",
    "additionalProperties": False,
    "required": [
        "constructive_score",
        "cortisol_score",
        "ragebait_score",
        "agency_score",
        "human_benefit_score",
        "novelty_score",
        "pr_risk_score",
        "accepted",
        "topic",
        "flavor",
        "reason_code",
        "reason_text",
    ],
    "properties": {
        "constructive_score": _SCORE_FIELD,
        "cortisol_score": _SCORE_FIELD,
        "ragebait_score": _SCORE_FIELD,
        "agency_score": _SCORE_FIELD,
        "human_benefit_score": _SCORE_FIELD,
        "novelty_score": _SCORE_FIELD,
        "pr_risk_score": _SCORE_FIELD,
        "accepted": {"type": "boolean"},
        "topic": {"type": "string", "enum": list(TOPICS)},
        "flavor": {"type": "string", "enum": list(FLAVORS)},
        "reason_code": {"type": "string"},
        "reason_text": {"type": "string"},
    },
}

# Response-format variants tried in order. Once one succeeds for a client, it is
# pinned so we stop paying failed round-trips on every subsequent call.
_RESPONSE_FORMATS = (
    {"type": "json_schema", "json_schema": {"name": "classification", "strict": True, "schema": CLASSIFICATION_SCHEMA}},
    {"type": "json_object"},
    None,
)


SYSTEM_PROMPT = """You classify article metadata for a calm constructive-news digest.

Judge emotional aftertaste, not simple positivity. Accept stories that leave a reader informed without feeling drained, especially when they include repair, progress, agency, resilience, human benefit, scientific discovery, environmental improvement, community action, or useful perspective.

Reject stories centered on fear, outrage, partisan conflict, crime, tragedy, disaster repetition, celebrity drama, market panic, or corporate PR without clear public benefit.

Also assign one topic and one flavor, choosing the single best fit.

Topic (what the story is about):
{topics}

Flavor (why it belongs in a calm, uplifting digest):
{flavors}

Return only JSON with this exact shape:
{{
  "constructive_score": 0,
  "cortisol_score": 0,
  "ragebait_score": 0,
  "agency_score": 0,
  "human_benefit_score": 0,
  "novelty_score": 0,
  "pr_risk_score": 0,
  "accepted": false,
  "topic": "one_of_the_allowed_topics",
  "flavor": "one_of_the_allowed_flavors",
  "reason_code": "short_snake_case",
  "reason_text": "one concise sentence"
}}
""".format(topics=topics_prompt_block(), flavors=flavors_prompt_block())


@dataclass
class LocalModelClient:
    base_url: str
    model: str
    api_key: str | None = None
    timeout: int = DEFAULT_TIMEOUT
    embed_model: str = DEFAULT_EMBED_MODEL
    # Index into _RESPONSE_FORMATS that the server accepts; discovered lazily.
    _response_format_idx: int | None = None

    @classmethod
    def from_env(cls) -> "LocalModelClient":
        return cls(
            base_url=os.environ.get("GOODNEWS_LLM_BASE_URL", DEFAULT_BASE_URL).rstrip("/"),
            model=os.environ.get("GOODNEWS_LLM_MODEL", DEFAULT_MODEL),
            api_key=os.environ.get("GOODNEWS_LLM_API_KEY"),
            timeout=int(os.environ.get("GOODNEWS_LLM_TIMEOUT", DEFAULT_TIMEOUT)),
            embed_model=os.environ.get("GOODNEWS_EMBED_MODEL", DEFAULT_EMBED_MODEL),
        )

    def embed(self, texts: list[str]) -> list[list[float]]:
        """Return embedding vectors for a batch of texts via /embeddings."""
        body = json.dumps({"model": self.embed_model, "input": texts}).encode("utf-8")
        headers = {"Content-Type": "application/json"}
        if self.api_key:
            headers["Authorization"] = f"Bearer {self.api_key}"
        request = urllib.request.Request(
            f"{self.base_url}/embeddings", data=body, headers=headers, method="POST"
        )
        try:
            with urllib.request.urlopen(request, timeout=self.timeout) as response:
                data = json.loads(response.read().decode("utf-8"))
        except urllib.error.HTTPError as exc:
            detail = exc.read().decode("utf-8", errors="replace")
            raise RuntimeError(f"HTTP {exc.code} from embeddings: {detail}") from exc
        except urllib.error.URLError as exc:
            raise RuntimeError(f"could not reach embeddings at {self.base_url}: {exc.reason}") from exc
        try:
            return [item["embedding"] for item in data["data"]]
        except (KeyError, TypeError) as exc:
            raise RuntimeError(f"unexpected embeddings response: {data}") from exc

    def classify(self, article: sqlite3.Row) -> dict:
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": _article_prompt(article)},
        ]
        # If we already learned which response_format the server accepts, use it.
        if self._response_format_idx is not None:
            return self._chat(self._build_payload(messages, _RESPONSE_FORMATS[self._response_format_idx]))

        # Otherwise escalate through the variants, pinning the first that works.
        last_exc: RuntimeError | None = None
        for idx, fmt in enumerate(_RESPONSE_FORMATS):
            try:
                result = self._chat(self._build_payload(messages, fmt))
                self._response_format_idx = idx
                return result
            except RuntimeError as exc:
                if "HTTP 400" not in str(exc):
                    raise
                last_exc = exc
        raise last_exc if last_exc else RuntimeError("no usable response_format")

    def _build_payload(self, messages: list[dict], response_format: dict | None) -> dict:
        payload = {"model": self.model, "temperature": 0.1, "messages": messages}
        if response_format is not None:
            payload["response_format"] = response_format
        return payload

    def list_models(self) -> list[str]:
        headers = {}
        if self.api_key:
            headers["Authorization"] = f"Bearer {self.api_key}"
        request = urllib.request.Request(f"{self.base_url}/models", headers=headers)
        try:
            with urllib.request.urlopen(request, timeout=10) as response:
                data = json.loads(response.read().decode("utf-8"))
        except urllib.error.HTTPError as exc:
            detail = exc.read().decode("utf-8", errors="replace")
            raise RuntimeError(f"HTTP {exc.code} from local model: {detail}") from exc
        except urllib.error.URLError as exc:
            raise RuntimeError(f"could not reach local model at {self.base_url}: {exc.reason}") from exc

        models = data.get("data", [])
        names = []
        for model in models:
            if isinstance(model, dict) and model.get("id"):
                names.append(str(model["id"]))
        return names

    def _chat(self, payload: dict) -> dict:
        body = json.dumps(payload).encode("utf-8")
        headers = {"Content-Type": "application/json"}
        if self.api_key:
            headers["Authorization"] = f"Bearer {self.api_key}"
        request = urllib.request.Request(
            f"{self.base_url}/chat/completions",
            data=body,
            headers=headers,
            method="POST",
        )
        try:
            with urllib.request.urlopen(request, timeout=self.timeout) as response:
                data = json.loads(response.read().decode("utf-8"))
        except urllib.error.HTTPError as exc:
            detail = exc.read().decode("utf-8", errors="replace")
            raise RuntimeError(f"HTTP {exc.code} from local model: {detail}") from exc
        except urllib.error.URLError as exc:
            raise RuntimeError(f"could not reach local model at {self.base_url}: {exc.reason}") from exc

        try:
            content = data["choices"][0]["message"]["content"]
        except (KeyError, IndexError, TypeError) as exc:
            raise RuntimeError(f"unexpected local model response: {data}") from exc
        return parse_classifier_json(content)


@dataclass
class ClassifyReport:
    results: list[tuple[int, dict]]
    attempted: int
    succeeded: int
    skipped: int


def classify_articles(
    conn: sqlite3.Connection,
    client: LocalModelClient,
    limit: int,
    include_rejected: bool = False,
    dry_run: bool = False,
    only_unclassified: bool = False,
    progress: "Callable[[int, int, int], None] | None" = None,
) -> ClassifyReport:
    rows = _classification_candidates(
        conn, limit=limit, include_rejected=include_rejected, only_unclassified=only_unclassified
    )
    results = []
    skipped = 0
    for index, row in enumerate(rows, start=1):
        try:
            scores = client.classify(row)
        except RuntimeError as exc:
            # One slow/failed article (timeout, bad response) shouldn't sink the
            # whole batch or discard work already committed. Skip and continue.
            skipped += 1
            print(f"[{row['id']}] skipped: {exc}")
            continue
        scores = normalize_scores(scores, model_name=client.model)
        results.append((row["id"], scores))
        if not dry_run:
            upsert_article_score(conn, row["id"], scores)
            conn.commit()
        if progress is not None:
            progress(index, len(rows), row["id"])
    return ClassifyReport(results=results, attempted=len(rows), succeeded=len(results), skipped=skipped)


def parse_classifier_json(content: str) -> dict:
    content = content.strip()
    try:
        return json.loads(content)
    except json.JSONDecodeError:
        start = content.find("{")
        end = content.rfind("}")
        if start == -1 or end == -1 or end <= start:
            raise RuntimeError(f"model did not return JSON: {content}")
        return json.loads(content[start : end + 1])


def normalize_scores(data: dict, model_name: str) -> dict:
    return {
        "constructive_score": _bounded_int(data.get("constructive_score")),
        "cortisol_score": _bounded_int(data.get("cortisol_score")),
        "ragebait_score": _bounded_int(data.get("ragebait_score")),
        "agency_score": _bounded_int(data.get("agency_score")),
        "human_benefit_score": _bounded_int(data.get("human_benefit_score")),
        "novelty_score": _bounded_int(data.get("novelty_score")),
        "pr_risk_score": _bounded_int(data.get("pr_risk_score")),
        "accepted": 1 if bool(data.get("accepted")) else 0,
        "topic": coerce_topic(data.get("topic")),
        "flavor": coerce_flavor(data.get("flavor")),
        "reason_code": str(data.get("reason_code") or "model_no_reason")[:120],
        "reason_text": str(data.get("reason_text") or "")[:1000],
        "model_name": model_name,
    }


def upsert_article_score(conn: sqlite3.Connection, article_id: int, scores: dict) -> None:
    conn.execute(
        """
        INSERT INTO article_scores (
            article_id, constructive_score, cortisol_score, ragebait_score,
            agency_score, human_benefit_score, novelty_score, pr_risk_score,
            accepted, topic, flavor, reason_code, reason_text, model_name, scored_at
        )
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
        ON CONFLICT(article_id) DO UPDATE SET
            constructive_score = excluded.constructive_score,
            cortisol_score = excluded.cortisol_score,
            ragebait_score = excluded.ragebait_score,
            agency_score = excluded.agency_score,
            human_benefit_score = excluded.human_benefit_score,
            novelty_score = excluded.novelty_score,
            pr_risk_score = excluded.pr_risk_score,
            accepted = excluded.accepted,
            topic = excluded.topic,
            flavor = excluded.flavor,
            reason_code = excluded.reason_code,
            reason_text = excluded.reason_text,
            model_name = excluded.model_name,
            scored_at = CURRENT_TIMESTAMP
        """,
        (
            article_id,
            scores["constructive_score"],
            scores["cortisol_score"],
            scores["ragebait_score"],
            scores["agency_score"],
            scores["human_benefit_score"],
            scores["novelty_score"],
            scores["pr_risk_score"],
            scores["accepted"],
            scores["topic"],
            scores["flavor"],
            scores["reason_code"],
            scores["reason_text"],
            scores["model_name"],
        ),
    )


def _classification_candidates(
    conn: sqlite3.Connection,
    limit: int,
    include_rejected: bool,
    only_unclassified: bool = False,
) -> list[sqlite3.Row]:
    filters = []
    if not include_rejected:
        filters.append("(s.accepted = 1 OR s.constructive_score >= 4)")
    if only_unclassified:
        # Articles still carrying the fast heuristic score, i.e. not yet judged
        # by the model. Lets a scheduled cycle only spend the LLM on new items.
        filters.append("s.model_name LIKE 'heuristic-%'")
    where = ("WHERE " + " AND ".join(filters)) if filters else ""
    return conn.execute(
        f"""
        SELECT
            a.id,
            a.title,
            a.description,
            a.published_at,
            a.canonical_url,
            src.name AS source_name,
            src.default_category,
            src.trust_score AS source_trust_score,
            src.pr_risk_score AS source_pr_risk_score,
            s.constructive_score,
            s.cortisol_score,
            s.ragebait_score,
            s.agency_score,
            s.human_benefit_score,
            s.pr_risk_score,
            s.accepted,
            s.reason_code
        FROM articles a
        JOIN sources src ON src.id = a.source_id
        LEFT JOIN article_scores s ON s.article_id = a.id
        {where}
        ORDER BY
            CASE WHEN s.model_name LIKE 'heuristic-%' THEN 0 ELSE 1 END,
            COALESCE(a.published_at, a.discovered_at) DESC
        LIMIT ?
        """,
        (limit,),
    ).fetchall()


def _article_prompt(article: sqlite3.Row) -> str:
    return "\n".join(
        [
            f"Source: {article['source_name']}",
            f"Source category: {article['default_category'] or 'unknown'}",
            f"Source trust score: {article['source_trust_score']}/10",
            f"Source PR risk score: {article['source_pr_risk_score']}/10",
            f"Published: {article['published_at'] or 'unknown'}",
            f"Title: {article['title']}",
            f"Snippet: {article['description'] or ''}",
            f"URL: {article['canonical_url']}",
        ]
    )


def _bounded_int(value: object) -> int:
    try:
        parsed = int(value)
    except (TypeError, ValueError):
        parsed = 0
    return max(0, min(10, parsed))