Files
upbeatBytes/goodnews/llm.py
T
thejayman77 9cdcda5e02 Durability pass: tests, clearer diversity/classify behavior, Calm Filters foundation
- Add pytest suite (34 tests) covering scoring thresholds, dedup clustering +
  representative selection + time window, brief source/category diversity,
  avoid-term phrase matching, and text canonicalization/truncation.
- Rewrite _select_diverse with an explicit, tested contract (best-first, one
  per source, backfill, then inject a second category by evicting the
  lowest-ranked pick).
- classify_articles now returns attempted/succeeded/skipped (ClassifyReport) so
  silent model failures are visible in both the cycle and classify output.
- Fix clean_text truncation to stay within max_len (ellipsis no longer
  overshoots).
- New filters.py: canonical FilterPrefs shape (include/mute topics+flavors,
  avoid_terms, pauses) and pure word/phrase-boundary matching engine seeding
  Calm Filters. Not yet wired into the API.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-30 19:07:31 +00:00

406 lines
15 KiB
Python

from __future__ import annotations
import json
import os
import sqlite3
import urllib.error
import urllib.request
from collections.abc import Callable
from dataclasses import dataclass
from .taxonomy import (
FLAVORS,
TOPICS,
coerce_flavor,
coerce_topic,
flavors_prompt_block,
topics_prompt_block,
)
DEFAULT_BASE_URL = "http://127.0.0.1:1234/v1"
DEFAULT_MODEL = "gpt-oss"
DEFAULT_EMBED_MODEL = "text-embedding-nomic-embed-text-v1.5"
DEFAULT_TIMEOUT = 180
# Structured-output schema. Newer LM Studio / OpenAI-compatible servers want a
# json_schema response_format (older ones took json_object); we try schema first
# and fall back gracefully so the client works across server versions.
_SCORE_FIELD = {"type": "integer", "minimum": 0, "maximum": 10}
CLASSIFICATION_SCHEMA = {
"type": "object",
"additionalProperties": False,
"required": [
"constructive_score",
"cortisol_score",
"ragebait_score",
"agency_score",
"human_benefit_score",
"novelty_score",
"pr_risk_score",
"accepted",
"topic",
"flavor",
"reason_code",
"reason_text",
],
"properties": {
"constructive_score": _SCORE_FIELD,
"cortisol_score": _SCORE_FIELD,
"ragebait_score": _SCORE_FIELD,
"agency_score": _SCORE_FIELD,
"human_benefit_score": _SCORE_FIELD,
"novelty_score": _SCORE_FIELD,
"pr_risk_score": _SCORE_FIELD,
"accepted": {"type": "boolean"},
"topic": {"type": "string", "enum": list(TOPICS)},
"flavor": {"type": "string", "enum": list(FLAVORS)},
"reason_code": {"type": "string"},
"reason_text": {"type": "string"},
},
}
# Response-format variants tried in order. Once one succeeds for a client, it is
# pinned so we stop paying failed round-trips on every subsequent call.
_RESPONSE_FORMATS = (
{"type": "json_schema", "json_schema": {"name": "classification", "strict": True, "schema": CLASSIFICATION_SCHEMA}},
{"type": "json_object"},
None,
)
SYSTEM_PROMPT = """You classify article metadata for a calm constructive-news digest.
Judge emotional aftertaste, not simple positivity. Accept stories that leave a reader informed without feeling drained, especially when they include repair, progress, agency, resilience, human benefit, scientific discovery, environmental improvement, community action, or useful perspective.
Reject stories centered on fear, outrage, partisan conflict, crime, tragedy, disaster repetition, celebrity drama, market panic, or corporate PR without clear public benefit.
Also assign one topic and one flavor, choosing the single best fit.
Topic (what the story is about):
{topics}
Flavor (why it belongs in a calm, uplifting digest):
{flavors}
Return only JSON with this exact shape:
{{
"constructive_score": 0,
"cortisol_score": 0,
"ragebait_score": 0,
"agency_score": 0,
"human_benefit_score": 0,
"novelty_score": 0,
"pr_risk_score": 0,
"accepted": false,
"topic": "one_of_the_allowed_topics",
"flavor": "one_of_the_allowed_flavors",
"reason_code": "short_snake_case",
"reason_text": "one concise sentence"
}}
""".format(topics=topics_prompt_block(), flavors=flavors_prompt_block())
@dataclass
class LocalModelClient:
base_url: str
model: str
api_key: str | None = None
timeout: int = DEFAULT_TIMEOUT
embed_model: str = DEFAULT_EMBED_MODEL
# Index into _RESPONSE_FORMATS that the server accepts; discovered lazily.
_response_format_idx: int | None = None
@classmethod
def from_env(cls) -> "LocalModelClient":
return cls(
base_url=os.environ.get("GOODNEWS_LLM_BASE_URL", DEFAULT_BASE_URL).rstrip("/"),
model=os.environ.get("GOODNEWS_LLM_MODEL", DEFAULT_MODEL),
api_key=os.environ.get("GOODNEWS_LLM_API_KEY"),
timeout=int(os.environ.get("GOODNEWS_LLM_TIMEOUT", DEFAULT_TIMEOUT)),
embed_model=os.environ.get("GOODNEWS_EMBED_MODEL", DEFAULT_EMBED_MODEL),
)
def embed(self, texts: list[str]) -> list[list[float]]:
"""Return embedding vectors for a batch of texts via /embeddings."""
body = json.dumps({"model": self.embed_model, "input": texts}).encode("utf-8")
headers = {"Content-Type": "application/json"}
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
request = urllib.request.Request(
f"{self.base_url}/embeddings", data=body, headers=headers, method="POST"
)
try:
with urllib.request.urlopen(request, timeout=self.timeout) as response:
data = json.loads(response.read().decode("utf-8"))
except urllib.error.HTTPError as exc:
detail = exc.read().decode("utf-8", errors="replace")
raise RuntimeError(f"HTTP {exc.code} from embeddings: {detail}") from exc
except urllib.error.URLError as exc:
raise RuntimeError(f"could not reach embeddings at {self.base_url}: {exc.reason}") from exc
try:
return [item["embedding"] for item in data["data"]]
except (KeyError, TypeError) as exc:
raise RuntimeError(f"unexpected embeddings response: {data}") from exc
def classify(self, article: sqlite3.Row) -> dict:
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": _article_prompt(article)},
]
# If we already learned which response_format the server accepts, use it.
if self._response_format_idx is not None:
return self._chat(self._build_payload(messages, _RESPONSE_FORMATS[self._response_format_idx]))
# Otherwise escalate through the variants, pinning the first that works.
last_exc: RuntimeError | None = None
for idx, fmt in enumerate(_RESPONSE_FORMATS):
try:
result = self._chat(self._build_payload(messages, fmt))
self._response_format_idx = idx
return result
except RuntimeError as exc:
if "HTTP 400" not in str(exc):
raise
last_exc = exc
raise last_exc if last_exc else RuntimeError("no usable response_format")
def _build_payload(self, messages: list[dict], response_format: dict | None) -> dict:
payload = {"model": self.model, "temperature": 0.1, "messages": messages}
if response_format is not None:
payload["response_format"] = response_format
return payload
def list_models(self) -> list[str]:
headers = {}
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
request = urllib.request.Request(f"{self.base_url}/models", headers=headers)
try:
with urllib.request.urlopen(request, timeout=10) as response:
data = json.loads(response.read().decode("utf-8"))
except urllib.error.HTTPError as exc:
detail = exc.read().decode("utf-8", errors="replace")
raise RuntimeError(f"HTTP {exc.code} from local model: {detail}") from exc
except urllib.error.URLError as exc:
raise RuntimeError(f"could not reach local model at {self.base_url}: {exc.reason}") from exc
models = data.get("data", [])
names = []
for model in models:
if isinstance(model, dict) and model.get("id"):
names.append(str(model["id"]))
return names
def _chat(self, payload: dict) -> dict:
body = json.dumps(payload).encode("utf-8")
headers = {"Content-Type": "application/json"}
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
request = urllib.request.Request(
f"{self.base_url}/chat/completions",
data=body,
headers=headers,
method="POST",
)
try:
with urllib.request.urlopen(request, timeout=self.timeout) as response:
data = json.loads(response.read().decode("utf-8"))
except urllib.error.HTTPError as exc:
detail = exc.read().decode("utf-8", errors="replace")
raise RuntimeError(f"HTTP {exc.code} from local model: {detail}") from exc
except urllib.error.URLError as exc:
raise RuntimeError(f"could not reach local model at {self.base_url}: {exc.reason}") from exc
try:
content = data["choices"][0]["message"]["content"]
except (KeyError, IndexError, TypeError) as exc:
raise RuntimeError(f"unexpected local model response: {data}") from exc
return parse_classifier_json(content)
@dataclass
class ClassifyReport:
results: list[tuple[int, dict]]
attempted: int
succeeded: int
skipped: int
def classify_articles(
conn: sqlite3.Connection,
client: LocalModelClient,
limit: int,
include_rejected: bool = False,
dry_run: bool = False,
only_unclassified: bool = False,
progress: "Callable[[int, int, int], None] | None" = None,
) -> ClassifyReport:
rows = _classification_candidates(
conn, limit=limit, include_rejected=include_rejected, only_unclassified=only_unclassified
)
results = []
skipped = 0
for index, row in enumerate(rows, start=1):
try:
scores = client.classify(row)
except RuntimeError as exc:
# One slow/failed article (timeout, bad response) shouldn't sink the
# whole batch or discard work already committed. Skip and continue.
skipped += 1
print(f"[{row['id']}] skipped: {exc}")
continue
scores = normalize_scores(scores, model_name=client.model)
results.append((row["id"], scores))
if not dry_run:
upsert_article_score(conn, row["id"], scores)
conn.commit()
if progress is not None:
progress(index, len(rows), row["id"])
return ClassifyReport(results=results, attempted=len(rows), succeeded=len(results), skipped=skipped)
def parse_classifier_json(content: str) -> dict:
content = content.strip()
try:
return json.loads(content)
except json.JSONDecodeError:
start = content.find("{")
end = content.rfind("}")
if start == -1 or end == -1 or end <= start:
raise RuntimeError(f"model did not return JSON: {content}")
return json.loads(content[start : end + 1])
def normalize_scores(data: dict, model_name: str) -> dict:
return {
"constructive_score": _bounded_int(data.get("constructive_score")),
"cortisol_score": _bounded_int(data.get("cortisol_score")),
"ragebait_score": _bounded_int(data.get("ragebait_score")),
"agency_score": _bounded_int(data.get("agency_score")),
"human_benefit_score": _bounded_int(data.get("human_benefit_score")),
"novelty_score": _bounded_int(data.get("novelty_score")),
"pr_risk_score": _bounded_int(data.get("pr_risk_score")),
"accepted": 1 if bool(data.get("accepted")) else 0,
"topic": coerce_topic(data.get("topic")),
"flavor": coerce_flavor(data.get("flavor")),
"reason_code": str(data.get("reason_code") or "model_no_reason")[:120],
"reason_text": str(data.get("reason_text") or "")[:1000],
"model_name": model_name,
}
def upsert_article_score(conn: sqlite3.Connection, article_id: int, scores: dict) -> None:
conn.execute(
"""
INSERT INTO article_scores (
article_id, constructive_score, cortisol_score, ragebait_score,
agency_score, human_benefit_score, novelty_score, pr_risk_score,
accepted, topic, flavor, reason_code, reason_text, model_name, scored_at
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
ON CONFLICT(article_id) DO UPDATE SET
constructive_score = excluded.constructive_score,
cortisol_score = excluded.cortisol_score,
ragebait_score = excluded.ragebait_score,
agency_score = excluded.agency_score,
human_benefit_score = excluded.human_benefit_score,
novelty_score = excluded.novelty_score,
pr_risk_score = excluded.pr_risk_score,
accepted = excluded.accepted,
topic = excluded.topic,
flavor = excluded.flavor,
reason_code = excluded.reason_code,
reason_text = excluded.reason_text,
model_name = excluded.model_name,
scored_at = CURRENT_TIMESTAMP
""",
(
article_id,
scores["constructive_score"],
scores["cortisol_score"],
scores["ragebait_score"],
scores["agency_score"],
scores["human_benefit_score"],
scores["novelty_score"],
scores["pr_risk_score"],
scores["accepted"],
scores["topic"],
scores["flavor"],
scores["reason_code"],
scores["reason_text"],
scores["model_name"],
),
)
def _classification_candidates(
conn: sqlite3.Connection,
limit: int,
include_rejected: bool,
only_unclassified: bool = False,
) -> list[sqlite3.Row]:
filters = []
if not include_rejected:
filters.append("(s.accepted = 1 OR s.constructive_score >= 4)")
if only_unclassified:
# Articles still carrying the fast heuristic score, i.e. not yet judged
# by the model. Lets a scheduled cycle only spend the LLM on new items.
filters.append("s.model_name LIKE 'heuristic-%'")
where = ("WHERE " + " AND ".join(filters)) if filters else ""
return conn.execute(
f"""
SELECT
a.id,
a.title,
a.description,
a.published_at,
a.canonical_url,
src.name AS source_name,
src.default_category,
src.trust_score AS source_trust_score,
src.pr_risk_score AS source_pr_risk_score,
s.constructive_score,
s.cortisol_score,
s.ragebait_score,
s.agency_score,
s.human_benefit_score,
s.pr_risk_score,
s.accepted,
s.reason_code
FROM articles a
JOIN sources src ON src.id = a.source_id
LEFT JOIN article_scores s ON s.article_id = a.id
{where}
ORDER BY
CASE WHEN s.model_name LIKE 'heuristic-%' THEN 0 ELSE 1 END,
COALESCE(a.published_at, a.discovered_at) DESC
LIMIT ?
""",
(limit,),
).fetchall()
def _article_prompt(article: sqlite3.Row) -> str:
return "\n".join(
[
f"Source: {article['source_name']}",
f"Source category: {article['default_category'] or 'unknown'}",
f"Source trust score: {article['source_trust_score']}/10",
f"Source PR risk score: {article['source_pr_risk_score']}/10",
f"Published: {article['published_at'] or 'unknown'}",
f"Title: {article['title']}",
f"Snippet: {article['description'] or ''}",
f"URL: {article['canonical_url']}",
]
)
def _bounded_int(value: object) -> int:
try:
parsed = int(value)
except (TypeError, ValueError):
parsed = 0
return max(0, min(10, parsed))