667b1a82c3
Per the logo + brand: the name is upbeatBytes (camelCase). Swept all user-facing strings — titles/og:site_name/og:title, logo alt text, share pages (share.py), emails (email_send), classifier prompt (llm), digest/unsubscribe (api), PWA manifest, game share text, sign-in, the SPA shell + patch-static-heads (play title) — plus README/publish.sh and the email test fixture. (SMTP From env was already upbeatBytes.) Domains (upbeatbytes.com) unchanged. 425 BE + 36 FE green. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
531 lines
24 KiB
Python
531 lines
24 KiB
Python
from __future__ import annotations
|
||
|
||
import json
|
||
import os
|
||
import sqlite3
|
||
import urllib.error
|
||
import urllib.request
|
||
from collections.abc import Callable
|
||
from dataclasses import dataclass
|
||
|
||
from .taxonomy import (
|
||
ALLOWED_TAGS,
|
||
FLAVORS,
|
||
MAX_TAGS,
|
||
TOPICS,
|
||
coerce_flavor,
|
||
coerce_tags,
|
||
coerce_topic,
|
||
flavors_prompt_block,
|
||
tags_prompt_block,
|
||
topics_prompt_block,
|
||
)
|
||
|
||
|
||
DEFAULT_BASE_URL = "http://127.0.0.1:1234/v1"
|
||
DEFAULT_MODEL = "gpt-oss"
|
||
DEFAULT_EMBED_MODEL = "text-embedding-nomic-embed-text-v1.5"
|
||
DEFAULT_TIMEOUT = 180
|
||
|
||
|
||
# Structured-output schema. Newer LM Studio / OpenAI-compatible servers want a
|
||
# json_schema response_format (older ones took json_object); we try schema first
|
||
# and fall back gracefully so the client works across server versions.
|
||
_SCORE_FIELD = {"type": "integer", "minimum": 0, "maximum": 10}
|
||
CLASSIFICATION_SCHEMA = {
|
||
"type": "object",
|
||
"additionalProperties": False,
|
||
"required": [
|
||
"constructive_score",
|
||
"cortisol_score",
|
||
"ragebait_score",
|
||
"agency_score",
|
||
"human_benefit_score",
|
||
"novelty_score",
|
||
"pr_risk_score",
|
||
"accepted",
|
||
"topic",
|
||
"flavor",
|
||
"tags",
|
||
"reason_code",
|
||
"reason_text",
|
||
"language",
|
||
],
|
||
"properties": {
|
||
"constructive_score": _SCORE_FIELD,
|
||
"cortisol_score": _SCORE_FIELD,
|
||
"ragebait_score": _SCORE_FIELD,
|
||
"agency_score": _SCORE_FIELD,
|
||
"human_benefit_score": _SCORE_FIELD,
|
||
"novelty_score": _SCORE_FIELD,
|
||
"pr_risk_score": _SCORE_FIELD,
|
||
"accepted": {"type": "boolean"},
|
||
"topic": {"type": "string", "enum": list(TOPICS)},
|
||
"flavor": {"type": "string", "enum": list(FLAVORS)},
|
||
"tags": {"type": "array", "items": {"type": "string", "enum": list(ALLOWED_TAGS)}, "maxItems": MAX_TAGS},
|
||
"reason_code": {"type": "string"},
|
||
"reason_text": {"type": "string"},
|
||
"language": {"type": "string"}, # ISO 639-1 of the article's own text (en, de, es…)
|
||
},
|
||
}
|
||
|
||
# Response-format variants tried in order. Once one succeeds for a client, it is
|
||
# pinned so we stop paying failed round-trips on every subsequent call.
|
||
_RESPONSE_FORMATS = (
|
||
{"type": "json_schema", "json_schema": {"name": "classification", "strict": True, "schema": CLASSIFICATION_SCHEMA}},
|
||
{"type": "json_object"},
|
||
None,
|
||
)
|
||
|
||
|
||
SYSTEM_PROMPT = """You classify article metadata for upbeatBytes, a calm news digest.
|
||
|
||
The bar is NOT "is this happy?" — it is "will a reader finish this calm or a little better, never worse?" ACCEPT stories that are calm, neutral, insightful, or uplifting: they inform, teach, delight, or show progress or benefit. Neutral-but-absorbing is welcome — a discovery, a clear explainer, a clever build or gadget, a fascinating bit of science, space, nature, design, or culture, a genuinely useful insight — even when it isn't "feel-good."
|
||
|
||
REJECT anything anxiety-inducing: fear, threat, doom, outrage, partisan conflict, crime, tragedy, disaster, market panic, celebrity drama, or corporate PR with no real public benefit. Also reject visceral-threat and body-horror hooks — disease outbreaks, parasites, infestations, contamination, recalls, poisonings, deadly or "flesh-eating" infections — EVEN when the piece is calmly written or framed as "monitoring," "surveillance," "awareness," or "public health." A measured, factual telling of an alarming subject still leaves a worse aftertaste. ESPECIALLY reject the comparison traps — anything that would make a reader feel inferior, behind, inadequate, envious, or pressured (status flexing, FOMO, hustle-grind, "you're falling behind"). When unsure, judge the emotional aftertaste, not the topic.
|
||
|
||
Health and public-health stories ARE welcome when the subject itself is benign or hopeful — a treatment that helps, a disease in decline, prevention, recovery, caregiving, fitness, mental wellbeing, or a genuine medical advance. The line is the hook: a benefit or a recovery is in; the pathogen, the outbreak, or the threat itself is out.
|
||
|
||
Score cortisol_score by the reader's personal, visceral, or public-health threat — NOT by dramatic vocabulary or the grandeur of the subject. Distant astronomy and cosmology (black holes, stars, cosmic events), engineering or equipment hazards, geological forces, scientific self-correction and measurement quirks, natural-history mechanisms, predator–prey biology, and historical discoveries are LOW cortisol (0–3) even when written with words like "deadly," "lethal," "destructive," "shocking," or "dangerous." A black hole winking across the cosmos, harsh lunar regolith that shreds equipment, a venomous snake's biology, or an ancient extinction is wonder, not dread — accept it. Reserve high cortisol for disease, contamination, outbreak, parasites, violence, or immediate human or animal suffering — that is what the reader's gut actually flinches from.
|
||
|
||
On AI specifically: this is NOT "no AI" — it is "no AI dread." ACCEPT AI stories about practical tools, accessibility, medical/scientific/educational benefit, creative or maker use, environmental or resource gains, open research, humane design, or a specific bounded innovation. REJECT AI stories whose main frame is loss of human control, cognitive decline or "brain rot," job-displacement panic, surveillance panic, existential doom, harm-to-children or social-fabric panic, "you're falling behind" productivity anxiety, or adversarial arms-race framing.
|
||
|
||
Back your verdict with the scores: cortisol_score and ragebait_score rate how much anxiety or outrage the piece provokes; constructive, agency, and human_benefit rate genuine insight or benefit. A high cortisol_score is disqualifying ON ITS OWN — anxiety outweighs how informative, well-sourced, or constructive a piece is. Do not let "informative" or "public health" rescue an unsettling subject.
|
||
|
||
Also assign one primary topic and one flavor (the single best fit), plus 1-4 grouping tags.
|
||
|
||
Primary topic (what the story is mainly about):
|
||
{topics}
|
||
|
||
Flavor (why it belongs in a calm, uplifting digest):
|
||
{flavors}
|
||
|
||
Grouping tags — choose ONLY from this controlled vocabulary:
|
||
{tags}
|
||
|
||
Tag discipline: assign 1-4 tags; prefer fewer, stronger ones; never tag by weak
|
||
association; pick tags a reader would reasonably use to find this story later.
|
||
|
||
Also report `language`: the ISO 639-1 code of the article's OWN text (the title and
|
||
description), e.g. "en", "de", "es", "fr". Judge the language of the words, not the
|
||
subject. This is detection only — score and accept the story on its merits as usual;
|
||
the site decides separately what to do with non-English items.
|
||
|
||
Return only JSON with this exact shape:
|
||
{{
|
||
"constructive_score": 0,
|
||
"cortisol_score": 0,
|
||
"ragebait_score": 0,
|
||
"agency_score": 0,
|
||
"human_benefit_score": 0,
|
||
"novelty_score": 0,
|
||
"pr_risk_score": 0,
|
||
"accepted": false,
|
||
"topic": "one_of_the_allowed_topics",
|
||
"flavor": "one_of_the_allowed_flavors",
|
||
"tags": ["one_to_four_allowed_tags"],
|
||
"reason_code": "short_snake_case",
|
||
"reason_text": "one concise sentence",
|
||
"language": "en"
|
||
}}
|
||
""".format(topics=topics_prompt_block(), flavors=flavors_prompt_block(), tags=tags_prompt_block())
|
||
|
||
|
||
@dataclass
|
||
class LocalModelClient:
|
||
base_url: str
|
||
model: str
|
||
api_key: str | None = None
|
||
timeout: int = DEFAULT_TIMEOUT
|
||
embed_model: str = DEFAULT_EMBED_MODEL
|
||
# Index into _RESPONSE_FORMATS that the server accepts; discovered lazily.
|
||
_response_format_idx: int | None = None
|
||
|
||
@classmethod
|
||
def from_env(cls) -> "LocalModelClient":
|
||
return cls(
|
||
base_url=os.environ.get("GOODNEWS_LLM_BASE_URL", DEFAULT_BASE_URL).rstrip("/"),
|
||
model=os.environ.get("GOODNEWS_LLM_MODEL", DEFAULT_MODEL),
|
||
api_key=os.environ.get("GOODNEWS_LLM_API_KEY"),
|
||
timeout=int(os.environ.get("GOODNEWS_LLM_TIMEOUT", DEFAULT_TIMEOUT)),
|
||
embed_model=os.environ.get("GOODNEWS_EMBED_MODEL", DEFAULT_EMBED_MODEL),
|
||
)
|
||
|
||
def embed(self, texts: list[str]) -> list[list[float]]:
|
||
"""Return embedding vectors for a batch of texts via /embeddings."""
|
||
body = json.dumps({"model": self.embed_model, "input": texts}).encode("utf-8")
|
||
headers = {"Content-Type": "application/json"}
|
||
if self.api_key:
|
||
headers["Authorization"] = f"Bearer {self.api_key}"
|
||
request = urllib.request.Request(
|
||
f"{self.base_url}/embeddings", data=body, headers=headers, method="POST"
|
||
)
|
||
try:
|
||
with urllib.request.urlopen(request, timeout=self.timeout) as response:
|
||
data = json.loads(response.read().decode("utf-8"))
|
||
except urllib.error.HTTPError as exc:
|
||
detail = exc.read().decode("utf-8", errors="replace")
|
||
raise RuntimeError(f"HTTP {exc.code} from embeddings: {detail}") from exc
|
||
except urllib.error.URLError as exc:
|
||
raise RuntimeError(f"could not reach embeddings at {self.base_url}: {exc.reason}") from exc
|
||
try:
|
||
return [item["embedding"] for item in data["data"]]
|
||
except (KeyError, TypeError) as exc:
|
||
raise RuntimeError(f"unexpected embeddings response: {data}") from exc
|
||
|
||
def classify(self, article: sqlite3.Row) -> dict:
|
||
messages = [
|
||
{"role": "system", "content": SYSTEM_PROMPT},
|
||
{"role": "user", "content": _article_prompt(article)},
|
||
]
|
||
# If we already learned which response_format the server accepts, use it.
|
||
if self._response_format_idx is not None:
|
||
return self._chat(self._build_payload(messages, _RESPONSE_FORMATS[self._response_format_idx]))
|
||
|
||
# Otherwise escalate through the variants, pinning the first that works.
|
||
last_exc: RuntimeError | None = None
|
||
for idx, fmt in enumerate(_RESPONSE_FORMATS):
|
||
try:
|
||
result = self._chat(self._build_payload(messages, fmt))
|
||
self._response_format_idx = idx
|
||
return result
|
||
except RuntimeError as exc:
|
||
if "HTTP 400" not in str(exc):
|
||
raise
|
||
last_exc = exc
|
||
raise last_exc if last_exc else RuntimeError("no usable response_format")
|
||
|
||
def _build_payload(self, messages: list[dict], response_format: dict | None) -> dict:
|
||
payload = {"model": self.model, "temperature": 0.1, "messages": messages}
|
||
if response_format is not None:
|
||
payload["response_format"] = response_format
|
||
return payload
|
||
|
||
def list_models(self) -> list[str]:
|
||
headers = {}
|
||
if self.api_key:
|
||
headers["Authorization"] = f"Bearer {self.api_key}"
|
||
request = urllib.request.Request(f"{self.base_url}/models", headers=headers)
|
||
try:
|
||
with urllib.request.urlopen(request, timeout=10) as response:
|
||
data = json.loads(response.read().decode("utf-8"))
|
||
except urllib.error.HTTPError as exc:
|
||
detail = exc.read().decode("utf-8", errors="replace")
|
||
raise RuntimeError(f"HTTP {exc.code} from local model: {detail}") from exc
|
||
except urllib.error.URLError as exc:
|
||
raise RuntimeError(f"could not reach local model at {self.base_url}: {exc.reason}") from exc
|
||
|
||
models = data.get("data", [])
|
||
names = []
|
||
for model in models:
|
||
if isinstance(model, dict) and model.get("id"):
|
||
names.append(str(model["id"]))
|
||
return names
|
||
|
||
def chat_text(self, messages: list[dict]) -> str:
|
||
"""Plain chat completion → the raw message text (no JSON parsing).
|
||
|
||
Used for free-form output like summaries; classification uses _chat,
|
||
which JSON-parses the same content.
|
||
"""
|
||
return self._raw_content(self._build_payload(messages, None))
|
||
|
||
def rank_for_social(self, candidates: list[dict]) -> list[dict]:
|
||
"""ONE bounded COMPARATIVE pass over a small candidate set (not N calls).
|
||
Returns a best-first list of {id, social_score 0-10, why, talking_points,
|
||
angle, entities}. Bounded by self.timeout; callers fall back to deterministic
|
||
ranking on ANY failure, so the Publishing Desk always works."""
|
||
if not candidates:
|
||
return []
|
||
lines = []
|
||
for c in candidates:
|
||
summ = " ".join((c.get("summary") or "").split())[:280]
|
||
lines.append(f'- id={int(c["id"])} | topic={c.get("topic")} | {c["title"]} :: {summ}')
|
||
user = (
|
||
"These are constructive-news articles. Compare them as candidates for a SHORT X "
|
||
"(Twitter) post from a calm good-news account, and rank best-first by SOCIAL "
|
||
"share-worthiness — would someone stop scrolling? That differs from how 'good' the "
|
||
"article is.\n\n" + "\n".join(lines) + "\n\n"
|
||
'Reply with JSON only, exactly this shape:\n'
|
||
'{"ranked": [{"id": <one of the ids above>, "social_score": <0-10>, '
|
||
'"why": "one sentence: why it stops the scroll", '
|
||
'"talking_points": ["3 short factual points a writer could use"], '
|
||
'"angle": "a possible conversational angle", '
|
||
'"entities": ["real org/person names mentioned, for tagging"]}]}\n'
|
||
"Only use ids from the list above. Order best-first."
|
||
)
|
||
messages = [
|
||
{"role": "system", "content": "You rank constructive news for social sharing. Reply with JSON only."},
|
||
{"role": "user", "content": user},
|
||
]
|
||
data = parse_classifier_json(self.chat_text(messages))
|
||
ranked = data.get("ranked") if isinstance(data, dict) else None
|
||
if not isinstance(ranked, list):
|
||
raise RuntimeError("rank_for_social: missing 'ranked' list")
|
||
out = []
|
||
for r in ranked:
|
||
if not isinstance(r, dict):
|
||
continue
|
||
try:
|
||
rid = int(r.get("id"))
|
||
except (TypeError, ValueError):
|
||
continue
|
||
# Require ACTUAL lists — a model that returns a bare string must not be
|
||
# iterated into characters ("fact" → ["f","a","c","t"]).
|
||
tp = r.get("talking_points")
|
||
ents = r.get("entities")
|
||
out.append({
|
||
"id": rid,
|
||
"social_score": _bounded_int(r.get("social_score")),
|
||
"why": str(r.get("why") or "")[:300],
|
||
"talking_points": [str(p)[:200] for p in tp][:4] if isinstance(tp, list) else [],
|
||
"angle": str(r.get("angle") or "")[:300],
|
||
"entities": [str(e)[:80] for e in ents][:8] if isinstance(ents, list) else [],
|
||
})
|
||
return out
|
||
|
||
def _raw_content(self, payload: dict) -> str:
|
||
body = json.dumps(payload).encode("utf-8")
|
||
headers = {"Content-Type": "application/json"}
|
||
if self.api_key:
|
||
headers["Authorization"] = f"Bearer {self.api_key}"
|
||
request = urllib.request.Request(
|
||
f"{self.base_url}/chat/completions",
|
||
data=body,
|
||
headers=headers,
|
||
method="POST",
|
||
)
|
||
try:
|
||
with urllib.request.urlopen(request, timeout=self.timeout) as response:
|
||
data = json.loads(response.read().decode("utf-8"))
|
||
except urllib.error.HTTPError as exc:
|
||
detail = exc.read().decode("utf-8", errors="replace")
|
||
raise RuntimeError(f"HTTP {exc.code} from local model: {detail}") from exc
|
||
except urllib.error.URLError as exc:
|
||
raise RuntimeError(f"could not reach local model at {self.base_url}: {exc.reason}") from exc
|
||
|
||
try:
|
||
return data["choices"][0]["message"]["content"]
|
||
except (KeyError, IndexError, TypeError) as exc:
|
||
raise RuntimeError(f"unexpected local model response: {data}") from exc
|
||
|
||
def _chat(self, payload: dict) -> dict:
|
||
return parse_classifier_json(self._raw_content(payload))
|
||
|
||
|
||
@dataclass
|
||
class ClassifyReport:
|
||
results: list[tuple[int, dict]]
|
||
attempted: int
|
||
succeeded: int
|
||
skipped: int
|
||
|
||
|
||
def classify_articles(
|
||
conn: sqlite3.Connection,
|
||
client: LocalModelClient,
|
||
limit: int,
|
||
include_rejected: bool = False,
|
||
dry_run: bool = False,
|
||
only_unclassified: bool = False,
|
||
progress: "Callable[[int, int, int], None] | None" = None,
|
||
) -> ClassifyReport:
|
||
rows = _classification_candidates(
|
||
conn, limit=limit, include_rejected=include_rejected, only_unclassified=only_unclassified
|
||
)
|
||
results = []
|
||
skipped = 0
|
||
for index, row in enumerate(rows, start=1):
|
||
try:
|
||
scores = client.classify(row)
|
||
except RuntimeError as exc:
|
||
# One slow/failed article (timeout, bad response) shouldn't sink the
|
||
# whole batch or discard work already committed. Skip and continue.
|
||
skipped += 1
|
||
print(f"[{row['id']}] skipped: {exc}")
|
||
continue
|
||
scores = normalize_scores(scores, model_name=client.model)
|
||
results.append((row["id"], scores))
|
||
if not dry_run:
|
||
upsert_article_score(conn, row["id"], scores)
|
||
conn.commit()
|
||
if progress is not None:
|
||
progress(index, len(rows), row["id"])
|
||
return ClassifyReport(results=results, attempted=len(rows), succeeded=len(results), skipped=skipped)
|
||
|
||
|
||
def parse_classifier_json(content: str) -> dict:
|
||
content = content.strip()
|
||
try:
|
||
return json.loads(content)
|
||
except json.JSONDecodeError:
|
||
start = content.find("{")
|
||
end = content.rfind("}")
|
||
if start == -1 or end == -1 or end <= start:
|
||
raise RuntimeError(f"model did not return JSON: {content}")
|
||
return json.loads(content[start : end + 1])
|
||
|
||
|
||
def _is_english(language: str) -> bool:
|
||
"""Conservative: HOLD only when the model clearly reports a non-English language.
|
||
Missing/blank/undetermined → treated as English, so a model hiccup never silently
|
||
drops genuine English content (the corpus is ~all English today)."""
|
||
lang = (language or "").strip().lower()
|
||
if not lang or lang in ("und", "unknown", "mul", "zxx"):
|
||
return True
|
||
return lang == "en" or lang.startswith("en-") or lang.startswith("en_")
|
||
|
||
|
||
def normalize_scores(data: dict, model_name: str) -> dict:
|
||
language = str(data.get("language") or "").strip().lower()[:16]
|
||
accepted = 1 if bool(data.get("accepted")) else 0
|
||
reason_code = str(data.get("reason_code") or "model_no_reason")[:120]
|
||
reason_text = str(data.get("reason_text") or "")[:1000]
|
||
# Language gate (code disposes): the public feed is English-only for now. A
|
||
# non-English article is HELD — never shown — but PRESERVED with a distinct
|
||
# reason so it isn't counted as a calm-filter rejection or a source failure, and
|
||
# can be revisited when translation support lands (Phase 4 / GDELT).
|
||
if not _is_english(language):
|
||
accepted = 0
|
||
reason_code = "non_english"
|
||
reason_text = f"Held — non-English ({language}); awaiting translation support."
|
||
return {
|
||
"constructive_score": _bounded_int(data.get("constructive_score")),
|
||
"cortisol_score": _bounded_int(data.get("cortisol_score")),
|
||
"ragebait_score": _bounded_int(data.get("ragebait_score")),
|
||
"agency_score": _bounded_int(data.get("agency_score")),
|
||
"human_benefit_score": _bounded_int(data.get("human_benefit_score")),
|
||
"novelty_score": _bounded_int(data.get("novelty_score")),
|
||
"pr_risk_score": _bounded_int(data.get("pr_risk_score")),
|
||
"accepted": accepted,
|
||
"topic": coerce_topic(data.get("topic")),
|
||
"flavor": coerce_flavor(data.get("flavor")),
|
||
"tags": coerce_tags(data.get("tags")),
|
||
"reason_code": reason_code,
|
||
"reason_text": reason_text,
|
||
"language": language,
|
||
"model_name": model_name,
|
||
}
|
||
|
||
|
||
def upsert_article_score(conn: sqlite3.Connection, article_id: int, scores: dict) -> None:
|
||
conn.execute(
|
||
"""
|
||
INSERT INTO article_scores (
|
||
article_id, constructive_score, cortisol_score, ragebait_score,
|
||
agency_score, human_benefit_score, novelty_score, pr_risk_score,
|
||
accepted, topic, flavor, reason_code, reason_text, language, model_name, scored_at
|
||
)
|
||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
|
||
ON CONFLICT(article_id) DO UPDATE SET
|
||
constructive_score = excluded.constructive_score,
|
||
cortisol_score = excluded.cortisol_score,
|
||
ragebait_score = excluded.ragebait_score,
|
||
agency_score = excluded.agency_score,
|
||
human_benefit_score = excluded.human_benefit_score,
|
||
novelty_score = excluded.novelty_score,
|
||
pr_risk_score = excluded.pr_risk_score,
|
||
accepted = excluded.accepted,
|
||
topic = excluded.topic,
|
||
flavor = excluded.flavor,
|
||
reason_code = excluded.reason_code,
|
||
reason_text = excluded.reason_text,
|
||
language = excluded.language,
|
||
model_name = excluded.model_name,
|
||
scored_at = CURRENT_TIMESTAMP
|
||
""",
|
||
(
|
||
article_id,
|
||
scores["constructive_score"],
|
||
scores["cortisol_score"],
|
||
scores["ragebait_score"],
|
||
scores["agency_score"],
|
||
scores["human_benefit_score"],
|
||
scores["novelty_score"],
|
||
scores["pr_risk_score"],
|
||
scores["accepted"],
|
||
scores["topic"],
|
||
scores["flavor"],
|
||
scores["reason_code"],
|
||
scores["reason_text"],
|
||
scores.get("language"),
|
||
scores["model_name"],
|
||
),
|
||
)
|
||
# Replace this article's grouping tags (controlled vocabulary, 0-4).
|
||
conn.execute("DELETE FROM article_tags WHERE article_id = ?", (article_id,))
|
||
for tag in scores.get("tags") or []:
|
||
conn.execute(
|
||
"INSERT OR IGNORE INTO article_tags (article_id, tag) VALUES (?, ?)", (article_id, tag)
|
||
)
|
||
|
||
|
||
def _classification_candidates(
|
||
conn: sqlite3.Connection,
|
||
limit: int,
|
||
include_rejected: bool,
|
||
only_unclassified: bool = False,
|
||
) -> list[sqlite3.Row]:
|
||
filters = []
|
||
if not include_rejected:
|
||
filters.append("(s.accepted = 1 OR s.constructive_score >= 4)")
|
||
if only_unclassified:
|
||
# Articles still carrying the fast heuristic score, i.e. not yet judged
|
||
# by the model. Lets a scheduled cycle only spend the LLM on new items.
|
||
filters.append("s.model_name LIKE 'heuristic-%'")
|
||
where = ("WHERE " + " AND ".join(filters)) if filters else ""
|
||
return conn.execute(
|
||
f"""
|
||
SELECT
|
||
a.id,
|
||
a.title,
|
||
a.description,
|
||
a.published_at,
|
||
a.canonical_url,
|
||
src.name AS source_name,
|
||
src.default_category,
|
||
src.trust_score AS source_trust_score,
|
||
src.pr_risk_score AS source_pr_risk_score,
|
||
s.constructive_score,
|
||
s.cortisol_score,
|
||
s.ragebait_score,
|
||
s.agency_score,
|
||
s.human_benefit_score,
|
||
s.pr_risk_score,
|
||
s.accepted,
|
||
s.reason_code
|
||
FROM articles a
|
||
JOIN sources src ON src.id = a.source_id
|
||
LEFT JOIN article_scores s ON s.article_id = a.id
|
||
{where}
|
||
ORDER BY
|
||
CASE WHEN s.model_name LIKE 'heuristic-%' THEN 0 ELSE 1 END,
|
||
COALESCE(a.published_at, a.discovered_at) DESC
|
||
LIMIT ?
|
||
""",
|
||
(limit,),
|
||
).fetchall()
|
||
|
||
|
||
def _article_prompt(article: sqlite3.Row) -> str:
|
||
return "\n".join(
|
||
[
|
||
f"Source: {article['source_name']}",
|
||
f"Source category: {article['default_category'] or 'unknown'}",
|
||
f"Source trust score: {article['source_trust_score']}/10",
|
||
f"Source PR risk score: {article['source_pr_risk_score']}/10",
|
||
f"Published: {article['published_at'] or 'unknown'}",
|
||
f"Title: {article['title']}",
|
||
f"Snippet: {article['description'] or ''}",
|
||
f"URL: {article['canonical_url']}",
|
||
]
|
||
)
|
||
|
||
|
||
def _bounded_int(value: object) -> int:
|
||
try:
|
||
parsed = int(value)
|
||
except (TypeError, ValueError):
|
||
parsed = 0
|
||
return max(0, min(10, parsed))
|