Add topic/flavor categorization and category browsing

- New taxonomy module: single source of truth for 6 topics x 5 flavors,
  shared by the LLM response schema (enum-constrained) and validation.
- Classifier now assigns one topic + one flavor per article; json_schema
  enums force valid values, with coercion as a safety net.
- article_scores gains topic/flavor columns via an idempotent migration.
- New 'list-category' command to browse by topic and/or flavor, ranked by
  composite score.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
jay
2026-05-30 11:21:53 +00:00
parent f4842ed100
commit 38057d0354
5 changed files with 165 additions and 6 deletions
+61 -1
View File
@@ -38,6 +38,12 @@ def main() -> None:
source_parser = subparsers.add_parser("list-sources", help="Show configured sources")
source_parser.add_argument("--active-only", action="store_true")
cat_parser = subparsers.add_parser("list-category", help="Browse articles by topic and/or flavor")
cat_parser.add_argument("--topic", help="Filter by topic, e.g. science, environment, animals")
cat_parser.add_argument("--flavor", help="Filter by flavor, e.g. breakthrough, discovery, feelgood")
cat_parser.add_argument("--limit", type=int, default=20)
cat_parser.add_argument("--all", action="store_true", help="Include not-accepted articles")
subparsers.add_parser("source-report", help="Show source-level ingestion and scoring stats")
runs_parser = subparsers.add_parser("list-runs", help="Show recent ingest runs")
@@ -90,6 +96,8 @@ def main() -> None:
list_recent(conn, limit=args.limit, accepted_only=args.accepted_only)
elif args.command == "list-sources":
list_sources(conn, active_only=args.active_only)
elif args.command == "list-category":
list_category(conn, topic=args.topic, flavor=args.flavor, limit=args.limit, accepted_only=not args.all)
elif args.command == "source-report":
source_report(conn)
elif args.command == "list-runs":
@@ -109,7 +117,10 @@ def main() -> None:
)
for article_id, scores in results:
accepted = "yes" if scores["accepted"] else "no"
print(f"[{article_id}] accepted={accepted} reason={scores['reason_code']}")
print(
f"[{article_id}] accepted={accepted} {scores['topic']}/{scores['flavor']} "
f"reason={scores['reason_code']}"
)
print(f" {scores['reason_text']}")
if args.dry_run:
print("Dry run only; database was not updated.")
@@ -179,6 +190,55 @@ def list_recent(conn: sqlite3.Connection, limit: int, accepted_only: bool) -> No
print(f" {row['canonical_url']}")
def list_category(
conn: sqlite3.Connection,
topic: str | None,
flavor: str | None,
limit: int,
accepted_only: bool,
) -> None:
clauses = []
params: list = []
if accepted_only:
clauses.append("s.accepted = 1")
if topic:
clauses.append("s.topic = ?")
params.append(topic.lower())
if flavor:
clauses.append("s.flavor = ?")
params.append(flavor.lower())
where = ("WHERE " + " AND ".join(clauses)) if clauses else ""
params.append(limit)
rows = conn.execute(
f"""
SELECT
a.id, a.title, a.canonical_url, a.published_at,
src.name AS source_name,
s.topic, s.flavor, s.accepted,
s.constructive_score, s.cortisol_score, s.reason_code,
(s.constructive_score + s.agency_score + s.human_benefit_score + src.trust_score
- s.cortisol_score - s.ragebait_score - s.pr_risk_score) AS rank_score
FROM articles a
JOIN sources src ON src.id = a.source_id
JOIN article_scores s ON s.article_id = a.id
{where}
ORDER BY rank_score DESC, COALESCE(a.published_at, a.discovered_at) DESC
LIMIT ?
""",
params,
).fetchall()
label = " / ".join(filter(None, [topic, flavor])) or "all categories"
print(f"{label} ({len(rows)} shown)")
for row in rows:
accepted = "" if row["accepted"] else " [not accepted]"
print(f"[{row['id']}] {row['topic']}/{row['flavor']} | {row['source_name']}{accepted}")
print(f" {row['title']}")
print(f" score={row['rank_score']} reason={row['reason_code']}")
print(f" {row['canonical_url']}")
def llm_client_from_args(args: argparse.Namespace) -> LocalModelClient:
client = LocalModelClient.from_env()
if getattr(args, "base_url", None):
+15
View File
@@ -56,6 +56,8 @@ CREATE TABLE IF NOT EXISTS article_scores (
accepted INTEGER,
reason_code TEXT,
reason_text TEXT,
topic TEXT,
flavor TEXT,
model_name TEXT,
scored_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
@@ -102,4 +104,17 @@ def connect(db_path: Path | str) -> sqlite3.Connection:
def init_db(conn: sqlite3.Connection) -> None:
conn.executescript(SCHEMA)
_migrate(conn)
conn.commit()
def _migrate(conn: sqlite3.Connection) -> None:
"""Add columns introduced after the initial schema to existing databases.
CREATE TABLE IF NOT EXISTS never alters an existing table, so new columns
need an explicit, idempotent ALTER guarded by the current column set.
"""
cols = {row["name"] for row in conn.execute("PRAGMA table_info(article_scores)")}
for column in ("topic", "flavor"):
if column not in cols:
conn.execute(f"ALTER TABLE article_scores ADD COLUMN {column} TEXT")
+34 -5
View File
@@ -7,6 +7,15 @@ import urllib.error
import urllib.request
from dataclasses import dataclass
from .taxonomy import (
FLAVORS,
TOPICS,
coerce_flavor,
coerce_topic,
flavors_prompt_block,
topics_prompt_block,
)
DEFAULT_BASE_URL = "http://127.0.0.1:1234/v1"
DEFAULT_MODEL = "gpt-oss"
@@ -29,6 +38,8 @@ CLASSIFICATION_SCHEMA = {
"novelty_score",
"pr_risk_score",
"accepted",
"topic",
"flavor",
"reason_code",
"reason_text",
],
@@ -41,6 +52,8 @@ CLASSIFICATION_SCHEMA = {
"novelty_score": _SCORE_FIELD,
"pr_risk_score": _SCORE_FIELD,
"accepted": {"type": "boolean"},
"topic": {"type": "string", "enum": list(TOPICS)},
"flavor": {"type": "string", "enum": list(FLAVORS)},
"reason_code": {"type": "string"},
"reason_text": {"type": "string"},
},
@@ -61,8 +74,16 @@ Judge emotional aftertaste, not simple positivity. Accept stories that leave a r
Reject stories centered on fear, outrage, partisan conflict, crime, tragedy, disaster repetition, celebrity drama, market panic, or corporate PR without clear public benefit.
Also assign one topic and one flavor, choosing the single best fit.
Topic (what the story is about):
{topics}
Flavor (why it belongs in a calm, uplifting digest):
{flavors}
Return only JSON with this exact shape:
{
{{
"constructive_score": 0,
"cortisol_score": 0,
"ragebait_score": 0,
@@ -71,10 +92,12 @@ Return only JSON with this exact shape:
"novelty_score": 0,
"pr_risk_score": 0,
"accepted": false,
"topic": "one_of_the_allowed_topics",
"flavor": "one_of_the_allowed_flavors",
"reason_code": "short_snake_case",
"reason_text": "one concise sentence"
}
"""
}}
""".format(topics=topics_prompt_block(), flavors=flavors_prompt_block())
@dataclass
@@ -218,6 +241,8 @@ def normalize_scores(data: dict, model_name: str) -> dict:
"novelty_score": _bounded_int(data.get("novelty_score")),
"pr_risk_score": _bounded_int(data.get("pr_risk_score")),
"accepted": 1 if bool(data.get("accepted")) else 0,
"topic": coerce_topic(data.get("topic")),
"flavor": coerce_flavor(data.get("flavor")),
"reason_code": str(data.get("reason_code") or "model_no_reason")[:120],
"reason_text": str(data.get("reason_text") or "")[:1000],
"model_name": model_name,
@@ -230,9 +255,9 @@ def upsert_article_score(conn: sqlite3.Connection, article_id: int, scores: dict
INSERT INTO article_scores (
article_id, constructive_score, cortisol_score, ragebait_score,
agency_score, human_benefit_score, novelty_score, pr_risk_score,
accepted, reason_code, reason_text, model_name, scored_at
accepted, topic, flavor, reason_code, reason_text, model_name, scored_at
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
ON CONFLICT(article_id) DO UPDATE SET
constructive_score = excluded.constructive_score,
cortisol_score = excluded.cortisol_score,
@@ -242,6 +267,8 @@ def upsert_article_score(conn: sqlite3.Connection, article_id: int, scores: dict
novelty_score = excluded.novelty_score,
pr_risk_score = excluded.pr_risk_score,
accepted = excluded.accepted,
topic = excluded.topic,
flavor = excluded.flavor,
reason_code = excluded.reason_code,
reason_text = excluded.reason_text,
model_name = excluded.model_name,
@@ -257,6 +284,8 @@ def upsert_article_score(conn: sqlite3.Connection, article_id: int, scores: dict
scores["novelty_score"],
scores["pr_risk_score"],
scores["accepted"],
scores["topic"],
scores["flavor"],
scores["reason_code"],
scores["reason_text"],
scores["model_name"],
+54
View File
@@ -0,0 +1,54 @@
"""Single source of truth for article topic/flavor categories.
Both the LLM response schema (enum constraints) and the post-hoc validation in
normalize_scores import from here, so the allowed values can never drift apart.
Adjusting a category here + re-running `classify` is all it takes to reshape the
browsable feeds.
"""
from __future__ import annotations
# Topical axis: what the story is primarily about.
TOPICS: dict[str, str] = {
"science": "research, discoveries, space, physics, technology",
"environment": "conservation, climate solutions, ecosystems, clean energy",
"health": "medicine, wellbeing, mental health, public health",
"community": "local action, humanitarian work, social progress, kindness, fair work",
"culture": "arts, history, heritage, sport, human-interest",
"animals": "wildlife, nature discoveries, charming animal stories",
}
# Tonal axis: why the story is worth surfacing in a calm, uplifting digest.
FLAVORS: dict[str, str] = {
"breakthrough": "a significant advance or innovation with clear public benefit",
"discovery": "newly found or learned; calm and fascinating, low on agency",
"solution": "people actively repairing, restoring, or solving a problem",
"feelgood": "a heartwarming human, community, or kindness story",
"perspective": "useful advice, insight, or framing the reader can apply",
}
DEFAULT_TOPIC = "science"
DEFAULT_FLAVOR = "discovery"
def coerce_topic(value: object) -> str:
text = str(value or "").strip().lower()
return text if text in TOPICS else DEFAULT_TOPIC
def coerce_flavor(value: object) -> str:
text = str(value or "").strip().lower()
return text if text in FLAVORS else DEFAULT_FLAVOR
def _bullet_list(mapping: dict[str, str]) -> str:
return "\n".join(f"- {key}: {desc}" for key, desc in mapping.items())
def topics_prompt_block() -> str:
return _bullet_list(TOPICS)
def flavors_prompt_block() -> str:
return _bullet_list(FLAVORS)
+1
View File
@@ -0,0 +1 @@
- Ability to silence some categories temporarily (Maybe a user doesn't even want to see health-related articles, even good ones, so they're not reminded of an ongoing medical issue -- a way to avoid something purposely for a bit)