Add topic/flavor categorization and category browsing

- New taxonomy module: single source of truth for 6 topics x 5 flavors, shared by the LLM response schema (enum-constrained) and validation. - Classifier now assigns one topic + one flavor per article; json_schema enums force valid values, with coercion as a safety net. - article_scores gains topic/flavor columns via an idempotent migration. - New 'list-category' command to browse by topic and/or flavor, ranked by composite score. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-30 11:21:53 +00:00
parent f4842ed100
commit 38057d0354
5 changed files with 165 additions and 6 deletions
@@ -38,6 +38,12 @@ def main() -> None:
    source_parser = subparsers.add_parser("list-sources", help="Show configured sources")
    source_parser.add_argument("--active-only", action="store_true")

+    cat_parser = subparsers.add_parser("list-category", help="Browse articles by topic and/or flavor")
+    cat_parser.add_argument("--topic", help="Filter by topic, e.g. science, environment, animals")
+    cat_parser.add_argument("--flavor", help="Filter by flavor, e.g. breakthrough, discovery, feelgood")
+    cat_parser.add_argument("--limit", type=int, default=20)
+    cat_parser.add_argument("--all", action="store_true", help="Include not-accepted articles")
+
    subparsers.add_parser("source-report", help="Show source-level ingestion and scoring stats")

    runs_parser = subparsers.add_parser("list-runs", help="Show recent ingest runs")
@@ -90,6 +96,8 @@ def main() -> None:
        list_recent(conn, limit=args.limit, accepted_only=args.accepted_only)
    elif args.command == "list-sources":
        list_sources(conn, active_only=args.active_only)
+    elif args.command == "list-category":
+        list_category(conn, topic=args.topic, flavor=args.flavor, limit=args.limit, accepted_only=not args.all)
    elif args.command == "source-report":
        source_report(conn)
    elif args.command == "list-runs":
@@ -109,7 +117,10 @@ def main() -> None:
        )
        for article_id, scores in results:
            accepted = "yes" if scores["accepted"] else "no"
-            print(f"[{article_id}] accepted={accepted} reason={scores['reason_code']}")
+            print(
+                f"[{article_id}] accepted={accepted} {scores['topic']}/{scores['flavor']} "
+                f"reason={scores['reason_code']}"
+            )
            print(f"  {scores['reason_text']}")
        if args.dry_run:
            print("Dry run only; database was not updated.")
@@ -179,6 +190,55 @@ def list_recent(conn: sqlite3.Connection, limit: int, accepted_only: bool) -> No
        print(f"  {row['canonical_url']}")


+def list_category(
+    conn: sqlite3.Connection,
+    topic: str | None,
+    flavor: str | None,
+    limit: int,
+    accepted_only: bool,
+) -> None:
+    clauses = []
+    params: list = []
+    if accepted_only:
+        clauses.append("s.accepted = 1")
+    if topic:
+        clauses.append("s.topic = ?")
+        params.append(topic.lower())
+    if flavor:
+        clauses.append("s.flavor = ?")
+        params.append(flavor.lower())
+    where = ("WHERE " + " AND ".join(clauses)) if clauses else ""
+    params.append(limit)
+
+    rows = conn.execute(
+        f"""
+        SELECT
+            a.id, a.title, a.canonical_url, a.published_at,
+            src.name AS source_name,
+            s.topic, s.flavor, s.accepted,
+            s.constructive_score, s.cortisol_score, s.reason_code,
+            (s.constructive_score + s.agency_score + s.human_benefit_score + src.trust_score
+             - s.cortisol_score - s.ragebait_score - s.pr_risk_score) AS rank_score
+        FROM articles a
+        JOIN sources src ON src.id = a.source_id
+        JOIN article_scores s ON s.article_id = a.id
+        {where}
+        ORDER BY rank_score DESC, COALESCE(a.published_at, a.discovered_at) DESC
+        LIMIT ?
+        """,
+        params,
+    ).fetchall()
+
+    label = " / ".join(filter(None, [topic, flavor])) or "all categories"
+    print(f"{label} ({len(rows)} shown)")
+    for row in rows:
+        accepted = "" if row["accepted"] else " [not accepted]"
+        print(f"[{row['id']}] {row['topic']}/{row['flavor']} | {row['source_name']}{accepted}")
+        print(f"  {row['title']}")
+        print(f"  score={row['rank_score']} reason={row['reason_code']}")
+        print(f"  {row['canonical_url']}")
+
+
 def llm_client_from_args(args: argparse.Namespace) -> LocalModelClient:
    client = LocalModelClient.from_env()
    if getattr(args, "base_url", None):
@@ -56,6 +56,8 @@ CREATE TABLE IF NOT EXISTS article_scores (
    accepted INTEGER,
    reason_code TEXT,
    reason_text TEXT,
+    topic TEXT,
+    flavor TEXT,
    model_name TEXT,
    scored_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
 );
@@ -102,4 +104,17 @@ def connect(db_path: Path | str) -> sqlite3.Connection:

 def init_db(conn: sqlite3.Connection) -> None:
    conn.executescript(SCHEMA)
+    _migrate(conn)
    conn.commit()
+
+
+def _migrate(conn: sqlite3.Connection) -> None:
+    """Add columns introduced after the initial schema to existing databases.
+
+    CREATE TABLE IF NOT EXISTS never alters an existing table, so new columns
+    need an explicit, idempotent ALTER guarded by the current column set.
+    """
+    cols = {row["name"] for row in conn.execute("PRAGMA table_info(article_scores)")}
+    for column in ("topic", "flavor"):
+        if column not in cols:
+            conn.execute(f"ALTER TABLE article_scores ADD COLUMN {column} TEXT")
@@ -7,6 +7,15 @@ import urllib.error
 import urllib.request
 from dataclasses import dataclass

+from .taxonomy import (
+    FLAVORS,
+    TOPICS,
+    coerce_flavor,
+    coerce_topic,
+    flavors_prompt_block,
+    topics_prompt_block,
+)
+

 DEFAULT_BASE_URL = "http://127.0.0.1:1234/v1"
 DEFAULT_MODEL = "gpt-oss"
@@ -29,6 +38,8 @@ CLASSIFICATION_SCHEMA = {
        "novelty_score",
        "pr_risk_score",
        "accepted",
+        "topic",
+        "flavor",
        "reason_code",
        "reason_text",
    ],
@@ -41,6 +52,8 @@ CLASSIFICATION_SCHEMA = {
        "novelty_score": _SCORE_FIELD,
        "pr_risk_score": _SCORE_FIELD,
        "accepted": {"type": "boolean"},
+        "topic": {"type": "string", "enum": list(TOPICS)},
+        "flavor": {"type": "string", "enum": list(FLAVORS)},
        "reason_code": {"type": "string"},
        "reason_text": {"type": "string"},
    },
@@ -61,8 +74,16 @@ Judge emotional aftertaste, not simple positivity. Accept stories that leave a r

 Reject stories centered on fear, outrage, partisan conflict, crime, tragedy, disaster repetition, celebrity drama, market panic, or corporate PR without clear public benefit.

+Also assign one topic and one flavor, choosing the single best fit.
+
+Topic (what the story is about):
+{topics}
+
+Flavor (why it belongs in a calm, uplifting digest):
+{flavors}
+
 Return only JSON with this exact shape:
-{
+{{
  "constructive_score": 0,
  "cortisol_score": 0,
  "ragebait_score": 0,
@@ -71,10 +92,12 @@ Return only JSON with this exact shape:
  "novelty_score": 0,
  "pr_risk_score": 0,
  "accepted": false,
+  "topic": "one_of_the_allowed_topics",
+  "flavor": "one_of_the_allowed_flavors",
  "reason_code": "short_snake_case",
  "reason_text": "one concise sentence"
-}
-"""
+}}
+""".format(topics=topics_prompt_block(), flavors=flavors_prompt_block())


@dataclass
@@ -218,6 +241,8 @@ def normalize_scores(data: dict, model_name: str) -> dict:
        "novelty_score": _bounded_int(data.get("novelty_score")),
        "pr_risk_score": _bounded_int(data.get("pr_risk_score")),
        "accepted": 1 if bool(data.get("accepted")) else 0,
+        "topic": coerce_topic(data.get("topic")),
+        "flavor": coerce_flavor(data.get("flavor")),
        "reason_code": str(data.get("reason_code") or "model_no_reason")[:120],
        "reason_text": str(data.get("reason_text") or "")[:1000],
        "model_name": model_name,
@@ -230,9 +255,9 @@ def upsert_article_score(conn: sqlite3.Connection, article_id: int, scores: dict
        INSERT INTO article_scores (
            article_id, constructive_score, cortisol_score, ragebait_score,
            agency_score, human_benefit_score, novelty_score, pr_risk_score,
-            accepted, reason_code, reason_text, model_name, scored_at
+            accepted, topic, flavor, reason_code, reason_text, model_name, scored_at
        )
-        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
+        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
        ON CONFLICT(article_id) DO UPDATE SET
            constructive_score = excluded.constructive_score,
            cortisol_score = excluded.cortisol_score,
@@ -242,6 +267,8 @@ def upsert_article_score(conn: sqlite3.Connection, article_id: int, scores: dict
            novelty_score = excluded.novelty_score,
            pr_risk_score = excluded.pr_risk_score,
            accepted = excluded.accepted,
+            topic = excluded.topic,
+            flavor = excluded.flavor,
            reason_code = excluded.reason_code,
            reason_text = excluded.reason_text,
            model_name = excluded.model_name,
@@ -257,6 +284,8 @@ def upsert_article_score(conn: sqlite3.Connection, article_id: int, scores: dict
            scores["novelty_score"],
            scores["pr_risk_score"],
            scores["accepted"],
+            scores["topic"],
+            scores["flavor"],
            scores["reason_code"],
            scores["reason_text"],
            scores["model_name"],
@@ -0,0 +1,54 @@
+"""Single source of truth for article topic/flavor categories.
+
+Both the LLM response schema (enum constraints) and the post-hoc validation in
+normalize_scores import from here, so the allowed values can never drift apart.
+Adjusting a category here + re-running `classify` is all it takes to reshape the
+browsable feeds.
+"""
+
+from __future__ import annotations
+
+
+# Topical axis: what the story is primarily about.
+TOPICS: dict[str, str] = {
+    "science": "research, discoveries, space, physics, technology",
+    "environment": "conservation, climate solutions, ecosystems, clean energy",
+    "health": "medicine, wellbeing, mental health, public health",
+    "community": "local action, humanitarian work, social progress, kindness, fair work",
+    "culture": "arts, history, heritage, sport, human-interest",
+    "animals": "wildlife, nature discoveries, charming animal stories",
+}
+
+# Tonal axis: why the story is worth surfacing in a calm, uplifting digest.
+FLAVORS: dict[str, str] = {
+    "breakthrough": "a significant advance or innovation with clear public benefit",
+    "discovery": "newly found or learned; calm and fascinating, low on agency",
+    "solution": "people actively repairing, restoring, or solving a problem",
+    "feelgood": "a heartwarming human, community, or kindness story",
+    "perspective": "useful advice, insight, or framing the reader can apply",
+}
+
+DEFAULT_TOPIC = "science"
+DEFAULT_FLAVOR = "discovery"
+
+
+def coerce_topic(value: object) -> str:
+    text = str(value or "").strip().lower()
+    return text if text in TOPICS else DEFAULT_TOPIC
+
+
+def coerce_flavor(value: object) -> str:
+    text = str(value or "").strip().lower()
+    return text if text in FLAVORS else DEFAULT_FLAVOR
+
+
+def _bullet_list(mapping: dict[str, str]) -> str:
+    return "\n".join(f"- {key}: {desc}" for key, desc in mapping.items())
+
+
+def topics_prompt_block() -> str:
+    return _bullet_list(TOPICS)
+
+
+def flavors_prompt_block() -> str:
+    return _bullet_list(FLAVORS)
@@ -0,0 +1 @@
+- Ability to silence some categories temporarily (Maybe a user doesn't even want to see health-related articles, even good ones, so they're not reminded of an ongoing medical issue -- a way to avoid something purposely for a bit)
				`@@ -0,0 +1 @@`
				`- Ability to silence some categories temporarily (Maybe a user doesn't even want to see health-related articles, even good ones, so they're not reminded of an ongoing medical issue -- a way to avoid something purposely for a bit)`