Add topic/flavor categorization and category browsing
- New taxonomy module: single source of truth for 6 topics x 5 flavors, shared by the LLM response schema (enum-constrained) and validation. - Classifier now assigns one topic + one flavor per article; json_schema enums force valid values, with coercion as a safety net. - article_scores gains topic/flavor columns via an idempotent migration. - New 'list-category' command to browse by topic and/or flavor, ranked by composite score. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+61
-1
@@ -38,6 +38,12 @@ def main() -> None:
|
||||
source_parser = subparsers.add_parser("list-sources", help="Show configured sources")
|
||||
source_parser.add_argument("--active-only", action="store_true")
|
||||
|
||||
cat_parser = subparsers.add_parser("list-category", help="Browse articles by topic and/or flavor")
|
||||
cat_parser.add_argument("--topic", help="Filter by topic, e.g. science, environment, animals")
|
||||
cat_parser.add_argument("--flavor", help="Filter by flavor, e.g. breakthrough, discovery, feelgood")
|
||||
cat_parser.add_argument("--limit", type=int, default=20)
|
||||
cat_parser.add_argument("--all", action="store_true", help="Include not-accepted articles")
|
||||
|
||||
subparsers.add_parser("source-report", help="Show source-level ingestion and scoring stats")
|
||||
|
||||
runs_parser = subparsers.add_parser("list-runs", help="Show recent ingest runs")
|
||||
@@ -90,6 +96,8 @@ def main() -> None:
|
||||
list_recent(conn, limit=args.limit, accepted_only=args.accepted_only)
|
||||
elif args.command == "list-sources":
|
||||
list_sources(conn, active_only=args.active_only)
|
||||
elif args.command == "list-category":
|
||||
list_category(conn, topic=args.topic, flavor=args.flavor, limit=args.limit, accepted_only=not args.all)
|
||||
elif args.command == "source-report":
|
||||
source_report(conn)
|
||||
elif args.command == "list-runs":
|
||||
@@ -109,7 +117,10 @@ def main() -> None:
|
||||
)
|
||||
for article_id, scores in results:
|
||||
accepted = "yes" if scores["accepted"] else "no"
|
||||
print(f"[{article_id}] accepted={accepted} reason={scores['reason_code']}")
|
||||
print(
|
||||
f"[{article_id}] accepted={accepted} {scores['topic']}/{scores['flavor']} "
|
||||
f"reason={scores['reason_code']}"
|
||||
)
|
||||
print(f" {scores['reason_text']}")
|
||||
if args.dry_run:
|
||||
print("Dry run only; database was not updated.")
|
||||
@@ -179,6 +190,55 @@ def list_recent(conn: sqlite3.Connection, limit: int, accepted_only: bool) -> No
|
||||
print(f" {row['canonical_url']}")
|
||||
|
||||
|
||||
def list_category(
|
||||
conn: sqlite3.Connection,
|
||||
topic: str | None,
|
||||
flavor: str | None,
|
||||
limit: int,
|
||||
accepted_only: bool,
|
||||
) -> None:
|
||||
clauses = []
|
||||
params: list = []
|
||||
if accepted_only:
|
||||
clauses.append("s.accepted = 1")
|
||||
if topic:
|
||||
clauses.append("s.topic = ?")
|
||||
params.append(topic.lower())
|
||||
if flavor:
|
||||
clauses.append("s.flavor = ?")
|
||||
params.append(flavor.lower())
|
||||
where = ("WHERE " + " AND ".join(clauses)) if clauses else ""
|
||||
params.append(limit)
|
||||
|
||||
rows = conn.execute(
|
||||
f"""
|
||||
SELECT
|
||||
a.id, a.title, a.canonical_url, a.published_at,
|
||||
src.name AS source_name,
|
||||
s.topic, s.flavor, s.accepted,
|
||||
s.constructive_score, s.cortisol_score, s.reason_code,
|
||||
(s.constructive_score + s.agency_score + s.human_benefit_score + src.trust_score
|
||||
- s.cortisol_score - s.ragebait_score - s.pr_risk_score) AS rank_score
|
||||
FROM articles a
|
||||
JOIN sources src ON src.id = a.source_id
|
||||
JOIN article_scores s ON s.article_id = a.id
|
||||
{where}
|
||||
ORDER BY rank_score DESC, COALESCE(a.published_at, a.discovered_at) DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
params,
|
||||
).fetchall()
|
||||
|
||||
label = " / ".join(filter(None, [topic, flavor])) or "all categories"
|
||||
print(f"{label} ({len(rows)} shown)")
|
||||
for row in rows:
|
||||
accepted = "" if row["accepted"] else " [not accepted]"
|
||||
print(f"[{row['id']}] {row['topic']}/{row['flavor']} | {row['source_name']}{accepted}")
|
||||
print(f" {row['title']}")
|
||||
print(f" score={row['rank_score']} reason={row['reason_code']}")
|
||||
print(f" {row['canonical_url']}")
|
||||
|
||||
|
||||
def llm_client_from_args(args: argparse.Namespace) -> LocalModelClient:
|
||||
client = LocalModelClient.from_env()
|
||||
if getattr(args, "base_url", None):
|
||||
|
||||
@@ -56,6 +56,8 @@ CREATE TABLE IF NOT EXISTS article_scores (
|
||||
accepted INTEGER,
|
||||
reason_code TEXT,
|
||||
reason_text TEXT,
|
||||
topic TEXT,
|
||||
flavor TEXT,
|
||||
model_name TEXT,
|
||||
scored_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
@@ -102,4 +104,17 @@ def connect(db_path: Path | str) -> sqlite3.Connection:
|
||||
|
||||
def init_db(conn: sqlite3.Connection) -> None:
|
||||
conn.executescript(SCHEMA)
|
||||
_migrate(conn)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def _migrate(conn: sqlite3.Connection) -> None:
|
||||
"""Add columns introduced after the initial schema to existing databases.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS never alters an existing table, so new columns
|
||||
need an explicit, idempotent ALTER guarded by the current column set.
|
||||
"""
|
||||
cols = {row["name"] for row in conn.execute("PRAGMA table_info(article_scores)")}
|
||||
for column in ("topic", "flavor"):
|
||||
if column not in cols:
|
||||
conn.execute(f"ALTER TABLE article_scores ADD COLUMN {column} TEXT")
|
||||
|
||||
+34
-5
@@ -7,6 +7,15 @@ import urllib.error
|
||||
import urllib.request
|
||||
from dataclasses import dataclass
|
||||
|
||||
from .taxonomy import (
|
||||
FLAVORS,
|
||||
TOPICS,
|
||||
coerce_flavor,
|
||||
coerce_topic,
|
||||
flavors_prompt_block,
|
||||
topics_prompt_block,
|
||||
)
|
||||
|
||||
|
||||
DEFAULT_BASE_URL = "http://127.0.0.1:1234/v1"
|
||||
DEFAULT_MODEL = "gpt-oss"
|
||||
@@ -29,6 +38,8 @@ CLASSIFICATION_SCHEMA = {
|
||||
"novelty_score",
|
||||
"pr_risk_score",
|
||||
"accepted",
|
||||
"topic",
|
||||
"flavor",
|
||||
"reason_code",
|
||||
"reason_text",
|
||||
],
|
||||
@@ -41,6 +52,8 @@ CLASSIFICATION_SCHEMA = {
|
||||
"novelty_score": _SCORE_FIELD,
|
||||
"pr_risk_score": _SCORE_FIELD,
|
||||
"accepted": {"type": "boolean"},
|
||||
"topic": {"type": "string", "enum": list(TOPICS)},
|
||||
"flavor": {"type": "string", "enum": list(FLAVORS)},
|
||||
"reason_code": {"type": "string"},
|
||||
"reason_text": {"type": "string"},
|
||||
},
|
||||
@@ -61,8 +74,16 @@ Judge emotional aftertaste, not simple positivity. Accept stories that leave a r
|
||||
|
||||
Reject stories centered on fear, outrage, partisan conflict, crime, tragedy, disaster repetition, celebrity drama, market panic, or corporate PR without clear public benefit.
|
||||
|
||||
Also assign one topic and one flavor, choosing the single best fit.
|
||||
|
||||
Topic (what the story is about):
|
||||
{topics}
|
||||
|
||||
Flavor (why it belongs in a calm, uplifting digest):
|
||||
{flavors}
|
||||
|
||||
Return only JSON with this exact shape:
|
||||
{
|
||||
{{
|
||||
"constructive_score": 0,
|
||||
"cortisol_score": 0,
|
||||
"ragebait_score": 0,
|
||||
@@ -71,10 +92,12 @@ Return only JSON with this exact shape:
|
||||
"novelty_score": 0,
|
||||
"pr_risk_score": 0,
|
||||
"accepted": false,
|
||||
"topic": "one_of_the_allowed_topics",
|
||||
"flavor": "one_of_the_allowed_flavors",
|
||||
"reason_code": "short_snake_case",
|
||||
"reason_text": "one concise sentence"
|
||||
}
|
||||
"""
|
||||
}}
|
||||
""".format(topics=topics_prompt_block(), flavors=flavors_prompt_block())
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -218,6 +241,8 @@ def normalize_scores(data: dict, model_name: str) -> dict:
|
||||
"novelty_score": _bounded_int(data.get("novelty_score")),
|
||||
"pr_risk_score": _bounded_int(data.get("pr_risk_score")),
|
||||
"accepted": 1 if bool(data.get("accepted")) else 0,
|
||||
"topic": coerce_topic(data.get("topic")),
|
||||
"flavor": coerce_flavor(data.get("flavor")),
|
||||
"reason_code": str(data.get("reason_code") or "model_no_reason")[:120],
|
||||
"reason_text": str(data.get("reason_text") or "")[:1000],
|
||||
"model_name": model_name,
|
||||
@@ -230,9 +255,9 @@ def upsert_article_score(conn: sqlite3.Connection, article_id: int, scores: dict
|
||||
INSERT INTO article_scores (
|
||||
article_id, constructive_score, cortisol_score, ragebait_score,
|
||||
agency_score, human_benefit_score, novelty_score, pr_risk_score,
|
||||
accepted, reason_code, reason_text, model_name, scored_at
|
||||
accepted, topic, flavor, reason_code, reason_text, model_name, scored_at
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
|
||||
ON CONFLICT(article_id) DO UPDATE SET
|
||||
constructive_score = excluded.constructive_score,
|
||||
cortisol_score = excluded.cortisol_score,
|
||||
@@ -242,6 +267,8 @@ def upsert_article_score(conn: sqlite3.Connection, article_id: int, scores: dict
|
||||
novelty_score = excluded.novelty_score,
|
||||
pr_risk_score = excluded.pr_risk_score,
|
||||
accepted = excluded.accepted,
|
||||
topic = excluded.topic,
|
||||
flavor = excluded.flavor,
|
||||
reason_code = excluded.reason_code,
|
||||
reason_text = excluded.reason_text,
|
||||
model_name = excluded.model_name,
|
||||
@@ -257,6 +284,8 @@ def upsert_article_score(conn: sqlite3.Connection, article_id: int, scores: dict
|
||||
scores["novelty_score"],
|
||||
scores["pr_risk_score"],
|
||||
scores["accepted"],
|
||||
scores["topic"],
|
||||
scores["flavor"],
|
||||
scores["reason_code"],
|
||||
scores["reason_text"],
|
||||
scores["model_name"],
|
||||
|
||||
@@ -0,0 +1,54 @@
|
||||
"""Single source of truth for article topic/flavor categories.
|
||||
|
||||
Both the LLM response schema (enum constraints) and the post-hoc validation in
|
||||
normalize_scores import from here, so the allowed values can never drift apart.
|
||||
Adjusting a category here + re-running `classify` is all it takes to reshape the
|
||||
browsable feeds.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
# Topical axis: what the story is primarily about.
|
||||
TOPICS: dict[str, str] = {
|
||||
"science": "research, discoveries, space, physics, technology",
|
||||
"environment": "conservation, climate solutions, ecosystems, clean energy",
|
||||
"health": "medicine, wellbeing, mental health, public health",
|
||||
"community": "local action, humanitarian work, social progress, kindness, fair work",
|
||||
"culture": "arts, history, heritage, sport, human-interest",
|
||||
"animals": "wildlife, nature discoveries, charming animal stories",
|
||||
}
|
||||
|
||||
# Tonal axis: why the story is worth surfacing in a calm, uplifting digest.
|
||||
FLAVORS: dict[str, str] = {
|
||||
"breakthrough": "a significant advance or innovation with clear public benefit",
|
||||
"discovery": "newly found or learned; calm and fascinating, low on agency",
|
||||
"solution": "people actively repairing, restoring, or solving a problem",
|
||||
"feelgood": "a heartwarming human, community, or kindness story",
|
||||
"perspective": "useful advice, insight, or framing the reader can apply",
|
||||
}
|
||||
|
||||
DEFAULT_TOPIC = "science"
|
||||
DEFAULT_FLAVOR = "discovery"
|
||||
|
||||
|
||||
def coerce_topic(value: object) -> str:
|
||||
text = str(value or "").strip().lower()
|
||||
return text if text in TOPICS else DEFAULT_TOPIC
|
||||
|
||||
|
||||
def coerce_flavor(value: object) -> str:
|
||||
text = str(value or "").strip().lower()
|
||||
return text if text in FLAVORS else DEFAULT_FLAVOR
|
||||
|
||||
|
||||
def _bullet_list(mapping: dict[str, str]) -> str:
|
||||
return "\n".join(f"- {key}: {desc}" for key, desc in mapping.items())
|
||||
|
||||
|
||||
def topics_prompt_block() -> str:
|
||||
return _bullet_list(TOPICS)
|
||||
|
||||
|
||||
def flavors_prompt_block() -> str:
|
||||
return _bullet_list(FLAVORS)
|
||||
Reference in New Issue
Block a user