Phase B1: multi-tag groupings model (backend)

Three-layer organization: primary topic (one per article, for ranking and
brief balance) + grouping tags (1-4 per article from a controlled vocabulary,
the organic "wandering" axis) + tonal flavor.

- taxonomy: add technology + learning topics; 4 calm tag families
  (Discovery & Wonder, People & Kindness, Solutions & Progress, Mind & Craft)
  defined in code, not the DB; ALLOWED_TAGS union + coerce_tags validation.
- db: article_tags(article_id, tag) join table + tag index.
- llm: tags added to the classifier json_schema (enum-constrained, maxItems 4)
  and system prompt; normalize_scores coerces tags; upsert_article_score
  replaces a row's tags atomically on every (re)classification.
- queries: feed gains a tag filter and exposes tags via group_concat; tag_counts.
- api: Article.tags, feed tag param, and /api/families with per-tag counts.
- tests: coerce/normalize/upsert/tag-filter/reclassify-replace/tag_counts +
  /api/families. 99 passing.

Corpus reclassify (re-tag + new primary topics) runs separately against the
local LLM. Frontend (B2) pairs with this; the live site is unchanged until then.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
jay
2026-06-01 18:35:25 +00:00
parent c7f4db3973
commit a47a1504c8
8 changed files with 203 additions and 16 deletions
+21 -3
View File
@@ -34,7 +34,7 @@ from .hero import safe_to_lead
from .llm import LocalModelClient
from .moods import MOODS, mood_filter
from .paywall import is_paywalled
from .taxonomy import FLAVORS, TOPICS
from .taxonomy import FAMILIES, FLAVORS, TOPICS
ROOT = Path(__file__).resolve().parents[1]
DEFAULT_DB = ROOT / "data" / "goodnews.sqlite3"
@@ -126,9 +126,11 @@ class Article(BaseModel):
model_name: str | None = None
rank: int | None = None # position within a brief, when applicable
paywalled: bool = False
tags: list[str] = []
@classmethod
def from_row(cls, row: dict) -> "Article":
raw_tags = row.get("tags")
return cls(
id=row["id"],
title=row["title"],
@@ -146,6 +148,7 @@ class Article(BaseModel):
model_name=row.get("model_name"),
rank=row.get("rank"),
paywalled=is_paywalled(row.get("canonical_url")),
tags=[t for t in (raw_tags.split(",") if raw_tags else []) if t],
)
@@ -240,6 +243,20 @@ def create_app() -> FastAPI:
# client merges with the user's own Calm Filters.
return MOODS
@app.get("/api/families")
def families() -> list[dict]:
# Grouping vocabulary organised into calm families for the Explore UI.
with get_conn() as conn:
counts = queries.tag_counts(conn)
return [
{
"name": name,
"description": d["description"],
"tags": [{"key": t, "count": counts.get(t, 0)} for t in d["tags"]],
}
for name, d in FAMILIES.items()
]
@app.get("/api/category-counts", response_model=list[CategoryCount])
def category_counts(accepted_only: bool = True, prefs: str | None = Query(None)) -> list[CategoryCount]:
fp = prefs_from_json(prefs)
@@ -267,6 +284,7 @@ def create_app() -> FastAPI:
offset: int = Query(0, ge=0),
prefs: str | None = Query(None),
exclude: str = Query("", description="comma-separated article ids the reader has dismissed"),
tag: str | None = Query(None, description="grouping tag to browse"),
) -> FeedResponse:
if topic and topic.lower() not in TOPICS:
raise HTTPException(400, f"unknown topic: {topic}")
@@ -285,14 +303,14 @@ def create_app() -> FastAPI:
fetch_n = min(2000, (offset + limit) * 4 + 50 + len(excl))
raw = queries.feed(
conn, topic=topic, flavor=flavor, accepted_only=accepted_only,
limit=fetch_n, offset=0, **kw,
limit=fetch_n, offset=0, tag=tag, **kw,
)
kept = [a for a in filter_articles(raw, fp, now) if a["id"] not in excl]
rows = kept[offset : offset + limit]
else:
rows = queries.feed(
conn, topic=topic, flavor=flavor, accepted_only=accepted_only,
limit=limit, offset=offset, **kw,
limit=limit, offset=offset, tag=tag, **kw,
)
# Keep the top of a browse view readable: stable-sort paywalled items
# below readable ones (composite order preserved within each group).
+8
View File
@@ -70,6 +70,14 @@ CREATE TABLE IF NOT EXISTS article_scores (
scored_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS article_tags (
article_id INTEGER NOT NULL REFERENCES articles(id) ON DELETE CASCADE,
tag TEXT NOT NULL,
PRIMARY KEY (article_id, tag)
);
CREATE INDEX IF NOT EXISTS idx_article_tags_tag ON article_tags(tag);
CREATE TABLE IF NOT EXISTS article_embeddings (
article_id INTEGER PRIMARY KEY REFERENCES articles(id) ON DELETE CASCADE,
vector BLOB NOT NULL,
+23 -3
View File
@@ -9,11 +9,15 @@ from collections.abc import Callable
from dataclasses import dataclass
from .taxonomy import (
ALLOWED_TAGS,
FLAVORS,
MAX_TAGS,
TOPICS,
coerce_flavor,
coerce_tags,
coerce_topic,
flavors_prompt_block,
tags_prompt_block,
topics_prompt_block,
)
@@ -42,6 +46,7 @@ CLASSIFICATION_SCHEMA = {
"accepted",
"topic",
"flavor",
"tags",
"reason_code",
"reason_text",
],
@@ -56,6 +61,7 @@ CLASSIFICATION_SCHEMA = {
"accepted": {"type": "boolean"},
"topic": {"type": "string", "enum": list(TOPICS)},
"flavor": {"type": "string", "enum": list(FLAVORS)},
"tags": {"type": "array", "items": {"type": "string", "enum": list(ALLOWED_TAGS)}, "maxItems": MAX_TAGS},
"reason_code": {"type": "string"},
"reason_text": {"type": "string"},
},
@@ -76,14 +82,20 @@ Judge emotional aftertaste, not simple positivity. Accept stories that leave a r
Reject stories centered on fear, outrage, partisan conflict, crime, tragedy, disaster repetition, celebrity drama, market panic, or corporate PR without clear public benefit.
Also assign one topic and one flavor, choosing the single best fit.
Also assign one primary topic and one flavor (the single best fit), plus 1-4 grouping tags.
Topic (what the story is about):
Primary topic (what the story is mainly about):
{topics}
Flavor (why it belongs in a calm, uplifting digest):
{flavors}
Grouping tags — choose ONLY from this controlled vocabulary:
{tags}
Tag discipline: assign 1-4 tags; prefer fewer, stronger ones; never tag by weak
association; pick tags a reader would reasonably use to find this story later.
Return only JSON with this exact shape:
{{
"constructive_score": 0,
@@ -96,10 +108,11 @@ Return only JSON with this exact shape:
"accepted": false,
"topic": "one_of_the_allowed_topics",
"flavor": "one_of_the_allowed_flavors",
"tags": ["one_to_four_allowed_tags"],
"reason_code": "short_snake_case",
"reason_text": "one concise sentence"
}}
""".format(topics=topics_prompt_block(), flavors=flavors_prompt_block())
""".format(topics=topics_prompt_block(), flavors=flavors_prompt_block(), tags=tags_prompt_block())
@dataclass
@@ -285,6 +298,7 @@ def normalize_scores(data: dict, model_name: str) -> dict:
"accepted": 1 if bool(data.get("accepted")) else 0,
"topic": coerce_topic(data.get("topic")),
"flavor": coerce_flavor(data.get("flavor")),
"tags": coerce_tags(data.get("tags")),
"reason_code": str(data.get("reason_code") or "model_no_reason")[:120],
"reason_text": str(data.get("reason_text") or "")[:1000],
"model_name": model_name,
@@ -333,6 +347,12 @@ def upsert_article_score(conn: sqlite3.Connection, article_id: int, scores: dict
scores["model_name"],
),
)
# Replace this article's grouping tags (controlled vocabulary, 0-4).
conn.execute("DELETE FROM article_tags WHERE article_id = ?", (article_id,))
for tag in scores.get("tags") or []:
conn.execute(
"INSERT OR IGNORE INTO article_tags (article_id, tag) VALUES (?, ?)", (article_id, tag)
)
def _classification_candidates(
+21
View File
@@ -36,6 +36,7 @@ _ARTICLE_COLUMNS = f"""
s.reason_code,
s.reason_text,
s.model_name,
(SELECT group_concat(t.tag) FROM article_tags t WHERE t.article_id = a.id) AS tags,
{RANK_SCORE_SQL} AS rank_score
"""
@@ -53,6 +54,7 @@ def feed(
mute_flavors: list[str] | None = None,
max_cortisol: int | None = None,
max_ragebait: int | None = None,
tag: str | None = None,
) -> list[dict]:
"""Return ranked articles with categorical filters applied in SQL.
@@ -94,6 +96,9 @@ def feed(
if max_ragebait is not None:
clauses.append("COALESCE(s.ragebait_score, 0) <= ?")
params.append(max_ragebait)
if tag:
clauses.append("EXISTS (SELECT 1 FROM article_tags at WHERE at.article_id = a.id AND at.tag = ?)")
params.append(tag.lower())
where = "WHERE " + " AND ".join(clauses)
params.extend([limit, offset])
@@ -148,6 +153,22 @@ def brief(conn: sqlite3.Connection, brief_date: str | None = None, limit: int =
}
def tag_counts(conn: sqlite3.Connection, accepted_only: bool = True) -> dict:
"""How many shown (accepted, non-duplicate) articles carry each grouping tag."""
where = "WHERE a.duplicate_of IS NULL" + (" AND s.accepted = 1" if accepted_only else "")
rows = conn.execute(
f"""
SELECT t.tag, COUNT(*) AS count
FROM article_tags t
JOIN articles a ON a.id = t.article_id
JOIN article_scores s ON s.article_id = a.id
{where}
GROUP BY t.tag
"""
).fetchall()
return {r["tag"]: r["count"] for r in rows}
def category_counts(conn: sqlite3.Connection, accepted_only: bool = True) -> list[dict]:
"""Return per topic/flavor article counts for building browse UIs.
+49 -2
View File
@@ -9,16 +9,45 @@ browsable feeds.
from __future__ import annotations
# Topical axis: what the story is primarily about.
# Primary topic — exactly one per article. Used for ranking, brief balance, and
# source reports (the "machine organization" axis).
TOPICS: dict[str, str] = {
"science": "research, discoveries, space, physics, technology",
"science": "research, discoveries, space, physics",
"technology": "computing, AI, engineering, gadgets, digital tools",
"environment": "conservation, climate solutions, ecosystems, clean energy",
"health": "medicine, wellbeing, mental health, public health",
"community": "local action, humanitarian work, social progress, kindness, fair work",
"culture": "arts, history, heritage, sport, human-interest",
"animals": "wildlife, nature discoveries, charming animal stories",
"learning": "education, personal growth, practical knowledge, curiosity",
}
# Groupings — 14 per article, the "human wandering" axis. A controlled
# vocabulary (never free-form) organised into calm families for the Explore UI.
# Families live in code, not the DB. Tag slugs are lowercase, hyphenated.
FAMILIES: dict[str, dict] = {
"Discovery & Wonder": {
"description": "Awe, science, and the natural world.",
"tags": ["science", "space", "animals", "nature", "archaeology", "technology", "curiosity"],
},
"People & Kindness": {
"description": "Community, generosity, and human warmth.",
"tags": ["community", "helping", "culture", "generosity", "resilience", "local-wins"],
},
"Solutions & Progress": {
"description": "Problems being solved.",
"tags": ["environment", "climate-solutions", "public-health", "cities", "clean-energy", "innovation"],
},
"Mind & Craft": {
"description": "Ideas, learning, and making.",
"tags": ["learning", "ideas", "arts", "books", "creativity", "perspective", "work-life"],
},
}
# Flat allowed-tag set (union of all families), for enum + validation.
ALLOWED_TAGS: tuple[str, ...] = tuple(dict.fromkeys(t for f in FAMILIES.values() for t in f["tags"]))
MAX_TAGS = 4
# Tonal axis: why the story is worth surfacing in a calm, uplifting digest.
FLAVORS: dict[str, str] = {
"breakthrough": "a significant advance or innovation with clear public benefit",
@@ -42,6 +71,24 @@ def coerce_flavor(value: object) -> str:
return text if text in FLAVORS else DEFAULT_FLAVOR
def coerce_tags(value: object, max_tags: int = MAX_TAGS) -> list[str]:
"""Validate a model-supplied tag list against the controlled vocabulary."""
if not isinstance(value, list):
return []
out: list[str] = []
for item in value:
tag = str(item).strip().lower()
if tag in ALLOWED_TAGS and tag not in out:
out.append(tag)
if len(out) >= max_tags:
break
return out
def tags_prompt_block() -> str:
return "\n".join(f"- {family}: {', '.join(d['tags'])}" for family, d in FAMILIES.items())
def _bullet_list(mapping: dict[str, str]) -> str:
return "\n".join(f"- {key}: {desc}" for key, desc in mapping.items())
+10 -8
View File
@@ -6,17 +6,19 @@
$ = informational
* Ability to silence some categories temporarily (Maybe a user doesn't even want to see health-related articles, even good ones, so they're not reminded of an ongoing medical issue -- a way to avoid something purposely for a bit) [done: pause topics/flavors in Boundaries]
* Terms to avoid list (To filter even good news that you'd rather not hear about) [done: avoid words/phrases in Boundaries]
| Favorite/save articles [tabled: needs accounts/logins for a larger footprint]
| Soothing background colors/gradients per each category as you scroll. Maybe a user preference. [tabled: revisit deliberately; if done, whisper-quiet translucent tints, not neon]
$ I really like the coloring for the metadata highlighting in each card (The grading bubbles)
| Persistent history + favorites with sign-in [tabled: needs accounts]
-l Shomehow include a daily inspirational/motivational/uplifting quote that would change each day.
- Allow ability to forward/share articles
##### Completed Sections #####
* Some articles are behind paywalls.. what can we do? [done: domain-level paywall flag, readable hero, paywalled downweighted out of the daily five, and a "paywall-heavy" advisory source-health flag (Nature/New Scientist). Replace handles any that remain in browse.]
* After an article is read, can we add a refresh button to fetch a replacement for it in the list? [done: "Find one I can read" / Replace swaps in the next readable article]
* I want the top 5 to be tere, but I want the remaining categories to be hidden behing their selections. So the main screen should show just the current highlights, and then the other articles should only be visible when in that category. [done]
* Title headings should be a little larger -- if you select Today, Today should look like a proper heading, bold and beautiful. Switching to Wondow should show "Wonder" all nice and whatnot. [done]
-l Shomehow include a daily inspirational/motivational/uplifting quote that would change each day.
- Allow ability to forward/share articles
* Session-only history of seen/swapped-away articles, recoverable without an account [done: History panel; Replace no longer recycles seen stories]
| Persistent history + favorites with sign-in [tabled: needs accounts]
* Ability to silence some categories temporarily (Maybe a user doesn't even want to see health-related articles, even good ones, so they're not reminded of an ongoing medical issue -- a way to avoid something purposely for a bit) [done: pause topics/flavors in Boundaries]
* Terms to avoid list (To filter even good news that you'd rather not hear about) [done: avoid words/phrases in Boundaries]
* Session-only history of seen/swapped-away articles, recoverable without an account [done: History panel; Replace no longer recycles seen stories]
+7
View File
@@ -80,3 +80,10 @@ def test_feed_excludes_dismissed(client):
r = client.get("/api/feed", params={"exclude": "1"})
ids = [i["id"] for i in r.json()["items"]]
assert 1 not in ids
def test_families_endpoint(client):
fams = client.get("/api/families").json()
names = [f["name"] for f in fams]
assert "Discovery & Wonder" in names
assert all("tags" in f and isinstance(f["tags"], list) for f in fams)
+64
View File
@@ -0,0 +1,64 @@
from goodnews.taxonomy import coerce_tags
from goodnews.db import connect, init_db
from goodnews.llm import normalize_scores, upsert_article_score
from goodnews import queries
def test_coerce_tags_validates_dedupes_caps():
assert coerce_tags(["science", "space", "bogus", "science"]) == ["science", "space"]
assert coerce_tags(["science", "space", "animals", "nature", "archaeology"]) == \
["science", "space", "animals", "nature"] # capped at 4
assert coerce_tags("not-a-list") == []
assert coerce_tags(None) == []
def test_normalize_includes_valid_tags_only():
s = normalize_scores({"topic": "technology", "flavor": "discovery", "tags": ["space", "nope"]}, "m")
assert s["topic"] == "technology" # new primary topic accepted
assert s["tags"] == ["space"]
def _db():
c = connect(":memory:"); init_db(c)
c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)")
for aid in (1, 2):
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash) VALUES (?,1,?,?,?)",
(aid, f"http://s/{aid}", f"t{aid}", f"h{aid}"))
c.commit()
return c
def _score(tags, topic="science"):
return normalize_scores({"topic": topic, "flavor": "discovery", "accepted": True,
"constructive_score": 7, "agency_score": 2, "human_benefit_score": 2,
"tags": tags}, "m")
def test_upsert_writes_tags_and_feed_filters_by_tag():
c = _db()
upsert_article_score(c, 1, _score(["space", "animals"]))
upsert_article_score(c, 2, _score(["community"], topic="community"))
c.commit()
assert [r["id"] for r in queries.feed(c, tag="space", limit=50)] == [1]
assert [r["id"] for r in queries.feed(c, tag="community", limit=50)] == [2]
row1 = next(r for r in queries.feed(c, limit=50) if r["id"] == 1)
assert set(row1["tags"].split(",")) == {"space", "animals"}
def test_reclassify_replaces_old_tags():
c = _db()
upsert_article_score(c, 1, _score(["space", "animals"]))
c.commit()
upsert_article_score(c, 1, _score(["science"])) # re-tag
c.commit()
assert [r["id"] for r in queries.feed(c, tag="animals", limit=50)] == [] # old tag gone
assert [r["id"] for r in queries.feed(c, tag="science", limit=50)] == [1]
def test_tag_counts():
c = _db()
upsert_article_score(c, 1, _score(["space", "science"]))
upsert_article_score(c, 2, _score(["science"]))
c.commit()
counts = queries.tag_counts(c)
assert counts["science"] == 2 and counts["space"] == 1