Phase B1: multi-tag groupings model (backend)

Three-layer organization: primary topic (one per article, for ranking and brief balance) + grouping tags (1-4 per article from a controlled vocabulary, the organic "wandering" axis) + tonal flavor. - taxonomy: add technology + learning topics; 4 calm tag families (Discovery & Wonder, People & Kindness, Solutions & Progress, Mind & Craft) defined in code, not the DB; ALLOWED_TAGS union + coerce_tags validation. - db: article_tags(article_id, tag) join table + tag index. - llm: tags added to the classifier json_schema (enum-constrained, maxItems 4) and system prompt; normalize_scores coerces tags; upsert_article_score replaces a row's tags atomically on every (re)classification. - queries: feed gains a tag filter and exposes tags via group_concat; tag_counts. - api: Article.tags, feed tag param, and /api/families with per-tag counts. - tests: coerce/normalize/upsert/tag-filter/reclassify-replace/tag_counts + /api/families. 99 passing. Corpus reclassify (re-tag + new primary topics) runs separately against the local LLM. Frontend (B2) pairs with this; the live site is unchanged until then. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-01 18:35:25 +00:00
parent c7f4db3973
commit a47a1504c8
8 changed files with 203 additions and 16 deletions
@@ -34,7 +34,7 @@ from .hero import safe_to_lead
 from .llm import LocalModelClient
 from .moods import MOODS, mood_filter
 from .paywall import is_paywalled
-from .taxonomy import FLAVORS, TOPICS
+from .taxonomy import FAMILIES, FLAVORS, TOPICS

 ROOT = Path(__file__).resolve().parents[1]
 DEFAULT_DB = ROOT / "data" / "goodnews.sqlite3"
@@ -126,9 +126,11 @@ class Article(BaseModel):
    model_name: str | None = None
    rank: int | None = None  # position within a brief, when applicable
    paywalled: bool = False
+    tags: list[str] = []

    @classmethod
    def from_row(cls, row: dict) -> "Article":
+        raw_tags = row.get("tags")
        return cls(
            id=row["id"],
            title=row["title"],
@@ -146,6 +148,7 @@ class Article(BaseModel):
            model_name=row.get("model_name"),
            rank=row.get("rank"),
            paywalled=is_paywalled(row.get("canonical_url")),
+            tags=[t for t in (raw_tags.split(",") if raw_tags else []) if t],
        )


@@ -240,6 +243,20 @@ def create_app() -> FastAPI:
        # client merges with the user's own Calm Filters.
        return MOODS

+    @app.get("/api/families")
+    def families() -> list[dict]:
+        # Grouping vocabulary organised into calm families for the Explore UI.
+        with get_conn() as conn:
+            counts = queries.tag_counts(conn)
+        return [
+            {
+                "name": name,
+                "description": d["description"],
+                "tags": [{"key": t, "count": counts.get(t, 0)} for t in d["tags"]],
+            }
+            for name, d in FAMILIES.items()
+        ]
+
    @app.get("/api/category-counts", response_model=list[CategoryCount])
    def category_counts(accepted_only: bool = True, prefs: str | None = Query(None)) -> list[CategoryCount]:
        fp = prefs_from_json(prefs)
@@ -267,6 +284,7 @@ def create_app() -> FastAPI:
        offset: int = Query(0, ge=0),
        prefs: str | None = Query(None),
        exclude: str = Query("", description="comma-separated article ids the reader has dismissed"),
+        tag: str | None = Query(None, description="grouping tag to browse"),
    ) -> FeedResponse:
        if topic and topic.lower() not in TOPICS:
            raise HTTPException(400, f"unknown topic: {topic}")
@@ -285,14 +303,14 @@ def create_app() -> FastAPI:
                fetch_n = min(2000, (offset + limit) * 4 + 50 + len(excl))
                raw = queries.feed(
                    conn, topic=topic, flavor=flavor, accepted_only=accepted_only,
-                    limit=fetch_n, offset=0, **kw,
+                    limit=fetch_n, offset=0, tag=tag, **kw,
                )
                kept = [a for a in filter_articles(raw, fp, now) if a["id"] not in excl]
                rows = kept[offset : offset + limit]
            else:
                rows = queries.feed(
                    conn, topic=topic, flavor=flavor, accepted_only=accepted_only,
-                    limit=limit, offset=offset, **kw,
+                    limit=limit, offset=offset, tag=tag, **kw,
                )
        # Keep the top of a browse view readable: stable-sort paywalled items
        # below readable ones (composite order preserved within each group).
@@ -70,6 +70,14 @@ CREATE TABLE IF NOT EXISTS article_scores (
    scored_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
 );

+CREATE TABLE IF NOT EXISTS article_tags (
+    article_id INTEGER NOT NULL REFERENCES articles(id) ON DELETE CASCADE,
+    tag TEXT NOT NULL,
+    PRIMARY KEY (article_id, tag)
+);
+
+CREATE INDEX IF NOT EXISTS idx_article_tags_tag ON article_tags(tag);
+
 CREATE TABLE IF NOT EXISTS article_embeddings (
    article_id INTEGER PRIMARY KEY REFERENCES articles(id) ON DELETE CASCADE,
    vector BLOB NOT NULL,
@@ -9,11 +9,15 @@ from collections.abc import Callable
 from dataclasses import dataclass

 from .taxonomy import (
+    ALLOWED_TAGS,
    FLAVORS,
+    MAX_TAGS,
    TOPICS,
    coerce_flavor,
+    coerce_tags,
    coerce_topic,
    flavors_prompt_block,
+    tags_prompt_block,
    topics_prompt_block,
 )

@@ -42,6 +46,7 @@ CLASSIFICATION_SCHEMA = {
        "accepted",
        "topic",
        "flavor",
+        "tags",
        "reason_code",
        "reason_text",
    ],
@@ -56,6 +61,7 @@ CLASSIFICATION_SCHEMA = {
        "accepted": {"type": "boolean"},
        "topic": {"type": "string", "enum": list(TOPICS)},
        "flavor": {"type": "string", "enum": list(FLAVORS)},
+        "tags": {"type": "array", "items": {"type": "string", "enum": list(ALLOWED_TAGS)}, "maxItems": MAX_TAGS},
        "reason_code": {"type": "string"},
        "reason_text": {"type": "string"},
    },
@@ -76,14 +82,20 @@ Judge emotional aftertaste, not simple positivity. Accept stories that leave a r

 Reject stories centered on fear, outrage, partisan conflict, crime, tragedy, disaster repetition, celebrity drama, market panic, or corporate PR without clear public benefit.

-Also assign one topic and one flavor, choosing the single best fit.
+Also assign one primary topic and one flavor (the single best fit), plus 1-4 grouping tags.

-Topic (what the story is about):
+Primary topic (what the story is mainly about):
 {topics}

 Flavor (why it belongs in a calm, uplifting digest):
 {flavors}

+Grouping tags — choose ONLY from this controlled vocabulary:
+{tags}
+
+Tag discipline: assign 1-4 tags; prefer fewer, stronger ones; never tag by weak
+association; pick tags a reader would reasonably use to find this story later.
+
 Return only JSON with this exact shape:
 {{
  "constructive_score": 0,
@@ -96,10 +108,11 @@ Return only JSON with this exact shape:
  "accepted": false,
  "topic": "one_of_the_allowed_topics",
  "flavor": "one_of_the_allowed_flavors",
+  "tags": ["one_to_four_allowed_tags"],
  "reason_code": "short_snake_case",
  "reason_text": "one concise sentence"
 }}
-""".format(topics=topics_prompt_block(), flavors=flavors_prompt_block())
+""".format(topics=topics_prompt_block(), flavors=flavors_prompt_block(), tags=tags_prompt_block())


@dataclass
@@ -285,6 +298,7 @@ def normalize_scores(data: dict, model_name: str) -> dict:
        "accepted": 1 if bool(data.get("accepted")) else 0,
        "topic": coerce_topic(data.get("topic")),
        "flavor": coerce_flavor(data.get("flavor")),
+        "tags": coerce_tags(data.get("tags")),
        "reason_code": str(data.get("reason_code") or "model_no_reason")[:120],
        "reason_text": str(data.get("reason_text") or "")[:1000],
        "model_name": model_name,
@@ -333,6 +347,12 @@ def upsert_article_score(conn: sqlite3.Connection, article_id: int, scores: dict
            scores["model_name"],
        ),
    )
+    # Replace this article's grouping tags (controlled vocabulary, 0-4).
+    conn.execute("DELETE FROM article_tags WHERE article_id = ?", (article_id,))
+    for tag in scores.get("tags") or []:
+        conn.execute(
+            "INSERT OR IGNORE INTO article_tags (article_id, tag) VALUES (?, ?)", (article_id, tag)
+        )


 def _classification_candidates(
@@ -36,6 +36,7 @@ _ARTICLE_COLUMNS = f"""
    s.reason_code,
    s.reason_text,
    s.model_name,
+    (SELECT group_concat(t.tag) FROM article_tags t WHERE t.article_id = a.id) AS tags,
    {RANK_SCORE_SQL} AS rank_score
 """

@@ -53,6 +54,7 @@ def feed(
    mute_flavors: list[str] | None = None,
    max_cortisol: int | None = None,
    max_ragebait: int | None = None,
+    tag: str | None = None,
 ) -> list[dict]:
    """Return ranked articles with categorical filters applied in SQL.

@@ -94,6 +96,9 @@ def feed(
    if max_ragebait is not None:
        clauses.append("COALESCE(s.ragebait_score, 0) <= ?")
        params.append(max_ragebait)
+    if tag:
+        clauses.append("EXISTS (SELECT 1 FROM article_tags at WHERE at.article_id = a.id AND at.tag = ?)")
+        params.append(tag.lower())

    where = "WHERE " + " AND ".join(clauses)
    params.extend([limit, offset])
@@ -148,6 +153,22 @@ def brief(conn: sqlite3.Connection, brief_date: str | None = None, limit: int =
    }


+def tag_counts(conn: sqlite3.Connection, accepted_only: bool = True) -> dict:
+    """How many shown (accepted, non-duplicate) articles carry each grouping tag."""
+    where = "WHERE a.duplicate_of IS NULL" + (" AND s.accepted = 1" if accepted_only else "")
+    rows = conn.execute(
+        f"""
+        SELECT t.tag, COUNT(*) AS count
+        FROM article_tags t
+        JOIN articles a ON a.id = t.article_id
+        JOIN article_scores s ON s.article_id = a.id
+        {where}
+        GROUP BY t.tag
+        """
+    ).fetchall()
+    return {r["tag"]: r["count"] for r in rows}
+
+
 def category_counts(conn: sqlite3.Connection, accepted_only: bool = True) -> list[dict]:
    """Return per topic/flavor article counts for building browse UIs.

@@ -9,16 +9,45 @@ browsable feeds.
 from __future__ import annotations


-# Topical axis: what the story is primarily about.
+# Primary topic — exactly one per article. Used for ranking, brief balance, and
+# source reports (the "machine organization" axis).
 TOPICS: dict[str, str] = {
-    "science": "research, discoveries, space, physics, technology",
+    "science": "research, discoveries, space, physics",
+    "technology": "computing, AI, engineering, gadgets, digital tools",
    "environment": "conservation, climate solutions, ecosystems, clean energy",
    "health": "medicine, wellbeing, mental health, public health",
    "community": "local action, humanitarian work, social progress, kindness, fair work",
    "culture": "arts, history, heritage, sport, human-interest",
    "animals": "wildlife, nature discoveries, charming animal stories",
+    "learning": "education, personal growth, practical knowledge, curiosity",
 }

+# Groupings — 1–4 per article, the "human wandering" axis. A controlled
+# vocabulary (never free-form) organised into calm families for the Explore UI.
+# Families live in code, not the DB. Tag slugs are lowercase, hyphenated.
+FAMILIES: dict[str, dict] = {
+    "Discovery & Wonder": {
+        "description": "Awe, science, and the natural world.",
+        "tags": ["science", "space", "animals", "nature", "archaeology", "technology", "curiosity"],
+    },
+    "People & Kindness": {
+        "description": "Community, generosity, and human warmth.",
+        "tags": ["community", "helping", "culture", "generosity", "resilience", "local-wins"],
+    },
+    "Solutions & Progress": {
+        "description": "Problems being solved.",
+        "tags": ["environment", "climate-solutions", "public-health", "cities", "clean-energy", "innovation"],
+    },
+    "Mind & Craft": {
+        "description": "Ideas, learning, and making.",
+        "tags": ["learning", "ideas", "arts", "books", "creativity", "perspective", "work-life"],
+    },
+}
+
+# Flat allowed-tag set (union of all families), for enum + validation.
+ALLOWED_TAGS: tuple[str, ...] = tuple(dict.fromkeys(t for f in FAMILIES.values() for t in f["tags"]))
+MAX_TAGS = 4
+
 # Tonal axis: why the story is worth surfacing in a calm, uplifting digest.
 FLAVORS: dict[str, str] = {
    "breakthrough": "a significant advance or innovation with clear public benefit",
@@ -42,6 +71,24 @@ def coerce_flavor(value: object) -> str:
    return text if text in FLAVORS else DEFAULT_FLAVOR


+def coerce_tags(value: object, max_tags: int = MAX_TAGS) -> list[str]:
+    """Validate a model-supplied tag list against the controlled vocabulary."""
+    if not isinstance(value, list):
+        return []
+    out: list[str] = []
+    for item in value:
+        tag = str(item).strip().lower()
+        if tag in ALLOWED_TAGS and tag not in out:
+            out.append(tag)
+        if len(out) >= max_tags:
+            break
+    return out
+
+
+def tags_prompt_block() -> str:
+    return "\n".join(f"- {family}: {', '.join(d['tags'])}" for family, d in FAMILIES.items())
+
+
 def _bullet_list(mapping: dict[str, str]) -> str:
    return "\n".join(f"- {key}: {desc}" for key, desc in mapping.items())

@@ -6,17 +6,19 @@
 $ = informational


-* Ability to silence some categories temporarily (Maybe a user doesn't even want to see health-related articles, even good ones, so they're not reminded of an ongoing medical issue -- a way to avoid something purposely for a bit)  [done: pause topics/flavors in Boundaries]
-* Terms to avoid list (To filter even good news that you'd rather not hear about)  [done: avoid words/phrases in Boundaries]
 | Favorite/save articles  [tabled: needs accounts/logins for a larger footprint]
 | Soothing background colors/gradients per each category as you scroll.  Maybe a user preference.  [tabled: revisit deliberately; if done, whisper-quiet translucent tints, not neon]
-$ I really like the coloring for the metadata highlighting in each card (The grading bubbles)
+| Persistent history + favorites with sign-in  [tabled: needs accounts]
+
+-l Shomehow include a daily inspirational/motivational/uplifting quote that would change each day.
+- Allow ability to forward/share articles
+
+
+##### Completed Sections #####
 * Some articles are behind paywalls.. what can we do?  [done: domain-level paywall flag, readable hero, paywalled downweighted out of the daily five, and a "paywall-heavy" advisory source-health flag (Nature/New Scientist). Replace handles any that remain in browse.]
 * After an article is read, can we add a refresh button to fetch a replacement for it in the list?  [done: "Find one I can read" / Replace swaps in the next readable article]
 * I want the top 5 to be tere, but I want the remaining categories to be hidden behing their selections.  So the main screen should show just the current highlights, and then the other articles should only be visible when in that category.  [done]
 * Title headings should be a little larger -- if you select Today, Today should look like a proper heading, bold and beautiful.  Switching to Wondow should show "Wonder" all nice and whatnot.  [done]
-
-l Shomehow include a daily inspirational/motivational/uplifting quote that would change each day.
- Allow ability to forward/share articles
-* Session-only history of seen/swapped-away articles, recoverable without an account  [done: History panel; Replace no longer recycles seen stories]
-| Persistent history + favorites with sign-in  [tabled: needs accounts]
+* Ability to silence some categories temporarily (Maybe a user doesn't even want to see health-related articles, even good ones, so they're not reminded of an ongoing medical issue -- a way to avoid something purposely for a bit)  [done: pause topics/flavors in Boundaries]
+* Terms to avoid list (To filter even good news that you'd rather not hear about)  [done: avoid words/phrases in Boundaries]
+* Session-only history of seen/swapped-away articles, recoverable without an account  [done: History panel; Replace no longer recycles seen stories]
@@ -80,3 +80,10 @@ def test_feed_excludes_dismissed(client):
    r = client.get("/api/feed", params={"exclude": "1"})
    ids = [i["id"] for i in r.json()["items"]]
    assert 1 not in ids
+
+
+def test_families_endpoint(client):
+    fams = client.get("/api/families").json()
+    names = [f["name"] for f in fams]
+    assert "Discovery & Wonder" in names
+    assert all("tags" in f and isinstance(f["tags"], list) for f in fams)
@@ -0,0 +1,64 @@
+from goodnews.taxonomy import coerce_tags
+from goodnews.db import connect, init_db
+from goodnews.llm import normalize_scores, upsert_article_score
+from goodnews import queries
+
+
+def test_coerce_tags_validates_dedupes_caps():
+    assert coerce_tags(["science", "space", "bogus", "science"]) == ["science", "space"]
+    assert coerce_tags(["science", "space", "animals", "nature", "archaeology"]) == \
+        ["science", "space", "animals", "nature"]  # capped at 4
+    assert coerce_tags("not-a-list") == []
+    assert coerce_tags(None) == []
+
+
+def test_normalize_includes_valid_tags_only():
+    s = normalize_scores({"topic": "technology", "flavor": "discovery", "tags": ["space", "nope"]}, "m")
+    assert s["topic"] == "technology"   # new primary topic accepted
+    assert s["tags"] == ["space"]
+
+
+def _db():
+    c = connect(":memory:"); init_db(c)
+    c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)")
+    for aid in (1, 2):
+        c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash) VALUES (?,1,?,?,?)",
+                  (aid, f"http://s/{aid}", f"t{aid}", f"h{aid}"))
+    c.commit()
+    return c
+
+
+def _score(tags, topic="science"):
+    return normalize_scores({"topic": topic, "flavor": "discovery", "accepted": True,
+                             "constructive_score": 7, "agency_score": 2, "human_benefit_score": 2,
+                             "tags": tags}, "m")
+
+
+def test_upsert_writes_tags_and_feed_filters_by_tag():
+    c = _db()
+    upsert_article_score(c, 1, _score(["space", "animals"]))
+    upsert_article_score(c, 2, _score(["community"], topic="community"))
+    c.commit()
+    assert [r["id"] for r in queries.feed(c, tag="space", limit=50)] == [1]
+    assert [r["id"] for r in queries.feed(c, tag="community", limit=50)] == [2]
+    row1 = next(r for r in queries.feed(c, limit=50) if r["id"] == 1)
+    assert set(row1["tags"].split(",")) == {"space", "animals"}
+
+
+def test_reclassify_replaces_old_tags():
+    c = _db()
+    upsert_article_score(c, 1, _score(["space", "animals"]))
+    c.commit()
+    upsert_article_score(c, 1, _score(["science"]))  # re-tag
+    c.commit()
+    assert [r["id"] for r in queries.feed(c, tag="animals", limit=50)] == []   # old tag gone
+    assert [r["id"] for r in queries.feed(c, tag="science", limit=50)] == [1]
+
+
+def test_tag_counts():
+    c = _db()
+    upsert_article_score(c, 1, _score(["space", "science"]))
+    upsert_article_score(c, 2, _score(["science"]))
+    c.commit()
+    counts = queries.tag_counts(c)
+    assert counts["science"] == 2 and counts["space"] == 1