Files
upbeatBytes/goodnews/queries.py
T
thejayman77 a47a1504c8 Phase B1: multi-tag groupings model (backend)
Three-layer organization: primary topic (one per article, for ranking and
brief balance) + grouping tags (1-4 per article from a controlled vocabulary,
the organic "wandering" axis) + tonal flavor.

- taxonomy: add technology + learning topics; 4 calm tag families
  (Discovery & Wonder, People & Kindness, Solutions & Progress, Mind & Craft)
  defined in code, not the DB; ALLOWED_TAGS union + coerce_tags validation.
- db: article_tags(article_id, tag) join table + tag index.
- llm: tags added to the classifier json_schema (enum-constrained, maxItems 4)
  and system prompt; normalize_scores coerces tags; upsert_article_score
  replaces a row's tags atomically on every (re)classification.
- queries: feed gains a tag filter and exposes tags via group_concat; tag_counts.
- api: Article.tags, feed tag param, and /api/families with per-tag counts.
- tests: coerce/normalize/upsert/tag-filter/reclassify-replace/tag_counts +
  /api/families. 99 passing.

Corpus reclassify (re-tag + new primary topics) runs separately against the
local LLM. Frontend (B2) pairs with this; the live site is unchanged until then.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-01 18:35:25 +00:00

206 lines
6.8 KiB
Python

"""Read-only query helpers over the goodNews database.
Pure stdlib and framework-agnostic: returns plain dicts so the same functions
back both the CLI and the JSON API. All article output is metadata + a link to
the original source — never stored bodies.
"""
from __future__ import annotations
import sqlite3
# Composite ranking used everywhere a "best first" order is needed. Kept as one
# expression so brief, category feeds, and the API all rank identically.
RANK_SCORE_SQL = (
"(s.constructive_score + s.agency_score + s.human_benefit_score + src.trust_score "
"- s.cortisol_score - s.ragebait_score - s.pr_risk_score)"
)
_ARTICLE_COLUMNS = f"""
a.id,
a.title,
a.description,
a.canonical_url,
a.published_at,
a.image_url,
src.name AS source_name,
s.topic,
s.flavor,
s.accepted,
s.constructive_score,
s.cortisol_score,
s.ragebait_score,
s.agency_score,
s.human_benefit_score,
s.pr_risk_score,
s.reason_code,
s.reason_text,
s.model_name,
(SELECT group_concat(t.tag) FROM article_tags t WHERE t.article_id = a.id) AS tags,
{RANK_SCORE_SQL} AS rank_score
"""
def feed(
conn: sqlite3.Connection,
topic: str | None = None,
flavor: str | None = None,
accepted_only: bool = True,
limit: int = 30,
offset: int = 0,
include_topics: list[str] | None = None,
include_flavors: list[str] | None = None,
mute_topics: list[str] | None = None,
mute_flavors: list[str] | None = None,
max_cortisol: int | None = None,
max_ragebait: int | None = None,
tag: str | None = None,
) -> list[dict]:
"""Return ranked articles with categorical filters applied in SQL.
Categorical filters (topic/flavor include & mute, cortisol/ragebait ceilings)
must be applied here, not after ranking — otherwise low-ranked-but-matching
items (e.g. 'discovery' for a Wonder lane) fall outside any over-fetch window.
Word-boundary avoid-terms remain a Python pass on the caller side.
"""
clauses = ["a.duplicate_of IS NULL"]
params: list = []
if accepted_only:
clauses.append("s.accepted = 1")
if topic:
clauses.append("s.topic = ?")
params.append(topic.lower())
if flavor:
clauses.append("s.flavor = ?")
params.append(flavor.lower())
def _in(column: str, values: list[str], negate: bool = False) -> None:
vals = [v.lower() for v in values]
placeholders = ",".join("?" * len(vals))
op = "NOT IN" if negate else "IN"
# COALESCE keeps NULL-category rows from being dropped by NOT IN.
clauses.append(f"COALESCE({column}, '') {op} ({placeholders})")
params.extend(vals)
if include_topics:
_in("s.topic", include_topics)
if include_flavors:
_in("s.flavor", include_flavors)
if mute_topics:
_in("s.topic", mute_topics, negate=True)
if mute_flavors:
_in("s.flavor", mute_flavors, negate=True)
if max_cortisol is not None:
clauses.append("COALESCE(s.cortisol_score, 0) <= ?")
params.append(max_cortisol)
if max_ragebait is not None:
clauses.append("COALESCE(s.ragebait_score, 0) <= ?")
params.append(max_ragebait)
if tag:
clauses.append("EXISTS (SELECT 1 FROM article_tags at WHERE at.article_id = a.id AND at.tag = ?)")
params.append(tag.lower())
where = "WHERE " + " AND ".join(clauses)
params.extend([limit, offset])
rows = conn.execute(
f"""
SELECT {_ARTICLE_COLUMNS}
FROM articles a
JOIN sources src ON src.id = a.source_id
JOIN article_scores s ON s.article_id = a.id
{where}
ORDER BY rank_score DESC, COALESCE(a.published_at, a.discovered_at) DESC
LIMIT ? OFFSET ?
""",
params,
).fetchall()
return [dict(row) for row in rows]
def brief(conn: sqlite3.Connection, brief_date: str | None = None, limit: int = 10) -> dict:
"""Return a stored daily brief (latest if no date) with its ranked items."""
target_date = brief_date or _latest_brief_date(conn)
if not target_date:
return {"brief_date": None, "title": None, "items": []}
header = conn.execute(
"SELECT brief_date, title, created_at FROM daily_briefs WHERE brief_date = ?",
(target_date,),
).fetchone()
if not header:
return {"brief_date": target_date, "title": None, "created_at": None, "items": []}
rows = conn.execute(
f"""
SELECT bi.rank, bi.selection_reason, {_ARTICLE_COLUMNS}
FROM daily_briefs b
JOIN daily_brief_items bi ON bi.brief_id = b.id
JOIN articles a ON a.id = bi.article_id
JOIN sources src ON src.id = a.source_id
LEFT JOIN article_scores s ON s.article_id = a.id
WHERE b.brief_date = ?
ORDER BY bi.rank
LIMIT ?
""",
(target_date, limit),
).fetchall()
return {
"brief_date": header["brief_date"],
"title": header["title"],
"created_at": header["created_at"],
"items": [dict(row) for row in rows],
}
def tag_counts(conn: sqlite3.Connection, accepted_only: bool = True) -> dict:
"""How many shown (accepted, non-duplicate) articles carry each grouping tag."""
where = "WHERE a.duplicate_of IS NULL" + (" AND s.accepted = 1" if accepted_only else "")
rows = conn.execute(
f"""
SELECT t.tag, COUNT(*) AS count
FROM article_tags t
JOIN articles a ON a.id = t.article_id
JOIN article_scores s ON s.article_id = a.id
{where}
GROUP BY t.tag
"""
).fetchall()
return {r["tag"]: r["count"] for r in rows}
def category_counts(conn: sqlite3.Connection, accepted_only: bool = True) -> list[dict]:
"""Return per topic/flavor article counts for building browse UIs.
Joins articles and excludes duplicates so the counts match exactly what the
feed endpoint will actually return for each topic/flavor.
"""
clauses = ["a.duplicate_of IS NULL"]
clauses.append("s.accepted = 1" if accepted_only else "s.topic IS NOT NULL")
rows = conn.execute(
f"""
SELECT s.topic, s.flavor, COUNT(*) AS count
FROM article_scores s
JOIN articles a ON a.id = s.article_id
WHERE {" AND ".join(clauses)}
GROUP BY s.topic, s.flavor
ORDER BY s.topic, s.flavor
"""
).fetchall()
return [dict(row) for row in rows]
def available_dates(conn: sqlite3.Connection, limit: int = 30) -> list[str]:
rows = conn.execute(
"SELECT brief_date FROM daily_briefs ORDER BY brief_date DESC LIMIT ?",
(limit,),
).fetchall()
return [row["brief_date"] for row in rows]
def _latest_brief_date(conn: sqlite3.Connection) -> str | None:
row = conn.execute(
"SELECT brief_date FROM daily_briefs ORDER BY brief_date DESC LIMIT 1"
).fetchone()
return row["brief_date"] if row else None