a47a1504c8
Three-layer organization: primary topic (one per article, for ranking and brief balance) + grouping tags (1-4 per article from a controlled vocabulary, the organic "wandering" axis) + tonal flavor. - taxonomy: add technology + learning topics; 4 calm tag families (Discovery & Wonder, People & Kindness, Solutions & Progress, Mind & Craft) defined in code, not the DB; ALLOWED_TAGS union + coerce_tags validation. - db: article_tags(article_id, tag) join table + tag index. - llm: tags added to the classifier json_schema (enum-constrained, maxItems 4) and system prompt; normalize_scores coerces tags; upsert_article_score replaces a row's tags atomically on every (re)classification. - queries: feed gains a tag filter and exposes tags via group_concat; tag_counts. - api: Article.tags, feed tag param, and /api/families with per-tag counts. - tests: coerce/normalize/upsert/tag-filter/reclassify-replace/tag_counts + /api/families. 99 passing. Corpus reclassify (re-tag + new primary topics) runs separately against the local LLM. Frontend (B2) pairs with this; the live site is unchanged until then. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
206 lines
6.8 KiB
Python
206 lines
6.8 KiB
Python
"""Read-only query helpers over the goodNews database.
|
|
|
|
Pure stdlib and framework-agnostic: returns plain dicts so the same functions
|
|
back both the CLI and the JSON API. All article output is metadata + a link to
|
|
the original source — never stored bodies.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import sqlite3
|
|
|
|
# Composite ranking used everywhere a "best first" order is needed. Kept as one
|
|
# expression so brief, category feeds, and the API all rank identically.
|
|
RANK_SCORE_SQL = (
|
|
"(s.constructive_score + s.agency_score + s.human_benefit_score + src.trust_score "
|
|
"- s.cortisol_score - s.ragebait_score - s.pr_risk_score)"
|
|
)
|
|
|
|
_ARTICLE_COLUMNS = f"""
|
|
a.id,
|
|
a.title,
|
|
a.description,
|
|
a.canonical_url,
|
|
a.published_at,
|
|
a.image_url,
|
|
src.name AS source_name,
|
|
s.topic,
|
|
s.flavor,
|
|
s.accepted,
|
|
s.constructive_score,
|
|
s.cortisol_score,
|
|
s.ragebait_score,
|
|
s.agency_score,
|
|
s.human_benefit_score,
|
|
s.pr_risk_score,
|
|
s.reason_code,
|
|
s.reason_text,
|
|
s.model_name,
|
|
(SELECT group_concat(t.tag) FROM article_tags t WHERE t.article_id = a.id) AS tags,
|
|
{RANK_SCORE_SQL} AS rank_score
|
|
"""
|
|
|
|
|
|
def feed(
|
|
conn: sqlite3.Connection,
|
|
topic: str | None = None,
|
|
flavor: str | None = None,
|
|
accepted_only: bool = True,
|
|
limit: int = 30,
|
|
offset: int = 0,
|
|
include_topics: list[str] | None = None,
|
|
include_flavors: list[str] | None = None,
|
|
mute_topics: list[str] | None = None,
|
|
mute_flavors: list[str] | None = None,
|
|
max_cortisol: int | None = None,
|
|
max_ragebait: int | None = None,
|
|
tag: str | None = None,
|
|
) -> list[dict]:
|
|
"""Return ranked articles with categorical filters applied in SQL.
|
|
|
|
Categorical filters (topic/flavor include & mute, cortisol/ragebait ceilings)
|
|
must be applied here, not after ranking — otherwise low-ranked-but-matching
|
|
items (e.g. 'discovery' for a Wonder lane) fall outside any over-fetch window.
|
|
Word-boundary avoid-terms remain a Python pass on the caller side.
|
|
"""
|
|
clauses = ["a.duplicate_of IS NULL"]
|
|
params: list = []
|
|
if accepted_only:
|
|
clauses.append("s.accepted = 1")
|
|
if topic:
|
|
clauses.append("s.topic = ?")
|
|
params.append(topic.lower())
|
|
if flavor:
|
|
clauses.append("s.flavor = ?")
|
|
params.append(flavor.lower())
|
|
|
|
def _in(column: str, values: list[str], negate: bool = False) -> None:
|
|
vals = [v.lower() for v in values]
|
|
placeholders = ",".join("?" * len(vals))
|
|
op = "NOT IN" if negate else "IN"
|
|
# COALESCE keeps NULL-category rows from being dropped by NOT IN.
|
|
clauses.append(f"COALESCE({column}, '') {op} ({placeholders})")
|
|
params.extend(vals)
|
|
|
|
if include_topics:
|
|
_in("s.topic", include_topics)
|
|
if include_flavors:
|
|
_in("s.flavor", include_flavors)
|
|
if mute_topics:
|
|
_in("s.topic", mute_topics, negate=True)
|
|
if mute_flavors:
|
|
_in("s.flavor", mute_flavors, negate=True)
|
|
if max_cortisol is not None:
|
|
clauses.append("COALESCE(s.cortisol_score, 0) <= ?")
|
|
params.append(max_cortisol)
|
|
if max_ragebait is not None:
|
|
clauses.append("COALESCE(s.ragebait_score, 0) <= ?")
|
|
params.append(max_ragebait)
|
|
if tag:
|
|
clauses.append("EXISTS (SELECT 1 FROM article_tags at WHERE at.article_id = a.id AND at.tag = ?)")
|
|
params.append(tag.lower())
|
|
|
|
where = "WHERE " + " AND ".join(clauses)
|
|
params.extend([limit, offset])
|
|
|
|
rows = conn.execute(
|
|
f"""
|
|
SELECT {_ARTICLE_COLUMNS}
|
|
FROM articles a
|
|
JOIN sources src ON src.id = a.source_id
|
|
JOIN article_scores s ON s.article_id = a.id
|
|
{where}
|
|
ORDER BY rank_score DESC, COALESCE(a.published_at, a.discovered_at) DESC
|
|
LIMIT ? OFFSET ?
|
|
""",
|
|
params,
|
|
).fetchall()
|
|
return [dict(row) for row in rows]
|
|
|
|
|
|
def brief(conn: sqlite3.Connection, brief_date: str | None = None, limit: int = 10) -> dict:
|
|
"""Return a stored daily brief (latest if no date) with its ranked items."""
|
|
target_date = brief_date or _latest_brief_date(conn)
|
|
if not target_date:
|
|
return {"brief_date": None, "title": None, "items": []}
|
|
|
|
header = conn.execute(
|
|
"SELECT brief_date, title, created_at FROM daily_briefs WHERE brief_date = ?",
|
|
(target_date,),
|
|
).fetchone()
|
|
if not header:
|
|
return {"brief_date": target_date, "title": None, "created_at": None, "items": []}
|
|
|
|
rows = conn.execute(
|
|
f"""
|
|
SELECT bi.rank, bi.selection_reason, {_ARTICLE_COLUMNS}
|
|
FROM daily_briefs b
|
|
JOIN daily_brief_items bi ON bi.brief_id = b.id
|
|
JOIN articles a ON a.id = bi.article_id
|
|
JOIN sources src ON src.id = a.source_id
|
|
LEFT JOIN article_scores s ON s.article_id = a.id
|
|
WHERE b.brief_date = ?
|
|
ORDER BY bi.rank
|
|
LIMIT ?
|
|
""",
|
|
(target_date, limit),
|
|
).fetchall()
|
|
return {
|
|
"brief_date": header["brief_date"],
|
|
"title": header["title"],
|
|
"created_at": header["created_at"],
|
|
"items": [dict(row) for row in rows],
|
|
}
|
|
|
|
|
|
def tag_counts(conn: sqlite3.Connection, accepted_only: bool = True) -> dict:
|
|
"""How many shown (accepted, non-duplicate) articles carry each grouping tag."""
|
|
where = "WHERE a.duplicate_of IS NULL" + (" AND s.accepted = 1" if accepted_only else "")
|
|
rows = conn.execute(
|
|
f"""
|
|
SELECT t.tag, COUNT(*) AS count
|
|
FROM article_tags t
|
|
JOIN articles a ON a.id = t.article_id
|
|
JOIN article_scores s ON s.article_id = a.id
|
|
{where}
|
|
GROUP BY t.tag
|
|
"""
|
|
).fetchall()
|
|
return {r["tag"]: r["count"] for r in rows}
|
|
|
|
|
|
def category_counts(conn: sqlite3.Connection, accepted_only: bool = True) -> list[dict]:
|
|
"""Return per topic/flavor article counts for building browse UIs.
|
|
|
|
Joins articles and excludes duplicates so the counts match exactly what the
|
|
feed endpoint will actually return for each topic/flavor.
|
|
"""
|
|
clauses = ["a.duplicate_of IS NULL"]
|
|
clauses.append("s.accepted = 1" if accepted_only else "s.topic IS NOT NULL")
|
|
rows = conn.execute(
|
|
f"""
|
|
SELECT s.topic, s.flavor, COUNT(*) AS count
|
|
FROM article_scores s
|
|
JOIN articles a ON a.id = s.article_id
|
|
WHERE {" AND ".join(clauses)}
|
|
GROUP BY s.topic, s.flavor
|
|
ORDER BY s.topic, s.flavor
|
|
"""
|
|
).fetchall()
|
|
return [dict(row) for row in rows]
|
|
|
|
|
|
def available_dates(conn: sqlite3.Connection, limit: int = 30) -> list[str]:
|
|
rows = conn.execute(
|
|
"SELECT brief_date FROM daily_briefs ORDER BY brief_date DESC LIMIT ?",
|
|
(limit,),
|
|
).fetchall()
|
|
return [row["brief_date"] for row in rows]
|
|
|
|
|
|
def _latest_brief_date(conn: sqlite3.Connection) -> str | None:
|
|
row = conn.execute(
|
|
"SELECT brief_date FROM daily_briefs ORDER BY brief_date DESC LIMIT 1"
|
|
).fetchone()
|
|
return row["brief_date"] if row else None
|