upbeatBytes/goodnews/queries.py

"""Read-only query helpers over the goodNews database.

Pure stdlib and framework-agnostic: returns plain dicts so the same functions
back both the CLI and the JSON API. All article output is metadata + a link to
the original source — never stored bodies.
"""

from __future__ import annotations

import sqlite3

# Composite ranking used everywhere a "best first" order is needed. Kept as one
# expression so brief, category feeds, and the API all rank identically.
RANK_SCORE_SQL = (
    "(s.constructive_score + s.agency_score + s.human_benefit_score + src.trust_score "
    "- s.cortisol_score - s.ragebait_score - s.pr_risk_score)"
)

_ARTICLE_COLUMNS = f"""
    a.id,
    a.title,
    a.description,
    a.canonical_url,
    a.published_at,
    a.image_url,
    src.name AS source_name,
    s.topic,
    s.flavor,
    s.accepted,
    s.constructive_score,
    s.cortisol_score,
    s.ragebait_score,
    s.agency_score,
    s.human_benefit_score,
    s.pr_risk_score,
    s.reason_code,
    s.reason_text,
    s.model_name,
    {RANK_SCORE_SQL} AS rank_score
"""


def feed(
    conn: sqlite3.Connection,
    topic: str | None = None,
    flavor: str | None = None,
    accepted_only: bool = True,
    limit: int = 30,
    offset: int = 0,
    include_topics: list[str] | None = None,
    include_flavors: list[str] | None = None,
    mute_topics: list[str] | None = None,
    mute_flavors: list[str] | None = None,
    max_cortisol: int | None = None,
    max_ragebait: int | None = None,
) -> list[dict]:
    """Return ranked articles with categorical filters applied in SQL.

    Categorical filters (topic/flavor include & mute, cortisol/ragebait ceilings)
    must be applied here, not after ranking — otherwise low-ranked-but-matching
    items (e.g. 'discovery' for a Wonder lane) fall outside any over-fetch window.
    Word-boundary avoid-terms remain a Python pass on the caller side.
    """
    clauses = ["a.duplicate_of IS NULL"]
    params: list = []
    if accepted_only:
        clauses.append("s.accepted = 1")
    if topic:
        clauses.append("s.topic = ?")
        params.append(topic.lower())
    if flavor:
        clauses.append("s.flavor = ?")
        params.append(flavor.lower())

    def _in(column: str, values: list[str], negate: bool = False) -> None:
        vals = [v.lower() for v in values]
        placeholders = ",".join("?" * len(vals))
        op = "NOT IN" if negate else "IN"
        # COALESCE keeps NULL-category rows from being dropped by NOT IN.
        clauses.append(f"COALESCE({column}, '') {op} ({placeholders})")
        params.extend(vals)

    if include_topics:
        _in("s.topic", include_topics)
    if include_flavors:
        _in("s.flavor", include_flavors)
    if mute_topics:
        _in("s.topic", mute_topics, negate=True)
    if mute_flavors:
        _in("s.flavor", mute_flavors, negate=True)
    if max_cortisol is not None:
        clauses.append("COALESCE(s.cortisol_score, 0) <= ?")
        params.append(max_cortisol)
    if max_ragebait is not None:
        clauses.append("COALESCE(s.ragebait_score, 0) <= ?")
        params.append(max_ragebait)

    where = "WHERE " + " AND ".join(clauses)
    params.extend([limit, offset])

    rows = conn.execute(
        f"""
        SELECT {_ARTICLE_COLUMNS}
        FROM articles a
        JOIN sources src ON src.id = a.source_id
        JOIN article_scores s ON s.article_id = a.id
        {where}
        ORDER BY rank_score DESC, COALESCE(a.published_at, a.discovered_at) DESC
        LIMIT ? OFFSET ?
        """,
        params,
    ).fetchall()
    return [dict(row) for row in rows]


def brief(conn: sqlite3.Connection, brief_date: str | None = None, limit: int = 10) -> dict:
    """Return a stored daily brief (latest if no date) with its ranked items."""
    target_date = brief_date or _latest_brief_date(conn)
    if not target_date:
        return {"brief_date": None, "title": None, "items": []}

    header = conn.execute(
        "SELECT brief_date, title FROM daily_briefs WHERE brief_date = ?",
        (target_date,),
    ).fetchone()
    if not header:
        return {"brief_date": target_date, "title": None, "items": []}

    rows = conn.execute(
        f"""
        SELECT bi.rank, bi.selection_reason, {_ARTICLE_COLUMNS}
        FROM daily_briefs b
        JOIN daily_brief_items bi ON bi.brief_id = b.id
        JOIN articles a ON a.id = bi.article_id
        JOIN sources src ON src.id = a.source_id
        LEFT JOIN article_scores s ON s.article_id = a.id
        WHERE b.brief_date = ?
        ORDER BY bi.rank
        LIMIT ?
        """,
        (target_date, limit),
    ).fetchall()
    return {
        "brief_date": header["brief_date"],
        "title": header["title"],
        "items": [dict(row) for row in rows],
    }


def category_counts(conn: sqlite3.Connection, accepted_only: bool = True) -> list[dict]:
    """Return per topic/flavor article counts for building browse UIs.

    Joins articles and excludes duplicates so the counts match exactly what the
    feed endpoint will actually return for each topic/flavor.
    """
    clauses = ["a.duplicate_of IS NULL"]
    clauses.append("s.accepted = 1" if accepted_only else "s.topic IS NOT NULL")
    rows = conn.execute(
        f"""
        SELECT s.topic, s.flavor, COUNT(*) AS count
        FROM article_scores s
        JOIN articles a ON a.id = s.article_id
        WHERE {" AND ".join(clauses)}
        GROUP BY s.topic, s.flavor
        ORDER BY s.topic, s.flavor
        """
    ).fetchall()
    return [dict(row) for row in rows]


def available_dates(conn: sqlite3.Connection, limit: int = 30) -> list[str]:
    rows = conn.execute(
        "SELECT brief_date FROM daily_briefs ORDER BY brief_date DESC LIMIT ?",
        (limit,),
    ).fetchall()
    return [row["brief_date"] for row in rows]


def _latest_brief_date(conn: sqlite3.Connection) -> str | None:
    row = conn.execute(
        "SELECT brief_date FROM daily_briefs ORDER BY brief_date DESC LIMIT 1"
    ).fetchone()
    return row["brief_date"] if row else None