upbeatBytes/goodnews/briefs.py

from __future__ import annotations

import sqlite3
from datetime import date

from .paywall import is_paywalled


def build_daily_brief(
    conn: sqlite3.Connection,
    brief_date: str | None = None,
    limit: int = 5,
    replace: bool = False,
    window_days: int = 3,
) -> int:
    target_date = brief_date or date.today().isoformat()
    existing = conn.execute("SELECT id FROM daily_briefs WHERE brief_date = ?", (target_date,)).fetchone()
    if existing and not replace:
        return int(existing["id"])
    if existing and replace:
        conn.execute("DELETE FROM daily_briefs WHERE id = ?", (existing["id"],))

    brief_id = conn.execute(
        "INSERT INTO daily_briefs (brief_date, title) VALUES (?, ?)",
        (target_date, f"Five Good Things Today - {target_date}"),
    ).lastrowid

    rows = _candidate_articles(conn, target_date, window_days)
    # A calm daily brief shouldn't repeatedly hand the reader a locked door:
    # push paywalled candidates below readable ones (stable, so composite order
    # is preserved within each group) before selecting the five.
    rows = sorted(rows, key=lambda r: is_paywalled(r["canonical_url"]))
    selected = _select_diverse(rows, limit)
    for index, row in enumerate(selected, start=1):
        conn.execute(
            """
            INSERT INTO daily_brief_items (brief_id, article_id, rank, selection_reason)
            VALUES (?, ?, ?, ?)
            """,
            (
                brief_id,
                row["id"],
                index,
                _selection_reason(row),
            ),
        )
    conn.commit()
    return int(brief_id)


def show_brief(conn: sqlite3.Connection, brief_date: str | None = None, limit: int = 10) -> list[sqlite3.Row]:
    target_date = brief_date or _latest_brief_date(conn)
    if not target_date:
        return []
    return conn.execute(
        """
        SELECT
            b.brief_date,
            bi.rank,
            bi.selection_reason,
            a.title,
            a.description,
            a.canonical_url,
            a.published_at,
            src.name AS source_name,
            src.default_category,
            s.constructive_score,
            s.cortisol_score,
            s.ragebait_score,
            s.agency_score,
            s.human_benefit_score,
            s.reason_code,
            s.reason_text,
            s.model_name
        FROM daily_briefs b
        JOIN daily_brief_items bi ON bi.brief_id = b.id
        JOIN articles a ON a.id = bi.article_id
        JOIN sources src ON src.id = a.source_id
        LEFT JOIN article_scores s ON s.article_id = a.id
        WHERE b.brief_date = ?
        ORDER BY bi.rank
        LIMIT ?
        """,
        (target_date, limit),
    ).fetchall()


def _candidate_articles(
    conn: sqlite3.Connection, target_date: str, window_days: int = 3
) -> list[sqlite3.Row]:
    """Brief candidates, sparse-day-proof.

    Prefers articles dated on target_date, but widens to the preceding
    `window_days` so the brief still fills on slow news days. Anything already
    featured in a brief within the last 7 days (other than this same date, which
    is being rebuilt) is excluded so backfilled stories cannot linger across
    consecutive days.
    """
    return conn.execute(
        """
        SELECT
            a.id,
            a.title,
            a.description,
            a.canonical_url,
            a.published_at,
            a.discovered_at,
            src.name AS source_name,
            src.default_category,
            src.trust_score,
            s.constructive_score,
            s.cortisol_score,
            s.ragebait_score,
            s.agency_score,
            s.human_benefit_score,
            s.novelty_score,
            s.pr_risk_score,
            s.reason_code,
            s.reason_text,
            s.model_name,
            CASE WHEN date(COALESCE(a.published_at, a.discovered_at)) = date(?)
                 THEN 1 ELSE 0 END AS is_today
        FROM articles a
        JOIN sources src ON src.id = a.source_id
        JOIN article_scores s ON s.article_id = a.id
        WHERE s.accepted = 1
          AND a.duplicate_of IS NULL
          AND date(COALESCE(a.published_at, a.discovered_at)) <= date(?)
          AND date(COALESCE(a.published_at, a.discovered_at)) > date(?, '-' || ? || ' days')
          AND a.id NOT IN (
              SELECT bi.article_id
              FROM daily_brief_items bi
              JOIN daily_briefs b ON b.id = bi.brief_id
              WHERE b.brief_date <> ?
                AND b.brief_date <= date(?)
                AND b.brief_date > date(?, '-7 days')
          )
        ORDER BY
            is_today DESC,
            (s.constructive_score + s.agency_score + s.human_benefit_score + src.trust_score
             - s.cortisol_score - s.ragebait_score - s.pr_risk_score) DESC,
            COALESCE(a.published_at, a.discovered_at) DESC
        LIMIT 50
        """,
        (target_date, target_date, target_date, window_days, target_date, target_date, target_date),
    ).fetchall()


def _select_diverse(rows: list[sqlite3.Row], limit: int) -> list[sqlite3.Row]:
    """Pick up to `limit` items from `rows` (already ranked best-first).

    Contract:
    1. Prefer higher-ranked items.
    2. Source diversity: take at most one item per source while other sources
       remain; only repeat a source once distinct sources are exhausted.
    3. Category diversity: if the result ended up single-category and a different
       category is available in the pool, swap in the highest-ranked off-category
       candidate by evicting the lowest-ranked currently-selected item (so we
       gain breadth without dropping a higher-ranked pick).
    """
    selected: list[sqlite3.Row] = []
    seen_sources: set = set()

    # Pass 1: best-first, one per source.
    for row in rows:
        if len(selected) >= limit:
            break
        if row["source_name"] in seen_sources:
            continue
        selected.append(row)
        seen_sources.add(row["source_name"])

    # Pass 2: if short on distinct sources, backfill best-first regardless.
    if len(selected) < limit:
        selected_ids = {row["id"] for row in selected}
        for row in rows:
            if len(selected) >= limit:
                break
            if row["id"] in selected_ids:
                continue
            selected.append(row)
            selected_ids.add(row["id"])

    # Pass 3: ensure >= 2 categories when the pool allows it.
    categories = {row["default_category"] for row in selected}
    if len(categories) < 2:
        selected_ids = {row["id"] for row in selected}
        for row in rows:
            if row["id"] in selected_ids:
                continue
            if row["default_category"] not in categories:
                selected[-1] = row  # evict the lowest-ranked selected item
                break

    return selected


def _selection_reason(row: sqlite3.Row) -> str:
    return (
        f"{row['reason_code']}; constructive={row['constructive_score']}, "
        f"agency={row['agency_score']}, human_benefit={row['human_benefit_score']}, "
        f"cortisol={row['cortisol_score']}, source={row['source_name']}"
    )


def _latest_brief_date(conn: sqlite3.Connection) -> str | None:
    row = conn.execute("SELECT brief_date FROM daily_briefs ORDER BY brief_date DESC LIMIT 1").fetchone()
    return row["brief_date"] if row else None