upbeatBytes/goodnews/api.py

"""FastAPI service for goodNews.

A read-only JSON API over the ingestion database, plus a small static site that
consumes it. The same endpoints back both the website and any future companion
app; the auto-generated OpenAPI docs at /docs are that shared contract.

Run with the bundled CLI:  goodnews serve
Or directly:               uvicorn goodnews.api:app --host 0.0.0.0 --port 8000

The database path comes from GOODNEWS_DB (falling back to the repo's data dir),
so the API and CLI always read the same file.
"""

from __future__ import annotations

import os
import re
import sqlite3
from collections import Counter
from contextlib import contextmanager
from datetime import datetime, timezone
from pathlib import Path

from fastapi import FastAPI, HTTPException, Query
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel

from . import feeds, queries
from .db import connect, init_db
from .filters import filter_articles, prefs_from_json
from .llm import LocalModelClient
from .taxonomy import FLAVORS, TOPICS

ROOT = Path(__file__).resolve().parents[1]
DEFAULT_DB = ROOT / "data" / "goodnews.sqlite3"
STATIC_DIR = Path(__file__).resolve().parent / "static"


def db_path() -> Path:
    return Path(os.environ.get("GOODNEWS_DB", str(DEFAULT_DB)))


@contextmanager
def get_conn():
    conn = connect(db_path())
    try:
        yield conn
    finally:
        conn.close()


# --- Response models (the companion-app contract) ---------------------------


class Category(BaseModel):
    key: str
    description: str


class CategoriesResponse(BaseModel):
    topics: list[Category]
    flavors: list[Category]


class CategoryCount(BaseModel):
    topic: str | None
    flavor: str | None
    count: int


class Article(BaseModel):
    id: int
    title: str
    description: str | None = None
    url: str
    image_url: str | None = None
    published_at: str | None = None
    source: str
    topic: str | None = None
    flavor: str | None = None
    accepted: bool
    rank_score: int | None = None
    reason_code: str | None = None
    reason_text: str | None = None
    model_name: str | None = None
    rank: int | None = None  # position within a brief, when applicable

    @classmethod
    def from_row(cls, row: dict) -> "Article":
        return cls(
            id=row["id"],
            title=row["title"],
            description=row.get("description"),
            url=row["canonical_url"],
            image_url=row.get("image_url"),
            published_at=row.get("published_at"),
            source=row["source_name"],
            topic=row.get("topic"),
            flavor=row.get("flavor"),
            accepted=bool(row.get("accepted")),
            rank_score=row.get("rank_score"),
            reason_code=row.get("reason_code"),
            reason_text=row.get("reason_text"),
            model_name=row.get("model_name"),
            rank=row.get("rank"),
        )


class FeedResponse(BaseModel):
    topic: str | None
    flavor: str | None
    count: int
    items: list[Article]


class BriefResponse(BaseModel):
    brief_date: str | None
    title: str | None
    items: list[Article]


class RejectedExample(BaseModel):
    title: str
    reason: str


class SourcePreview(BaseModel):
    url: str
    sampled: int
    classified: bool
    accepted: int
    acceptance_rate: float
    avg_cortisol: float
    avg_ragebait: float
    avg_pr_risk: float
    newest_published: str | None
    recent_7d: int
    topic_mix: dict[str, int]
    flavor_mix: dict[str, int]
    examples_accepted: list[str]
    examples_rejected: list[RejectedExample]


# --- App --------------------------------------------------------------------


def create_app() -> FastAPI:
    app = FastAPI(
        title="goodNews API",
        version="0.1.0",
        description="Constructive, uplifting news — metadata and links only.",
    )

    # The website and companion app may live on other origins; allow them.
    app.add_middleware(
        CORSMiddleware,
        allow_origins=["*"],
        allow_methods=["GET"],
        allow_headers=["*"],
    )

    @app.get("/healthz")
    def healthz() -> dict:
        with get_conn() as conn:
            init_db(conn)
            scored = conn.execute("SELECT COUNT(*) FROM article_scores").fetchone()[0]
        return {"status": "ok", "scored_articles": scored}

    @app.get("/api/categories", response_model=CategoriesResponse)
    def categories() -> CategoriesResponse:
        return CategoriesResponse(
            topics=[Category(key=k, description=v) for k, v in TOPICS.items()],
            flavors=[Category(key=k, description=v) for k, v in FLAVORS.items()],
        )

    @app.get("/api/category-counts", response_model=list[CategoryCount])
    def category_counts(accepted_only: bool = True, prefs: str | None = Query(None)) -> list[CategoryCount]:
        fp = prefs_from_json(prefs)
        with get_conn() as conn:
            if fp.is_empty():
                rows = queries.category_counts(conn, accepted_only=accepted_only)
            else:
                # Count over the SAME filtered set the feed would return, so the
                # browse numbers always match what the user actually sees.
                allrows = queries.feed(conn, accepted_only=accepted_only, limit=100000, offset=0)
                kept = filter_articles(allrows, fp, datetime.now(timezone.utc))
                counts = Counter((r["topic"], r["flavor"]) for r in kept)
                rows = [
                    {"topic": t, "flavor": f, "count": n}
                    for (t, f), n in sorted(counts.items(), key=lambda kv: (str(kv[0][0]), str(kv[0][1])))
                ]
        return [CategoryCount(**row) for row in rows]

    @app.get("/api/feed", response_model=FeedResponse)
    def feed(
        topic: str | None = Query(None),
        flavor: str | None = Query(None),
        accepted_only: bool = True,
        limit: int = Query(30, ge=1, le=100),
        offset: int = Query(0, ge=0),
        prefs: str | None = Query(None),
    ) -> FeedResponse:
        if topic and topic.lower() not in TOPICS:
            raise HTTPException(400, f"unknown topic: {topic}")
        if flavor and flavor.lower() not in FLAVORS:
            raise HTTPException(400, f"unknown flavor: {flavor}")
        fp = prefs_from_json(prefs)
        with get_conn() as conn:
            if fp.is_empty():
                rows = queries.feed(
                    conn, topic=topic, flavor=flavor, accepted_only=accepted_only, limit=limit, offset=offset
                )
            else:
                # Over-fetch, apply the calm filters in Python (word-boundary
                # avoid-terms can't be done in SQL), then slice to the page.
                fetch_n = min(2000, (offset + limit) * 4 + 50)
                raw = queries.feed(
                    conn, topic=topic, flavor=flavor, accepted_only=accepted_only, limit=fetch_n, offset=0
                )
                filtered = filter_articles(raw, fp, datetime.now(timezone.utc))
                rows = filtered[offset : offset + limit]
        return FeedResponse(
            topic=topic,
            flavor=flavor,
            count=len(rows),
            items=[Article.from_row(r) for r in rows],
        )

    @app.get("/api/brief", response_model=BriefResponse)
    def brief(
        date: str | None = Query(None),
        limit: int = Query(10, ge=1, le=50),
        prefs: str | None = Query(None),
    ) -> BriefResponse:
        fp = prefs_from_json(prefs)
        with get_conn() as conn:
            data = queries.brief(conn, brief_date=date, limit=limit)
        items = data["items"]
        if not fp.is_empty():
            # MVP: filter the stored brief DOWN; no refill from outside the brief.
            items = filter_articles(items, fp, datetime.now(timezone.utc))
        return BriefResponse(
            brief_date=data["brief_date"],
            title=data["title"],
            items=[Article.from_row(r) for r in items],
        )

    @app.get("/api/brief-dates", response_model=list[str])
    def brief_dates(limit: int = Query(30, ge=1, le=365)) -> list[str]:
        with get_conn() as conn:
            return queries.available_dates(conn, limit=limit)

    @app.get("/api/source-preview", response_model=SourcePreview)
    def source_preview(
        url: str = Query(..., max_length=2048),
        sample: int = Query(25, ge=1, le=50),
        classify: bool = Query(False, description="Also classify with the local model (accurate but slower)"),
    ) -> SourcePreview:
        # Read-only sample scoring; nothing is persisted. Only http(s) is allowed.
        # NOTE: fetching a user-supplied URL is an SSRF surface — before exposing
        # this publicly, also block private/loopback/link-local address ranges.
        if not re.match(r"^https?://", url, re.IGNORECASE):
            raise HTTPException(400, "url must start with http:// or https://")
        client = LocalModelClient.from_env() if classify else None
        try:
            data = feeds.preview_feed(url, sample=sample, client=client)
        except Exception as exc:
            raise HTTPException(502, f"could not preview feed: {exc}")
        return SourcePreview(**data)

    # Static site last, mounted at root, so /api/* and /healthz win.
    if STATIC_DIR.is_dir():
        app.mount("/", StaticFiles(directory=str(STATIC_DIR), html=True), name="site")

    return app


app = create_app()