upbeatBytes/goodnews/wotd.py

"""Word of the Day — an uplifting word a day, grounded in a real dictionary.

"LLM proposes, dictionary disposes": the LLM suggests positive/calming words; each is
validated + enriched against the free Dictionary API (dictionaryapi.dev) for the REAL
definition, IPA pronunciation, example sentences, and a human pronunciation clip. That
rules out hallucinated definitions — the authoritative data is the dictionary's. The
audio clip (public-domain, usually Wiktionary) is cached to our origin; the page falls
back to the browser's speech synthesis when a word has no clip.

All network/LLM work happens before the brief DB write. Same pick lifecycle as the
other small joys.
"""
from __future__ import annotations

import json
import os
import re
import sqlite3
import urllib.parse
import urllib.request
from pathlib import Path

from . import daily
from .localtime import local_today

DICT_BASE = "https://api.dictionaryapi.dev/api/v2/entries/en"
_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
_NO_REPEAT_POOL = 60
_TARGET_POOL = 30          # keep harvesting (a batch/day) until the pool reaches this
_HARVEST_BATCH = 12
_MIN_AUDIO_BYTES = 500


def cache_dir() -> Path:
    override = os.environ.get("GOODNEWS_WOTD_AUDIO")
    d = Path(override) if override else Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3")).parent / "wotd_audio"
    d.mkdir(parents=True, exist_ok=True)
    return d


def _http_bytes(url: str, timeout: int = 30) -> tuple[bytes, str]:
    req = urllib.request.Request(url, headers=_UA)
    with urllib.request.urlopen(req, timeout=timeout) as r:
        return r.read(), (r.headers.get("Content-Type") or "")


def _propose_words(client, n: int) -> list[dict]:
    """Ask for word + the intended part of speech, so _lookup picks the sense the LLM meant
    (e.g. 'serene' the adjective, not the archaic noun)."""
    user = (
        f"Suggest {n} English vocabulary words for an uplifting 'word of the day' — positive, "
        "calming, hopeful, or quietly beautiful in meaning (e.g. serene, kindness, dawn, "
        "resilience, wonder). Real, usable words; vary common and slightly elevated. For each, "
        "give the part of speech you intend (the everyday modern sense, not an archaic one). "
        'Reply with JSON only: {"words": [{"word": "serene", "pos": "adjective"}, ...]}'
    )
    txt = client.chat_text([{"role": "user", "content": user}])
    m = re.search(r"\{.*\}", txt, re.S)
    if not m:
        return []
    out = []
    for w in json.loads(m.group(0)).get("words", []):
        if isinstance(w, str) and w.strip():
            out.append({"word": w.strip().lower(), "pos": None})
        elif isinstance(w, dict) and str(w.get("word", "")).strip():
            out.append({"word": str(w["word"]).strip().lower(),
                        "pos": (str(w.get("pos")).strip().lower() or None) if w.get("pos") else None})
    return out


def _polish(client, word: str, part_of_speech: str | None, definition: str) -> dict | None:
    """LLM polish for display: rewrite the real dictionary gloss as ONE warm plain sentence,
    and write two clear everyday example sentences. Grounded in the real definition (the
    dictionary stays the anchor); returns None on any trouble so callers fall back to raw."""
    pos = f" ({part_of_speech})" if part_of_speech else ""
    user = (
        f'The word is "{word}"{pos}. Its dictionary definition is: "{definition}".\n'
        "1) Rewrite that definition as ONE warm, plain-language sentence that a general reader "
        "instantly understands. Stay faithful to the meaning; do not invent extra facts.\n"
        "2) Write TWO short, natural example sentences that clearly show the word used in "
        "everyday life — concrete and easy to picture, not abstract, archaic, or a proper-noun "
        f'title. Each must actually use the word "{word}".\n'
        'Reply with JSON only: {"gloss": "...", "examples": ["...", "..."]}'
    )
    try:
        txt = client.chat_text([{"role": "user", "content": user}])
    except Exception:  # noqa: BLE001 — polish is best-effort; raw dictionary data stands
        return None
    m = re.search(r"\{.*\}", txt, re.S)
    if not m:
        return None
    try:
        data = json.loads(m.group(0))
    except ValueError:
        return None
    gloss = " ".join(str(data.get("gloss") or "").split()).strip()
    examples = [" ".join(str(e).split()).strip() for e in (data.get("examples") or []) if str(e).strip()]
    if not gloss:
        return None
    return {"gloss": gloss, "examples": examples[:2]}


def _lookup(word: str, prefer_pos: str | None = None) -> dict | None:
    """Validate + enrich a word via the dictionary. Returns None if it's not a real word.
    When prefer_pos is given, picks the meaning of that part of speech (the sense the LLM meant)."""
    try:
        data = daily.http_json(f"{DICT_BASE}/{urllib.parse.quote(word)}")
    except Exception:  # noqa: BLE001 — unknown word / network → just skip it
        return None
    if not isinstance(data, list) or not data:
        return None
    entry = data[0]
    meanings = entry.get("meanings") or []
    if not meanings or not (meanings[0].get("definitions") or []):
        return None
    # Prefer the meaning whose part of speech matches the LLM's intent; else the first usable one.
    chosen = None
    if prefer_pos:
        for mn in meanings:
            if (mn.get("partOfSpeech") or "").strip().lower() == prefer_pos and (mn.get("definitions") or []):
                if (mn["definitions"][0].get("definition") or "").strip():
                    chosen = mn
                    break
    if chosen is None:
        for mn in meanings:
            if (mn.get("definitions") or []) and (mn["definitions"][0].get("definition") or "").strip():
                chosen = mn
                break
    if chosen is None:
        return None
    definition = (chosen["definitions"][0].get("definition") or "").strip()
    phonetic = entry.get("phonetic")
    audio_url = None
    for p in (entry.get("phonetics") or []):
        if not phonetic and p.get("text"):
            phonetic = p["text"]
        if not audio_url and p.get("audio"):
            audio_url = p["audio"]
    examples = []
    for m in [chosen] + [mn for mn in meanings if mn is not chosen]:   # chosen sense's examples first
        for d in (m.get("definitions") or []):
            if d.get("example"):
                examples.append(d["example"].strip())
    return {
        "word": (entry.get("word") or word).strip().lower(),
        "part_of_speech": chosen.get("partOfSpeech"),
        "phonetic": phonetic,
        "audio_url": audio_url,
        "definition": definition,
        "examples": examples[:3],
    }


def _cache_audio(audio_url: str | None, word: str) -> str | None:
    """Download the pronunciation clip to our origin (atomic). Returns filename or None."""
    if not audio_url:
        return None
    if audio_url.startswith("//"):
        audio_url = "https:" + audio_url
    try:
        data, ctype = _http_bytes(audio_url)
    except Exception:  # noqa: BLE001
        return None
    if len(data) < _MIN_AUDIO_BYTES:
        return None
    ext = ".ogg" if ("ogg" in ctype or audio_url.endswith(".ogg")) else ".mp3"
    fname = f"{word}{ext}"
    cdir = cache_dir()
    tmp = cdir / f".{word}.tmp"
    try:
        tmp.write_bytes(data)
        os.replace(tmp, cdir / fname)
    except OSError:
        try:
            tmp.unlink()
        except OSError:
            pass
        return None
    return fname


def _pool_count(conn: sqlite3.Connection) -> int:
    return conn.execute("SELECT COUNT(*) FROM wotd_pool").fetchone()[0]


def harvest(conn: sqlite3.Connection, client, count: int = _HARVEST_BATCH) -> dict:
    """Propose words → validate/enrich via dictionary → cache audio → add new ones.
    All network up front; one brief write at the end."""
    try:
        words = _propose_words(client, count)
    except Exception:  # noqa: BLE001
        return {"proposed": 0, "added": 0, "pool": _pool_count(conn)}
    rows = []
    for item in words:
        w = item["word"]
        if not w.isalpha() or conn.execute("SELECT 1 FROM wotd_pool WHERE word=?", (w,)).fetchone():
            continue
        info = _lookup(w, item.get("pos"))
        if not info:
            continue
        audio_file = _cache_audio(info["audio_url"], info["word"])
        polished = _polish(client, info["word"], info["part_of_speech"], info["definition"])
        gloss = polished["gloss"] if polished else None
        usage = json.dumps(polished["examples"]) if polished else None
        rows.append((info["word"], info["part_of_speech"], info["phonetic"], audio_file,
                     info["audio_url"], info["definition"], json.dumps(info["examples"]), gloss, usage))
    before = _pool_count(conn)
    conn.executemany(
        "INSERT OR IGNORE INTO wotd_pool (source, word, part_of_speech, phonetic, audio_file, audio_url, definition, examples, gloss, usage) "
        "VALUES ('llm', ?, ?, ?, ?, ?, ?, ?, ?, ?)", rows,
    )
    conn.commit()
    after = _pool_count(conn)
    return {"proposed": len(words), "added": after - before, "pool": after}


def _candidates(conn: sqlite3.Connection, avoid: int | None = None) -> list[int]:
    featured = conn.execute("SELECT id FROM wotd_pool WHERE blocked=0 AND featured=1 ORDER BY id").fetchall()
    if featured:
        ids = [r[0] for r in featured]
    else:
        rows = conn.execute(
            "SELECT id FROM wotd_pool WHERE blocked=0 ORDER BY shown_at IS NOT NULL, shown_at, id LIMIT ?",
            (_NO_REPEAT_POOL,),
        ).fetchall()
        ids = [r[0] for r in rows]
    if avoid is not None:
        ids = [i for i in ids if i != avoid] or ids
    return ids


def pick_daily(conn: sqlite3.Connection, feature_date: str | None = None, force: bool = False,
               avoid: int | None = None, client=None) -> dict | None:
    feature_date = feature_date or local_today()
    existing = conn.execute("SELECT * FROM daily_wotd WHERE feature_date=?", (feature_date,)).fetchone()
    if existing and not force:
        return dict(existing)
    ids = _candidates(conn, avoid)
    if not ids:
        return None
    pick_id = daily.seeded_order(ids, feature_date)[0]
    row = conn.execute("SELECT * FROM wotd_pool WHERE id=?", (pick_id,)).fetchone()
    gloss, usage = row["gloss"], row["usage"]
    if not gloss and client:                         # lazy polish for older pool words; cached back
        polished = _polish(client, row["word"], row["part_of_speech"], row["definition"])
        if polished:
            gloss, usage = polished["gloss"], json.dumps(polished["examples"])
            conn.execute("UPDATE wotd_pool SET gloss=?, usage=? WHERE id=?", (gloss, usage, pick_id))
    conn.execute(
        "INSERT INTO daily_wotd (feature_date, pool_id, word, part_of_speech, phonetic, audio_file, definition, examples, gloss, usage) "
        "VALUES (?,?,?,?,?,?,?,?,?,?) "
        "ON CONFLICT(feature_date) DO UPDATE SET pool_id=excluded.pool_id, word=excluded.word, "
        "part_of_speech=excluded.part_of_speech, phonetic=excluded.phonetic, audio_file=excluded.audio_file, "
        "definition=excluded.definition, examples=excluded.examples, gloss=excluded.gloss, usage=excluded.usage",
        (feature_date, row["id"], row["word"], row["part_of_speech"], row["phonetic"],
         row["audio_file"], row["definition"], row["examples"], gloss, usage),
    )
    conn.execute("UPDATE wotd_pool SET shown_at=? WHERE id=?", (feature_date, pick_id))
    conn.commit()
    return dict(conn.execute("SELECT * FROM daily_wotd WHERE feature_date=?", (feature_date,)).fetchone())


def get_today(conn: sqlite3.Connection, feature_date: str | None = None) -> dict | None:
    if feature_date:
        row = conn.execute("SELECT * FROM daily_wotd WHERE feature_date=?", (feature_date,)).fetchone()
        if row:
            return dict(row)
    row = conn.execute("SELECT * FROM daily_wotd ORDER BY feature_date DESC LIMIT 1").fetchone()
    return dict(row) if row else None


def run_daily(conn: sqlite3.Connection, client=None) -> dict:
    """Top the pool up toward _TARGET_POOL (a batch a day), then pick today's word."""
    harvested = None
    if client and _pool_count(conn) < _TARGET_POOL:
        harvested = harvest(conn, client)
    picked = pick_daily(conn, client=client)
    return {"pool": _pool_count(conn), "harvested": harvested, "picked": (picked or {}).get("word")}