"""Word of the Day — an uplifting word a day, grounded in a real dictionary. "LLM proposes, dictionary disposes": the LLM suggests positive/calming words; each is validated + enriched against the free Dictionary API (dictionaryapi.dev) for the REAL definition, IPA pronunciation, example sentences, and a human pronunciation clip. That rules out hallucinated definitions — the authoritative data is the dictionary's. The audio clip (public-domain, usually Wiktionary) is cached to our origin; the page falls back to the browser's speech synthesis when a word has no clip. All network/LLM work happens before the brief DB write. Same pick lifecycle as the other small joys. """ from __future__ import annotations import json import os import re import sqlite3 import urllib.parse import urllib.request from pathlib import Path from . import daily from .localtime import local_today DICT_BASE = "https://api.dictionaryapi.dev/api/v2/entries/en" _UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"} _TARGET_POOL = 30 # keep harvesting (a batch/day) until the pool reaches this _HARVEST_BATCH = 12 _MIN_AUDIO_BYTES = 500 def cache_dir() -> Path: override = os.environ.get("GOODNEWS_WOTD_AUDIO") d = Path(override) if override else Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3")).parent / "wotd_audio" d.mkdir(parents=True, exist_ok=True) return d def _http_bytes(url: str, timeout: int = 30) -> tuple[bytes, str]: req = urllib.request.Request(url, headers=_UA) with urllib.request.urlopen(req, timeout=timeout) as r: return r.read(), (r.headers.get("Content-Type") or "") def _propose_words(client, n: int) -> list[dict]: """Ask for word + the intended part of speech, so _lookup picks the sense the LLM meant (e.g. 'serene' the adjective, not the archaic noun).""" user = ( f"Suggest {n} English vocabulary words for an uplifting 'word of the day' — positive, " "calming, hopeful, or quietly beautiful in meaning (e.g. serene, kindness, dawn, " "resilience, wonder). Real, usable words; vary common and slightly elevated. For each, " "give the part of speech you intend (the everyday modern sense, not an archaic one). " 'Reply with JSON only: {"words": [{"word": "serene", "pos": "adjective"}, ...]}' ) txt = client.chat_text([{"role": "user", "content": user}]) m = re.search(r"\{.*\}", txt, re.S) if not m: return [] out = [] for w in json.loads(m.group(0)).get("words", []): if isinstance(w, str) and w.strip(): out.append({"word": w.strip().lower(), "pos": None}) elif isinstance(w, dict) and str(w.get("word", "")).strip(): out.append({"word": str(w["word"]).strip().lower(), "pos": (str(w.get("pos")).strip().lower() or None) if w.get("pos") else None}) return out def _polish(client, word: str, part_of_speech: str | None, definition: str) -> dict | None: """LLM polish for display: rewrite the real dictionary gloss as ONE warm plain sentence, and write two clear everyday example sentences. Grounded in the real definition (the dictionary stays the anchor); returns None on any trouble so callers fall back to raw.""" pos = f" ({part_of_speech})" if part_of_speech else "" user = ( f'The word is "{word}"{pos}. Its dictionary definition is: "{definition}".\n' "1) Rewrite that definition as ONE warm, plain-language sentence that a general reader " "instantly understands. Stay faithful to the meaning; do not invent extra facts.\n" "2) Write TWO short, natural example sentences that clearly show the word used in " "everyday life — concrete and easy to picture, not abstract, archaic, or a proper-noun " f'title. Each must actually use the word "{word}".\n' 'Reply with JSON only: {"gloss": "...", "examples": ["...", "..."]}' ) try: txt = client.chat_text([{"role": "user", "content": user}]) except Exception: # noqa: BLE001 — polish is best-effort; raw dictionary data stands return None m = re.search(r"\{.*\}", txt, re.S) if not m: return None try: data = json.loads(m.group(0)) except ValueError: return None gloss = " ".join(str(data.get("gloss") or "").split()).strip() examples = [" ".join(str(e).split()).strip() for e in (data.get("examples") or []) if str(e).strip()] # Enforce the contract: keep only sentences that actually use the word, and require at # least one. A gloss with no usable examples falls back to the raw dictionary data. examples = [e for e in examples if word.lower() in e.lower()] if not gloss or not examples: return None return {"gloss": gloss, "examples": examples[:2]} def _lookup(word: str, prefer_pos: str | None = None) -> dict | None: """Validate + enrich a word via the dictionary. Returns None if it's not a real word. When prefer_pos is given, picks the meaning of that part of speech (the sense the LLM meant).""" try: data = daily.http_json(f"{DICT_BASE}/{urllib.parse.quote(word)}") except Exception: # noqa: BLE001 — unknown word / network → just skip it return None if not isinstance(data, list) or not data: return None entry = data[0] meanings = entry.get("meanings") or [] if not meanings or not (meanings[0].get("definitions") or []): return None # Prefer the meaning whose part of speech matches the LLM's intent; else the first usable one. chosen = None if prefer_pos: for mn in meanings: if (mn.get("partOfSpeech") or "").strip().lower() == prefer_pos and (mn.get("definitions") or []): if (mn["definitions"][0].get("definition") or "").strip(): chosen = mn break if chosen is None: for mn in meanings: if (mn.get("definitions") or []) and (mn["definitions"][0].get("definition") or "").strip(): chosen = mn break if chosen is None: return None definition = (chosen["definitions"][0].get("definition") or "").strip() phonetic = entry.get("phonetic") audio_url = None for p in (entry.get("phonetics") or []): if not phonetic and p.get("text"): phonetic = p["text"] if not audio_url and p.get("audio"): audio_url = p["audio"] examples = [] for m in [chosen] + [mn for mn in meanings if mn is not chosen]: # chosen sense's examples first for d in (m.get("definitions") or []): if d.get("example"): examples.append(d["example"].strip()) return { "word": (entry.get("word") or word).strip().lower(), "part_of_speech": chosen.get("partOfSpeech"), "phonetic": phonetic, "audio_url": audio_url, "definition": definition, "examples": examples[:3], } def _cache_audio(audio_url: str | None, word: str) -> str | None: """Download the pronunciation clip to our origin (atomic). Returns filename or None.""" if not audio_url: return None if audio_url.startswith("//"): audio_url = "https:" + audio_url try: data, ctype = _http_bytes(audio_url) except Exception: # noqa: BLE001 return None if len(data) < _MIN_AUDIO_BYTES: return None ext = ".ogg" if ("ogg" in ctype or audio_url.endswith(".ogg")) else ".mp3" fname = f"{word}{ext}" cdir = cache_dir() tmp = cdir / f".{word}.tmp" try: tmp.write_bytes(data) os.replace(tmp, cdir / fname) except OSError: try: tmp.unlink() except OSError: pass return None return fname def _pool_count(conn: sqlite3.Connection) -> int: return conn.execute("SELECT COUNT(*) FROM wotd_pool").fetchone()[0] def harvest(conn: sqlite3.Connection, client, count: int = _HARVEST_BATCH) -> dict: """Propose words → validate/enrich via dictionary → cache audio → add new ones. All network up front; one brief write at the end.""" try: words = _propose_words(client, count) except Exception: # noqa: BLE001 return {"proposed": 0, "added": 0, "pool": _pool_count(conn)} rows = [] for item in words: w = item["word"] if not w.isalpha() or conn.execute("SELECT 1 FROM wotd_pool WHERE word=?", (w,)).fetchone(): continue info = _lookup(w, item.get("pos")) if not info: continue audio_file = _cache_audio(info["audio_url"], info["word"]) polished = _polish(client, info["word"], info["part_of_speech"], info["definition"]) gloss = polished["gloss"] if polished else None usage = json.dumps(polished["examples"]) if polished else None rows.append((info["word"], info["part_of_speech"], info["phonetic"], audio_file, info["audio_url"], info["definition"], json.dumps(info["examples"]), gloss, usage)) before = _pool_count(conn) conn.executemany( "INSERT OR IGNORE INTO wotd_pool (source, word, part_of_speech, phonetic, audio_file, audio_url, definition, examples, gloss, usage) " "VALUES ('llm', ?, ?, ?, ?, ?, ?, ?, ?, ?)", rows, ) conn.commit() after = _pool_count(conn) return {"proposed": len(words), "added": after - before, "pool": after} def _candidates(conn: sqlite3.Connection, avoid: int | None = None) -> list[int]: featured = conn.execute("SELECT id FROM wotd_pool WHERE blocked=0 AND featured=1 ORDER BY id").fetchall() if featured: ids = [r[0] for r in featured] else: # The freshest cohort only (never-shown, else the oldest-shown group) — picking # across the whole pool is what re-fed recent words day to day. rows = conn.execute("SELECT id, shown_at FROM wotd_pool WHERE blocked=0").fetchall() ids = daily.freshest(rows) if avoid is not None: ids = [i for i in ids if i != avoid] or ids return ids def pick_daily(conn: sqlite3.Connection, feature_date: str | None = None, force: bool = False, avoid: int | None = None, client=None) -> dict | None: feature_date = feature_date or local_today() existing = conn.execute("SELECT * FROM daily_wotd WHERE feature_date=?", (feature_date,)).fetchone() if existing and not force: return dict(existing) ids = _candidates(conn, avoid) if not ids: return None pick_id = daily.seeded_order(ids, feature_date)[0] row = conn.execute("SELECT * FROM wotd_pool WHERE id=?", (pick_id,)).fetchone() gloss, usage = row["gloss"], row["usage"] if not gloss and client: # lazy polish for older pool words; cached back polished = _polish(client, row["word"], row["part_of_speech"], row["definition"]) if polished: gloss, usage = polished["gloss"], json.dumps(polished["examples"]) conn.execute("UPDATE wotd_pool SET gloss=?, usage=? WHERE id=?", (gloss, usage, pick_id)) conn.execute( "INSERT INTO daily_wotd (feature_date, pool_id, word, part_of_speech, phonetic, audio_file, definition, examples, gloss, usage) " "VALUES (?,?,?,?,?,?,?,?,?,?) " "ON CONFLICT(feature_date) DO UPDATE SET pool_id=excluded.pool_id, word=excluded.word, " "part_of_speech=excluded.part_of_speech, phonetic=excluded.phonetic, audio_file=excluded.audio_file, " "definition=excluded.definition, examples=excluded.examples, gloss=excluded.gloss, usage=excluded.usage", (feature_date, row["id"], row["word"], row["part_of_speech"], row["phonetic"], row["audio_file"], row["definition"], row["examples"], gloss, usage), ) conn.execute("UPDATE wotd_pool SET shown_at=? WHERE id=?", (feature_date, pick_id)) conn.commit() return dict(conn.execute("SELECT * FROM daily_wotd WHERE feature_date=?", (feature_date,)).fetchone()) def get_today(conn: sqlite3.Connection, feature_date: str | None = None) -> dict | None: if feature_date: row = conn.execute("SELECT * FROM daily_wotd WHERE feature_date=?", (feature_date,)).fetchone() if row: return dict(row) row = conn.execute("SELECT * FROM daily_wotd ORDER BY feature_date DESC LIMIT 1").fetchone() return dict(row) if row else None def run_daily(conn: sqlite3.Connection, client=None) -> dict: """Top the pool up toward _TARGET_POOL (a batch a day), then pick today's word.""" harvested = None if client and _pool_count(conn) < _TARGET_POOL: harvested = harvest(conn, client) picked = pick_daily(conn, client=client) return {"pool": _pool_count(conn), "harvested": harvested, "picked": (picked or {}).get("word")}