Files
upbeatBytes/goodnews/wotd.py
T
thejayman77 cebbed58ab WOTD #4/#5 content quality + Editorial Asymmetric /word page (CD)
Content quality ("LLM polishes, dictionary anchors"):
- New wotd._polish: rewrites the real dictionary gloss into ONE warm plain
  sentence + two clear everyday example sentences, grounded in the real
  definition (no invented meanings). Stored in new wotd_pool/daily_wotd columns
  gloss + usage, alongside the raw definition/examples which stay the anchor.
- harvest() polishes each new word; pick_daily() lazily polishes + caches back
  any older pooled word that lacks a gloss (client threaded through run_daily).
- Admin word-add polishes on insert; re-pick passes an LLM client so quote
  meaning / word gloss fill on a forced fresh pick.
- /api/word/today now prefers gloss + usage, falling back to the raw dictionary
  def/examples when polish is absent (so it's always safe).
- db._migrate adds gloss/usage to wotd_pool + daily_wotd (idempotent ALTER).

Frontend — /word redesigned to CD's "Editorial Asymmetric": faded oversized
initial bleeding off the right, vertical part-of-speech rail, big Newsreader
word, airy definition, left-ruled italic example sentences, outline Listen
button + date. (Uses our self-hosted Newsreader/Hanken stack rather than the
mockup's Google fonts; the made-up syllable respelling is omitted since we only
have real IPA.)

Tests: _polish parse/trim/cap, harvest stores gloss/usage, pick lazy-polishes
older words, admin gloss flows through to /api/word/today. 403 backend + 27 fe.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-23 06:08:14 -04:00

279 lines
12 KiB
Python

"""Word of the Day — an uplifting word a day, grounded in a real dictionary.
"LLM proposes, dictionary disposes": the LLM suggests positive/calming words; each is
validated + enriched against the free Dictionary API (dictionaryapi.dev) for the REAL
definition, IPA pronunciation, example sentences, and a human pronunciation clip. That
rules out hallucinated definitions — the authoritative data is the dictionary's. The
audio clip (public-domain, usually Wiktionary) is cached to our origin; the page falls
back to the browser's speech synthesis when a word has no clip.
All network/LLM work happens before the brief DB write. Same pick lifecycle as the
other small joys.
"""
from __future__ import annotations
import json
import os
import re
import sqlite3
import urllib.parse
import urllib.request
from pathlib import Path
from . import daily
from .localtime import local_today
DICT_BASE = "https://api.dictionaryapi.dev/api/v2/entries/en"
_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
_NO_REPEAT_POOL = 60
_TARGET_POOL = 30 # keep harvesting (a batch/day) until the pool reaches this
_HARVEST_BATCH = 12
_MIN_AUDIO_BYTES = 500
def cache_dir() -> Path:
override = os.environ.get("GOODNEWS_WOTD_AUDIO")
d = Path(override) if override else Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3")).parent / "wotd_audio"
d.mkdir(parents=True, exist_ok=True)
return d
def _http_bytes(url: str, timeout: int = 30) -> tuple[bytes, str]:
req = urllib.request.Request(url, headers=_UA)
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read(), (r.headers.get("Content-Type") or "")
def _propose_words(client, n: int) -> list[dict]:
"""Ask for word + the intended part of speech, so _lookup picks the sense the LLM meant
(e.g. 'serene' the adjective, not the archaic noun)."""
user = (
f"Suggest {n} English vocabulary words for an uplifting 'word of the day' — positive, "
"calming, hopeful, or quietly beautiful in meaning (e.g. serene, kindness, dawn, "
"resilience, wonder). Real, usable words; vary common and slightly elevated. For each, "
"give the part of speech you intend (the everyday modern sense, not an archaic one). "
'Reply with JSON only: {"words": [{"word": "serene", "pos": "adjective"}, ...]}'
)
txt = client.chat_text([{"role": "user", "content": user}])
m = re.search(r"\{.*\}", txt, re.S)
if not m:
return []
out = []
for w in json.loads(m.group(0)).get("words", []):
if isinstance(w, str) and w.strip():
out.append({"word": w.strip().lower(), "pos": None})
elif isinstance(w, dict) and str(w.get("word", "")).strip():
out.append({"word": str(w["word"]).strip().lower(),
"pos": (str(w.get("pos")).strip().lower() or None) if w.get("pos") else None})
return out
def _polish(client, word: str, part_of_speech: str | None, definition: str) -> dict | None:
"""LLM polish for display: rewrite the real dictionary gloss as ONE warm plain sentence,
and write two clear everyday example sentences. Grounded in the real definition (the
dictionary stays the anchor); returns None on any trouble so callers fall back to raw."""
pos = f" ({part_of_speech})" if part_of_speech else ""
user = (
f'The word is "{word}"{pos}. Its dictionary definition is: "{definition}".\n'
"1) Rewrite that definition as ONE warm, plain-language sentence that a general reader "
"instantly understands. Stay faithful to the meaning; do not invent extra facts.\n"
"2) Write TWO short, natural example sentences that clearly show the word used in "
"everyday life — concrete and easy to picture, not abstract, archaic, or a proper-noun "
f'title. Each must actually use the word "{word}".\n'
'Reply with JSON only: {"gloss": "...", "examples": ["...", "..."]}'
)
try:
txt = client.chat_text([{"role": "user", "content": user}])
except Exception: # noqa: BLE001 — polish is best-effort; raw dictionary data stands
return None
m = re.search(r"\{.*\}", txt, re.S)
if not m:
return None
try:
data = json.loads(m.group(0))
except ValueError:
return None
gloss = " ".join(str(data.get("gloss") or "").split()).strip()
examples = [" ".join(str(e).split()).strip() for e in (data.get("examples") or []) if str(e).strip()]
if not gloss:
return None
return {"gloss": gloss, "examples": examples[:2]}
def _lookup(word: str, prefer_pos: str | None = None) -> dict | None:
"""Validate + enrich a word via the dictionary. Returns None if it's not a real word.
When prefer_pos is given, picks the meaning of that part of speech (the sense the LLM meant)."""
try:
data = daily.http_json(f"{DICT_BASE}/{urllib.parse.quote(word)}")
except Exception: # noqa: BLE001 — unknown word / network → just skip it
return None
if not isinstance(data, list) or not data:
return None
entry = data[0]
meanings = entry.get("meanings") or []
if not meanings or not (meanings[0].get("definitions") or []):
return None
# Prefer the meaning whose part of speech matches the LLM's intent; else the first usable one.
chosen = None
if prefer_pos:
for mn in meanings:
if (mn.get("partOfSpeech") or "").strip().lower() == prefer_pos and (mn.get("definitions") or []):
if (mn["definitions"][0].get("definition") or "").strip():
chosen = mn
break
if chosen is None:
for mn in meanings:
if (mn.get("definitions") or []) and (mn["definitions"][0].get("definition") or "").strip():
chosen = mn
break
if chosen is None:
return None
definition = (chosen["definitions"][0].get("definition") or "").strip()
phonetic = entry.get("phonetic")
audio_url = None
for p in (entry.get("phonetics") or []):
if not phonetic and p.get("text"):
phonetic = p["text"]
if not audio_url and p.get("audio"):
audio_url = p["audio"]
examples = []
for m in [chosen] + [mn for mn in meanings if mn is not chosen]: # chosen sense's examples first
for d in (m.get("definitions") or []):
if d.get("example"):
examples.append(d["example"].strip())
return {
"word": (entry.get("word") or word).strip().lower(),
"part_of_speech": chosen.get("partOfSpeech"),
"phonetic": phonetic,
"audio_url": audio_url,
"definition": definition,
"examples": examples[:3],
}
def _cache_audio(audio_url: str | None, word: str) -> str | None:
"""Download the pronunciation clip to our origin (atomic). Returns filename or None."""
if not audio_url:
return None
if audio_url.startswith("//"):
audio_url = "https:" + audio_url
try:
data, ctype = _http_bytes(audio_url)
except Exception: # noqa: BLE001
return None
if len(data) < _MIN_AUDIO_BYTES:
return None
ext = ".ogg" if ("ogg" in ctype or audio_url.endswith(".ogg")) else ".mp3"
fname = f"{word}{ext}"
cdir = cache_dir()
tmp = cdir / f".{word}.tmp"
try:
tmp.write_bytes(data)
os.replace(tmp, cdir / fname)
except OSError:
try:
tmp.unlink()
except OSError:
pass
return None
return fname
def _pool_count(conn: sqlite3.Connection) -> int:
return conn.execute("SELECT COUNT(*) FROM wotd_pool").fetchone()[0]
def harvest(conn: sqlite3.Connection, client, count: int = _HARVEST_BATCH) -> dict:
"""Propose words → validate/enrich via dictionary → cache audio → add new ones.
All network up front; one brief write at the end."""
try:
words = _propose_words(client, count)
except Exception: # noqa: BLE001
return {"proposed": 0, "added": 0, "pool": _pool_count(conn)}
rows = []
for item in words:
w = item["word"]
if not w.isalpha() or conn.execute("SELECT 1 FROM wotd_pool WHERE word=?", (w,)).fetchone():
continue
info = _lookup(w, item.get("pos"))
if not info:
continue
audio_file = _cache_audio(info["audio_url"], info["word"])
polished = _polish(client, info["word"], info["part_of_speech"], info["definition"])
gloss = polished["gloss"] if polished else None
usage = json.dumps(polished["examples"]) if polished else None
rows.append((info["word"], info["part_of_speech"], info["phonetic"], audio_file,
info["audio_url"], info["definition"], json.dumps(info["examples"]), gloss, usage))
before = _pool_count(conn)
conn.executemany(
"INSERT OR IGNORE INTO wotd_pool (source, word, part_of_speech, phonetic, audio_file, audio_url, definition, examples, gloss, usage) "
"VALUES ('llm', ?, ?, ?, ?, ?, ?, ?, ?, ?)", rows,
)
conn.commit()
after = _pool_count(conn)
return {"proposed": len(words), "added": after - before, "pool": after}
def _candidates(conn: sqlite3.Connection, avoid: int | None = None) -> list[int]:
featured = conn.execute("SELECT id FROM wotd_pool WHERE blocked=0 AND featured=1 ORDER BY id").fetchall()
if featured:
ids = [r[0] for r in featured]
else:
rows = conn.execute(
"SELECT id FROM wotd_pool WHERE blocked=0 ORDER BY shown_at IS NOT NULL, shown_at, id LIMIT ?",
(_NO_REPEAT_POOL,),
).fetchall()
ids = [r[0] for r in rows]
if avoid is not None:
ids = [i for i in ids if i != avoid] or ids
return ids
def pick_daily(conn: sqlite3.Connection, feature_date: str | None = None, force: bool = False,
avoid: int | None = None, client=None) -> dict | None:
feature_date = feature_date or local_today()
existing = conn.execute("SELECT * FROM daily_wotd WHERE feature_date=?", (feature_date,)).fetchone()
if existing and not force:
return dict(existing)
ids = _candidates(conn, avoid)
if not ids:
return None
pick_id = daily.seeded_order(ids, feature_date)[0]
row = conn.execute("SELECT * FROM wotd_pool WHERE id=?", (pick_id,)).fetchone()
gloss, usage = row["gloss"], row["usage"]
if not gloss and client: # lazy polish for older pool words; cached back
polished = _polish(client, row["word"], row["part_of_speech"], row["definition"])
if polished:
gloss, usage = polished["gloss"], json.dumps(polished["examples"])
conn.execute("UPDATE wotd_pool SET gloss=?, usage=? WHERE id=?", (gloss, usage, pick_id))
conn.execute(
"INSERT INTO daily_wotd (feature_date, pool_id, word, part_of_speech, phonetic, audio_file, definition, examples, gloss, usage) "
"VALUES (?,?,?,?,?,?,?,?,?,?) "
"ON CONFLICT(feature_date) DO UPDATE SET pool_id=excluded.pool_id, word=excluded.word, "
"part_of_speech=excluded.part_of_speech, phonetic=excluded.phonetic, audio_file=excluded.audio_file, "
"definition=excluded.definition, examples=excluded.examples, gloss=excluded.gloss, usage=excluded.usage",
(feature_date, row["id"], row["word"], row["part_of_speech"], row["phonetic"],
row["audio_file"], row["definition"], row["examples"], gloss, usage),
)
conn.execute("UPDATE wotd_pool SET shown_at=? WHERE id=?", (feature_date, pick_id))
conn.commit()
return dict(conn.execute("SELECT * FROM daily_wotd WHERE feature_date=?", (feature_date,)).fetchone())
def get_today(conn: sqlite3.Connection, feature_date: str | None = None) -> dict | None:
if feature_date:
row = conn.execute("SELECT * FROM daily_wotd WHERE feature_date=?", (feature_date,)).fetchone()
if row:
return dict(row)
row = conn.execute("SELECT * FROM daily_wotd ORDER BY feature_date DESC LIMIT 1").fetchone()
return dict(row) if row else None
def run_daily(conn: sqlite3.Connection, client=None) -> dict:
"""Top the pool up toward _TARGET_POOL (a batch a day), then pick today's word."""
harvested = None
if client and _pool_count(conn) < _TARGET_POOL:
harvested = harvest(conn, client)
picked = pick_daily(conn, client=client)
return {"pool": _pool_count(conn), "harvested": harvested, "picked": (picked or {}).get("word")}