Files
upbeatBytes/goodnews/wotd.py
T
thejayman77 0ae789752e fix: QOTD/WOTD freshness — pick within the freshest cohort, not the rotated pool
Both selectors ordered candidates least-recently-shown, then daily.seeded_order()
ROTATED the whole list and took [0] — an arbitrary date-hashed item, undoing the
ordering. Result: repeats (quote id 2 on 6/28+6/29; word "harmony" on 6/25+6/28),
no guarantee a pool item is shown before it recurs.

Fix: daily.freshest(rows) returns the freshest cohort only — every NEVER-shown
item while any remain, else the oldest-shown group. quote/wotd _candidates use it;
seeded_order now picks deterministically WITHIN that cohort. So every pool item is
featured once before any repeat, then cycles oldest-first. Dropped the unused
_NO_REPEAT_POOL window. Tests: no-repeat-until-exhausted (quote + wotd) + a
freshest() unit test. 428 backend tests green.

(Separate follow-up: expand the QOTD pool from 16 → 90+ vetted public-domain
quotes for a longer no-repeat window.)

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 05:39:06 -04:00

280 lines
13 KiB
Python

"""Word of the Day — an uplifting word a day, grounded in a real dictionary.
"LLM proposes, dictionary disposes": the LLM suggests positive/calming words; each is
validated + enriched against the free Dictionary API (dictionaryapi.dev) for the REAL
definition, IPA pronunciation, example sentences, and a human pronunciation clip. That
rules out hallucinated definitions — the authoritative data is the dictionary's. The
audio clip (public-domain, usually Wiktionary) is cached to our origin; the page falls
back to the browser's speech synthesis when a word has no clip.
All network/LLM work happens before the brief DB write. Same pick lifecycle as the
other small joys.
"""
from __future__ import annotations
import json
import os
import re
import sqlite3
import urllib.parse
import urllib.request
from pathlib import Path
from . import daily
from .localtime import local_today
DICT_BASE = "https://api.dictionaryapi.dev/api/v2/entries/en"
_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
_TARGET_POOL = 30 # keep harvesting (a batch/day) until the pool reaches this
_HARVEST_BATCH = 12
_MIN_AUDIO_BYTES = 500
def cache_dir() -> Path:
override = os.environ.get("GOODNEWS_WOTD_AUDIO")
d = Path(override) if override else Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3")).parent / "wotd_audio"
d.mkdir(parents=True, exist_ok=True)
return d
def _http_bytes(url: str, timeout: int = 30) -> tuple[bytes, str]:
req = urllib.request.Request(url, headers=_UA)
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read(), (r.headers.get("Content-Type") or "")
def _propose_words(client, n: int) -> list[dict]:
"""Ask for word + the intended part of speech, so _lookup picks the sense the LLM meant
(e.g. 'serene' the adjective, not the archaic noun)."""
user = (
f"Suggest {n} English vocabulary words for an uplifting 'word of the day' — positive, "
"calming, hopeful, or quietly beautiful in meaning (e.g. serene, kindness, dawn, "
"resilience, wonder). Real, usable words; vary common and slightly elevated. For each, "
"give the part of speech you intend (the everyday modern sense, not an archaic one). "
'Reply with JSON only: {"words": [{"word": "serene", "pos": "adjective"}, ...]}'
)
txt = client.chat_text([{"role": "user", "content": user}])
m = re.search(r"\{.*\}", txt, re.S)
if not m:
return []
out = []
for w in json.loads(m.group(0)).get("words", []):
if isinstance(w, str) and w.strip():
out.append({"word": w.strip().lower(), "pos": None})
elif isinstance(w, dict) and str(w.get("word", "")).strip():
out.append({"word": str(w["word"]).strip().lower(),
"pos": (str(w.get("pos")).strip().lower() or None) if w.get("pos") else None})
return out
def _polish(client, word: str, part_of_speech: str | None, definition: str) -> dict | None:
"""LLM polish for display: rewrite the real dictionary gloss as ONE warm plain sentence,
and write two clear everyday example sentences. Grounded in the real definition (the
dictionary stays the anchor); returns None on any trouble so callers fall back to raw."""
pos = f" ({part_of_speech})" if part_of_speech else ""
user = (
f'The word is "{word}"{pos}. Its dictionary definition is: "{definition}".\n'
"1) Rewrite that definition as ONE warm, plain-language sentence that a general reader "
"instantly understands. Stay faithful to the meaning; do not invent extra facts.\n"
"2) Write TWO short, natural example sentences that clearly show the word used in "
"everyday life — concrete and easy to picture, not abstract, archaic, or a proper-noun "
f'title. Each must actually use the word "{word}".\n'
'Reply with JSON only: {"gloss": "...", "examples": ["...", "..."]}'
)
try:
txt = client.chat_text([{"role": "user", "content": user}])
except Exception: # noqa: BLE001 — polish is best-effort; raw dictionary data stands
return None
m = re.search(r"\{.*\}", txt, re.S)
if not m:
return None
try:
data = json.loads(m.group(0))
except ValueError:
return None
gloss = " ".join(str(data.get("gloss") or "").split()).strip()
examples = [" ".join(str(e).split()).strip() for e in (data.get("examples") or []) if str(e).strip()]
# Enforce the contract: keep only sentences that actually use the word, and require at
# least one. A gloss with no usable examples falls back to the raw dictionary data.
examples = [e for e in examples if word.lower() in e.lower()]
if not gloss or not examples:
return None
return {"gloss": gloss, "examples": examples[:2]}
def _lookup(word: str, prefer_pos: str | None = None) -> dict | None:
"""Validate + enrich a word via the dictionary. Returns None if it's not a real word.
When prefer_pos is given, picks the meaning of that part of speech (the sense the LLM meant)."""
try:
data = daily.http_json(f"{DICT_BASE}/{urllib.parse.quote(word)}")
except Exception: # noqa: BLE001 — unknown word / network → just skip it
return None
if not isinstance(data, list) or not data:
return None
entry = data[0]
meanings = entry.get("meanings") or []
if not meanings or not (meanings[0].get("definitions") or []):
return None
# Prefer the meaning whose part of speech matches the LLM's intent; else the first usable one.
chosen = None
if prefer_pos:
for mn in meanings:
if (mn.get("partOfSpeech") or "").strip().lower() == prefer_pos and (mn.get("definitions") or []):
if (mn["definitions"][0].get("definition") or "").strip():
chosen = mn
break
if chosen is None:
for mn in meanings:
if (mn.get("definitions") or []) and (mn["definitions"][0].get("definition") or "").strip():
chosen = mn
break
if chosen is None:
return None
definition = (chosen["definitions"][0].get("definition") or "").strip()
phonetic = entry.get("phonetic")
audio_url = None
for p in (entry.get("phonetics") or []):
if not phonetic and p.get("text"):
phonetic = p["text"]
if not audio_url and p.get("audio"):
audio_url = p["audio"]
examples = []
for m in [chosen] + [mn for mn in meanings if mn is not chosen]: # chosen sense's examples first
for d in (m.get("definitions") or []):
if d.get("example"):
examples.append(d["example"].strip())
return {
"word": (entry.get("word") or word).strip().lower(),
"part_of_speech": chosen.get("partOfSpeech"),
"phonetic": phonetic,
"audio_url": audio_url,
"definition": definition,
"examples": examples[:3],
}
def _cache_audio(audio_url: str | None, word: str) -> str | None:
"""Download the pronunciation clip to our origin (atomic). Returns filename or None."""
if not audio_url:
return None
if audio_url.startswith("//"):
audio_url = "https:" + audio_url
try:
data, ctype = _http_bytes(audio_url)
except Exception: # noqa: BLE001
return None
if len(data) < _MIN_AUDIO_BYTES:
return None
ext = ".ogg" if ("ogg" in ctype or audio_url.endswith(".ogg")) else ".mp3"
fname = f"{word}{ext}"
cdir = cache_dir()
tmp = cdir / f".{word}.tmp"
try:
tmp.write_bytes(data)
os.replace(tmp, cdir / fname)
except OSError:
try:
tmp.unlink()
except OSError:
pass
return None
return fname
def _pool_count(conn: sqlite3.Connection) -> int:
return conn.execute("SELECT COUNT(*) FROM wotd_pool").fetchone()[0]
def harvest(conn: sqlite3.Connection, client, count: int = _HARVEST_BATCH) -> dict:
"""Propose words → validate/enrich via dictionary → cache audio → add new ones.
All network up front; one brief write at the end."""
try:
words = _propose_words(client, count)
except Exception: # noqa: BLE001
return {"proposed": 0, "added": 0, "pool": _pool_count(conn)}
rows = []
for item in words:
w = item["word"]
if not w.isalpha() or conn.execute("SELECT 1 FROM wotd_pool WHERE word=?", (w,)).fetchone():
continue
info = _lookup(w, item.get("pos"))
if not info:
continue
audio_file = _cache_audio(info["audio_url"], info["word"])
polished = _polish(client, info["word"], info["part_of_speech"], info["definition"])
gloss = polished["gloss"] if polished else None
usage = json.dumps(polished["examples"]) if polished else None
rows.append((info["word"], info["part_of_speech"], info["phonetic"], audio_file,
info["audio_url"], info["definition"], json.dumps(info["examples"]), gloss, usage))
before = _pool_count(conn)
conn.executemany(
"INSERT OR IGNORE INTO wotd_pool (source, word, part_of_speech, phonetic, audio_file, audio_url, definition, examples, gloss, usage) "
"VALUES ('llm', ?, ?, ?, ?, ?, ?, ?, ?, ?)", rows,
)
conn.commit()
after = _pool_count(conn)
return {"proposed": len(words), "added": after - before, "pool": after}
def _candidates(conn: sqlite3.Connection, avoid: int | None = None) -> list[int]:
featured = conn.execute("SELECT id FROM wotd_pool WHERE blocked=0 AND featured=1 ORDER BY id").fetchall()
if featured:
ids = [r[0] for r in featured]
else:
# The freshest cohort only (never-shown, else the oldest-shown group) — picking
# across the whole pool is what re-fed recent words day to day.
rows = conn.execute("SELECT id, shown_at FROM wotd_pool WHERE blocked=0").fetchall()
ids = daily.freshest(rows)
if avoid is not None:
ids = [i for i in ids if i != avoid] or ids
return ids
def pick_daily(conn: sqlite3.Connection, feature_date: str | None = None, force: bool = False,
avoid: int | None = None, client=None) -> dict | None:
feature_date = feature_date or local_today()
existing = conn.execute("SELECT * FROM daily_wotd WHERE feature_date=?", (feature_date,)).fetchone()
if existing and not force:
return dict(existing)
ids = _candidates(conn, avoid)
if not ids:
return None
pick_id = daily.seeded_order(ids, feature_date)[0]
row = conn.execute("SELECT * FROM wotd_pool WHERE id=?", (pick_id,)).fetchone()
gloss, usage = row["gloss"], row["usage"]
if not gloss and client: # lazy polish for older pool words; cached back
polished = _polish(client, row["word"], row["part_of_speech"], row["definition"])
if polished:
gloss, usage = polished["gloss"], json.dumps(polished["examples"])
conn.execute("UPDATE wotd_pool SET gloss=?, usage=? WHERE id=?", (gloss, usage, pick_id))
conn.execute(
"INSERT INTO daily_wotd (feature_date, pool_id, word, part_of_speech, phonetic, audio_file, definition, examples, gloss, usage) "
"VALUES (?,?,?,?,?,?,?,?,?,?) "
"ON CONFLICT(feature_date) DO UPDATE SET pool_id=excluded.pool_id, word=excluded.word, "
"part_of_speech=excluded.part_of_speech, phonetic=excluded.phonetic, audio_file=excluded.audio_file, "
"definition=excluded.definition, examples=excluded.examples, gloss=excluded.gloss, usage=excluded.usage",
(feature_date, row["id"], row["word"], row["part_of_speech"], row["phonetic"],
row["audio_file"], row["definition"], row["examples"], gloss, usage),
)
conn.execute("UPDATE wotd_pool SET shown_at=? WHERE id=?", (feature_date, pick_id))
conn.commit()
return dict(conn.execute("SELECT * FROM daily_wotd WHERE feature_date=?", (feature_date,)).fetchone())
def get_today(conn: sqlite3.Connection, feature_date: str | None = None) -> dict | None:
if feature_date:
row = conn.execute("SELECT * FROM daily_wotd WHERE feature_date=?", (feature_date,)).fetchone()
if row:
return dict(row)
row = conn.execute("SELECT * FROM daily_wotd ORDER BY feature_date DESC LIMIT 1").fetchone()
return dict(row) if row else None
def run_daily(conn: sqlite3.Connection, client=None) -> dict:
"""Top the pool up toward _TARGET_POOL (a batch a day), then pick today's word."""
harvested = None
if client and _pool_count(conn) < _TARGET_POOL:
harvested = harvest(conn, client)
picked = pick_daily(conn, client=client)
return {"pool": _pool_count(conn), "harvested": harvested, "picked": (picked or {}).get("word")}