0ae789752e
Both selectors ordered candidates least-recently-shown, then daily.seeded_order() ROTATED the whole list and took [0] — an arbitrary date-hashed item, undoing the ordering. Result: repeats (quote id 2 on 6/28+6/29; word "harmony" on 6/25+6/28), no guarantee a pool item is shown before it recurs. Fix: daily.freshest(rows) returns the freshest cohort only — every NEVER-shown item while any remain, else the oldest-shown group. quote/wotd _candidates use it; seeded_order now picks deterministically WITHIN that cohort. So every pool item is featured once before any repeat, then cycles oldest-first. Dropped the unused _NO_REPEAT_POOL window. Tests: no-repeat-until-exhausted (quote + wotd) + a freshest() unit test. 428 backend tests green. (Separate follow-up: expand the QOTD pool from 16 → 90+ vetted public-domain quotes for a longer no-repeat window.) Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
280 lines
13 KiB
Python
280 lines
13 KiB
Python
"""Word of the Day — an uplifting word a day, grounded in a real dictionary.
|
|
|
|
"LLM proposes, dictionary disposes": the LLM suggests positive/calming words; each is
|
|
validated + enriched against the free Dictionary API (dictionaryapi.dev) for the REAL
|
|
definition, IPA pronunciation, example sentences, and a human pronunciation clip. That
|
|
rules out hallucinated definitions — the authoritative data is the dictionary's. The
|
|
audio clip (public-domain, usually Wiktionary) is cached to our origin; the page falls
|
|
back to the browser's speech synthesis when a word has no clip.
|
|
|
|
All network/LLM work happens before the brief DB write. Same pick lifecycle as the
|
|
other small joys.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import urllib.parse
|
|
import urllib.request
|
|
from pathlib import Path
|
|
|
|
from . import daily
|
|
from .localtime import local_today
|
|
|
|
DICT_BASE = "https://api.dictionaryapi.dev/api/v2/entries/en"
|
|
_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
|
|
_TARGET_POOL = 30 # keep harvesting (a batch/day) until the pool reaches this
|
|
_HARVEST_BATCH = 12
|
|
_MIN_AUDIO_BYTES = 500
|
|
|
|
|
|
def cache_dir() -> Path:
|
|
override = os.environ.get("GOODNEWS_WOTD_AUDIO")
|
|
d = Path(override) if override else Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3")).parent / "wotd_audio"
|
|
d.mkdir(parents=True, exist_ok=True)
|
|
return d
|
|
|
|
|
|
def _http_bytes(url: str, timeout: int = 30) -> tuple[bytes, str]:
|
|
req = urllib.request.Request(url, headers=_UA)
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
return r.read(), (r.headers.get("Content-Type") or "")
|
|
|
|
|
|
def _propose_words(client, n: int) -> list[dict]:
|
|
"""Ask for word + the intended part of speech, so _lookup picks the sense the LLM meant
|
|
(e.g. 'serene' the adjective, not the archaic noun)."""
|
|
user = (
|
|
f"Suggest {n} English vocabulary words for an uplifting 'word of the day' — positive, "
|
|
"calming, hopeful, or quietly beautiful in meaning (e.g. serene, kindness, dawn, "
|
|
"resilience, wonder). Real, usable words; vary common and slightly elevated. For each, "
|
|
"give the part of speech you intend (the everyday modern sense, not an archaic one). "
|
|
'Reply with JSON only: {"words": [{"word": "serene", "pos": "adjective"}, ...]}'
|
|
)
|
|
txt = client.chat_text([{"role": "user", "content": user}])
|
|
m = re.search(r"\{.*\}", txt, re.S)
|
|
if not m:
|
|
return []
|
|
out = []
|
|
for w in json.loads(m.group(0)).get("words", []):
|
|
if isinstance(w, str) and w.strip():
|
|
out.append({"word": w.strip().lower(), "pos": None})
|
|
elif isinstance(w, dict) and str(w.get("word", "")).strip():
|
|
out.append({"word": str(w["word"]).strip().lower(),
|
|
"pos": (str(w.get("pos")).strip().lower() or None) if w.get("pos") else None})
|
|
return out
|
|
|
|
|
|
def _polish(client, word: str, part_of_speech: str | None, definition: str) -> dict | None:
|
|
"""LLM polish for display: rewrite the real dictionary gloss as ONE warm plain sentence,
|
|
and write two clear everyday example sentences. Grounded in the real definition (the
|
|
dictionary stays the anchor); returns None on any trouble so callers fall back to raw."""
|
|
pos = f" ({part_of_speech})" if part_of_speech else ""
|
|
user = (
|
|
f'The word is "{word}"{pos}. Its dictionary definition is: "{definition}".\n'
|
|
"1) Rewrite that definition as ONE warm, plain-language sentence that a general reader "
|
|
"instantly understands. Stay faithful to the meaning; do not invent extra facts.\n"
|
|
"2) Write TWO short, natural example sentences that clearly show the word used in "
|
|
"everyday life — concrete and easy to picture, not abstract, archaic, or a proper-noun "
|
|
f'title. Each must actually use the word "{word}".\n'
|
|
'Reply with JSON only: {"gloss": "...", "examples": ["...", "..."]}'
|
|
)
|
|
try:
|
|
txt = client.chat_text([{"role": "user", "content": user}])
|
|
except Exception: # noqa: BLE001 — polish is best-effort; raw dictionary data stands
|
|
return None
|
|
m = re.search(r"\{.*\}", txt, re.S)
|
|
if not m:
|
|
return None
|
|
try:
|
|
data = json.loads(m.group(0))
|
|
except ValueError:
|
|
return None
|
|
gloss = " ".join(str(data.get("gloss") or "").split()).strip()
|
|
examples = [" ".join(str(e).split()).strip() for e in (data.get("examples") or []) if str(e).strip()]
|
|
# Enforce the contract: keep only sentences that actually use the word, and require at
|
|
# least one. A gloss with no usable examples falls back to the raw dictionary data.
|
|
examples = [e for e in examples if word.lower() in e.lower()]
|
|
if not gloss or not examples:
|
|
return None
|
|
return {"gloss": gloss, "examples": examples[:2]}
|
|
|
|
|
|
def _lookup(word: str, prefer_pos: str | None = None) -> dict | None:
|
|
"""Validate + enrich a word via the dictionary. Returns None if it's not a real word.
|
|
When prefer_pos is given, picks the meaning of that part of speech (the sense the LLM meant)."""
|
|
try:
|
|
data = daily.http_json(f"{DICT_BASE}/{urllib.parse.quote(word)}")
|
|
except Exception: # noqa: BLE001 — unknown word / network → just skip it
|
|
return None
|
|
if not isinstance(data, list) or not data:
|
|
return None
|
|
entry = data[0]
|
|
meanings = entry.get("meanings") or []
|
|
if not meanings or not (meanings[0].get("definitions") or []):
|
|
return None
|
|
# Prefer the meaning whose part of speech matches the LLM's intent; else the first usable one.
|
|
chosen = None
|
|
if prefer_pos:
|
|
for mn in meanings:
|
|
if (mn.get("partOfSpeech") or "").strip().lower() == prefer_pos and (mn.get("definitions") or []):
|
|
if (mn["definitions"][0].get("definition") or "").strip():
|
|
chosen = mn
|
|
break
|
|
if chosen is None:
|
|
for mn in meanings:
|
|
if (mn.get("definitions") or []) and (mn["definitions"][0].get("definition") or "").strip():
|
|
chosen = mn
|
|
break
|
|
if chosen is None:
|
|
return None
|
|
definition = (chosen["definitions"][0].get("definition") or "").strip()
|
|
phonetic = entry.get("phonetic")
|
|
audio_url = None
|
|
for p in (entry.get("phonetics") or []):
|
|
if not phonetic and p.get("text"):
|
|
phonetic = p["text"]
|
|
if not audio_url and p.get("audio"):
|
|
audio_url = p["audio"]
|
|
examples = []
|
|
for m in [chosen] + [mn for mn in meanings if mn is not chosen]: # chosen sense's examples first
|
|
for d in (m.get("definitions") or []):
|
|
if d.get("example"):
|
|
examples.append(d["example"].strip())
|
|
return {
|
|
"word": (entry.get("word") or word).strip().lower(),
|
|
"part_of_speech": chosen.get("partOfSpeech"),
|
|
"phonetic": phonetic,
|
|
"audio_url": audio_url,
|
|
"definition": definition,
|
|
"examples": examples[:3],
|
|
}
|
|
|
|
|
|
def _cache_audio(audio_url: str | None, word: str) -> str | None:
|
|
"""Download the pronunciation clip to our origin (atomic). Returns filename or None."""
|
|
if not audio_url:
|
|
return None
|
|
if audio_url.startswith("//"):
|
|
audio_url = "https:" + audio_url
|
|
try:
|
|
data, ctype = _http_bytes(audio_url)
|
|
except Exception: # noqa: BLE001
|
|
return None
|
|
if len(data) < _MIN_AUDIO_BYTES:
|
|
return None
|
|
ext = ".ogg" if ("ogg" in ctype or audio_url.endswith(".ogg")) else ".mp3"
|
|
fname = f"{word}{ext}"
|
|
cdir = cache_dir()
|
|
tmp = cdir / f".{word}.tmp"
|
|
try:
|
|
tmp.write_bytes(data)
|
|
os.replace(tmp, cdir / fname)
|
|
except OSError:
|
|
try:
|
|
tmp.unlink()
|
|
except OSError:
|
|
pass
|
|
return None
|
|
return fname
|
|
|
|
|
|
def _pool_count(conn: sqlite3.Connection) -> int:
|
|
return conn.execute("SELECT COUNT(*) FROM wotd_pool").fetchone()[0]
|
|
|
|
|
|
def harvest(conn: sqlite3.Connection, client, count: int = _HARVEST_BATCH) -> dict:
|
|
"""Propose words → validate/enrich via dictionary → cache audio → add new ones.
|
|
All network up front; one brief write at the end."""
|
|
try:
|
|
words = _propose_words(client, count)
|
|
except Exception: # noqa: BLE001
|
|
return {"proposed": 0, "added": 0, "pool": _pool_count(conn)}
|
|
rows = []
|
|
for item in words:
|
|
w = item["word"]
|
|
if not w.isalpha() or conn.execute("SELECT 1 FROM wotd_pool WHERE word=?", (w,)).fetchone():
|
|
continue
|
|
info = _lookup(w, item.get("pos"))
|
|
if not info:
|
|
continue
|
|
audio_file = _cache_audio(info["audio_url"], info["word"])
|
|
polished = _polish(client, info["word"], info["part_of_speech"], info["definition"])
|
|
gloss = polished["gloss"] if polished else None
|
|
usage = json.dumps(polished["examples"]) if polished else None
|
|
rows.append((info["word"], info["part_of_speech"], info["phonetic"], audio_file,
|
|
info["audio_url"], info["definition"], json.dumps(info["examples"]), gloss, usage))
|
|
before = _pool_count(conn)
|
|
conn.executemany(
|
|
"INSERT OR IGNORE INTO wotd_pool (source, word, part_of_speech, phonetic, audio_file, audio_url, definition, examples, gloss, usage) "
|
|
"VALUES ('llm', ?, ?, ?, ?, ?, ?, ?, ?, ?)", rows,
|
|
)
|
|
conn.commit()
|
|
after = _pool_count(conn)
|
|
return {"proposed": len(words), "added": after - before, "pool": after}
|
|
|
|
|
|
def _candidates(conn: sqlite3.Connection, avoid: int | None = None) -> list[int]:
|
|
featured = conn.execute("SELECT id FROM wotd_pool WHERE blocked=0 AND featured=1 ORDER BY id").fetchall()
|
|
if featured:
|
|
ids = [r[0] for r in featured]
|
|
else:
|
|
# The freshest cohort only (never-shown, else the oldest-shown group) — picking
|
|
# across the whole pool is what re-fed recent words day to day.
|
|
rows = conn.execute("SELECT id, shown_at FROM wotd_pool WHERE blocked=0").fetchall()
|
|
ids = daily.freshest(rows)
|
|
if avoid is not None:
|
|
ids = [i for i in ids if i != avoid] or ids
|
|
return ids
|
|
|
|
|
|
def pick_daily(conn: sqlite3.Connection, feature_date: str | None = None, force: bool = False,
|
|
avoid: int | None = None, client=None) -> dict | None:
|
|
feature_date = feature_date or local_today()
|
|
existing = conn.execute("SELECT * FROM daily_wotd WHERE feature_date=?", (feature_date,)).fetchone()
|
|
if existing and not force:
|
|
return dict(existing)
|
|
ids = _candidates(conn, avoid)
|
|
if not ids:
|
|
return None
|
|
pick_id = daily.seeded_order(ids, feature_date)[0]
|
|
row = conn.execute("SELECT * FROM wotd_pool WHERE id=?", (pick_id,)).fetchone()
|
|
gloss, usage = row["gloss"], row["usage"]
|
|
if not gloss and client: # lazy polish for older pool words; cached back
|
|
polished = _polish(client, row["word"], row["part_of_speech"], row["definition"])
|
|
if polished:
|
|
gloss, usage = polished["gloss"], json.dumps(polished["examples"])
|
|
conn.execute("UPDATE wotd_pool SET gloss=?, usage=? WHERE id=?", (gloss, usage, pick_id))
|
|
conn.execute(
|
|
"INSERT INTO daily_wotd (feature_date, pool_id, word, part_of_speech, phonetic, audio_file, definition, examples, gloss, usage) "
|
|
"VALUES (?,?,?,?,?,?,?,?,?,?) "
|
|
"ON CONFLICT(feature_date) DO UPDATE SET pool_id=excluded.pool_id, word=excluded.word, "
|
|
"part_of_speech=excluded.part_of_speech, phonetic=excluded.phonetic, audio_file=excluded.audio_file, "
|
|
"definition=excluded.definition, examples=excluded.examples, gloss=excluded.gloss, usage=excluded.usage",
|
|
(feature_date, row["id"], row["word"], row["part_of_speech"], row["phonetic"],
|
|
row["audio_file"], row["definition"], row["examples"], gloss, usage),
|
|
)
|
|
conn.execute("UPDATE wotd_pool SET shown_at=? WHERE id=?", (feature_date, pick_id))
|
|
conn.commit()
|
|
return dict(conn.execute("SELECT * FROM daily_wotd WHERE feature_date=?", (feature_date,)).fetchone())
|
|
|
|
|
|
def get_today(conn: sqlite3.Connection, feature_date: str | None = None) -> dict | None:
|
|
if feature_date:
|
|
row = conn.execute("SELECT * FROM daily_wotd WHERE feature_date=?", (feature_date,)).fetchone()
|
|
if row:
|
|
return dict(row)
|
|
row = conn.execute("SELECT * FROM daily_wotd ORDER BY feature_date DESC LIMIT 1").fetchone()
|
|
return dict(row) if row else None
|
|
|
|
|
|
def run_daily(conn: sqlite3.Connection, client=None) -> dict:
|
|
"""Top the pool up toward _TARGET_POOL (a batch a day), then pick today's word."""
|
|
harvested = None
|
|
if client and _pool_count(conn) < _TARGET_POOL:
|
|
harvested = harvest(conn, client)
|
|
picked = pick_daily(conn, client=client)
|
|
return {"pool": _pool_count(conn), "harvested": harvested, "picked": (picked or {}).get("word")}
|