#!/usr/bin/env python3 """Build Bloom's accepted-word dictionary (one-time / regenerable build step). The make-or-break of Bloom is the accepted-word list: large and natural enough that a normal word is never rejected, but free of obscure crossword-ese and of anything offensive (so a shared board can't be made abusive). Recipe: base = ENABLE (~173k word-game words, NO proper nouns) → "is it a real word" ∩ keep words with wordfreq zipf >= ZIPF_MIN → "is it natural/common" − profanity/slur blocklist (LDNOOBW en) → "is it safe to share" − any word containing 's' (the wheel never has S, so an S-word can never be formed → it can never be accepted → drop it) − words < 4 letters Two tiers are vendored to goodnews/data/bloom_words.json: "accept" (zipf >= ACCEPT_MIN) — the generous set that COUNTS when typed "common" (zipf >= COMMON_MIN) — a tighter subset used only to DESIGN puzzles (pangram is always recognizable; top tier is reachable with everyday vocabulary) Pre-filtered + vendored so the game needs no wordfreq at runtime. Usage: python scripts/build_bloom_words.py preview # show sizes+samples per threshold python scripts/build_bloom_words.py write # vendor at the chosen thresholds """ from __future__ import annotations import json import random import sys from pathlib import Path import wordfreq ROOT = Path(__file__).resolve().parents[1] OUT = ROOT / "goodnews" / "data" / "bloom_words.json" BASE = Path("/tmp/enable1.txt") BAD = Path("/tmp/ldnoobw_en.txt") MIN_LEN = 4 # Accept is VERY generous so a normal word (incl. inflected forms like "beefed", # "aced") is never rejected — a frequency cut splits inflections, so we keep the # floor low and only trim the genuinely obscure/archaic tail. Tiers are based on # `common` (below), NOT on accept, so generosity never makes the game harder. ACCEPT_MIN = 2.0 COMMON_MIN = 3.3 # the DESIGNED puzzle: recognizable words; drives tiers + pangram def _load_candidates() -> list[str]: base = {w.strip().lower() for w in BASE.read_text().splitlines() if w.strip()} bad = {w.strip().lower() for w in BAD.read_text().splitlines() if w.strip()} # LDNOOBW conflates clinical anatomy/biology with profanity — "block abuse, # not biology": allow legitimate medical/anatomical/normal words back in. allow = set(json.loads((ROOT / "goodnews" / "data" / "bloom_allow.json").read_text())) bad -= allow out = [] for w in base: if len(w) < MIN_LEN or not w.isalpha(): continue if "s" in w: # wheel never contains S → an S-word is never makeable continue if w in bad: continue out.append(w) return out, bad def _filter(cands: list[str], zipf_min: float) -> list[str]: return sorted(w for w in cands if wordfreq.zipf_frequency(w, "en") >= zipf_min) def main() -> None: cmd = sys.argv[1] if len(sys.argv) > 1 else "preview" cands, bad = _load_candidates() print(f"candidates (real, alpha, >=4, no-S, not-blocked): {len(cands)} | blocklist {len(bad)}") if cmd == "preview": rng = random.Random(7) for z in (2.5, 2.8, 3.0, 3.3): words = _filter(cands, z) sample = rng.sample(words, 18) print(f"\nzipf>={z}: {len(words)} words") print(" sample:", ", ".join(sorted(sample))) elif cmd == "write": # ACCEPT is now BROAD: every valid dictionary word (real ENABLE word, ≥4, # no-S, not profane). No frequency floor — tiers are decoupled (common-based), # so obscure-but-real words like "arraign" count automatically as bonus finds # without ever becoming a pangram or making the game harder. Runtime curation # (allow/block individual words) is DB-backed (bloom_word_overrides), no deploy. accept = sorted(cands) common = _filter(cands, COMMON_MIN) OUT.write_text(json.dumps({"accept": accept, "common": common})) print(f"\nwrote accept={len(accept)} (ALL valid words), " f"common={len(common)} (zipf>={COMMON_MIN}) → {OUT}") else: print(f"unknown command: {cmd}") if __name__ == "__main__": main()