upbeatBytes/scripts/build_bloom_words.py

#!/usr/bin/env python3
"""Build Bloom's accepted-word dictionary (one-time / regenerable build step).

The make-or-break of Bloom is the accepted-word list: large and natural enough
that a normal word is never rejected, but free of obscure crossword-ese and of
anything offensive (so a shared board can't be made abusive).

Recipe:
  base   = ENABLE (~173k word-game words, NO proper nouns)  → "is it a real word"
  ∩ keep words with wordfreq zipf >= ZIPF_MIN              → "is it natural/common"
  − profanity/slur blocklist (LDNOOBW en)                   → "is it safe to share"
  − any word containing 's'   (the wheel never has S, so an S-word can never be
                               formed → it can never be accepted → drop it)
  − words < 4 letters

Two tiers are vendored to goodnews/data/bloom_words.json:
  "accept" (zipf >= ACCEPT_MIN)  — the generous set that COUNTS when typed
  "common" (zipf >= COMMON_MIN)  — a tighter subset used only to DESIGN puzzles
                                   (pangram is always recognizable; top tier is
                                   reachable with everyday vocabulary)
Pre-filtered + vendored so the game needs no wordfreq at runtime.

Usage:
  python scripts/build_bloom_words.py preview        # show sizes+samples per threshold
  python scripts/build_bloom_words.py write          # vendor at the chosen thresholds
"""
from __future__ import annotations

import json
import random
import sys
from pathlib import Path

import wordfreq

ROOT = Path(__file__).resolve().parents[1]
OUT = ROOT / "goodnews" / "data" / "bloom_words.json"
BASE = Path("/tmp/enable1.txt")
BAD = Path("/tmp/ldnoobw_en.txt")
MIN_LEN = 4
# Accept is VERY generous so a normal word (incl. inflected forms like "beefed",
# "aced") is never rejected — a frequency cut splits inflections, so we keep the
# floor low and only trim the genuinely obscure/archaic tail. Tiers are based on
# `common` (below), NOT on accept, so generosity never makes the game harder.
ACCEPT_MIN = 2.0
COMMON_MIN = 3.3   # the DESIGNED puzzle: recognizable words; drives tiers + pangram


def _load_candidates() -> list[str]:
    base = {w.strip().lower() for w in BASE.read_text().splitlines() if w.strip()}
    bad = {w.strip().lower() for w in BAD.read_text().splitlines() if w.strip()}
    # LDNOOBW conflates clinical anatomy/biology with profanity — "block abuse,
    # not biology": allow legitimate medical/anatomical/normal words back in.
    allow = set(json.loads((ROOT / "goodnews" / "data" / "bloom_allow.json").read_text()))
    bad -= allow
    out = []
    for w in base:
        if len(w) < MIN_LEN or not w.isalpha():
            continue
        if "s" in w:          # wheel never contains S → an S-word is never makeable
            continue
        if w in bad:
            continue
        out.append(w)
    return out, bad


def _filter(cands: list[str], zipf_min: float) -> list[str]:
    return sorted(w for w in cands if wordfreq.zipf_frequency(w, "en") >= zipf_min)


def main() -> None:
    cmd = sys.argv[1] if len(sys.argv) > 1 else "preview"
    cands, bad = _load_candidates()
    print(f"candidates (real, alpha, >=4, no-S, not-blocked): {len(cands)}  | blocklist {len(bad)}")
    if cmd == "preview":
        rng = random.Random(7)
        for z in (2.5, 2.8, 3.0, 3.3):
            words = _filter(cands, z)
            sample = rng.sample(words, 18)
            print(f"\nzipf>={z}: {len(words)} words")
            print("  sample:", ", ".join(sorted(sample)))
    elif cmd == "write":
        # ACCEPT is now BROAD: every valid dictionary word (real ENABLE word, ≥4,
        # no-S, not profane). No frequency floor — tiers are decoupled (common-based),
        # so obscure-but-real words like "arraign" count automatically as bonus finds
        # without ever becoming a pangram or making the game harder. Runtime curation
        # (allow/block individual words) is DB-backed (bloom_word_overrides), no deploy.
        accept = sorted(cands)
        common = _filter(cands, COMMON_MIN)
        OUT.write_text(json.dumps({"accept": accept, "common": common}))
        print(f"\nwrote accept={len(accept)} (ALL valid words), "
              f"common={len(common)} (zipf>={COMMON_MIN}) → {OUT}")
    else:
        print(f"unknown command: {cmd}")


if __name__ == "__main__":
    main()