scripts/build-wordlists.ts at trunk · jeremy.herve.bzh/atmot

AT Mot — a bilingual (EN/FR) daily word game native to the AT Protocol.
atmot / scripts / build-wordlists.ts
at trunk 8.9 kB View raw
Jeremy Herve Trim the word-list source note to drop the third-party project reference 1d ago
  1/**
  2 * build-wordlists.ts — generate the per-language word lists shipped with AT Mot.
  3 *
  4 * Sources (both license-clean and independently sourced):
  5 *   - English dictionary: ENABLE word list — public domain.
  6 *   - French dictionary + both commonness rankings: hermitdave/FrequencyWords
  7 *     (OpenSubtitles frequency lists) — MIT licensed.
  8 *
  9 * Output: src/data/words.<lang>.json, committed to the repo. The ANSWER order is
 10 * a fixed seeded shuffle baked into the file, so every client worldwide indexes
 11 * the identical sequence (UTC daily agreement is guaranteed by shared data, not
 12 * by runtime computation). Re-run with `npm run build:words`.
 13 *
 14 * The output is reproducible: same sources + same seeds => byte-identical files.
 15 */
 16import { writeFileSync, mkdirSync } from 'node:fs';
 17import { fileURLToPath } from 'node:url';
 18import { dirname, resolve } from 'node:path';
 19import { normalizeWord } from '../src/engine/normalize.ts';
 20import { encodeAnswers } from '../src/engine/obfuscate.ts';
 21import { isInflectedSForm } from './plurals.ts';
 22import frenchDictionary from 'an-array-of-french-words' with { type: 'json' };
 23
 24const HERE = dirname(fileURLToPath(import.meta.url));
 25const DATA_DIR = resolve(HERE, '../src/data');
 26
 27const WORD_LENGTH = 5;
 28
 29const SOURCES = {
 30  enable: 'https://raw.githubusercontent.com/dolph/dictionary/master/enable1.txt',
 31  enFreq:
 32    'https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2018/en/en_50k.txt',
 33  frFreq:
 34    'https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2018/fr/fr_50k.txt',
 35};
 36
 37// Fixed per-language seeds: changing these reshuffles all history, so never do.
 38const SEED = { en: 0x6d6f7401, fr: 0x6d6f7402 };
 39
 40async function fetchText(url: string): Promise<string> {
 41  const res = await fetch(url, { headers: { 'user-agent': 'atmot-build/1.0' } });
 42  if (!res.ok) throw new Error(`fetch ${url} -> ${res.status}`);
 43  return res.text();
 44}
 45
 46/** Deterministic PRNG (mulberry32). */
 47function mulberry32(seed: number): () => number {
 48  let a = seed >>> 0;
 49  return () => {
 50    a |= 0;
 51    a = (a + 0x6d2b79f5) | 0;
 52    let t = Math.imul(a ^ (a >>> 15), 1 | a);
 53    t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
 54    return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
 55  };
 56}
 57
 58/** Fisher–Yates with a seeded PRNG; pure, deterministic. */
 59function seededShuffle<T>(items: readonly T[], seed: number): T[] {
 60  const rand = mulberry32(seed);
 61  const a = items.slice();
 62  for (let i = a.length - 1; i > 0; i--) {
 63    const j = Math.floor(rand() * (i + 1));
 64    [a[i], a[j]] = [a[j]!, a[i]!];
 65  }
 66  return a;
 67}
 68
 69/** Parse a hermitdave frequency file ("word count" per line) into ranked tokens. */
 70function parseFreq(text: string): string[] {
 71  const out: string[] = [];
 72  for (const line of text.split('\n')) {
 73    const word = line.split(' ')[0]?.trim();
 74    if (word) out.push(word);
 75  }
 76  return out;
 77}
 78
 79interface WordList {
 80  lang: string;
 81  source: string;
 82  license: string;
 83  wordLength: number;
 84  counts: { answers: number; allowed: number };
 85  /** Ordered daily-answer sequence (seeded shuffle). */
 86  answers: string[];
 87  /** Sorted set of valid guesses (superset of answers). */
 88  allowed: string[];
 89  /** Optional: normalized -> accented original (provenance / reference only). */
 90  accents?: Record<string, string>;
 91}
 92
 93function write(list: WordList): void {
 94  mkdirSync(DATA_DIR, { recursive: true });
 95  const file = resolve(DATA_DIR, `words.${list.lang}.json`);
 96  // The ordered answer sequence ships obfuscated (a deliberate speed bump, not
 97  // encryption — see src/engine/obfuscate.ts) so the daily word isn't readable
 98  // in plaintext in the bundle or the repo. `answersEnc` decodes back to the
 99  // identical ordered list, so the per-day mapping is unchanged.
100  const out = {
101    lang: list.lang,
102    source: list.source,
103    license: list.license,
104    wordLength: list.wordLength,
105    counts: list.counts,
106    answersEnc: encodeAnswers(list.answers),
107    allowed: list.allowed,
108    ...(list.accents ? { accents: list.accents } : {}),
109  };
110  writeFileSync(file, JSON.stringify(out, null, 0) + '\n');
111  console.log(
112    `words.${list.lang}.json: ${list.counts.answers} answers, ${list.counts.allowed} allowed`,
113  );
114}
115
116async function buildEnglish(): Promise<void> {
117  const [enableRaw, freqRaw] = await Promise.all([
118    fetchText(SOURCES.enable),
119    fetchText(SOURCES.enFreq),
120  ]);
121
122  // Full ENABLE dictionary (every length): the singular of a 5-letter plural is
123  // shorter than five letters, so plural detection needs words of all lengths.
124  const dictAll = new Set<string>();
125  const allowedSet = new Set<string>(); // 5-letter subset => the allowed-guess set.
126  for (const w of enableRaw.split('\n')) {
127    const n = normalizeWord(w);
128    if (n.length === 0) continue;
129    dictAll.add(n);
130    if (n.length === WORD_LENGTH) allowedSet.add(n);
131  }
132  const isWord = (w: string): boolean => dictAll.has(w);
133
134  // Commonness ranking from the frequency list (only words present in ENABLE).
135  const freqRanked = parseFreq(freqRaw)
136    .map(normalizeWord)
137    .filter((w) => w.length === WORD_LENGTH && allowedSet.has(w));
138
139  // Heuristic: keep obvious -S inflections (plurals, 3rd-person verbs) out of the
140  // answer pool so it stays mostly base words. They remain valid as guesses.
141  const seen = new Set<string>();
142  const answerPool: string[] = [];
143  for (const w of freqRanked) {
144    if (seen.has(w)) continue;
145    seen.add(w);
146    if (isInflectedSForm(w, isWord)) continue;
147    answerPool.push(w);
148    if (answerPool.length >= 2500) break;
149  }
150
151  const answers = seededShuffle(answerPool, SEED.en);
152  const allowed = [...allowedSet].sort();
153
154  write({
155    lang: 'en',
156    source:
157      'Dictionary: ENABLE word list (https://github.com/dolph/dictionary). ' +
158      'Commonness ranking: hermitdave/FrequencyWords en_50k (OpenSubtitles).',
159    license: 'ENABLE: Public Domain. Frequency ranking: MIT.',
160    wordLength: WORD_LENGTH,
161    counts: { answers: answers.length, allowed: allowed.length },
162    answers,
163    allowed,
164  });
165}
166
167async function buildFrench(): Promise<void> {
168  const freqRaw = await fetchText(SOURCES.frFreq);
169  const ranked = parseFreq(freqRaw);
170
171  // Build a validity dictionary from an-array-of-french-words (MIT). Map each
172  // word's normalized form to its first (accented) spelling for reference, and
173  // remember which normalized forms are real French words. This is what filters
174  // out the names (AARON), English loanwords, and apostrophe-mangled contractions
175  // (ESTCE, JUSQU) that pollute raw subtitle frequency lists.
176  // Elision stems that appear as bare dictionary artifacts but aren't standalone
177  // playable words (e.g. "jusqu" from "jusqu'à").
178  const FR_FRAGMENTS = new Set(['JUSQU', 'LORSQU', 'PUISQU', 'QUOIQU', 'PRESQU']);
179
180  const validNormalized = new Set<string>();
181  const accents: Record<string, string> = {};
182  for (const word of frenchDictionary as string[]) {
183    // Skip multi-part entries (contractions/compounds like "est-ce", "jusqu'",
184    // "a-t-il") — they normalize to fragment "words" (ESTCE, JUSQU) that aren't
185    // real single playable words.
186    if (/['’-]/.test(word)) continue;
187    const n = normalizeWord(word);
188    if (n.length !== WORD_LENGTH) continue;
189    if (FR_FRAGMENTS.has(n)) continue;
190    validNormalized.add(n);
191    if (!(n in accents) && /[^a-z]/i.test(word)) accents[n] = word;
192  }
193
194  // Commonness ordering from the frequency list, gated by dictionary validity.
195  const allowedSet = new Set<string>();
196  const answerPool: string[] = [];
197  const VOWELS = /[AEIOUY]/;
198  for (const raw of ranked) {
199    const n = normalizeWord(raw);
200    if (n.length !== WORD_LENGTH || !VOWELS.test(n)) continue;
201    if (!validNormalized.has(n)) continue; // must be a real French word
202    if (!allowedSet.has(n)) {
203      allowedSet.add(n);
204      if (answerPool.length < 2200) answerPool.push(n);
205    }
206  }
207
208  // Allowed-guess set = every valid 5-letter dictionary word (superset of answers),
209  // so players can guess uncommon-but-real words.
210  for (const n of validNormalized) allowedSet.add(n);
211
212  const answers = seededShuffle(answerPool, SEED.fr);
213  const allowed = [...allowedSet].sort();
214  // Trim the accents map to words we actually ship.
215  const shippedAccents: Record<string, string> = {};
216  for (const n of allowedSet) if (accents[n]) shippedAccents[n] = accents[n]!;
217
218  write({
219    lang: 'fr',
220    source:
221      'Dictionary: an-array-of-french-words (https://github.com/words/an-array-of-french-words). ' +
222      'Commonness ranking: hermitdave/FrequencyWords fr_50k (OpenSubtitles). ' +
223      'Independently sourced.',
224    license: 'Dictionary: MIT. Frequency ranking: MIT.',
225    wordLength: WORD_LENGTH,
226    counts: { answers: answers.length, allowed: allowed.length },
227    answers,
228    allowed,
229    accents: shippedAccents,
230  });
231}
232
233await buildEnglish();
234await buildFrench();
235console.log('done.');
Configure Feed

Configure Feed