AT Mot — a bilingual (EN/FR) daily word game native to the AT Protocol.
1/**
2 * build-wordlists.ts — generate the per-language word lists shipped with AT Mot.
3 *
4 * Sources (both license-clean and independently sourced):
5 * - English dictionary: ENABLE word list — public domain.
6 * - French dictionary + both commonness rankings: hermitdave/FrequencyWords
7 * (OpenSubtitles frequency lists) — MIT licensed.
8 *
9 * Output: src/data/words.<lang>.json, committed to the repo. The ANSWER order is
10 * a fixed seeded shuffle baked into the file, so every client worldwide indexes
11 * the identical sequence (UTC daily agreement is guaranteed by shared data, not
12 * by runtime computation). Re-run with `npm run build:words`.
13 *
14 * The output is reproducible: same sources + same seeds => byte-identical files.
15 */
16import { writeFileSync, mkdirSync } from 'node:fs';
17import { fileURLToPath } from 'node:url';
18import { dirname, resolve } from 'node:path';
19import { normalizeWord } from '../src/engine/normalize.ts';
20import { encodeAnswers } from '../src/engine/obfuscate.ts';
21import { isInflectedSForm } from './plurals.ts';
22import frenchDictionary from 'an-array-of-french-words' with { type: 'json' };
23
24const HERE = dirname(fileURLToPath(import.meta.url));
25const DATA_DIR = resolve(HERE, '../src/data');
26
27const WORD_LENGTH = 5;
28
29const SOURCES = {
30 enable: 'https://raw.githubusercontent.com/dolph/dictionary/master/enable1.txt',
31 enFreq:
32 'https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2018/en/en_50k.txt',
33 frFreq:
34 'https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2018/fr/fr_50k.txt',
35};
36
37// Fixed per-language seeds: changing these reshuffles all history, so never do.
38const SEED = { en: 0x6d6f7401, fr: 0x6d6f7402 };
39
40async function fetchText(url: string): Promise<string> {
41 const res = await fetch(url, { headers: { 'user-agent': 'atmot-build/1.0' } });
42 if (!res.ok) throw new Error(`fetch ${url} -> ${res.status}`);
43 return res.text();
44}
45
46/** Deterministic PRNG (mulberry32). */
47function mulberry32(seed: number): () => number {
48 let a = seed >>> 0;
49 return () => {
50 a |= 0;
51 a = (a + 0x6d2b79f5) | 0;
52 let t = Math.imul(a ^ (a >>> 15), 1 | a);
53 t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
54 return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
55 };
56}
57
58/** Fisher–Yates with a seeded PRNG; pure, deterministic. */
59function seededShuffle<T>(items: readonly T[], seed: number): T[] {
60 const rand = mulberry32(seed);
61 const a = items.slice();
62 for (let i = a.length - 1; i > 0; i--) {
63 const j = Math.floor(rand() * (i + 1));
64 [a[i], a[j]] = [a[j]!, a[i]!];
65 }
66 return a;
67}
68
69/** Parse a hermitdave frequency file ("word count" per line) into ranked tokens. */
70function parseFreq(text: string): string[] {
71 const out: string[] = [];
72 for (const line of text.split('\n')) {
73 const word = line.split(' ')[0]?.trim();
74 if (word) out.push(word);
75 }
76 return out;
77}
78
79interface WordList {
80 lang: string;
81 source: string;
82 license: string;
83 wordLength: number;
84 counts: { answers: number; allowed: number };
85 /** Ordered daily-answer sequence (seeded shuffle). */
86 answers: string[];
87 /** Sorted set of valid guesses (superset of answers). */
88 allowed: string[];
89 /** Optional: normalized -> accented original (provenance / reference only). */
90 accents?: Record<string, string>;
91}
92
93function write(list: WordList): void {
94 mkdirSync(DATA_DIR, { recursive: true });
95 const file = resolve(DATA_DIR, `words.${list.lang}.json`);
96 // The ordered answer sequence ships obfuscated (a deliberate speed bump, not
97 // encryption — see src/engine/obfuscate.ts) so the daily word isn't readable
98 // in plaintext in the bundle or the repo. `answersEnc` decodes back to the
99 // identical ordered list, so the per-day mapping is unchanged.
100 const out = {
101 lang: list.lang,
102 source: list.source,
103 license: list.license,
104 wordLength: list.wordLength,
105 counts: list.counts,
106 answersEnc: encodeAnswers(list.answers),
107 allowed: list.allowed,
108 ...(list.accents ? { accents: list.accents } : {}),
109 };
110 writeFileSync(file, JSON.stringify(out, null, 0) + '\n');
111 console.log(
112 `words.${list.lang}.json: ${list.counts.answers} answers, ${list.counts.allowed} allowed`,
113 );
114}
115
116async function buildEnglish(): Promise<void> {
117 const [enableRaw, freqRaw] = await Promise.all([
118 fetchText(SOURCES.enable),
119 fetchText(SOURCES.enFreq),
120 ]);
121
122 // Full ENABLE dictionary (every length): the singular of a 5-letter plural is
123 // shorter than five letters, so plural detection needs words of all lengths.
124 const dictAll = new Set<string>();
125 const allowedSet = new Set<string>(); // 5-letter subset => the allowed-guess set.
126 for (const w of enableRaw.split('\n')) {
127 const n = normalizeWord(w);
128 if (n.length === 0) continue;
129 dictAll.add(n);
130 if (n.length === WORD_LENGTH) allowedSet.add(n);
131 }
132 const isWord = (w: string): boolean => dictAll.has(w);
133
134 // Commonness ranking from the frequency list (only words present in ENABLE).
135 const freqRanked = parseFreq(freqRaw)
136 .map(normalizeWord)
137 .filter((w) => w.length === WORD_LENGTH && allowedSet.has(w));
138
139 // Heuristic: keep obvious -S inflections (plurals, 3rd-person verbs) out of the
140 // answer pool so it stays mostly base words. They remain valid as guesses.
141 const seen = new Set<string>();
142 const answerPool: string[] = [];
143 for (const w of freqRanked) {
144 if (seen.has(w)) continue;
145 seen.add(w);
146 if (isInflectedSForm(w, isWord)) continue;
147 answerPool.push(w);
148 if (answerPool.length >= 2500) break;
149 }
150
151 const answers = seededShuffle(answerPool, SEED.en);
152 const allowed = [...allowedSet].sort();
153
154 write({
155 lang: 'en',
156 source:
157 'Dictionary: ENABLE word list (https://github.com/dolph/dictionary). ' +
158 'Commonness ranking: hermitdave/FrequencyWords en_50k (OpenSubtitles).',
159 license: 'ENABLE: Public Domain. Frequency ranking: MIT.',
160 wordLength: WORD_LENGTH,
161 counts: { answers: answers.length, allowed: allowed.length },
162 answers,
163 allowed,
164 });
165}
166
167async function buildFrench(): Promise<void> {
168 const freqRaw = await fetchText(SOURCES.frFreq);
169 const ranked = parseFreq(freqRaw);
170
171 // Build a validity dictionary from an-array-of-french-words (MIT). Map each
172 // word's normalized form to its first (accented) spelling for reference, and
173 // remember which normalized forms are real French words. This is what filters
174 // out the names (AARON), English loanwords, and apostrophe-mangled contractions
175 // (ESTCE, JUSQU) that pollute raw subtitle frequency lists.
176 // Elision stems that appear as bare dictionary artifacts but aren't standalone
177 // playable words (e.g. "jusqu" from "jusqu'à").
178 const FR_FRAGMENTS = new Set(['JUSQU', 'LORSQU', 'PUISQU', 'QUOIQU', 'PRESQU']);
179
180 const validNormalized = new Set<string>();
181 const accents: Record<string, string> = {};
182 for (const word of frenchDictionary as string[]) {
183 // Skip multi-part entries (contractions/compounds like "est-ce", "jusqu'",
184 // "a-t-il") — they normalize to fragment "words" (ESTCE, JUSQU) that aren't
185 // real single playable words.
186 if (/['’-]/.test(word)) continue;
187 const n = normalizeWord(word);
188 if (n.length !== WORD_LENGTH) continue;
189 if (FR_FRAGMENTS.has(n)) continue;
190 validNormalized.add(n);
191 if (!(n in accents) && /[^a-z]/i.test(word)) accents[n] = word;
192 }
193
194 // Commonness ordering from the frequency list, gated by dictionary validity.
195 const allowedSet = new Set<string>();
196 const answerPool: string[] = [];
197 const VOWELS = /[AEIOUY]/;
198 for (const raw of ranked) {
199 const n = normalizeWord(raw);
200 if (n.length !== WORD_LENGTH || !VOWELS.test(n)) continue;
201 if (!validNormalized.has(n)) continue; // must be a real French word
202 if (!allowedSet.has(n)) {
203 allowedSet.add(n);
204 if (answerPool.length < 2200) answerPool.push(n);
205 }
206 }
207
208 // Allowed-guess set = every valid 5-letter dictionary word (superset of answers),
209 // so players can guess uncommon-but-real words.
210 for (const n of validNormalized) allowedSet.add(n);
211
212 const answers = seededShuffle(answerPool, SEED.fr);
213 const allowed = [...allowedSet].sort();
214 // Trim the accents map to words we actually ship.
215 const shippedAccents: Record<string, string> = {};
216 for (const n of allowedSet) if (accents[n]) shippedAccents[n] = accents[n]!;
217
218 write({
219 lang: 'fr',
220 source:
221 'Dictionary: an-array-of-french-words (https://github.com/words/an-array-of-french-words). ' +
222 'Commonness ranking: hermitdave/FrequencyWords fr_50k (OpenSubtitles). ' +
223 'Independently sourced.',
224 license: 'Dictionary: MIT. Frequency ranking: MIT.',
225 wordLength: WORD_LENGTH,
226 counts: { answers: answers.length, allowed: allowed.length },
227 answers,
228 allowed,
229 accents: shippedAccents,
230 });
231}
232
233await buildEnglish();
234await buildFrench();
235console.log('done.');