AT Mot — a bilingual (EN/FR) daily word game native to the AT Protocol.
0

Configure Feed

Select the types of activity you want to include in your feed.

at trunk 8.9 kB View raw
1/** 2 * build-wordlists.ts — generate the per-language word lists shipped with AT Mot. 3 * 4 * Sources (both license-clean and independently sourced): 5 * - English dictionary: ENABLE word list — public domain. 6 * - French dictionary + both commonness rankings: hermitdave/FrequencyWords 7 * (OpenSubtitles frequency lists) — MIT licensed. 8 * 9 * Output: src/data/words.<lang>.json, committed to the repo. The ANSWER order is 10 * a fixed seeded shuffle baked into the file, so every client worldwide indexes 11 * the identical sequence (UTC daily agreement is guaranteed by shared data, not 12 * by runtime computation). Re-run with `npm run build:words`. 13 * 14 * The output is reproducible: same sources + same seeds => byte-identical files. 15 */ 16import { writeFileSync, mkdirSync } from 'node:fs'; 17import { fileURLToPath } from 'node:url'; 18import { dirname, resolve } from 'node:path'; 19import { normalizeWord } from '../src/engine/normalize.ts'; 20import { encodeAnswers } from '../src/engine/obfuscate.ts'; 21import { isInflectedSForm } from './plurals.ts'; 22import frenchDictionary from 'an-array-of-french-words' with { type: 'json' }; 23 24const HERE = dirname(fileURLToPath(import.meta.url)); 25const DATA_DIR = resolve(HERE, '../src/data'); 26 27const WORD_LENGTH = 5; 28 29const SOURCES = { 30 enable: 'https://raw.githubusercontent.com/dolph/dictionary/master/enable1.txt', 31 enFreq: 32 'https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2018/en/en_50k.txt', 33 frFreq: 34 'https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2018/fr/fr_50k.txt', 35}; 36 37// Fixed per-language seeds: changing these reshuffles all history, so never do. 38const SEED = { en: 0x6d6f7401, fr: 0x6d6f7402 }; 39 40async function fetchText(url: string): Promise<string> { 41 const res = await fetch(url, { headers: { 'user-agent': 'atmot-build/1.0' } }); 42 if (!res.ok) throw new Error(`fetch ${url} -> ${res.status}`); 43 return res.text(); 44} 45 46/** Deterministic PRNG (mulberry32). */ 47function mulberry32(seed: number): () => number { 48 let a = seed >>> 0; 49 return () => { 50 a |= 0; 51 a = (a + 0x6d2b79f5) | 0; 52 let t = Math.imul(a ^ (a >>> 15), 1 | a); 53 t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t; 54 return ((t ^ (t >>> 14)) >>> 0) / 4294967296; 55 }; 56} 57 58/** Fisher–Yates with a seeded PRNG; pure, deterministic. */ 59function seededShuffle<T>(items: readonly T[], seed: number): T[] { 60 const rand = mulberry32(seed); 61 const a = items.slice(); 62 for (let i = a.length - 1; i > 0; i--) { 63 const j = Math.floor(rand() * (i + 1)); 64 [a[i], a[j]] = [a[j]!, a[i]!]; 65 } 66 return a; 67} 68 69/** Parse a hermitdave frequency file ("word count" per line) into ranked tokens. */ 70function parseFreq(text: string): string[] { 71 const out: string[] = []; 72 for (const line of text.split('\n')) { 73 const word = line.split(' ')[0]?.trim(); 74 if (word) out.push(word); 75 } 76 return out; 77} 78 79interface WordList { 80 lang: string; 81 source: string; 82 license: string; 83 wordLength: number; 84 counts: { answers: number; allowed: number }; 85 /** Ordered daily-answer sequence (seeded shuffle). */ 86 answers: string[]; 87 /** Sorted set of valid guesses (superset of answers). */ 88 allowed: string[]; 89 /** Optional: normalized -> accented original (provenance / reference only). */ 90 accents?: Record<string, string>; 91} 92 93function write(list: WordList): void { 94 mkdirSync(DATA_DIR, { recursive: true }); 95 const file = resolve(DATA_DIR, `words.${list.lang}.json`); 96 // The ordered answer sequence ships obfuscated (a deliberate speed bump, not 97 // encryption — see src/engine/obfuscate.ts) so the daily word isn't readable 98 // in plaintext in the bundle or the repo. `answersEnc` decodes back to the 99 // identical ordered list, so the per-day mapping is unchanged. 100 const out = { 101 lang: list.lang, 102 source: list.source, 103 license: list.license, 104 wordLength: list.wordLength, 105 counts: list.counts, 106 answersEnc: encodeAnswers(list.answers), 107 allowed: list.allowed, 108 ...(list.accents ? { accents: list.accents } : {}), 109 }; 110 writeFileSync(file, JSON.stringify(out, null, 0) + '\n'); 111 console.log( 112 `words.${list.lang}.json: ${list.counts.answers} answers, ${list.counts.allowed} allowed`, 113 ); 114} 115 116async function buildEnglish(): Promise<void> { 117 const [enableRaw, freqRaw] = await Promise.all([ 118 fetchText(SOURCES.enable), 119 fetchText(SOURCES.enFreq), 120 ]); 121 122 // Full ENABLE dictionary (every length): the singular of a 5-letter plural is 123 // shorter than five letters, so plural detection needs words of all lengths. 124 const dictAll = new Set<string>(); 125 const allowedSet = new Set<string>(); // 5-letter subset => the allowed-guess set. 126 for (const w of enableRaw.split('\n')) { 127 const n = normalizeWord(w); 128 if (n.length === 0) continue; 129 dictAll.add(n); 130 if (n.length === WORD_LENGTH) allowedSet.add(n); 131 } 132 const isWord = (w: string): boolean => dictAll.has(w); 133 134 // Commonness ranking from the frequency list (only words present in ENABLE). 135 const freqRanked = parseFreq(freqRaw) 136 .map(normalizeWord) 137 .filter((w) => w.length === WORD_LENGTH && allowedSet.has(w)); 138 139 // Heuristic: keep obvious -S inflections (plurals, 3rd-person verbs) out of the 140 // answer pool so it stays mostly base words. They remain valid as guesses. 141 const seen = new Set<string>(); 142 const answerPool: string[] = []; 143 for (const w of freqRanked) { 144 if (seen.has(w)) continue; 145 seen.add(w); 146 if (isInflectedSForm(w, isWord)) continue; 147 answerPool.push(w); 148 if (answerPool.length >= 2500) break; 149 } 150 151 const answers = seededShuffle(answerPool, SEED.en); 152 const allowed = [...allowedSet].sort(); 153 154 write({ 155 lang: 'en', 156 source: 157 'Dictionary: ENABLE word list (https://github.com/dolph/dictionary). ' + 158 'Commonness ranking: hermitdave/FrequencyWords en_50k (OpenSubtitles).', 159 license: 'ENABLE: Public Domain. Frequency ranking: MIT.', 160 wordLength: WORD_LENGTH, 161 counts: { answers: answers.length, allowed: allowed.length }, 162 answers, 163 allowed, 164 }); 165} 166 167async function buildFrench(): Promise<void> { 168 const freqRaw = await fetchText(SOURCES.frFreq); 169 const ranked = parseFreq(freqRaw); 170 171 // Build a validity dictionary from an-array-of-french-words (MIT). Map each 172 // word's normalized form to its first (accented) spelling for reference, and 173 // remember which normalized forms are real French words. This is what filters 174 // out the names (AARON), English loanwords, and apostrophe-mangled contractions 175 // (ESTCE, JUSQU) that pollute raw subtitle frequency lists. 176 // Elision stems that appear as bare dictionary artifacts but aren't standalone 177 // playable words (e.g. "jusqu" from "jusqu'à"). 178 const FR_FRAGMENTS = new Set(['JUSQU', 'LORSQU', 'PUISQU', 'QUOIQU', 'PRESQU']); 179 180 const validNormalized = new Set<string>(); 181 const accents: Record<string, string> = {}; 182 for (const word of frenchDictionary as string[]) { 183 // Skip multi-part entries (contractions/compounds like "est-ce", "jusqu'", 184 // "a-t-il") — they normalize to fragment "words" (ESTCE, JUSQU) that aren't 185 // real single playable words. 186 if (/['’-]/.test(word)) continue; 187 const n = normalizeWord(word); 188 if (n.length !== WORD_LENGTH) continue; 189 if (FR_FRAGMENTS.has(n)) continue; 190 validNormalized.add(n); 191 if (!(n in accents) && /[^a-z]/i.test(word)) accents[n] = word; 192 } 193 194 // Commonness ordering from the frequency list, gated by dictionary validity. 195 const allowedSet = new Set<string>(); 196 const answerPool: string[] = []; 197 const VOWELS = /[AEIOUY]/; 198 for (const raw of ranked) { 199 const n = normalizeWord(raw); 200 if (n.length !== WORD_LENGTH || !VOWELS.test(n)) continue; 201 if (!validNormalized.has(n)) continue; // must be a real French word 202 if (!allowedSet.has(n)) { 203 allowedSet.add(n); 204 if (answerPool.length < 2200) answerPool.push(n); 205 } 206 } 207 208 // Allowed-guess set = every valid 5-letter dictionary word (superset of answers), 209 // so players can guess uncommon-but-real words. 210 for (const n of validNormalized) allowedSet.add(n); 211 212 const answers = seededShuffle(answerPool, SEED.fr); 213 const allowed = [...allowedSet].sort(); 214 // Trim the accents map to words we actually ship. 215 const shippedAccents: Record<string, string> = {}; 216 for (const n of allowedSet) if (accents[n]) shippedAccents[n] = accents[n]!; 217 218 write({ 219 lang: 'fr', 220 source: 221 'Dictionary: an-array-of-french-words (https://github.com/words/an-array-of-french-words). ' + 222 'Commonness ranking: hermitdave/FrequencyWords fr_50k (OpenSubtitles). ' + 223 'Independently sourced.', 224 license: 'Dictionary: MIT. Frequency ranking: MIT.', 225 wordLength: WORD_LENGTH, 226 counts: { answers: answers.length, allowed: allowed.length }, 227 answers, 228 allowed, 229 accents: shippedAccents, 230 }); 231} 232 233await buildEnglish(); 234await buildFrench(); 235console.log('done.');