This repository has no description
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 4.8 kB View raw
1// Cluster-then-retrieve recommender: preserves a user's multiple distinct interests. 2// Contrasts NAIVE pooled top-K (one cluster can dominate) vs CLUSTERED round-robin (balanced). 3import pg from "pg"; 4import { readFileSync } from "node:fs"; 5import { createHash } from "node:crypto"; 6function fromEnv(k){ if(process.env[k])return process.env[k]; for(const p of["../.env",".env"]){try{const m=readFileSync(p,"utf8").match(new RegExp(`^\\s*${k}\\s*=\\s*(.+)$`,"m"));if(m)return m[1].trim();}catch{}} } 7const pool = new pg.Pool({ connectionString: fromEnv("DB_CONNECTION_STRING"), ssl: { rejectUnauthorized: false }, max: 4 }); 8 9const USER = process.env.USER_DID || "did:plc:y7g2koy4nqw7434s67fgfjca"; 10const K = parseInt(process.env.K ?? "10", 10); 11const T = parseFloat(process.env.CLUSTER_T ?? "0.22"); // cosine-dist threshold to consider two seeds "same interest" 12const hash = (s) => createHash("md5").update((s ?? "").slice(0, 500)).digest("hex"); 13const parseVec = (s) => s.replace(/^\[|\]$/g, "").split(",").map(Number); 14const cosDist = (a, b) => { let d = 0; for (let i = 0; i < a.length; i++) d += a[i]*b[i]; return 1 - d; }; 15 16async function main() { 17 // 1) the user's contributed repos (here: owned) with embeddings 18 const seeds = (await pool.query( 19 `select repo_did, repo_name, content, embedding::text as etext 20 from tangled_readmes where embedding is not null and repo_uri like $1`, [`at://${USER}/%`])).rows; 21 if (seeds.length < 2) { console.log("not enough embedded seed repos for this user"); await pool.end(); return; } 22 seeds.forEach((s) => (s.vec = parseVec(s.etext))); 23 console.log(`USER ${USER}`); 24 console.log(`contributed repos (${seeds.length}): ${seeds.map((s) => s.repo_name).join(", ")}\n`); 25 26 // 2) cluster seeds: single-linkage connected components at threshold T (union-find) 27 const parent = seeds.map((_, i) => i); 28 const find = (x) => (parent[x] === x ? x : (parent[x] = find(parent[x]))); 29 for (let i = 0; i < seeds.length; i++) 30 for (let j = i + 1; j < seeds.length; j++) 31 if (cosDist(seeds[i].vec, seeds[j].vec) < T) parent[find(i)] = find(j); 32 const clusters = new Map(); 33 seeds.forEach((s, i) => { const r = find(i); (clusters.get(r) ?? clusters.set(r, []).get(r)).push(s); }); 34 const clusterList = [...clusters.values()]; 35 console.log(`${clusterList.length} interest cluster(s):`); 36 clusterList.forEach((c, i) => console.log(` [${i + 1}] ${c.map((s) => s.repo_name).join(", ")}`)); 37 38 // 3) retrieve neighbors per seed (drop user's own repos), tag with cluster + min dist 39 const ownRepoDids = new Set(seeds.map((s) => s.repo_did)); 40 const seenContent = new Set(seeds.map((s) => hash(s.content))); 41 // candidate -> { repo_name, dist, clusterIdx } 42 const cand = new Map(); 43 for (let ci = 0; ci < clusterList.length; ci++) { 44 for (const seed of clusterList[ci]) { 45 const rows = (await pool.query( 46 `select repo_name, repo_did, content, round((embedding <=> $1::vector)::numeric,4) dist 47 from tangled_readmes where embedding is not null and repo_did <> all($2) 48 order by embedding <=> $1::vector limit 25`, [seed.etext, [...ownRepoDids]])).rows; 49 for (const r of rows) { 50 const h = hash(r.content); 51 if (seenContent.has(h)) continue; // collapse forks / user's own content 52 const prev = cand.get(h); 53 const dist = Number(r.dist); 54 if (!prev || dist < prev.dist) cand.set(h, { repo_name: r.repo_name, dist, clusterIdx: ci }); 55 } 56 } 57 } 58 const all = [...cand.values()]; 59 60 // 4a) NAIVE pooled: global top-K by distance 61 const naive = [...all].sort((a, b) => a.dist - b.dist).slice(0, K); 62 63 // 4b) CLUSTERED round-robin: rank within each cluster, then take turns → balanced coverage 64 const perCluster = clusterList.map((_, ci) => all.filter((c) => c.clusterIdx === ci).sort((a, b) => a.dist - b.dist)); 65 const clustered = []; 66 const used = new Set(); 67 for (let round = 0; clustered.length < K && round < 50; round++) { 68 for (let ci = 0; ci < perCluster.length && clustered.length < K; ci++) { 69 const next = perCluster[ci].find((c) => !used.has(c.repo_name)); 70 if (next) { used.add(next.repo_name); clustered.push(next); } 71 } 72 } 73 74 const fmt = (arr) => arr.map((c, i) => ` ${String(i + 1).padStart(2)}. ${(c.repo_name ?? "?").padEnd(30)} dist=${c.dist} [interest ${c.clusterIdx + 1}]`).join("\n"); 75 const cover = (arr) => { const s = new Set(arr.map((c) => c.clusterIdx)); return `${s.size}/${clusterList.length} interests`; }; 76 console.log(`\n===== NAIVE pooled top-${K} (covers ${cover(naive)}) =====\n${fmt(naive)}`); 77 console.log(`\n===== CLUSTERED round-robin top-${K} (covers ${cover(clustered)}) =====\n${fmt(clustered)}`); 78 await pool.end(); 79} 80main().catch((e) => { console.error("FATAL:", e); process.exit(1); });