This repository has no description
1// Cluster-then-retrieve recommender: preserves a user's multiple distinct interests.
2// Contrasts NAIVE pooled top-K (one cluster can dominate) vs CLUSTERED round-robin (balanced).
3import pg from "pg";
4import { readFileSync } from "node:fs";
5import { createHash } from "node:crypto";
6function fromEnv(k){ if(process.env[k])return process.env[k]; for(const p of["../.env",".env"]){try{const m=readFileSync(p,"utf8").match(new RegExp(`^\\s*${k}\\s*=\\s*(.+)$`,"m"));if(m)return m[1].trim();}catch{}} }
7const pool = new pg.Pool({ connectionString: fromEnv("DB_CONNECTION_STRING"), ssl: { rejectUnauthorized: false }, max: 4 });
8
9const USER = process.env.USER_DID || "did:plc:y7g2koy4nqw7434s67fgfjca";
10const K = parseInt(process.env.K ?? "10", 10);
11const T = parseFloat(process.env.CLUSTER_T ?? "0.22"); // cosine-dist threshold to consider two seeds "same interest"
12const hash = (s) => createHash("md5").update((s ?? "").slice(0, 500)).digest("hex");
13const parseVec = (s) => s.replace(/^\[|\]$/g, "").split(",").map(Number);
14const cosDist = (a, b) => { let d = 0; for (let i = 0; i < a.length; i++) d += a[i]*b[i]; return 1 - d; };
15
16async function main() {
17 // 1) the user's contributed repos (here: owned) with embeddings
18 const seeds = (await pool.query(
19 `select repo_did, repo_name, content, embedding::text as etext
20 from tangled_readmes where embedding is not null and repo_uri like $1`, [`at://${USER}/%`])).rows;
21 if (seeds.length < 2) { console.log("not enough embedded seed repos for this user"); await pool.end(); return; }
22 seeds.forEach((s) => (s.vec = parseVec(s.etext)));
23 console.log(`USER ${USER}`);
24 console.log(`contributed repos (${seeds.length}): ${seeds.map((s) => s.repo_name).join(", ")}\n`);
25
26 // 2) cluster seeds: single-linkage connected components at threshold T (union-find)
27 const parent = seeds.map((_, i) => i);
28 const find = (x) => (parent[x] === x ? x : (parent[x] = find(parent[x])));
29 for (let i = 0; i < seeds.length; i++)
30 for (let j = i + 1; j < seeds.length; j++)
31 if (cosDist(seeds[i].vec, seeds[j].vec) < T) parent[find(i)] = find(j);
32 const clusters = new Map();
33 seeds.forEach((s, i) => { const r = find(i); (clusters.get(r) ?? clusters.set(r, []).get(r)).push(s); });
34 const clusterList = [...clusters.values()];
35 console.log(`→ ${clusterList.length} interest cluster(s):`);
36 clusterList.forEach((c, i) => console.log(` [${i + 1}] ${c.map((s) => s.repo_name).join(", ")}`));
37
38 // 3) retrieve neighbors per seed (drop user's own repos), tag with cluster + min dist
39 const ownRepoDids = new Set(seeds.map((s) => s.repo_did));
40 const seenContent = new Set(seeds.map((s) => hash(s.content)));
41 // candidate -> { repo_name, dist, clusterIdx }
42 const cand = new Map();
43 for (let ci = 0; ci < clusterList.length; ci++) {
44 for (const seed of clusterList[ci]) {
45 const rows = (await pool.query(
46 `select repo_name, repo_did, content, round((embedding <=> $1::vector)::numeric,4) dist
47 from tangled_readmes where embedding is not null and repo_did <> all($2)
48 order by embedding <=> $1::vector limit 25`, [seed.etext, [...ownRepoDids]])).rows;
49 for (const r of rows) {
50 const h = hash(r.content);
51 if (seenContent.has(h)) continue; // collapse forks / user's own content
52 const prev = cand.get(h);
53 const dist = Number(r.dist);
54 if (!prev || dist < prev.dist) cand.set(h, { repo_name: r.repo_name, dist, clusterIdx: ci });
55 }
56 }
57 }
58 const all = [...cand.values()];
59
60 // 4a) NAIVE pooled: global top-K by distance
61 const naive = [...all].sort((a, b) => a.dist - b.dist).slice(0, K);
62
63 // 4b) CLUSTERED round-robin: rank within each cluster, then take turns → balanced coverage
64 const perCluster = clusterList.map((_, ci) => all.filter((c) => c.clusterIdx === ci).sort((a, b) => a.dist - b.dist));
65 const clustered = [];
66 const used = new Set();
67 for (let round = 0; clustered.length < K && round < 50; round++) {
68 for (let ci = 0; ci < perCluster.length && clustered.length < K; ci++) {
69 const next = perCluster[ci].find((c) => !used.has(c.repo_name));
70 if (next) { used.add(next.repo_name); clustered.push(next); }
71 }
72 }
73
74 const fmt = (arr) => arr.map((c, i) => ` ${String(i + 1).padStart(2)}. ${(c.repo_name ?? "?").padEnd(30)} dist=${c.dist} [interest ${c.clusterIdx + 1}]`).join("\n");
75 const cover = (arr) => { const s = new Set(arr.map((c) => c.clusterIdx)); return `${s.size}/${clusterList.length} interests`; };
76 console.log(`\n===== NAIVE pooled top-${K} (covers ${cover(naive)}) =====\n${fmt(naive)}`);
77 console.log(`\n===== CLUSTERED round-robin top-${K} (covers ${cover(clustered)}) =====\n${fmt(clustered)}`);
78 await pool.end();
79}
80main().catch((e) => { console.error("FATAL:", e); process.exit(1); });