This repository has no description
1// Experiment: embed a Tangled issue as a query and vector-search the README embeddings.
2// Validates the matching: (a) does the issue's OWN repo rank highly? (b) are other hits topical?
3import pg from "pg";
4import { readFileSync } from "node:fs";
5
6function fromEnv(key) {
7 if (process.env[key]) return process.env[key];
8 for (const p of ["../.env", ".env"]) {
9 try { const m = readFileSync(p, "utf8").match(new RegExp(`^\\s*${key}\\s*=\\s*(.+)$`, "m")); if (m) return m[1].trim().replace(/^["']|["']$/g, ""); } catch {}
10 }
11}
12const CONN = fromEnv("DB_CONNECTION_STRING");
13const API_KEY = fromEnv("GEMINI_API_KEY");
14const MODEL = "gemini-embedding-001";
15const N = parseInt(process.env.ISSUES ?? "3", 10);
16
17const pool = new pg.Pool({ connectionString: CONN, ssl: { rejectUnauthorized: false }, max: 3 });
18
19async function embedQuery(text) {
20 const resp = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${MODEL}:embedContent`, {
21 method: "POST",
22 headers: { "content-type": "application/json", "x-goog-api-key": API_KEY },
23 body: JSON.stringify({
24 model: `models/${MODEL}`,
25 content: { parts: [{ text: text.slice(0, 8000) }] },
26 taskType: "RETRIEVAL_QUERY",
27 outputDimensionality: 1536,
28 }),
29 });
30 if (!resp.ok) throw new Error(`embed HTTP ${resp.status}: ${(await resp.text()).slice(0, 200)}`);
31 const v = (await resp.json()).embedding.values;
32 let s = 0; for (const x of v) s += x * x; const n = Math.sqrt(s) || 1;
33 return `[${v.map((x) => x / n).join(",")}]`;
34}
35
36async function main() {
37 const total = (await pool.query(`select count(*)::int n from tangled_issues`)).rows[0].n;
38 console.log(`tangled_issues total: ${total}`);
39 const joinable = (await pool.query(`
40 select count(*)::int n from tangled_issues i
41 where exists (select 1 from tangled_readmes r where r.repo_did = i.repo_did and r.embedding is not null)`)).rows[0].n;
42 console.log(`issues whose parent repo has an embedded README: ${joinable}\n`);
43 if (joinable === 0) { console.log("No joinable issues — cannot run the own-repo sanity check."); await pool.end(); return; }
44
45 // Pick a few substantive issues (decent body) whose repo is embedded.
46 const issues = (await pool.query(`
47 select i.uri, i.repo_did, i.title, i.body,
48 (select repo_name from tangled_readmes r where r.repo_did = i.repo_did limit 1) as parent_repo
49 from tangled_issues i
50 where i.title is not null and length(coalesce(i.body,'')) > 80
51 and exists (select 1 from tangled_readmes r where r.repo_did = i.repo_did and r.embedding is not null)
52 order by length(i.body) desc
53 limit ${N}`)).rows;
54
55 for (const iss of issues) {
56 const queryText = `${iss.title}\n\n${iss.body}`;
57 console.log("\n" + "=".repeat(70));
58 console.log(`ISSUE: ${iss.title}`);
59 console.log(`parent repo: ${iss.parent_repo} (${iss.repo_did})`);
60 console.log(`body: ${iss.body.replace(/\s+/g, " ").slice(0, 180)}…`);
61 const qvec = await embedQuery(queryText);
62 const hits = (await pool.query(`
63 select repo_name, repo_did, round((embedding <=> $1::vector)::numeric, 4) as dist,
64 (repo_did = $2) as is_parent
65 from tangled_readmes
66 where embedding is not null
67 order by embedding <=> $1::vector
68 limit 8`, [qvec, iss.repo_did])).rows;
69 console.log("top README matches:");
70 hits.forEach((h, idx) => {
71 console.log(` ${idx + 1}. ${h.is_parent ? "👉 " : " "}${h.repo_name?.padEnd(32) ?? "(no name)"} dist=${h.dist}${h.is_parent ? " <-- OWN REPO" : ""}`);
72 });
73 // Where does the own repo rank overall?
74 const rank = (await pool.query(`
75 select 1 + count(*)::int as rnk
76 from tangled_readmes
77 where embedding is not null
78 and (embedding <=> $1::vector) < (select embedding <=> $1::vector from tangled_readmes where repo_did=$2 limit 1)`,
79 [qvec, iss.repo_did])).rows[0].rnk;
80 console.log(` → own repo overall rank: #${rank} of all embedded READMEs`);
81 }
82 await pool.end();
83}
84main().catch((e) => { console.error("FATAL:", e); process.exit(1); });