This repository has no description
1// Full experiment: fetch real Tangled issues live, embed as queries, vector-search READMEs.
2import pg from "pg";
3import { readFileSync } from "node:fs";
4function fromEnv(k){ if(process.env[k])return process.env[k]; for(const p of["../.env",".env"]){try{const m=readFileSync(p,"utf8").match(new RegExp(`^\\s*${k}\\s*=\\s*(.+)$`,"m"));if(m)return m[1].trim();}catch{}} }
5const API_KEY = fromEnv("GEMINI_API_KEY");
6const MODEL = "gemini-embedding-001";
7const pool = new pg.Pool({ connectionString: fromEnv("DB_CONNECTION_STRING"), ssl: { rejectUnauthorized: false }, max: 4 });
8const pdsUrl = (h) => (/^https?:\/\//.test(h) ? h : `https://${h}`);
9
10async function embedQuery(text) {
11 const resp = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${MODEL}:embedContent`, {
12 method: "POST", headers: { "content-type": "application/json", "x-goog-api-key": API_KEY },
13 body: JSON.stringify({ model: `models/${MODEL}`, content: { parts: [{ text: text.slice(0, 8000) }] }, taskType: "RETRIEVAL_QUERY", outputDimensionality: 1536 }),
14 });
15 if (!resp.ok) throw new Error(`embed HTTP ${resp.status}`);
16 const v = (await resp.json()).embedding.values;
17 let s = 0; for (const x of v) s += x * x; const n = Math.sqrt(s) || 1;
18 return `[${v.map((x) => x / n).join(",")}]`;
19}
20
21// Map an issue.repo reference (bare DID or at://owner/sh.tangled.repo/rkey) -> knot repoDid in readmes.
22async function resolveRepoDid(ref) {
23 if (!ref) return null;
24 if (ref.startsWith("at://")) {
25 const m = ref.match(/^at:\/\/([^/]+)\/[^/]+\/(.+)$/);
26 if (!m) return null;
27 const r = await pool.query(`select coalesce(repo_did, record_raw->>'repoDid') as rd from tangled_repos where owner_did=$1 and rkey=$2 limit 1`, [m[1], m[2]]);
28 return r.rows[0]?.rd ?? null;
29 }
30 return ref; // bare DID == repoDid
31}
32
33async function fetchIssues() {
34 const rows = (await pool.query(`
35 select distinct tr.owner_did, pa.pds_host
36 from tangled_repos tr join tangled_pds_accounts pa on pa.did = tr.owner_did
37 where exists (select 1 from tangled_readmes r where r.repo_did = coalesce(tr.repo_did, tr.record_raw->>'repoDid') and r.embedding is not null)
38 limit 120`)).rows;
39 const found = [];
40 const q = rows.slice();
41 await Promise.all(Array.from({ length: 14 }, async () => {
42 while (q.length) {
43 const r = q.pop();
44 const url = `${pdsUrl(r.pds_host)}/xrpc/com.atproto.repo.listRecords?repo=${encodeURIComponent(r.owner_did)}&collection=sh.tangled.repo.issue&limit=30`;
45 try {
46 const ctrl = new AbortController(); const t = setTimeout(() => ctrl.abort(), 10000);
47 const resp = await fetch(url, { signal: ctrl.signal });
48 clearTimeout(t);
49 if (!resp.ok) continue;
50 const j = await resp.json();
51 for (const rec of j.records ?? []) if (rec.value?.title) found.push(rec.value);
52 } catch {}
53 }
54 }));
55 return found;
56}
57
58async function main() {
59 const issues = await fetchIssues();
60 console.log(`fetched ${issues.length} live issues\n`);
61 // attach resolved repoDid + whether embedded; prefer substantive bodies whose repo is embedded
62 for (const iss of issues) {
63 iss._repoDid = await resolveRepoDid(iss.repo);
64 iss._embedded = iss._repoDid
65 ? (await pool.query(`select repo_name from tangled_readmes where repo_did=$1 and embedding is not null limit 1`, [iss._repoDid])).rows[0]?.repo_name ?? null
66 : null;
67 }
68 const pick = issues
69 .filter((i) => (i.body ?? "").length > 60)
70 .sort((a, b) => (b._embedded ? 1 : 0) - (a._embedded ? 1 : 0) || (b.body?.length ?? 0) - (a.body?.length ?? 0))
71 .slice(0, 4);
72
73 for (const iss of pick) {
74 console.log("\n" + "=".repeat(72));
75 console.log(`ISSUE: ${iss.title}`);
76 console.log(`own repo: ${iss._embedded ? iss._embedded + " (embedded ✓)" : "(parent README not embedded / unresolved)"}`);
77 console.log(`body: ${(iss.body ?? "").replace(/\s+/g, " ").slice(0, 200)}…`);
78 const qvec = await embedQuery(`${iss.title}\n\n${iss.body ?? ""}`);
79 const hits = (await pool.query(`
80 select repo_name, repo_did, round((embedding <=> $1::vector)::numeric,4) dist, (repo_did=$2) is_parent
81 from tangled_readmes where embedding is not null
82 order by embedding <=> $1::vector limit 8`, [qvec, iss._repoDid])).rows;
83 console.log("top README matches:");
84 hits.forEach((h, i) => console.log(` ${i + 1}. ${h.is_parent ? "👉" : " "} ${(h.repo_name ?? "(no name)").padEnd(34)} dist=${h.dist}${h.is_parent ? " <-- OWN REPO" : ""}`));
85 if (iss._embedded) {
86 const rnk = (await pool.query(`
87 select 1 + count(*)::int rnk from tangled_readmes
88 where embedding is not null and (embedding <=> $1::vector) < (select embedding <=> $1::vector from tangled_readmes where repo_did=$2 limit 1)`,
89 [qvec, iss._repoDid])).rows[0].rnk;
90 console.log(` → own repo overall rank: #${rnk} of all embedded READMEs`);
91 }
92 }
93 await pool.end();
94}
95main().catch((e) => { console.error("FATAL:", e); process.exit(1); });