This repository has no description
1from __future__ import annotations
2
3import json
4from pathlib import Path
5
6import numpy as np
7import pytest
8
9from app.config import Settings
10from app.dedup import content_hash, row_content_hash
11from app.git_store import GitDataStore, load_git_store
12from app import db, recommend
13
14
15def _unit(v: list[float]) -> np.ndarray:
16 a = np.asarray(v, dtype=np.float32)
17 return a / np.linalg.norm(a)
18
19
20def _write_bundle(root: Path) -> None:
21 data = root / "data"
22 data.mkdir(parents=True)
23 repo_vecs = np.stack(
24 [
25 _unit([1, 0, 0]),
26 _unit([0.9, 0.1, 0]),
27 _unit([0, 1, 0]),
28 ]
29 )
30 issue_vecs = np.stack(
31 [
32 _unit([0.95, 0.05, 0]),
33 _unit([0, 0.95, 0.05]),
34 ]
35 )
36 np.save(data / "repos.f32.npy", repo_vecs)
37 np.save(data / "issues.f32.npy", issue_vecs)
38
39 repos = [
40 {
41 "row": 0,
42 "subject_uri": "at://did:plc:alice/sh.tangled.repo/r1",
43 "repo_did": "did:repo:alice-r1",
44 "repo_name": "alice-r1",
45 "owner_handle": "alice",
46 "description": "Alice repo one",
47 "topics": ["nix"],
48 "created_at": "2026-01-01T00:00:00Z",
49 "content_len": 200,
50 "content_sha500": "aaa",
51 "embedding_model": "gemini-embedding-001",
52 "embedded_at": "2026-01-01T00:00:00Z",
53 },
54 {
55 "row": 1,
56 "subject_uri": "at://did:plc:bob/sh.tangled.repo/r9",
57 "repo_did": "did:repo:bob-r9",
58 "repo_name": "bob-r9",
59 "owner_handle": "bob",
60 "description": "Bob similar repo",
61 "topics": ["cli"],
62 "created_at": "2026-01-02T00:00:00Z",
63 "content_len": 180,
64 "content_sha500": "bbb",
65 "embedding_model": "gemini-embedding-001",
66 "embedded_at": "2026-01-02T00:00:00Z",
67 },
68 {
69 "row": 2,
70 "subject_uri": "at://did:plc:carol/sh.tangled.repo/web",
71 "repo_did": "did:repo:carol-web",
72 "repo_name": "web",
73 "owner_handle": "carol",
74 "description": "Different topic",
75 "topics": ["web"],
76 "created_at": "2026-01-03T00:00:00Z",
77 "content_len": 500,
78 "content_sha500": "ccc",
79 "embedding_model": "gemini-embedding-001",
80 "embedded_at": "2026-01-03T00:00:00Z",
81 },
82 ]
83 issues = [
84 {
85 "row": 0,
86 "subject_uri": "at://did:plc:bob/sh.tangled.repo.issue/i1",
87 "repo_did": "did:repo:bob-r9",
88 "rkey": "i1",
89 "repo_uri": "at://did:plc:bob/sh.tangled.repo/r9",
90 "author_did": "did:plc:other",
91 "title": "Fix CLI",
92 "body": "details",
93 "owner_handle": "bob",
94 "repo_name": "bob-r9",
95 "repo_description": "Bob similar repo",
96 "created_at": "2026-01-04T00:00:00Z",
97 "embedding_model": "gemini-embedding-001",
98 },
99 {
100 "row": 1,
101 "subject_uri": "at://did:plc:carol/sh.tangled.repo.issue/i9",
102 "repo_did": "did:repo:carol-web",
103 "rkey": "i9",
104 "repo_uri": "at://did:plc:carol/sh.tangled.repo/web",
105 "author_did": "did:plc:carol",
106 "title": "Web thing",
107 "body": "body",
108 "owner_handle": "carol",
109 "repo_name": "web",
110 "repo_description": "Different topic",
111 "created_at": "2026-01-05T00:00:00Z",
112 "embedding_model": "gemini-embedding-001",
113 },
114 ]
115 (data / "repos.jsonl").write_text(
116 "\n".join(json.dumps(r) for r in repos) + "\n", encoding="utf-8"
117 )
118 (data / "issues.jsonl").write_text(
119 "\n".join(json.dumps(r) for r in issues) + "\n", encoding="utf-8"
120 )
121 (root / "manifest.json").write_text(
122 json.dumps(
123 {
124 "model": "gemini-embedding-001",
125 "dim": 3,
126 "metric": "cosine",
127 "counts": {"repos": 3, "issues": 2},
128 }
129 ),
130 encoding="utf-8",
131 )
132
133
134@pytest.fixture()
135def git_bundle(tmp_path, monkeypatch):
136 root = tmp_path / "bundle"
137 _write_bundle(root)
138 monkeypatch.setenv("DATA_STORAGE", "git")
139 monkeypatch.setenv("REC_DATA_DIR", str(root))
140 monkeypatch.delenv("REC_DATA_GIT_URL", raising=False)
141 from app.config import get_settings
142
143 get_settings.cache_clear()
144 load_git_store(get_settings())
145 yield root
146 get_settings.cache_clear()
147
148
149def test_row_content_hash_prefers_sha500():
150 assert row_content_hash({"content_sha500": "deadbeef", "content": "x"}) == "deadbeef"
151 assert content_hash("hello") == row_content_hash({"content": "hello"})
152
153
154def test_git_store_load_and_knn(git_bundle):
155 store = GitDataStore.load_from_dir(git_bundle)
156 seeds = store.load_seeds("did:plc:alice", min_chars=120)
157 assert len(seeds) == 1
158 assert seeds[0]["repo_did"] == "did:repo:alice-r1"
159
160 hits = store.knn_repos(seeds[0]["etext"], ["did:repo:alice-r1"], limit=5, min_chars=120)
161 assert hits
162 assert hits[0]["repo_did"] == "did:repo:bob-r9"
163 assert hits[0]["distance"] < 0.2
164
165
166def test_git_recommend_end_to_end(git_bundle):
167 res = recommend.recommend("did:plc:alice")
168 assert res.profile.sources.tangled.repos == 1
169 assert res.repos
170 assert res.repos[0].name == "bob-r9"
171 assert res.issues
172 assert res.issues[0].issueUri.endswith("/i1")
173
174
175def test_db_dispatch_git_mode(git_bundle):
176 counts = db.embedding_counts()
177 assert counts["readmes_embedded"] == 3
178 assert db.ping() is True
179 assert db.get_questionnaire("at://x") is None