This repository has no description
1"""Recommendation orchestration: seeds -> per-seed kNN -> merge -> dedup ->
2floor -> rerank -> contract shape.
3
4This is the only place that stitches the (pure) stages to the (impure) data
5access. Keeping it thin makes the algorithm easy to read top-to-bottom.
6"""
7
8from __future__ import annotations
9
10from concurrent.futures import ThreadPoolExecutor
11
12from app import db
13from app.config import Settings, get_settings
14from app.dedup import collapse_forks, row_content_hash
15from app.links import at_owner, repo_url, to_rfc3339
16from app.merge import merge_hits
17from app.profile import build_interests
18from app.quality import drop_issue
19from app.rank import DefaultScorer, apply_floor, rerank
20from app.schemas import (
21 IssueOut,
22 Profile,
23 Recommendations,
24 RepoOut,
25 Sources,
26 TangledSource,
27)
28from app.types import Candidate
29
30
31def _empty(settings: Settings, seed_count: int) -> Recommendations:
32 return Recommendations(
33 profile=Profile(
34 interests=[],
35 languages=[],
36 sources=Sources(tangled=TangledSource(repos=seed_count)),
37 ),
38 repos=[],
39 issues=[],
40 )
41
42
43def _seed_label(seed: dict) -> str:
44 return seed.get("repo_name") or seed["repo_did"]
45
46
47def _seed_url_map(seeds: list[dict], settings: Settings) -> dict[str, str]:
48 """Map seed label (repo name or did) -> absolute Tangled repo URL."""
49 out: dict[str, str] = {}
50 for seed in seeds:
51 handle = (seed.get("owner_handle") or "").strip()
52 name = (seed.get("repo_name") or "").strip()
53 label = _seed_label(seed)
54 out[label] = repo_url(settings.web_base, handle, name) if handle and name else ""
55 return out
56
57
58def _based_on_repo_url(c: Candidate, seed_urls: dict[str, str]) -> str:
59 return seed_urls.get(c.primary_seed, "")
60
61
62def _repo_out(
63 c: Candidate,
64 settings: Settings,
65 open_issues: dict[str, int],
66 seed_urls: dict[str, str],
67) -> RepoOut:
68 p = c.payload
69 handle = p.get("owner_handle") or ""
70 name = p.get("repo_name") or ""
71 return RepoOut(
72 name=name,
73 owner=at_owner(handle),
74 language="", # no language signal in the shared DB yet
75 description=(p.get("description") or "").strip(),
76 stars=0, # no star signal yet (tangled_backlinks empty)
77 openIssues=open_issues.get(c.key, 0),
78 lastActive=to_rfc3339(p.get("created_at")),
79 url=repo_url(settings.web_base, handle, name),
80 basedOnRepoUrl=_based_on_repo_url(c, seed_urls),
81 )
82
83
84def _issue_out(
85 c: Candidate, settings: Settings, seed_urls: dict[str, str], with_questionnaire: set[str]
86) -> IssueOut:
87 p = c.payload
88 handle = p.get("owner_handle") or ""
89 name = p.get("repo_name") or ""
90 uri = (p.get("uri") or "").strip()
91 return IssueOut(
92 title=(p.get("title") or "").strip(),
93 repo=f"{handle}/{name}",
94 owner=at_owner(handle),
95 issueUri=uri,
96 repoDid=p.get("repo_did") or "",
97 rkey=p.get("rkey") or "",
98 url=repo_url(settings.web_base, handle, name),
99 basedOnRepoUrl=_based_on_repo_url(c, seed_urls),
100 repoReadme=(p.get("repo_readme") or "").strip(),
101 hasQuestionnaire=uri in with_questionnaire,
102 labels=[], # issue records carry no labels in the shared DB
103 comments=0, # no comment source yet
104 language="",
105 lastActive=to_rfc3339(p.get("created_at")),
106 )
107
108
109def _fetch_per_seed(seeds, query, workers) -> list[tuple[str, list[dict]]]:
110 """Run `query(seed) -> (label, rows)` across the user's seeds concurrently.
111
112 The DB is remote with multi-second round-trips, so the per-seed kNN queries
113 dominate request latency; fanning them out across a thread pool cuts it to
114 roughly one query's worth. `ThreadPoolExecutor.map` preserves seed order, so
115 the downstream merge/rerank stay deterministic (tie-breaks unchanged).
116 """
117 n = max(1, min(len(seeds), workers))
118 with ThreadPoolExecutor(max_workers=n) as ex:
119 return list(ex.map(query, seeds))
120
121
122def _recommend_repos(seeds, exclude_dids, seed_hashes, settings) -> list[RepoOut]:
123 seed_urls = _seed_url_map(seeds, settings)
124
125 def query(s):
126 rows = db.knn_repos(
127 s["etext"], exclude_dids, settings.per_seed_limit, settings.min_readme_chars
128 )
129 return (s["repo_name"] or s["repo_did"], rows)
130
131 per_seed_hits = _fetch_per_seed(seeds, query, settings.query_workers)
132
133 candidates = merge_hits(per_seed_hits, seed_hashes)
134 candidates = collapse_forks(candidates)
135 candidates = apply_floor(candidates, settings.distance_floor)
136 candidates = [c for c in candidates if (c.payload.get("owner_handle") or "").strip()]
137 ranked = rerank(candidates, DefaultScorer(), settings.max_repos, diversify=True)
138
139 counts = db.open_issue_counts([c.key for c in ranked])
140 return [_repo_out(c, settings, counts, seed_urls) for c in ranked]
141
142
143def _recommend_issues(did, seeds, exclude_dids, settings) -> list[IssueOut]:
144 seed_urls = _seed_url_map(seeds, settings)
145
146 def query(s):
147 rows = db.knn_issues(s["etext"], exclude_dids, did, settings.per_seed_limit)
148 return (s["repo_name"] or s["repo_did"], rows)
149
150 per_seed_hits = _fetch_per_seed(seeds, query, settings.query_workers)
151
152 # Key by issue uri — each issue is already unique. We deliberately do NOT run
153 # collapse_forks here: that collapses by md5(content[:500]), which is right for
154 # fork READMEs but would merge genuinely distinct issues that share an empty or
155 # boilerplate body, silently dropping real recommendations.
156 candidates = merge_hits(per_seed_hits, seed_content_hashes=set(), key_field="uri")
157 candidates = apply_floor(candidates, settings.issue_distance_floor)
158 # Drop issues whose parent repo is a sandbox/test repo or whose content is a
159 # placeholder/test issue — they embed close to real interests but aren't real
160 # contribution opportunities. (The README-length repo standard can't be used
161 # here: issue-parent repos almost never have a README in the DB.)
162 candidates = [
163 c
164 for c in candidates
165 if not drop_issue(
166 c.payload.get("repo_name") or "",
167 c.payload.get("repo_description") or "",
168 c.payload.get("title") or "",
169 c.payload.get("content") or "",
170 )
171 ]
172 ranked = rerank(candidates, DefaultScorer(), settings.max_issues, diversify=True)
173 with_questionnaire = db.questionnaires_present(
174 [c.payload.get("uri") for c in ranked if c.payload.get("uri")]
175 )
176 return [_issue_out(c, settings, seed_urls, with_questionnaire) for c in ranked]
177
178
179def recommend(did: str, settings: Settings | None = None) -> Recommendations:
180 settings = settings or get_settings()
181
182 seeds = db.load_seeds(did, settings.min_readme_chars)
183 if not seeds:
184 return _empty(settings, 0)
185
186 seed_hashes = {row_content_hash(s) for s in seeds}
187 exclude_dids = [s["repo_did"] for s in seeds]
188
189 repos = _recommend_repos(seeds, exclude_dids, seed_hashes, settings)
190 issues = _recommend_issues(did, seeds, exclude_dids, settings)
191
192 interests = build_interests(seeds, settings.max_interests)
193 profile = Profile(
194 interests=[{"label": i["label"], "slug": i["slug"]} for i in interests],
195 languages=[],
196 sources=Sources(tangled=TangledSource(repos=len(seeds))),
197 )
198 return Recommendations(profile=profile, repos=repos, issues=issues)