This repository has no description
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 7.5 kB View raw
1"""Recommendation orchestration: seeds -> per-seed kNN -> merge -> dedup -> 2floor -> rerank -> contract shape. 3 4This is the only place that stitches the (pure) stages to the (impure) data 5access. Keeping it thin makes the algorithm easy to read top-to-bottom. 6""" 7 8from __future__ import annotations 9 10from concurrent.futures import ThreadPoolExecutor 11 12from app import db 13from app.config import Settings, get_settings 14from app.dedup import collapse_forks, row_content_hash 15from app.links import at_owner, repo_url, to_rfc3339 16from app.merge import merge_hits 17from app.profile import build_interests 18from app.quality import drop_issue 19from app.rank import DefaultScorer, apply_floor, rerank 20from app.schemas import ( 21 IssueOut, 22 Profile, 23 Recommendations, 24 RepoOut, 25 Sources, 26 TangledSource, 27) 28from app.types import Candidate 29 30 31def _empty(settings: Settings, seed_count: int) -> Recommendations: 32 return Recommendations( 33 profile=Profile( 34 interests=[], 35 languages=[], 36 sources=Sources(tangled=TangledSource(repos=seed_count)), 37 ), 38 repos=[], 39 issues=[], 40 ) 41 42 43def _seed_label(seed: dict) -> str: 44 return seed.get("repo_name") or seed["repo_did"] 45 46 47def _seed_url_map(seeds: list[dict], settings: Settings) -> dict[str, str]: 48 """Map seed label (repo name or did) -> absolute Tangled repo URL.""" 49 out: dict[str, str] = {} 50 for seed in seeds: 51 handle = (seed.get("owner_handle") or "").strip() 52 name = (seed.get("repo_name") or "").strip() 53 label = _seed_label(seed) 54 out[label] = repo_url(settings.web_base, handle, name) if handle and name else "" 55 return out 56 57 58def _based_on_repo_url(c: Candidate, seed_urls: dict[str, str]) -> str: 59 return seed_urls.get(c.primary_seed, "") 60 61 62def _repo_out( 63 c: Candidate, 64 settings: Settings, 65 open_issues: dict[str, int], 66 seed_urls: dict[str, str], 67) -> RepoOut: 68 p = c.payload 69 handle = p.get("owner_handle") or "" 70 name = p.get("repo_name") or "" 71 return RepoOut( 72 name=name, 73 owner=at_owner(handle), 74 language="", # no language signal in the shared DB yet 75 description=(p.get("description") or "").strip(), 76 stars=0, # no star signal yet (tangled_backlinks empty) 77 openIssues=open_issues.get(c.key, 0), 78 lastActive=to_rfc3339(p.get("created_at")), 79 url=repo_url(settings.web_base, handle, name), 80 basedOnRepoUrl=_based_on_repo_url(c, seed_urls), 81 ) 82 83 84def _issue_out( 85 c: Candidate, settings: Settings, seed_urls: dict[str, str], with_questionnaire: set[str] 86) -> IssueOut: 87 p = c.payload 88 handle = p.get("owner_handle") or "" 89 name = p.get("repo_name") or "" 90 uri = (p.get("uri") or "").strip() 91 return IssueOut( 92 title=(p.get("title") or "").strip(), 93 repo=f"{handle}/{name}", 94 owner=at_owner(handle), 95 issueUri=uri, 96 repoDid=p.get("repo_did") or "", 97 rkey=p.get("rkey") or "", 98 url=repo_url(settings.web_base, handle, name), 99 basedOnRepoUrl=_based_on_repo_url(c, seed_urls), 100 repoReadme=(p.get("repo_readme") or "").strip(), 101 hasQuestionnaire=uri in with_questionnaire, 102 labels=[], # issue records carry no labels in the shared DB 103 comments=0, # no comment source yet 104 language="", 105 lastActive=to_rfc3339(p.get("created_at")), 106 ) 107 108 109def _fetch_per_seed(seeds, query, workers) -> list[tuple[str, list[dict]]]: 110 """Run `query(seed) -> (label, rows)` across the user's seeds concurrently. 111 112 The DB is remote with multi-second round-trips, so the per-seed kNN queries 113 dominate request latency; fanning them out across a thread pool cuts it to 114 roughly one query's worth. `ThreadPoolExecutor.map` preserves seed order, so 115 the downstream merge/rerank stay deterministic (tie-breaks unchanged). 116 """ 117 n = max(1, min(len(seeds), workers)) 118 with ThreadPoolExecutor(max_workers=n) as ex: 119 return list(ex.map(query, seeds)) 120 121 122def _recommend_repos(seeds, exclude_dids, seed_hashes, settings) -> list[RepoOut]: 123 seed_urls = _seed_url_map(seeds, settings) 124 125 def query(s): 126 rows = db.knn_repos( 127 s["etext"], exclude_dids, settings.per_seed_limit, settings.min_readme_chars 128 ) 129 return (s["repo_name"] or s["repo_did"], rows) 130 131 per_seed_hits = _fetch_per_seed(seeds, query, settings.query_workers) 132 133 candidates = merge_hits(per_seed_hits, seed_hashes) 134 candidates = collapse_forks(candidates) 135 candidates = apply_floor(candidates, settings.distance_floor) 136 candidates = [c for c in candidates if (c.payload.get("owner_handle") or "").strip()] 137 ranked = rerank(candidates, DefaultScorer(), settings.max_repos, diversify=True) 138 139 counts = db.open_issue_counts([c.key for c in ranked]) 140 return [_repo_out(c, settings, counts, seed_urls) for c in ranked] 141 142 143def _recommend_issues(did, seeds, exclude_dids, settings) -> list[IssueOut]: 144 seed_urls = _seed_url_map(seeds, settings) 145 146 def query(s): 147 rows = db.knn_issues(s["etext"], exclude_dids, did, settings.per_seed_limit) 148 return (s["repo_name"] or s["repo_did"], rows) 149 150 per_seed_hits = _fetch_per_seed(seeds, query, settings.query_workers) 151 152 # Key by issue uri — each issue is already unique. We deliberately do NOT run 153 # collapse_forks here: that collapses by md5(content[:500]), which is right for 154 # fork READMEs but would merge genuinely distinct issues that share an empty or 155 # boilerplate body, silently dropping real recommendations. 156 candidates = merge_hits(per_seed_hits, seed_content_hashes=set(), key_field="uri") 157 candidates = apply_floor(candidates, settings.issue_distance_floor) 158 # Drop issues whose parent repo is a sandbox/test repo or whose content is a 159 # placeholder/test issue — they embed close to real interests but aren't real 160 # contribution opportunities. (The README-length repo standard can't be used 161 # here: issue-parent repos almost never have a README in the DB.) 162 candidates = [ 163 c 164 for c in candidates 165 if not drop_issue( 166 c.payload.get("repo_name") or "", 167 c.payload.get("repo_description") or "", 168 c.payload.get("title") or "", 169 c.payload.get("content") or "", 170 ) 171 ] 172 ranked = rerank(candidates, DefaultScorer(), settings.max_issues, diversify=True) 173 with_questionnaire = db.questionnaires_present( 174 [c.payload.get("uri") for c in ranked if c.payload.get("uri")] 175 ) 176 return [_issue_out(c, settings, seed_urls, with_questionnaire) for c in ranked] 177 178 179def recommend(did: str, settings: Settings | None = None) -> Recommendations: 180 settings = settings or get_settings() 181 182 seeds = db.load_seeds(did, settings.min_readme_chars) 183 if not seeds: 184 return _empty(settings, 0) 185 186 seed_hashes = {row_content_hash(s) for s in seeds} 187 exclude_dids = [s["repo_did"] for s in seeds] 188 189 repos = _recommend_repos(seeds, exclude_dids, seed_hashes, settings) 190 issues = _recommend_issues(did, seeds, exclude_dids, settings) 191 192 interests = build_interests(seeds, settings.max_interests) 193 profile = Profile( 194 interests=[{"label": i["label"], "slug": i["slug"]} for i in interests], 195 languages=[], 196 sources=Sources(tangled=TangledSource(repos=len(seeds))), 197 ) 198 return Recommendations(profile=profile, repos=repos, issues=issues)