recommendation/app/recommend.py at main · char.tngl.sh/sunstead-backend

This repository has no description
sunstead-backend / recommendation / app / recommend.py
at main 7.5 kB View raw
Mark Pokidko Sunstead backend — Tangled Discover + AI-Solve (snapshot, no history) 2d ago
  1"""Recommendation orchestration: seeds -> per-seed kNN -> merge -> dedup ->
  2floor -> rerank -> contract shape.
  3
  4This is the only place that stitches the (pure) stages to the (impure) data
  5access. Keeping it thin makes the algorithm easy to read top-to-bottom.
  6"""
  7
  8from __future__ import annotations
  9
 10from concurrent.futures import ThreadPoolExecutor
 11
 12from app import db
 13from app.config import Settings, get_settings
 14from app.dedup import collapse_forks, row_content_hash
 15from app.links import at_owner, repo_url, to_rfc3339
 16from app.merge import merge_hits
 17from app.profile import build_interests
 18from app.quality import drop_issue
 19from app.rank import DefaultScorer, apply_floor, rerank
 20from app.schemas import (
 21    IssueOut,
 22    Profile,
 23    Recommendations,
 24    RepoOut,
 25    Sources,
 26    TangledSource,
 27)
 28from app.types import Candidate
 29
 30
 31def _empty(settings: Settings, seed_count: int) -> Recommendations:
 32    return Recommendations(
 33        profile=Profile(
 34            interests=[],
 35            languages=[],
 36            sources=Sources(tangled=TangledSource(repos=seed_count)),
 37        ),
 38        repos=[],
 39        issues=[],
 40    )
 41
 42
 43def _seed_label(seed: dict) -> str:
 44    return seed.get("repo_name") or seed["repo_did"]
 45
 46
 47def _seed_url_map(seeds: list[dict], settings: Settings) -> dict[str, str]:
 48    """Map seed label (repo name or did) -> absolute Tangled repo URL."""
 49    out: dict[str, str] = {}
 50    for seed in seeds:
 51        handle = (seed.get("owner_handle") or "").strip()
 52        name = (seed.get("repo_name") or "").strip()
 53        label = _seed_label(seed)
 54        out[label] = repo_url(settings.web_base, handle, name) if handle and name else ""
 55    return out
 56
 57
 58def _based_on_repo_url(c: Candidate, seed_urls: dict[str, str]) -> str:
 59    return seed_urls.get(c.primary_seed, "")
 60
 61
 62def _repo_out(
 63    c: Candidate,
 64    settings: Settings,
 65    open_issues: dict[str, int],
 66    seed_urls: dict[str, str],
 67) -> RepoOut:
 68    p = c.payload
 69    handle = p.get("owner_handle") or ""
 70    name = p.get("repo_name") or ""
 71    return RepoOut(
 72        name=name,
 73        owner=at_owner(handle),
 74        language="",  # no language signal in the shared DB yet
 75        description=(p.get("description") or "").strip(),
 76        stars=0,      # no star signal yet (tangled_backlinks empty)
 77        openIssues=open_issues.get(c.key, 0),
 78        lastActive=to_rfc3339(p.get("created_at")),
 79        url=repo_url(settings.web_base, handle, name),
 80        basedOnRepoUrl=_based_on_repo_url(c, seed_urls),
 81    )
 82
 83
 84def _issue_out(
 85    c: Candidate, settings: Settings, seed_urls: dict[str, str], with_questionnaire: set[str]
 86) -> IssueOut:
 87    p = c.payload
 88    handle = p.get("owner_handle") or ""
 89    name = p.get("repo_name") or ""
 90    uri = (p.get("uri") or "").strip()
 91    return IssueOut(
 92        title=(p.get("title") or "").strip(),
 93        repo=f"{handle}/{name}",
 94        owner=at_owner(handle),
 95        issueUri=uri,
 96        repoDid=p.get("repo_did") or "",
 97        rkey=p.get("rkey") or "",
 98        url=repo_url(settings.web_base, handle, name),
 99        basedOnRepoUrl=_based_on_repo_url(c, seed_urls),
100        repoReadme=(p.get("repo_readme") or "").strip(),
101        hasQuestionnaire=uri in with_questionnaire,
102        labels=[],          # issue records carry no labels in the shared DB
103        comments=0,         # no comment source yet
104        language="",
105        lastActive=to_rfc3339(p.get("created_at")),
106    )
107
108
109def _fetch_per_seed(seeds, query, workers) -> list[tuple[str, list[dict]]]:
110    """Run `query(seed) -> (label, rows)` across the user's seeds concurrently.
111
112    The DB is remote with multi-second round-trips, so the per-seed kNN queries
113    dominate request latency; fanning them out across a thread pool cuts it to
114    roughly one query's worth. `ThreadPoolExecutor.map` preserves seed order, so
115    the downstream merge/rerank stay deterministic (tie-breaks unchanged).
116    """
117    n = max(1, min(len(seeds), workers))
118    with ThreadPoolExecutor(max_workers=n) as ex:
119        return list(ex.map(query, seeds))
120
121
122def _recommend_repos(seeds, exclude_dids, seed_hashes, settings) -> list[RepoOut]:
123    seed_urls = _seed_url_map(seeds, settings)
124
125    def query(s):
126        rows = db.knn_repos(
127            s["etext"], exclude_dids, settings.per_seed_limit, settings.min_readme_chars
128        )
129        return (s["repo_name"] or s["repo_did"], rows)
130
131    per_seed_hits = _fetch_per_seed(seeds, query, settings.query_workers)
132
133    candidates = merge_hits(per_seed_hits, seed_hashes)
134    candidates = collapse_forks(candidates)
135    candidates = apply_floor(candidates, settings.distance_floor)
136    candidates = [c for c in candidates if (c.payload.get("owner_handle") or "").strip()]
137    ranked = rerank(candidates, DefaultScorer(), settings.max_repos, diversify=True)
138
139    counts = db.open_issue_counts([c.key for c in ranked])
140    return [_repo_out(c, settings, counts, seed_urls) for c in ranked]
141
142
143def _recommend_issues(did, seeds, exclude_dids, settings) -> list[IssueOut]:
144    seed_urls = _seed_url_map(seeds, settings)
145
146    def query(s):
147        rows = db.knn_issues(s["etext"], exclude_dids, did, settings.per_seed_limit)
148        return (s["repo_name"] or s["repo_did"], rows)
149
150    per_seed_hits = _fetch_per_seed(seeds, query, settings.query_workers)
151
152    # Key by issue uri — each issue is already unique. We deliberately do NOT run
153    # collapse_forks here: that collapses by md5(content[:500]), which is right for
154    # fork READMEs but would merge genuinely distinct issues that share an empty or
155    # boilerplate body, silently dropping real recommendations.
156    candidates = merge_hits(per_seed_hits, seed_content_hashes=set(), key_field="uri")
157    candidates = apply_floor(candidates, settings.issue_distance_floor)
158    # Drop issues whose parent repo is a sandbox/test repo or whose content is a
159    # placeholder/test issue — they embed close to real interests but aren't real
160    # contribution opportunities. (The README-length repo standard can't be used
161    # here: issue-parent repos almost never have a README in the DB.)
162    candidates = [
163        c
164        for c in candidates
165        if not drop_issue(
166            c.payload.get("repo_name") or "",
167            c.payload.get("repo_description") or "",
168            c.payload.get("title") or "",
169            c.payload.get("content") or "",
170        )
171    ]
172    ranked = rerank(candidates, DefaultScorer(), settings.max_issues, diversify=True)
173    with_questionnaire = db.questionnaires_present(
174        [c.payload.get("uri") for c in ranked if c.payload.get("uri")]
175    )
176    return [_issue_out(c, settings, seed_urls, with_questionnaire) for c in ranked]
177
178
179def recommend(did: str, settings: Settings | None = None) -> Recommendations:
180    settings = settings or get_settings()
181
182    seeds = db.load_seeds(did, settings.min_readme_chars)
183    if not seeds:
184        return _empty(settings, 0)
185
186    seed_hashes = {row_content_hash(s) for s in seeds}
187    exclude_dids = [s["repo_did"] for s in seeds]
188
189    repos = _recommend_repos(seeds, exclude_dids, seed_hashes, settings)
190    issues = _recommend_issues(did, seeds, exclude_dids, settings)
191
192    interests = build_interests(seeds, settings.max_interests)
193    profile = Profile(
194        interests=[{"label": i["label"], "slug": i["slug"]} for i in interests],
195        languages=[],
196        sources=Sources(tangled=TangledSource(repos=len(seeds))),
197    )
198    return Recommendations(profile=profile, repos=repos, issues=issues)
Configure Feed

Configure Feed