This repository has no description
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 3.4 kB View raw
1"""Quality heuristics for issue recommendations (pure: no DB, no network). 2 3Issues are ranked purely by body-embedding similarity, with no notion of whether 4an issue is a real contribution opportunity or a throwaway. A test/sandbox repo's 5issue, or a placeholder issue ("hello world", "test issue to explore tangled", 6"[READ-ONLY]"), can embed close to a user's interests and rank at the top. 7 8Our repo standard (REC_MIN_README_CHARS) can't be applied to issues — the issue 9corpus and the README corpus barely overlap, so almost no issue's parent repo has 10a README in the DB and a length gate would drop everything. Instead we judge the 11parent repo by name/description and the issue by title/body, matching the kinds of 12throwaway content observed in production. 13 14Keep these conservative: a false positive silently hides a real contribution. 15""" 16 17from __future__ import annotations 18 19import re 20 21# Repo name tokens that mark a scratchpad/sandbox. Matched on word tokens (split 22# on non-alphanumerics), so "latest"/"fastest"/"contest" are NOT caught. 23_TEST_TOKENS = frozenset({ 24 "test", "tests", "testing", "tester", 25 "sandbox", "playground", "scratch", "scratchpad", 26 "demo", "demos", "example", "examples", "sample", "samples", 27 "tmp", "temp", "placeholder", "throwaway", 28 "foo", "bar", "baz", "qux", "foobar", 29 "helloworld", 30}) 31_TOKEN_SPLIT = re.compile(r"[^a-z0-9]+") 32_TESTNUM_RE = re.compile(r"^test\d+$") # test100, test2, ... 33 34# Placeholder / "just exploring" phrases in an issue title or body (or a repo 35# description). Phrase-anchored so normal text mentioning "tests" is not caught. 36_PLACEHOLDER_RE = re.compile( 37 r""" 38 \btest\s+issue\b 39 | \btest\s+repo\b 40 | \bthis\s+is\s+(?:just\s+)?a\s+test\b 41 | \bjust\s+a\s+test\b 42 | \bjust\s+testing\b 43 | \btesting\s+(?:the\s+)?(?:tangled|programmatic|access|repo|issue|out|this)\b 44 | \bhello,?\s+world\b 45 | \bhallo\b 46 | \blorem\s+ipsum\b 47 | \bread[-\s]?only\s+mirror\b 48 | \[read[-\s]?only\] 49 | \bignore\s+(?:this|me|please)\b 50 | \bplaceholder\b 51 | \bexplor(?:e|ing)\s+(?:what\s+)?tangled\b 52 | \basdf\b | \bqwerty\b 53 """, 54 re.IGNORECASE | re.VERBOSE, 55) 56 57 58def _tokens(text: str) -> set[str]: 59 return {t for t in _TOKEN_SPLIT.split((text or "").lower()) if t} 60 61 62def _is_gibberish(text: str) -> bool: 63 """A single run of letters with very few distinct characters, e.g. 64 'adadadaddaaddada' or 'adwawdawd' — typical of throwaway repo descriptions.""" 65 t = (text or "").strip().lower() 66 if not t or " " in t or len(t) < 6: 67 return False 68 return len(set(t)) / len(t) < 0.4 69 70 71def is_test_repo(name: str, description: str = "") -> bool: 72 toks = _tokens(name) 73 if toks & _TEST_TOKENS or any(_TESTNUM_RE.match(t) for t in toks): 74 return True 75 desc = (description or "").strip() 76 if desc and (_PLACEHOLDER_RE.search(desc) or _is_gibberish(desc)): 77 return True 78 return False 79 80 81def is_placeholder_issue(title: str, body: str = "") -> bool: 82 blob = f"{title or ''}\n{body or ''}" 83 return bool(_PLACEHOLDER_RE.search(blob)) 84 85 86def drop_issue(repo_name: str, repo_description: str, title: str, body: str) -> bool: 87 """True if this issue should be excluded: its repo is a sandbox/test repo, or 88 its content is a placeholder/test issue.""" 89 return is_test_repo(repo_name, repo_description) or is_placeholder_issue(title, body)