This repository has no description
1"""Quality heuristics for issue recommendations (pure: no DB, no network).
2
3Issues are ranked purely by body-embedding similarity, with no notion of whether
4an issue is a real contribution opportunity or a throwaway. A test/sandbox repo's
5issue, or a placeholder issue ("hello world", "test issue to explore tangled",
6"[READ-ONLY]"), can embed close to a user's interests and rank at the top.
7
8Our repo standard (REC_MIN_README_CHARS) can't be applied to issues — the issue
9corpus and the README corpus barely overlap, so almost no issue's parent repo has
10a README in the DB and a length gate would drop everything. Instead we judge the
11parent repo by name/description and the issue by title/body, matching the kinds of
12throwaway content observed in production.
13
14Keep these conservative: a false positive silently hides a real contribution.
15"""
16
17from __future__ import annotations
18
19import re
20
21# Repo name tokens that mark a scratchpad/sandbox. Matched on word tokens (split
22# on non-alphanumerics), so "latest"/"fastest"/"contest" are NOT caught.
23_TEST_TOKENS = frozenset({
24 "test", "tests", "testing", "tester",
25 "sandbox", "playground", "scratch", "scratchpad",
26 "demo", "demos", "example", "examples", "sample", "samples",
27 "tmp", "temp", "placeholder", "throwaway",
28 "foo", "bar", "baz", "qux", "foobar",
29 "helloworld",
30})
31_TOKEN_SPLIT = re.compile(r"[^a-z0-9]+")
32_TESTNUM_RE = re.compile(r"^test\d+$") # test100, test2, ...
33
34# Placeholder / "just exploring" phrases in an issue title or body (or a repo
35# description). Phrase-anchored so normal text mentioning "tests" is not caught.
36_PLACEHOLDER_RE = re.compile(
37 r"""
38 \btest\s+issue\b
39 | \btest\s+repo\b
40 | \bthis\s+is\s+(?:just\s+)?a\s+test\b
41 | \bjust\s+a\s+test\b
42 | \bjust\s+testing\b
43 | \btesting\s+(?:the\s+)?(?:tangled|programmatic|access|repo|issue|out|this)\b
44 | \bhello,?\s+world\b
45 | \bhallo\b
46 | \blorem\s+ipsum\b
47 | \bread[-\s]?only\s+mirror\b
48 | \[read[-\s]?only\]
49 | \bignore\s+(?:this|me|please)\b
50 | \bplaceholder\b
51 | \bexplor(?:e|ing)\s+(?:what\s+)?tangled\b
52 | \basdf\b | \bqwerty\b
53 """,
54 re.IGNORECASE | re.VERBOSE,
55)
56
57
58def _tokens(text: str) -> set[str]:
59 return {t for t in _TOKEN_SPLIT.split((text or "").lower()) if t}
60
61
62def _is_gibberish(text: str) -> bool:
63 """A single run of letters with very few distinct characters, e.g.
64 'adadadaddaaddada' or 'adwawdawd' — typical of throwaway repo descriptions."""
65 t = (text or "").strip().lower()
66 if not t or " " in t or len(t) < 6:
67 return False
68 return len(set(t)) / len(t) < 0.4
69
70
71def is_test_repo(name: str, description: str = "") -> bool:
72 toks = _tokens(name)
73 if toks & _TEST_TOKENS or any(_TESTNUM_RE.match(t) for t in toks):
74 return True
75 desc = (description or "").strip()
76 if desc and (_PLACEHOLDER_RE.search(desc) or _is_gibberish(desc)):
77 return True
78 return False
79
80
81def is_placeholder_issue(title: str, body: str = "") -> bool:
82 blob = f"{title or ''}\n{body or ''}"
83 return bool(_PLACEHOLDER_RE.search(blob))
84
85
86def drop_issue(repo_name: str, repo_description: str, title: str, body: str) -> bool:
87 """True if this issue should be excluded: its repo is a sandbox/test repo, or
88 its content is a placeholder/test issue."""
89 return is_test_repo(repo_name, repo_description) or is_placeholder_issue(title, body)