This repository has no description
1from __future__ import annotations
2
3import re
4from typing import Any
5from urllib.parse import urlencode
6
7import httpx
8
9APPVIEW_BASE = "https://tangled.org"
10SEARCH_PATH = "/search"
11
12# href="/owner/repo" — exclude site chrome and static assets
13REPO_HREF = re.compile(r'href="/([a-zA-Z0-9._-]+)/([a-zA-Z0-9._-]+)"')
14TOTAL_RE = re.compile(r"Returned\s+(\d+)\s+of\s+(\d+)", re.I)
15
16SKIP_OWNERS = frozenset(
17 {
18 "static",
19 "search",
20 "login",
21 "signup",
22 "explore",
23 "settings",
24 "blog",
25 "docs",
26 "brand",
27 "chat",
28 "pwa-manifest.json",
29 }
30)
31
32
33def parse_search_total(html: str) -> int | None:
34 match = TOTAL_RE.search(html)
35 if not match:
36 return None
37 return int(match.group(2))
38
39
40def parse_repo_links(html: str) -> list[tuple[str, str]]:
41 seen: set[tuple[str, str]] = set()
42 out: list[tuple[str, str]] = []
43 for owner, repo in REPO_HREF.findall(html):
44 if owner in SKIP_OWNERS or owner.endswith(".json"):
45 continue
46 key = (owner, repo)
47 if key not in seen:
48 seen.add(key)
49 out.append(key)
50 return out
51
52
53def fetch_search_page(
54 client: httpx.Client,
55 *,
56 offset: int = 0,
57 limit: int = 100,
58 sort: str = "newest",
59 query: str = "",
60) -> tuple[str, list[tuple[str, str]], int | None]:
61 params = {"q": query, "sort": sort, "offset": offset, "limit": limit}
62 url = f"{APPVIEW_BASE}{SEARCH_PATH}?{urlencode(params)}"
63 resp = client.get(url)
64 resp.raise_for_status()
65 html = resp.text
66 return html, parse_repo_links(html), parse_search_total(html)