scraper/appview_client.py at main · char.tngl.sh/sunstead-backend

This repository has no description

sunstead-backend / scraper / appview_client.py

at main 1.6 kB View raw

Mark Pokidko Sunstead backend — Tangled Discover + AI-Solve (snapshot, no history) 2d ago

 1from __future__ import annotations
 2
 3import re
 4from typing import Any
 5from urllib.parse import urlencode
 6
 7import httpx
 8
 9APPVIEW_BASE = "https://tangled.org"
10SEARCH_PATH = "/search"
11
12# href="/owner/repo" — exclude site chrome and static assets
13REPO_HREF = re.compile(r'href="/([a-zA-Z0-9._-]+)/([a-zA-Z0-9._-]+)"')
14TOTAL_RE = re.compile(r"Returned\s+(\d+)\s+of\s+(\d+)", re.I)
15
16SKIP_OWNERS = frozenset(
17    {
18        "static",
19        "search",
20        "login",
21        "signup",
22        "explore",
23        "settings",
24        "blog",
25        "docs",
26        "brand",
27        "chat",
28        "pwa-manifest.json",
29    }
30)
31
32
33def parse_search_total(html: str) -> int | None:
34    match = TOTAL_RE.search(html)
35    if not match:
36        return None
37    return int(match.group(2))
38
39
40def parse_repo_links(html: str) -> list[tuple[str, str]]:
41    seen: set[tuple[str, str]] = set()
42    out: list[tuple[str, str]] = []
43    for owner, repo in REPO_HREF.findall(html):
44        if owner in SKIP_OWNERS or owner.endswith(".json"):
45            continue
46        key = (owner, repo)
47        if key not in seen:
48            seen.add(key)
49            out.append(key)
50    return out
51
52
53def fetch_search_page(
54    client: httpx.Client,
55    *,
56    offset: int = 0,
57    limit: int = 100,
58    sort: str = "newest",
59    query: str = "",
60) -> tuple[str, list[tuple[str, str]], int | None]:
61    params = {"q": query, "sort": sort, "offset": offset, "limit": limit}
62    url = f"{APPVIEW_BASE}{SEARCH_PATH}?{urlencode(params)}"
63    resp = client.get(url)
64    resp.raise_for_status()
65    html = resp.text
66    return html, parse_repo_links(html), parse_search_total(html)

Configure Feed

Configure Feed