This repository has no description
1from __future__ import annotations
2
3import json
4import subprocess
5from pathlib import Path
6from typing import Any
7
8from db import connect, set_crawl_state, upsert_lexicon
9from progress import banner, log, step
10
11REPO_ROOT = Path(__file__).resolve().parent.parent
12DEFAULT_CORE_GIT = "https://tangled.org/tangled.org/core.git"
13DEFAULT_CORE_DIR = REPO_ROOT / ".cache" / "tangled-core"
14LEXICONS_DIR = "lexicons"
15
16
17def _lexicon_type(definition: dict[str, Any]) -> str:
18 main = definition.get("defs", {}).get("main", {})
19 lex_type = main.get("type")
20 if lex_type:
21 return str(lex_type)
22 return "unknown"
23
24
25def ensure_core_repo(core_dir: Path, git_url: str) -> Path:
26 lexicons = core_dir / LEXICONS_DIR
27 if lexicons.is_dir() and any(lexicons.rglob("*.json")):
28 log("stage 0", f"Using existing lexicons at {lexicons}")
29 return lexicons
30
31 log("stage 0", f"Cloning Tangled core lexicons (first run only)...")
32 log("stage 0", f" git clone --depth 1 {git_url}")
33 core_dir.parent.mkdir(parents=True, exist_ok=True)
34 if core_dir.exists():
35 log("stage 0", f" Removing incomplete clone at {core_dir}")
36 subprocess.run(["rm", "-rf", str(core_dir)], check=True)
37
38 subprocess.run(
39 ["git", "clone", "--depth", "1", git_url, str(core_dir)],
40 check=True,
41 )
42 if not lexicons.is_dir():
43 raise RuntimeError(f"Expected lexicons directory at {lexicons} after clone")
44 log("stage 0", f"Clone complete.")
45 return lexicons
46
47
48def collect_lexicon_files(lexicons_dir: Path) -> list[Path]:
49 return sorted(lexicons_dir.rglob("*.json"))
50
51
52def run_stage0(
53 dsn: str,
54 *,
55 core_dir: Path = DEFAULT_CORE_DIR,
56 git_url: str = DEFAULT_CORE_GIT,
57) -> dict[str, int]:
58 banner("STAGE 0 — Load Tangled lexicons (schemas)")
59 log("stage 0", "Lexicons are JSON schema specs — not live API endpoints.")
60 log("stage 0", "This stage stores every sh.tangled.* definition for later validation.")
61
62 lexicons_dir = ensure_core_repo(core_dir, git_url)
63 files = collect_lexicon_files(lexicons_dir)
64 if not files:
65 raise RuntimeError(f"No lexicon JSON files found under {lexicons_dir}")
66
67 log("stage 0", f"Found {len(files)} lexicon files to import.")
68
69 stats = {"records": 0, "queries": 0, "procedures": 0, "tokens": 0, "other": 0}
70
71 with connect(dsn) as conn:
72 set_crawl_state(conn, key="stage0:lexicons", status="running")
73
74 for i, path in enumerate(files, start=1):
75 rel = path.relative_to(core_dir).as_posix()
76 try:
77 definition = json.loads(path.read_text())
78 except json.JSONDecodeError as exc:
79 step("stage 0", i, len(files), f"SKIP {rel} — invalid JSON: {exc}")
80 continue
81
82 nsid = definition.get("id")
83 if not nsid:
84 step("stage 0", i, len(files), f"SKIP {rel} — missing id field")
85 continue
86
87 lex_type = _lexicon_type(definition)
88 upsert_lexicon(
89 conn,
90 nsid=nsid,
91 lexicon_type=lex_type,
92 definition=definition,
93 source_path=rel,
94 )
95
96 bucket = {
97 "record": "records",
98 "query": "queries",
99 "procedure": "procedures",
100 "token": "tokens",
101 }.get(lex_type, "other")
102 stats[bucket] += 1
103 step("stage 0", i, len(files), f"{nsid} ({lex_type})")
104
105 set_crawl_state(
106 conn,
107 key="stage0:lexicons",
108 status="complete",
109 meta={"file_count": len(files), **stats},
110 )
111 conn.commit()
112
113 log("stage 0", "Done.")
114 log(
115 "stage 0",
116 f" records={stats['records']} queries={stats['queries']} "
117 f"procedures={stats['procedures']} tokens={stats['tokens']} other={stats['other']}",
118 )
119 return stats