This repository has no description
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 3.9 kB View raw
1from __future__ import annotations 2 3import json 4import subprocess 5from pathlib import Path 6from typing import Any 7 8from db import connect, set_crawl_state, upsert_lexicon 9from progress import banner, log, step 10 11REPO_ROOT = Path(__file__).resolve().parent.parent 12DEFAULT_CORE_GIT = "https://tangled.org/tangled.org/core.git" 13DEFAULT_CORE_DIR = REPO_ROOT / ".cache" / "tangled-core" 14LEXICONS_DIR = "lexicons" 15 16 17def _lexicon_type(definition: dict[str, Any]) -> str: 18 main = definition.get("defs", {}).get("main", {}) 19 lex_type = main.get("type") 20 if lex_type: 21 return str(lex_type) 22 return "unknown" 23 24 25def ensure_core_repo(core_dir: Path, git_url: str) -> Path: 26 lexicons = core_dir / LEXICONS_DIR 27 if lexicons.is_dir() and any(lexicons.rglob("*.json")): 28 log("stage 0", f"Using existing lexicons at {lexicons}") 29 return lexicons 30 31 log("stage 0", f"Cloning Tangled core lexicons (first run only)...") 32 log("stage 0", f" git clone --depth 1 {git_url}") 33 core_dir.parent.mkdir(parents=True, exist_ok=True) 34 if core_dir.exists(): 35 log("stage 0", f" Removing incomplete clone at {core_dir}") 36 subprocess.run(["rm", "-rf", str(core_dir)], check=True) 37 38 subprocess.run( 39 ["git", "clone", "--depth", "1", git_url, str(core_dir)], 40 check=True, 41 ) 42 if not lexicons.is_dir(): 43 raise RuntimeError(f"Expected lexicons directory at {lexicons} after clone") 44 log("stage 0", f"Clone complete.") 45 return lexicons 46 47 48def collect_lexicon_files(lexicons_dir: Path) -> list[Path]: 49 return sorted(lexicons_dir.rglob("*.json")) 50 51 52def run_stage0( 53 dsn: str, 54 *, 55 core_dir: Path = DEFAULT_CORE_DIR, 56 git_url: str = DEFAULT_CORE_GIT, 57) -> dict[str, int]: 58 banner("STAGE 0 — Load Tangled lexicons (schemas)") 59 log("stage 0", "Lexicons are JSON schema specs — not live API endpoints.") 60 log("stage 0", "This stage stores every sh.tangled.* definition for later validation.") 61 62 lexicons_dir = ensure_core_repo(core_dir, git_url) 63 files = collect_lexicon_files(lexicons_dir) 64 if not files: 65 raise RuntimeError(f"No lexicon JSON files found under {lexicons_dir}") 66 67 log("stage 0", f"Found {len(files)} lexicon files to import.") 68 69 stats = {"records": 0, "queries": 0, "procedures": 0, "tokens": 0, "other": 0} 70 71 with connect(dsn) as conn: 72 set_crawl_state(conn, key="stage0:lexicons", status="running") 73 74 for i, path in enumerate(files, start=1): 75 rel = path.relative_to(core_dir).as_posix() 76 try: 77 definition = json.loads(path.read_text()) 78 except json.JSONDecodeError as exc: 79 step("stage 0", i, len(files), f"SKIP {rel} — invalid JSON: {exc}") 80 continue 81 82 nsid = definition.get("id") 83 if not nsid: 84 step("stage 0", i, len(files), f"SKIP {rel} — missing id field") 85 continue 86 87 lex_type = _lexicon_type(definition) 88 upsert_lexicon( 89 conn, 90 nsid=nsid, 91 lexicon_type=lex_type, 92 definition=definition, 93 source_path=rel, 94 ) 95 96 bucket = { 97 "record": "records", 98 "query": "queries", 99 "procedure": "procedures", 100 "token": "tokens", 101 }.get(lex_type, "other") 102 stats[bucket] += 1 103 step("stage 0", i, len(files), f"{nsid} ({lex_type})") 104 105 set_crawl_state( 106 conn, 107 key="stage0:lexicons", 108 status="complete", 109 meta={"file_count": len(files), **stats}, 110 ) 111 conn.commit() 112 113 log("stage 0", "Done.") 114 log( 115 "stage 0", 116 f" records={stats['records']} queries={stats['queries']} " 117 f"procedures={stats['procedures']} tokens={stats['tokens']} other={stats['other']}", 118 ) 119 return stats