This repository has no description
1#!/usr/bin/env python3
2from __future__ import annotations
3
4import argparse
5import os
6import sys
7from pathlib import Path
8
9from dotenv import load_dotenv
10
11REPO_ROOT = Path(__file__).resolve().parent.parent
12if str(REPO_ROOT) not in sys.path:
13 sys.path.insert(0, str(REPO_ROOT))
14
15from daily_issue_scraper.pipeline import run_daily_sync
16from db import (
17 connect,
18 count_accounts_with_repos,
19 count_knots,
20 count_lexicons,
21 count_pds_accounts,
22 count_repos,
23 init_schema,
24 table_counts,
25)
26from progress import banner, die, log
27from stage0_lexicons import run_stage0
28from stage1_knots import run_stage1
29from stage2_network import run_stage2_network
30from stage2_pds import run_stage2, run_stage2_accounts_only, run_stage2_repos_only
31from check_readmes import run_check_readmes
32from embed_readmes import run_embed_readmes
33from fetch_collaborators import run_fetch_collaborators
34from fetch_issues import run_fetch_issues
35from embed_issues import run_embed_issues
36from backfill_repos_from_issues import run_backfill_repos_from_issues
37from stage4_repo_metadata import run_stage4
38
39
40def load_env() -> None:
41 for candidate in (REPO_ROOT / ".env", Path(__file__).parent / ".env"):
42 if candidate.exists():
43 load_dotenv(candidate)
44 log("setup", f"Loaded env from {candidate}")
45 return
46 load_dotenv()
47
48
49def require_dsn() -> str:
50 dsn = os.getenv("DB_CONNECTION_STRING", "").strip()
51 if not dsn:
52 die(
53 "DB_CONNECTION_STRING is not set.\n"
54 "Add it to the repo-root .env file, e.g.:\n"
55 " DB_CONNECTION_STRING=postgresql://user:pass@host:5432/postgres"
56 )
57 return dsn
58
59
60def cmd_init(dsn: str) -> None:
61 banner("INIT — Create scraper tables")
62 init_schema(dsn)
63 log("init", "Schema ready (stages 0–6 migrations applied).")
64
65
66def cmd_status(dsn: str) -> None:
67 banner("STATUS")
68 with connect(dsn) as conn:
69 lex = count_lexicons(conn)
70 knots = count_knots(conn)
71 accounts = count_pds_accounts(conn)
72 accounts_with_repos = count_accounts_with_repos(conn)
73 repos = count_repos(conn)
74 reachable = conn.execute(
75 "select count(*) as n from tangled_knots where reachable = true"
76 ).fetchone()
77 counts = table_counts(conn)
78 states = conn.execute(
79 "select key, status, meta, updated_at from tangled_crawl_state order by key"
80 ).fetchall()
81
82 log("status", "── Stages 0–2 (implemented) ──")
83 log("status", f" tangled_lexicons: {lex}")
84 log("status", f" tangled_knots: {knots} ({reachable['n'] if reachable else 0} reachable)")
85 log("status", f" tangled_pds_accounts: {accounts} ({accounts_with_repos} with repos)")
86 log("status", f" tangled_repos: {repos}")
87
88 log("status", "── Stages 3–6 (schema ready, scrapers pending) ──")
89 log("status", f" tangled_identities: {counts['tangled_identities']}")
90 log("status", f" tangled_atproto_records: {counts['tangled_atproto_records']}")
91 log("status", f" tangled_backlinks: {counts['tangled_backlinks']}")
92 log("status", f" tangled_xrpc_snapshots: {counts['tangled_xrpc_snapshots']}")
93 log("status", f" tangled_git_archives: {counts['tangled_git_archives']}")
94 log("status", f" tangled_git_blobs: {counts['tangled_git_blobs']}")
95 log("status", f" tangled_readmes: {counts['tangled_readmes']}")
96 log("status", f" tangled_issues: {counts['tangled_issues']}")
97 log("status", f" tangled_repo_collaborators: {counts['tangled_repo_collaborators']}")
98
99 if states:
100 log("status", "Crawl state:")
101 for row in states:
102 meta = row.get("meta") or {}
103 extra = ""
104 if isinstance(meta, dict) and "account_count" in meta:
105 extra = f" accounts={meta['account_count']}"
106 log("status", f" {row['key']}: {row['status']}{extra} @ {row['updated_at']}")
107 else:
108 log("status", "No crawl runs recorded yet.")
109
110
111def main(argv: list[str] | None = None) -> None:
112 parser = argparse.ArgumentParser(
113 description="Scrape Tangled into Postgres.",
114 )
115 parser.add_argument(
116 "command",
117 choices=[
118 "init",
119 "stage0",
120 "stage1",
121 "stage0-1",
122 "stage2",
123 "stage2-accounts",
124 "stage2-repos",
125 "stage2-network",
126 "stage4",
127 "check-readmes",
128 "embed-readmes",
129 "fetch-collaborators",
130 "fetch-issues",
131 "embed-issues",
132 "backfill-repos-from-issues",
133 "sync-daily",
134 "status",
135 ],
136 help=(
137 "init=tables | stage0=lexicons | stage1=knots | stage2=full PDS crawl | "
138 "stage2-accounts=count/list accounts | stage2-repos=scan repo records | "
139 "stage2-network=all repos (Bluesky+tngl via appview) | "
140 "stage4=deeper repo metadata (branches, tags, collaborators) | "
141 "check-readmes=fetch README from knot git for each repo | "
142 "embed-readmes=Gemini embeddings into tangled_readmes.embedding | "
143 "fetch-collaborators=listCollaborators for all repos | "
144 "fetch-issues=scrape issues from user PDSes | "
145 "embed-issues=Gemini embeddings into tangled_issues.embedding | "
146 "backfill-repos-from-issues=fetch repos referenced by issues but missing from tangled_repos | "
147 "sync-daily=run full daily sync pipeline"
148 ),
149 )
150 args = parser.parse_args(argv)
151
152 load_env()
153 dsn = require_dsn()
154
155 if args.command == "init":
156 cmd_init(dsn)
157 return
158
159 init_schema(dsn)
160
161 if args.command == "stage0":
162 run_stage0(dsn)
163 elif args.command == "stage1":
164 run_stage1(dsn)
165 elif args.command == "stage0-1":
166 run_stage0(dsn)
167 print()
168 run_stage1(dsn)
169 elif args.command == "stage2":
170 run_stage2(dsn)
171 elif args.command == "stage2-accounts":
172 run_stage2_accounts_only(dsn)
173 elif args.command == "stage2-repos":
174 run_stage2_repos_only(dsn)
175 elif args.command == "stage2-network":
176 run_stage2_network(dsn)
177 elif args.command == "stage4":
178 run_stage4(dsn)
179 elif args.command == "check-readmes":
180 run_check_readmes(dsn)
181 elif args.command == "embed-readmes":
182 run_embed_readmes(dsn)
183 elif args.command == "fetch-collaborators":
184 run_fetch_collaborators(dsn)
185 elif args.command == "fetch-issues":
186 run_fetch_issues(dsn)
187 elif args.command == "embed-issues":
188 run_embed_issues(dsn)
189 elif args.command == "backfill-repos-from-issues":
190 run_backfill_repos_from_issues(dsn)
191 elif args.command == "sync-daily":
192 run_daily_sync(dsn)
193 elif args.command == "status":
194 cmd_status(dsn)
195
196
197if __name__ == "__main__":
198 try:
199 main()
200 except KeyboardInterrupt:
201 print("\nInterrupted.", file=sys.stderr)
202 raise SystemExit(130) from None