This repository has no description
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 7.1 kB View raw
1#!/usr/bin/env python3 2from __future__ import annotations 3 4import argparse 5import os 6import sys 7from pathlib import Path 8 9from dotenv import load_dotenv 10 11REPO_ROOT = Path(__file__).resolve().parent.parent 12if str(REPO_ROOT) not in sys.path: 13 sys.path.insert(0, str(REPO_ROOT)) 14 15from daily_issue_scraper.pipeline import run_daily_sync 16from db import ( 17 connect, 18 count_accounts_with_repos, 19 count_knots, 20 count_lexicons, 21 count_pds_accounts, 22 count_repos, 23 init_schema, 24 table_counts, 25) 26from progress import banner, die, log 27from stage0_lexicons import run_stage0 28from stage1_knots import run_stage1 29from stage2_network import run_stage2_network 30from stage2_pds import run_stage2, run_stage2_accounts_only, run_stage2_repos_only 31from check_readmes import run_check_readmes 32from embed_readmes import run_embed_readmes 33from fetch_collaborators import run_fetch_collaborators 34from fetch_issues import run_fetch_issues 35from embed_issues import run_embed_issues 36from backfill_repos_from_issues import run_backfill_repos_from_issues 37from stage4_repo_metadata import run_stage4 38 39 40def load_env() -> None: 41 for candidate in (REPO_ROOT / ".env", Path(__file__).parent / ".env"): 42 if candidate.exists(): 43 load_dotenv(candidate) 44 log("setup", f"Loaded env from {candidate}") 45 return 46 load_dotenv() 47 48 49def require_dsn() -> str: 50 dsn = os.getenv("DB_CONNECTION_STRING", "").strip() 51 if not dsn: 52 die( 53 "DB_CONNECTION_STRING is not set.\n" 54 "Add it to the repo-root .env file, e.g.:\n" 55 " DB_CONNECTION_STRING=postgresql://user:pass@host:5432/postgres" 56 ) 57 return dsn 58 59 60def cmd_init(dsn: str) -> None: 61 banner("INIT — Create scraper tables") 62 init_schema(dsn) 63 log("init", "Schema ready (stages 0–6 migrations applied).") 64 65 66def cmd_status(dsn: str) -> None: 67 banner("STATUS") 68 with connect(dsn) as conn: 69 lex = count_lexicons(conn) 70 knots = count_knots(conn) 71 accounts = count_pds_accounts(conn) 72 accounts_with_repos = count_accounts_with_repos(conn) 73 repos = count_repos(conn) 74 reachable = conn.execute( 75 "select count(*) as n from tangled_knots where reachable = true" 76 ).fetchone() 77 counts = table_counts(conn) 78 states = conn.execute( 79 "select key, status, meta, updated_at from tangled_crawl_state order by key" 80 ).fetchall() 81 82 log("status", "── Stages 0–2 (implemented) ──") 83 log("status", f" tangled_lexicons: {lex}") 84 log("status", f" tangled_knots: {knots} ({reachable['n'] if reachable else 0} reachable)") 85 log("status", f" tangled_pds_accounts: {accounts} ({accounts_with_repos} with repos)") 86 log("status", f" tangled_repos: {repos}") 87 88 log("status", "── Stages 3–6 (schema ready, scrapers pending) ──") 89 log("status", f" tangled_identities: {counts['tangled_identities']}") 90 log("status", f" tangled_atproto_records: {counts['tangled_atproto_records']}") 91 log("status", f" tangled_backlinks: {counts['tangled_backlinks']}") 92 log("status", f" tangled_xrpc_snapshots: {counts['tangled_xrpc_snapshots']}") 93 log("status", f" tangled_git_archives: {counts['tangled_git_archives']}") 94 log("status", f" tangled_git_blobs: {counts['tangled_git_blobs']}") 95 log("status", f" tangled_readmes: {counts['tangled_readmes']}") 96 log("status", f" tangled_issues: {counts['tangled_issues']}") 97 log("status", f" tangled_repo_collaborators: {counts['tangled_repo_collaborators']}") 98 99 if states: 100 log("status", "Crawl state:") 101 for row in states: 102 meta = row.get("meta") or {} 103 extra = "" 104 if isinstance(meta, dict) and "account_count" in meta: 105 extra = f" accounts={meta['account_count']}" 106 log("status", f" {row['key']}: {row['status']}{extra} @ {row['updated_at']}") 107 else: 108 log("status", "No crawl runs recorded yet.") 109 110 111def main(argv: list[str] | None = None) -> None: 112 parser = argparse.ArgumentParser( 113 description="Scrape Tangled into Postgres.", 114 ) 115 parser.add_argument( 116 "command", 117 choices=[ 118 "init", 119 "stage0", 120 "stage1", 121 "stage0-1", 122 "stage2", 123 "stage2-accounts", 124 "stage2-repos", 125 "stage2-network", 126 "stage4", 127 "check-readmes", 128 "embed-readmes", 129 "fetch-collaborators", 130 "fetch-issues", 131 "embed-issues", 132 "backfill-repos-from-issues", 133 "sync-daily", 134 "status", 135 ], 136 help=( 137 "init=tables | stage0=lexicons | stage1=knots | stage2=full PDS crawl | " 138 "stage2-accounts=count/list accounts | stage2-repos=scan repo records | " 139 "stage2-network=all repos (Bluesky+tngl via appview) | " 140 "stage4=deeper repo metadata (branches, tags, collaborators) | " 141 "check-readmes=fetch README from knot git for each repo | " 142 "embed-readmes=Gemini embeddings into tangled_readmes.embedding | " 143 "fetch-collaborators=listCollaborators for all repos | " 144 "fetch-issues=scrape issues from user PDSes | " 145 "embed-issues=Gemini embeddings into tangled_issues.embedding | " 146 "backfill-repos-from-issues=fetch repos referenced by issues but missing from tangled_repos | " 147 "sync-daily=run full daily sync pipeline" 148 ), 149 ) 150 args = parser.parse_args(argv) 151 152 load_env() 153 dsn = require_dsn() 154 155 if args.command == "init": 156 cmd_init(dsn) 157 return 158 159 init_schema(dsn) 160 161 if args.command == "stage0": 162 run_stage0(dsn) 163 elif args.command == "stage1": 164 run_stage1(dsn) 165 elif args.command == "stage0-1": 166 run_stage0(dsn) 167 print() 168 run_stage1(dsn) 169 elif args.command == "stage2": 170 run_stage2(dsn) 171 elif args.command == "stage2-accounts": 172 run_stage2_accounts_only(dsn) 173 elif args.command == "stage2-repos": 174 run_stage2_repos_only(dsn) 175 elif args.command == "stage2-network": 176 run_stage2_network(dsn) 177 elif args.command == "stage4": 178 run_stage4(dsn) 179 elif args.command == "check-readmes": 180 run_check_readmes(dsn) 181 elif args.command == "embed-readmes": 182 run_embed_readmes(dsn) 183 elif args.command == "fetch-collaborators": 184 run_fetch_collaborators(dsn) 185 elif args.command == "fetch-issues": 186 run_fetch_issues(dsn) 187 elif args.command == "embed-issues": 188 run_embed_issues(dsn) 189 elif args.command == "backfill-repos-from-issues": 190 run_backfill_repos_from_issues(dsn) 191 elif args.command == "sync-daily": 192 run_daily_sync(dsn) 193 elif args.command == "status": 194 cmd_status(dsn) 195 196 197if __name__ == "__main__": 198 try: 199 main() 200 except KeyboardInterrupt: 201 print("\nInterrupted.", file=sys.stderr) 202 raise SystemExit(130) from None