# 1. DB connection (you already have this in .env)
# DB_CONNECTION_STRING=postgresql://...

# 2. Python venv + deps
python3 -m venv scraper/.venv
source scraper/.venv/bin/activate
pip install -r scraper/requirements.txt

# 3. git is required on first run (stage 0 clones tangled.org/core lexicons)
git --version

Run#

source scraper/.venv/bin/activate

# Create tables
python scraper/scrape.py init

# Stage 0 — lexicons (~89 JSON files, prints each NSID)
python scraper/scrape.py stage0

# Stage 1 — probe knots
python scraper/scrape.py stage1

# Or both in one go
python scraper/scrape.py stage0-1

# Check counts
python scraper/scrape.py status

Progress is printed as timestamped lines, e.g.:

[12:34:56] [stage 0] (12/89) sh.tangled.repo (record)
[12:35:01] [stage 1]   OK  knot1.tangled.sh  version=1.14.0-alpha  owner=did:plc:...

Knot configuration (optional)#

# Explicit seed list (comma-separated hostnames)
export TANGLED_KNOT_SEEDS=knot1.tangled.sh,my.knot.example

# Auto-probe knot2..knot5 in addition to defaults
export TANGLED_KNOT_PROBE_MAX=5

# Extra hostnames
export TANGLED_KNOT_EXTRA=custom.knot.example

Stage 2 — Discover repos via Tangled PDS#

sh.tangled.sync.listRepos on knots returns 404 (not deployed yet).
Stage 2 uses https://tngl.sh instead:

Phase	What	API
1	List all accounts	`com.atproto.sync.listRepos`
2	Repo records per account	`com.atproto.repo.listRecords` (`sh.tangled.repo`)
3	Enrich from knot	`sh.tangled.repo.describeRepo`

~7,928 accounts on tngl.sh (as of testing). Full repo scan takes a while.

# Step 1 only — count/list accounts (fast, ~10s)
python scraper/scrape.py stage2-accounts

# Step 2 only — scan repo records (requires accounts in DB)
python scraper/scrape.py stage2-repos

# All phases in one run
python scraper/scrape.py stage2

python scraper/scrape.py status

Optional env vars#

# Test with first N accounts only
export TANGLED_STAGE2_ACCOUNT_LIMIT=50

# Resolve handles via plc.directory (slower)
export TANGLED_RESOLVE_HANDLES=1

# Skip knot describeRepo enrichment
export TANGLED_STAGE2_ENRICH_KNOTS=0

# Override PDS (default https://tngl.sh)
export TANGLED_PDS_URL=https://tngl.sh

SQL tables created#

tangled_lexicons — NSID → full lexicon JSON
tangled_knots — probed knot servers
tangled_pds_accounts — every account on tngl.sh PDS
tangled_repos — sh.tangled.repo records + optional knot metadata
tangled_crawl_state — run metadata per stage

Configure Feed

Configure Feed

Tangled scraper (stages 0–1)#

What this does / does NOT do#

Setup#

Run#

Knot configuration (optional)#

Stage 2 — Discover repos via Tangled PDS#

Optional env vars#

SQL tables created#