Sunstead trust scoring project
1"""Config: env-driven paths (all large artifacts under DATA_ROOT) + tuning knobs.
2
3PRD section 4.1: every large file lives on the external drive, routed through
4DATA_ROOT. We fail fast at startup if DATA_ROOT is set-but-unwritable, so a
5half-run never scatters the DuckDB file onto the system disk.
6"""
7
8from __future__ import annotations
9
10import os
11from dataclasses import dataclass, field
12from pathlib import Path
13
14# ponytail: DATA_ROOT unset in dev -> fall back to repo-local .data with a warning.
15# The PRD's safety property ("never scatter onto system disk if the drive is gone")
16# is satisfied by the writability assert below; point DATA_ROOT at /Volumes/EXT in prod.
17DATA_ROOT = Path(os.environ.get("DATA_ROOT") or (Path(__file__).resolve().parents[2] / ".data"))
18DUCKDB_PATH = Path(os.environ.get("DUCKDB_PATH") or (DATA_ROOT / "duckdb" / "trust.duckdb"))
19STAGING_DIR = Path(os.environ.get("STAGING_DIR") or (DATA_ROOT / "staging"))
20MODEL_DIR = Path(os.environ.get("MODEL_DIR") or (DATA_ROOT / "models"))
21LOG_DIR = Path(os.environ.get("LOG_DIR") or (DATA_ROOT / "logs"))
22
23
24_warned = False
25
26
27def ensure_data_root() -> Path:
28 """Create the DATA_ROOT subtree and assert it is writable. Call at startup."""
29 global _warned
30 if not os.environ.get("DATA_ROOT") and not _warned:
31 print(f"[config] DATA_ROOT unset -> dev fallback {DATA_ROOT} (set DATA_ROOT for the external drive)")
32 _warned = True
33 for d in (DATA_ROOT, DUCKDB_PATH.parent, STAGING_DIR, MODEL_DIR, LOG_DIR):
34 d.mkdir(parents=True, exist_ok=True)
35 probe = DATA_ROOT / ".write_probe"
36 try:
37 probe.write_text("ok")
38 probe.unlink()
39 except OSError as e:
40 raise SystemExit(f"DATA_ROOT not writable ({DATA_ROOT}): {e}. Is the external drive mounted?")
41 return DATA_ROOT
42
43
44# --- Tangled lexicon NSIDs -------------------------------------------------
45# PRD 6.1: CONFIRM these against tangled.org core lexicons + a live Jetstream
46# sample before trusting them; `python -m trust.ingest --probe` logs the real
47# `collection` values seen on the wire. Override via env without touching code.
48# Known fact: Tangled records live under `sh.tangled.*`.
49WANTED_COLLECTIONS = os.environ.get("WANTED_COLLECTIONS", "sh.tangled.*,app.bsky.graph.*")
50JETSTREAM_URL = os.environ.get(
51 "JETSTREAM_URL", "wss://jetstream2.us-east.bsky.network/subscribe"
52)
53
54# collection -> our internal record kind. Patterns are substring matches on the
55# NSID. These are best-guess shapes; `--probe` tells you the truth. ponytail:
56# substring map over a lexicon parser; swap to exact NSIDs once confirmed.
57COLLECTION_KINDS: dict[str, str] = {
58 "tangled.pull": "pull_request",
59 "tangled.repo.pull": "pull_request",
60 # authoritative merge outcome (.merged/.closed/.open). MUST out-specific the pull rule
61 # above; _kind() takes the longest matching needle so this wins over "tangled.repo.pull".
62 "tangled.repo.pull.status": "pull_status",
63 "tangled.vouch": "vouch",
64 "tangled.graph.vouch": "vouch",
65 "tangled.denounce": "denounce",
66 "tangled.pipeline": "ci",
67 "tangled.spindle": "ci",
68 "tangled.issue": "issue",
69 "tangled.feed.star": "star", # real NSID is sh.tangled.feed.star (".feed." breaks "tangled.star")
70 "tangled.attestation": "attestation", # jurisdiction attestation (6.13); CONFIRM NSID
71 "tangled.jurisdiction": "attestation",
72 "bsky.graph.follow": "follow",
73}
74
75
76@dataclass
77class GateConfig:
78 """Fusion gate thresholds (PRD 6.7). Tune T_HIGH from calibration so the
79 historical false-approval rate above it stays under the chosen budget."""
80
81 T_LOW: float = 0.30 # below -> needs_human
82 T_HIGH: float = 0.70 # above (and content clean) -> fast_lane
83 R_LOW: float = 0.20 # content risk considered clean
84 R_HIGH: float = 0.60 # content risk considered dangerous
85
86
87@dataclass
88class EigenConfig:
89 alpha: float = 0.15 # restart probability
90 iters: int = 50
91 age_halflife_days: float = 180.0 # vouch weight time-decay
92 evidence_boost: float = 1.5 # vouch carrying PR evidence weighs more
93
94
95@dataclass
96class ReviewConfig:
97 model: str = os.environ.get("CLAUDE_MODEL", "claude-sonnet-4-6")
98 prepass_model: str = "claude-haiku-4-5-20251001"
99 escalate_model: str = "claude-opus-4-8"
100 max_diff_chars: int = 24_000 # token budget guard
101 api_key_env: str = "ANTHROPIC_API_KEY"
102
103
104@dataclass
105class EmbedConfig:
106 """Featherless (OpenAI-compatible) embeddings for the diff/slop-similarity path.
107 Model + base_url are env-overridable so a renamed model never needs a code edit."""
108
109 model: str = os.environ.get("EMBED_MODEL", "Qwen/Qwen3-Embedding-4B")
110 base_url: str = os.environ.get("FEATHERLESS_BASE_URL", "https://api.featherless.ai/v1")
111 api_key_env: str = "FEATHERLESS_API_KEY"
112 # MRL truncation: None -> model-native dim (Qwen3-Embedding-4B = 2560). Set to
113 # store smaller vectors in DuckDB. Server ignores it if unsupported.
114 dimensions: int | None = int(os.environ["EMBED_DIMENSIONS"]) if os.environ.get("EMBED_DIMENSIONS") else None
115 batch: int = 32 # inputs per request
116 max_chars: int = 24_000 # truncate giant diffs (matches the review token budget)
117 timeout: float = 60.0
118
119
120@dataclass
121class Config:
122 gate: GateConfig = field(default_factory=GateConfig)
123 eigen: EigenConfig = field(default_factory=EigenConfig)
124 review: ReviewConfig = field(default_factory=ReviewConfig)
125 embed: EmbedConfig = field(default_factory=EmbedConfig)
126 clean_merge_window_days: int = 14 # PRD 6.3 label-mining N
127
128
129CFG = Config()