Sunstead trust scoring project
0

Configure Feed

Select the types of activity you want to include in your feed.

at main 5.6 kB View raw
1"""Config: env-driven paths (all large artifacts under DATA_ROOT) + tuning knobs. 2 3PRD section 4.1: every large file lives on the external drive, routed through 4DATA_ROOT. We fail fast at startup if DATA_ROOT is set-but-unwritable, so a 5half-run never scatters the DuckDB file onto the system disk. 6""" 7 8from __future__ import annotations 9 10import os 11from dataclasses import dataclass, field 12from pathlib import Path 13 14# ponytail: DATA_ROOT unset in dev -> fall back to repo-local .data with a warning. 15# The PRD's safety property ("never scatter onto system disk if the drive is gone") 16# is satisfied by the writability assert below; point DATA_ROOT at /Volumes/EXT in prod. 17DATA_ROOT = Path(os.environ.get("DATA_ROOT") or (Path(__file__).resolve().parents[2] / ".data")) 18DUCKDB_PATH = Path(os.environ.get("DUCKDB_PATH") or (DATA_ROOT / "duckdb" / "trust.duckdb")) 19STAGING_DIR = Path(os.environ.get("STAGING_DIR") or (DATA_ROOT / "staging")) 20MODEL_DIR = Path(os.environ.get("MODEL_DIR") or (DATA_ROOT / "models")) 21LOG_DIR = Path(os.environ.get("LOG_DIR") or (DATA_ROOT / "logs")) 22 23 24_warned = False 25 26 27def ensure_data_root() -> Path: 28 """Create the DATA_ROOT subtree and assert it is writable. Call at startup.""" 29 global _warned 30 if not os.environ.get("DATA_ROOT") and not _warned: 31 print(f"[config] DATA_ROOT unset -> dev fallback {DATA_ROOT} (set DATA_ROOT for the external drive)") 32 _warned = True 33 for d in (DATA_ROOT, DUCKDB_PATH.parent, STAGING_DIR, MODEL_DIR, LOG_DIR): 34 d.mkdir(parents=True, exist_ok=True) 35 probe = DATA_ROOT / ".write_probe" 36 try: 37 probe.write_text("ok") 38 probe.unlink() 39 except OSError as e: 40 raise SystemExit(f"DATA_ROOT not writable ({DATA_ROOT}): {e}. Is the external drive mounted?") 41 return DATA_ROOT 42 43 44# --- Tangled lexicon NSIDs ------------------------------------------------- 45# PRD 6.1: CONFIRM these against tangled.org core lexicons + a live Jetstream 46# sample before trusting them; `python -m trust.ingest --probe` logs the real 47# `collection` values seen on the wire. Override via env without touching code. 48# Known fact: Tangled records live under `sh.tangled.*`. 49WANTED_COLLECTIONS = os.environ.get("WANTED_COLLECTIONS", "sh.tangled.*,app.bsky.graph.*") 50JETSTREAM_URL = os.environ.get( 51 "JETSTREAM_URL", "wss://jetstream2.us-east.bsky.network/subscribe" 52) 53 54# collection -> our internal record kind. Patterns are substring matches on the 55# NSID. These are best-guess shapes; `--probe` tells you the truth. ponytail: 56# substring map over a lexicon parser; swap to exact NSIDs once confirmed. 57COLLECTION_KINDS: dict[str, str] = { 58 "tangled.pull": "pull_request", 59 "tangled.repo.pull": "pull_request", 60 # authoritative merge outcome (.merged/.closed/.open). MUST out-specific the pull rule 61 # above; _kind() takes the longest matching needle so this wins over "tangled.repo.pull". 62 "tangled.repo.pull.status": "pull_status", 63 "tangled.vouch": "vouch", 64 "tangled.graph.vouch": "vouch", 65 "tangled.denounce": "denounce", 66 "tangled.pipeline": "ci", 67 "tangled.spindle": "ci", 68 "tangled.issue": "issue", 69 "tangled.feed.star": "star", # real NSID is sh.tangled.feed.star (".feed." breaks "tangled.star") 70 "tangled.attestation": "attestation", # jurisdiction attestation (6.13); CONFIRM NSID 71 "tangled.jurisdiction": "attestation", 72 "bsky.graph.follow": "follow", 73} 74 75 76@dataclass 77class GateConfig: 78 """Fusion gate thresholds (PRD 6.7). Tune T_HIGH from calibration so the 79 historical false-approval rate above it stays under the chosen budget.""" 80 81 T_LOW: float = 0.30 # below -> needs_human 82 T_HIGH: float = 0.70 # above (and content clean) -> fast_lane 83 R_LOW: float = 0.20 # content risk considered clean 84 R_HIGH: float = 0.60 # content risk considered dangerous 85 86 87@dataclass 88class EigenConfig: 89 alpha: float = 0.15 # restart probability 90 iters: int = 50 91 age_halflife_days: float = 180.0 # vouch weight time-decay 92 evidence_boost: float = 1.5 # vouch carrying PR evidence weighs more 93 94 95@dataclass 96class ReviewConfig: 97 model: str = os.environ.get("CLAUDE_MODEL", "claude-sonnet-4-6") 98 prepass_model: str = "claude-haiku-4-5-20251001" 99 escalate_model: str = "claude-opus-4-8" 100 max_diff_chars: int = 24_000 # token budget guard 101 api_key_env: str = "ANTHROPIC_API_KEY" 102 103 104@dataclass 105class EmbedConfig: 106 """Featherless (OpenAI-compatible) embeddings for the diff/slop-similarity path. 107 Model + base_url are env-overridable so a renamed model never needs a code edit.""" 108 109 model: str = os.environ.get("EMBED_MODEL", "Qwen/Qwen3-Embedding-4B") 110 base_url: str = os.environ.get("FEATHERLESS_BASE_URL", "https://api.featherless.ai/v1") 111 api_key_env: str = "FEATHERLESS_API_KEY" 112 # MRL truncation: None -> model-native dim (Qwen3-Embedding-4B = 2560). Set to 113 # store smaller vectors in DuckDB. Server ignores it if unsupported. 114 dimensions: int | None = int(os.environ["EMBED_DIMENSIONS"]) if os.environ.get("EMBED_DIMENSIONS") else None 115 batch: int = 32 # inputs per request 116 max_chars: int = 24_000 # truncate giant diffs (matches the review token budget) 117 timeout: float = 60.0 118 119 120@dataclass 121class Config: 122 gate: GateConfig = field(default_factory=GateConfig) 123 eigen: EigenConfig = field(default_factory=EigenConfig) 124 review: ReviewConfig = field(default_factory=ReviewConfig) 125 embed: EmbedConfig = field(default_factory=EmbedConfig) 126 clean_merge_window_days: int = 14 # PRD 6.3 label-mining N 127 128 129CFG = Config()