src/trust/config.py at main · veikka.tngl.sh/sunstead

Sunstead trust scoring project
sunstead / src / trust / config.py
at main 5.6 kB View raw
Veikka Silvekoski Update sunstead: new modules (embed, voice, content, diffs, merged, vouchsafe), web UI, docs, scorer Dockerfile 1d ago
3df319f5
  1"""Config: env-driven paths (all large artifacts under DATA_ROOT) + tuning knobs.
  2
  3PRD section 4.1: every large file lives on the external drive, routed through
  4DATA_ROOT. We fail fast at startup if DATA_ROOT is set-but-unwritable, so a
  5half-run never scatters the DuckDB file onto the system disk.
  6"""
  7
  8from __future__ import annotations
  9
 10import os
 11from dataclasses import dataclass, field
 12from pathlib import Path
 13
 14# ponytail: DATA_ROOT unset in dev -> fall back to repo-local .data with a warning.
 15# The PRD's safety property ("never scatter onto system disk if the drive is gone")
 16# is satisfied by the writability assert below; point DATA_ROOT at /Volumes/EXT in prod.
 17DATA_ROOT = Path(os.environ.get("DATA_ROOT") or (Path(__file__).resolve().parents[2] / ".data"))
 18DUCKDB_PATH = Path(os.environ.get("DUCKDB_PATH") or (DATA_ROOT / "duckdb" / "trust.duckdb"))
 19STAGING_DIR = Path(os.environ.get("STAGING_DIR") or (DATA_ROOT / "staging"))
 20MODEL_DIR = Path(os.environ.get("MODEL_DIR") or (DATA_ROOT / "models"))
 21LOG_DIR = Path(os.environ.get("LOG_DIR") or (DATA_ROOT / "logs"))
 22
 23
 24_warned = False
 25
 26
 27def ensure_data_root() -> Path:
 28    """Create the DATA_ROOT subtree and assert it is writable. Call at startup."""
 29    global _warned
 30    if not os.environ.get("DATA_ROOT") and not _warned:
 31        print(f"[config] DATA_ROOT unset -> dev fallback {DATA_ROOT} (set DATA_ROOT for the external drive)")
 32        _warned = True
 33    for d in (DATA_ROOT, DUCKDB_PATH.parent, STAGING_DIR, MODEL_DIR, LOG_DIR):
 34        d.mkdir(parents=True, exist_ok=True)
 35    probe = DATA_ROOT / ".write_probe"
 36    try:
 37        probe.write_text("ok")
 38        probe.unlink()
 39    except OSError as e:
 40        raise SystemExit(f"DATA_ROOT not writable ({DATA_ROOT}): {e}. Is the external drive mounted?")
 41    return DATA_ROOT
 42
 43
 44# --- Tangled lexicon NSIDs -------------------------------------------------
 45# PRD 6.1: CONFIRM these against tangled.org core lexicons + a live Jetstream
 46# sample before trusting them; `python -m trust.ingest --probe` logs the real
 47# `collection` values seen on the wire. Override via env without touching code.
 48# Known fact: Tangled records live under `sh.tangled.*`.
 49WANTED_COLLECTIONS = os.environ.get("WANTED_COLLECTIONS", "sh.tangled.*,app.bsky.graph.*")
 50JETSTREAM_URL = os.environ.get(
 51    "JETSTREAM_URL", "wss://jetstream2.us-east.bsky.network/subscribe"
 52)
 53
 54# collection -> our internal record kind. Patterns are substring matches on the
 55# NSID. These are best-guess shapes; `--probe` tells you the truth. ponytail:
 56# substring map over a lexicon parser; swap to exact NSIDs once confirmed.
 57COLLECTION_KINDS: dict[str, str] = {
 58    "tangled.pull": "pull_request",
 59    "tangled.repo.pull": "pull_request",
 60    # authoritative merge outcome (.merged/.closed/.open). MUST out-specific the pull rule
 61    # above; _kind() takes the longest matching needle so this wins over "tangled.repo.pull".
 62    "tangled.repo.pull.status": "pull_status",
 63    "tangled.vouch": "vouch",
 64    "tangled.graph.vouch": "vouch",
 65    "tangled.denounce": "denounce",
 66    "tangled.pipeline": "ci",
 67    "tangled.spindle": "ci",
 68    "tangled.issue": "issue",
 69    "tangled.feed.star": "star",   # real NSID is sh.tangled.feed.star (".feed." breaks "tangled.star")
 70    "tangled.attestation": "attestation",   # jurisdiction attestation (6.13); CONFIRM NSID
 71    "tangled.jurisdiction": "attestation",
 72    "bsky.graph.follow": "follow",
 73}
 74
 75
 76@dataclass
 77class GateConfig:
 78    """Fusion gate thresholds (PRD 6.7). Tune T_HIGH from calibration so the
 79    historical false-approval rate above it stays under the chosen budget."""
 80
 81    T_LOW: float = 0.30   # below -> needs_human
 82    T_HIGH: float = 0.70  # above (and content clean) -> fast_lane
 83    R_LOW: float = 0.20   # content risk considered clean
 84    R_HIGH: float = 0.60  # content risk considered dangerous
 85
 86
 87@dataclass
 88class EigenConfig:
 89    alpha: float = 0.15        # restart probability
 90    iters: int = 50
 91    age_halflife_days: float = 180.0   # vouch weight time-decay
 92    evidence_boost: float = 1.5        # vouch carrying PR evidence weighs more
 93
 94
 95@dataclass
 96class ReviewConfig:
 97    model: str = os.environ.get("CLAUDE_MODEL", "claude-sonnet-4-6")
 98    prepass_model: str = "claude-haiku-4-5-20251001"
 99    escalate_model: str = "claude-opus-4-8"
100    max_diff_chars: int = 24_000   # token budget guard
101    api_key_env: str = "ANTHROPIC_API_KEY"
102
103
104@dataclass
105class EmbedConfig:
106    """Featherless (OpenAI-compatible) embeddings for the diff/slop-similarity path.
107    Model + base_url are env-overridable so a renamed model never needs a code edit."""
108
109    model: str = os.environ.get("EMBED_MODEL", "Qwen/Qwen3-Embedding-4B")
110    base_url: str = os.environ.get("FEATHERLESS_BASE_URL", "https://api.featherless.ai/v1")
111    api_key_env: str = "FEATHERLESS_API_KEY"
112    # MRL truncation: None -> model-native dim (Qwen3-Embedding-4B = 2560). Set to
113    # store smaller vectors in DuckDB. Server ignores it if unsupported.
114    dimensions: int | None = int(os.environ["EMBED_DIMENSIONS"]) if os.environ.get("EMBED_DIMENSIONS") else None
115    batch: int = 32             # inputs per request
116    max_chars: int = 24_000     # truncate giant diffs (matches the review token budget)
117    timeout: float = 60.0
118
119
120@dataclass
121class Config:
122    gate: GateConfig = field(default_factory=GateConfig)
123    eigen: EigenConfig = field(default_factory=EigenConfig)
124    review: ReviewConfig = field(default_factory=ReviewConfig)
125    embed: EmbedConfig = field(default_factory=EmbedConfig)
126    clean_merge_window_days: int = 14   # PRD 6.3 label-mining N
127
128
129CFG = Config()
Configure Feed

Configure Feed