···257257 '';
258258 };
259259 };
260260+261261+ debugSsh = {
262262+ enable = mkOption {
263263+ type = types.bool;
264264+ default = false;
265265+ description = ''
266266+ Enable the debug ssh server that lets authorized users ssh into a
267267+ failed microVM to debug it.
268268+ '';
269269+ };
270270+ listenAddr = mkOption {
271271+ type = types.str;
272272+ default = "0.0.0.0:2222";
273273+ example = "0.0.0.0:2225";
274274+ description = "Address for the debug ssh server to listen on.";
275275+ };
276276+ hostKeyPath = mkOption {
277277+ type = with types; nullOr path;
278278+ default = null;
279279+ example = "/var/lib/spindle/debug_ssh_host_key";
280280+ description = ''
281281+ Path to the ssh host key for the debug server. If null, one is generated
282282+ once and persisted next to the spindle db.
283283+ '';
284284+ };
285285+ gracePeriod = mkOption {
286286+ type = types.str;
287287+ default = "5m";
288288+ description = ''
289289+ How long a failed workflow's microVM is kept alive for the user to ssh in.
290290+ '';
291291+ };
292292+ };
260293 };
261294262295 nixCache = {
···377410 "SPINDLE_MICROVM_PIPELINES_CGROUP_PIDS_MAX=${toString cfg.pipelines.microvm.cgroup.pidsMax}"
378411 "SPINDLE_MICROVM_PIPELINES_CGROUP_SWAP_MAX_MIB=${toString cfg.pipelines.microvm.cgroup.swapMaxMiB}"
379412 "SPINDLE_MICROVM_PIPELINES_CGROUP_SUPERVISOR_MEMORY_MIN_MIB=${toString cfg.pipelines.microvm.cgroup.supervisorMinMiB}"
413413+ "SPINDLE_MICROVM_PIPELINES_DEBUG_SSH_ENABLED=${lib.boolToString cfg.pipelines.microvm.debugSsh.enable}"
414414+ "SPINDLE_MICROVM_PIPELINES_DEBUG_SSH_LISTEN_ADDR=${cfg.pipelines.microvm.debugSsh.listenAddr}"
415415+ "SPINDLE_MICROVM_PIPELINES_DEBUG_SSH_HOST_KEY_PATH=${optionalString (cfg.pipelines.microvm.debugSsh.hostKeyPath != null) (toString cfg.pipelines.microvm.debugSsh.hostKeyPath)}"
416416+ "SPINDLE_MICROVM_PIPELINES_DEBUG_SSH_GRACE_PERIOD=${cfg.pipelines.microvm.debugSsh.gracePeriod}"
380417 "SPINDLE_NIX_CACHE_READ_URLS=${concatStringsSep "," cfg.pipelines.nixCache.readUrls}"
381418 "SPINDLE_NIX_CACHE_TRUSTED_PUBLIC_KEYS=${concatStringsSep "," cfg.pipelines.nixCache.trustedPublicKeys}"
382419 "SPINDLE_NIX_CACHE_UPLOAD_URL=${cfg.pipelines.nixCache.uploadUrl}"
+2-1
shuttle/Cargo.toml
···88[dependencies]
99anyhow = "1"
1010base64 = "0.22"
1111-nix = { version = "0.31", features = ["fs", "process", "reboot", "signal", "user"] }
1111+nix = { version = "0.31", features = ["fs", "process", "reboot", "signal", "term", "user"] }
1212prost = "0.14"
1313prost-reflect = "0.16"
1414prost-protovalidate = "0.3"
1515once_cell = "1"
1616+pty-process = { version = "0.5.3", features = ["async"] }
1617serde = { version = "1", features = ["derive"] }
1718serde_json = "1"
1819tempfile = "3"
+99-14
shuttle/src/command.rs
···11use anyhow::{Context, Result};
22use nix::sys::signal::{Signal, kill};
33use nix::unistd::{Gid, Pid, Uid, User, getgrouplist, setgid, setgroups, setuid};
44-use std::ffi::{CString, OsStr, OsString};
44+use pty_process::{Command as PtyCommand, OwnedReadPty, OwnedWritePty, Size};
55+use std::ffi::{CString, OsString};
56use std::io;
67use std::os::unix::process::ExitStatusExt;
78use std::path::PathBuf;
···196197 // group won't actually let it access the sock.
197198 // https://github.com/rust-lang/rust/issues/90747
198199 if let (Some(uid), Some(gid)) = (spec.uid, spec.gid) {
199199- let username = User::from_uid(Uid::from_raw(uid))
200200- .ok()
201201- .flatten()
202202- .map(|u| u.name)
203203- .with_context(|| format!("lookup passwd entry for uid {uid}"))?;
204204- let cname = CString::new(username)
205205- .with_context(|| format!("username for uid {uid} contained a null byte"))?;
206206- // resolve groups beforehand so we don't have to read /etc/group in the pre_exec
207207- let groups =
208208- getgrouplist(&cname, Gid::from_raw(gid)).context("resolve supplementary groups")?;
200200+ let groups = resolve_supplementary_groups(uid, gid)?;
209201 // SAFETY: pre_exec runs between fork and execve in the child.
210202 // we only call async-signal-safe syscalls and we don't touch any
211203 // shared state, no allocator, no mutexes, no globals.
···223215 cmd.process_group(0);
224216225217 cmd.spawn()
226226- .with_context(|| format!("spawn {}", display_os(&spec.program)))
218218+ .with_context(|| format!("spawn {:?}", &spec.program))
219219+}
220220+221221+// resolve the supplementary group list up front so the pre_exec hook never has
222222+// to read /etc/group (which is not async-signal-safe) between fork and exec.
223223+fn resolve_supplementary_groups(uid: u32, gid: u32) -> Result<Vec<Gid>> {
224224+ let username = User::from_uid(Uid::from_raw(uid))
225225+ .ok()
226226+ .flatten()
227227+ .map(|u| u.name)
228228+ .with_context(|| format!("lookup passwd entry for uid {uid}"))?;
229229+ let cname = CString::new(username)
230230+ .with_context(|| format!("username for uid {uid} contained a null byte"))?;
231231+ getgrouplist(&cname, Gid::from_raw(gid)).context("resolve supplementary groups")
232232+}
233233+234234+pub fn spawn_pty(spec: Spec, rows: u16, cols: u16) -> Result<(OwnedReadPty, OwnedWritePty, Child)> {
235235+ let (pty, pts) = pty_process::open().context("open pty")?;
236236+ pty.resize(Size::new(rows, cols)).context("set pty size")?;
237237+238238+ let mut cmd = PtyCommand::new(&spec.program)
239239+ .args(&spec.args)
240240+ .envs(spec.env.iter().map(|(key, value)| (key, value)));
241241+ if let Some(cwd) = &spec.cwd {
242242+ cmd = cmd.current_dir(cwd);
243243+ }
244244+245245+ // drop privileges in the child. this RELIES on pty-process composing our
246246+ // pre_exec hook *after* its own session setup: it wraps us as `move || {
247247+ // session_leader()?; ours()?; }`, so setsid + TIOCSCTTY run first (while
248248+ // still privileged) and only then do we drop to the workflow user. that
249249+ // ordering is what we want and we depend on it. if pty-process ever ran our
250250+ // hook first, the session setup would happen post-drop. (it'd likely still
251251+ // work, since setsid/TIOCSCTTY on our own pty need no privilege, but it is
252252+ // not the behaviour we're assuming here)
253253+ // don't use .uid()/.gid() here, they clear supplementary groups (see L195).
254254+ if let (Some(uid), Some(gid)) = (spec.uid, spec.gid) {
255255+ let groups = resolve_supplementary_groups(uid, gid)?;
256256+ // SAFETY: pre_exec runs between fork and execve in the child. every call
257257+ // below is async-signal-safe and touches no shared state.
258258+ cmd = unsafe {
259259+ cmd.pre_exec(move || {
260260+ setgroups(&groups).map_err(io::Error::from)?;
261261+ setgid(Gid::from_raw(gid)).map_err(io::Error::from)?;
262262+ setuid(Uid::from_raw(uid)).map_err(io::Error::from)?;
263263+ Ok(())
264264+ })
265265+ };
266266+ }
267267+268268+ // spawn consumes the slave (dup'd onto the child's 0/1/2 and then closed in
269269+ // the parent), so the master reports EOF once the shell and all its children
270270+ // have exited.
271271+ let child = cmd
272272+ .spawn(pts)
273273+ .with_context(|| format!("spawn pty shell {:?}", &spec.program))?;
274274+275275+ let (reader, writer) = pty.into_split();
276276+ Ok((reader, writer, child))
227277}
228278229279async fn wait_child(child: &mut Child, timeout: Option<Duration>) -> ExitResult {
···293343 })
294344}
295345296296-fn display_os(value: &OsStr) -> String {
297297- value.to_string_lossy().into_owned()
346346+#[cfg(test)]
347347+mod tests {
348348+ use super::*;
349349+ use tokio::io::AsyncReadExt;
350350+351351+ #[tokio::test]
352352+ async fn pty_runs_a_shell_and_reports_exit() {
353353+ let spec = Spec::new("/bin/sh")
354354+ .arg("-c")
355355+ .arg("printf 'hello pty'; exit 7");
356356+ let (mut reader, _writer, mut child) = spawn_pty(spec, 24, 80).expect("spawn pty");
357357+358358+ let mut output = Vec::new();
359359+ let mut chunk = [0u8; 1024];
360360+ loop {
361361+ match reader.read(&mut chunk).await {
362362+ Ok(0) => break,
363363+ Ok(n) => output.extend_from_slice(&chunk[..n]),
364364+ // linux signals slave-closed with EIO rather than EOF
365365+ Err(error) if error.raw_os_error() == Some(nix::libc::EIO) => break,
366366+ Err(error) => panic!("read pty master: {error}"),
367367+ }
368368+ }
369369+370370+ let status = child.wait().await.expect("wait child");
371371+ let text = String::from_utf8_lossy(&output);
372372+ assert!(text.contains("hello pty"), "unexpected output: {text:?}");
373373+ assert_eq!(status.code(), Some(7));
374374+ }
375375+376376+ #[tokio::test]
377377+ async fn pty_resize_succeeds() {
378378+ let spec = Spec::new("/bin/sh").arg("-c").arg("sleep 0.2");
379379+ let (_reader, writer, mut child) = spawn_pty(spec, 24, 80).expect("spawn pty");
380380+ writer.resize(Size::new(40, 120)).expect("resize");
381381+ let _ = child.wait().await;
382382+ }
298383}
···78787979 AgingThreshold time.Duration `env:"AGING_THRESHOLD, default=30s"`
80808181+ DebugSSH DebugSSH `env:",prefix=DEBUG_SSH_"`
8282+8183 EnableCgroups bool `env:"ENABLE_CGROUPS, default=false"`
8284 CgroupParent string `env:"CGROUP_PARENT, default=self"`
8385 CgroupPidsMax int64 `env:"CGROUP_PIDS_MAX, default=4096"`
8486 CgroupSwapMaxMiB *int64 `env:"CGROUP_SWAP_MAX_MIB"`
8587 // memory.min that will get assigned to the supervisor (spindle itself) cgroup
8688 CgroupSupervisorMemoryMinMiB int64 `env:"CGROUP_SUPERVISOR_MEMORY_MIN_MIB, default=512"`
8989+}
9090+9191+type DebugSSH struct {
9292+ Enabled bool `env:"ENABLED, default=false"`
9393+ ListenAddr string `env:"LISTEN_ADDR, default=0.0.0.0:2222"`
9494+ // path to private key; if empty, spindle will generate one next to the db
9595+ HostKeyPath string `env:"HOST_KEY_PATH"`
9696+ // how long to keep a failed wf alive after failure, for sshing in
9797+ GracePeriod time.Duration `env:"GRACE_PERIOD, default=5m"`
8798}
889989100type NixCache struct {
+5-19
spindle/engine/engine.go
···2020 ErrWorkflowFailed = errors.New("workflow failed")
2121)
22222323-type workflowFinalizer interface {
2424- FinalizeWorkflow(ctx context.Context, wid models.WorkflowId, wf *models.Workflow, wfLogger models.WorkflowLogger) error
2525-}
2626-2723func StartWorkflows(l *slog.Logger, vault secrets.Manager, cfg *config.Config, db *db.DB, n *notifier.Notifier, ctx context.Context, pipeline *models.Pipeline, pipelineId models.PipelineId) {
2824 l.Info("starting all workflows in parallel", "pipeline", pipelineId)
2925···5147 l.Info("using workflow timeout", "timeout", workflowTimeout)
52485349 for _, w := range wfs {
5454- wg.Add(1)
5555- go func() {
5656- defer wg.Done()
5757-5050+ wg.Go(func() {
5851 wid := models.WorkflowId{
5952 PipelineId: pipelineId,
6053 Name: w.Name,
···117110 }
118111 return
119112 }
113113+ // don't put this after the workflowTimeout deadline assignment
114114+ // below. engines that implement "ssh-after-fail" rely on the
115115+ // unbounded ctx for retaining the workflow after it fails.
120116 defer eng.DestroyWorkflow(ctx, wid)
121117122118 ctx, cancel := context.WithTimeout(ctx, workflowTimeout)
···155151 }
156152 }
157153158158- if finalizer, ok := eng.(workflowFinalizer); ok {
159159- if err := finalizer.FinalizeWorkflow(ctx, wid, &w, wfLogger); err != nil {
160160- dbErr := db.StatusFailed(wid, err.Error(), -1, n)
161161- if dbErr != nil {
162162- l.Error("failed to set workflow status to failed", "wid", wid, "err", dbErr)
163163- }
164164- return
165165- }
166166- }
167167-168154 err = db.StatusSuccess(wid, n)
169155 if err != nil {
170156 l.Error("failed to set workflow status to success", "wid", wid, "err", err)
171157 }
172172- }()
158158+ })
173159 }
174160 }
175161
+31-2
spindle/engines/microvm/README.md
···4141spindle expects, they should work. That is:
4242- a guest agent is present inside of the image and when that image boots it will
4343 get started,
4444-- `spindle-workflow` user exists,
4545-- and the work directory is configured (`/workspace`).
4444+- the `spindle-workflow` user exists, is unprivileged (non-zero uid/gid), and has
4545+ a usable login shell and home dir set in the image's passwd: workflow steps run
4646+ as this user, and the debug shell (see below) launches its passwd shell as a
4747+ login shell in its home dir. an unset or `nologin`/`false` shell breaks debug
4848+ ssh,
4949+- and the work directory is configured (`/workspace`, with `/workspace/repo` as
5050+ the per-step working dir).
46514752## Image discovery
4853···233238never made it to the destination store. The guest still only ever sees the same
234239HTTP binary-cache upload protocol over vsock; it never gets direct access to
235240SSH credentials or the destination store itself.
241241+242242+### Debug ssh
243243+244244+When a workflow fails, spindle can keep its microVM alive for a configured grace
245245+window (`MicroVMPipelines.SSH`) and print an `ssh` invocation so you can poke at
246246+the failed VM interactively. Spindle terminates the ssh connection itself and
247247+bridges a pty into the live guest over the agent's vsock; the guest stays
248248+keyless and never runs an ssh daemon.
249249+250250+Access mirrors a git push: the ssh username is the job id, and the offered
251251+public key is sent to the job's repo knot (`sh.tangled.repo.checkPushAllowed`).
252252+The session is accepted only if that key is allowed to push to the job's repo.
253253+254254+The shell is deliberately not configurable from either end. It always:
255255+- runs as the `spindle-workflow` user (the ssh username selects the *job*, not a
256256+ unix user),
257257+- uses that user's login shell from the image's passwd, launched as a login
258258+ shell (`-l`), and
259259+- starts in the dir where the repo was cloned to.
260260+261261+The only things the client influences are the terminal type and window size
262262+(forwarded from the ssh pty request, and on resize). This relies on the image
263263+configuring `spindle-workflow` properly per the expectations above; in
264264+particular a missing or `nologin`/`false` won't work of course.
···155155 CacheReadURLs []string
156156 CacheTrustedPublicKeys []string
157157 VM VMHandle
158158+ CID uint32
158159 Agent *AgentSession
159160 ReadCache *ReadCacheProxy
160161 UploadCache *UploadCacheProxy
161162 DNSProxy *DNSProxy
162163 WorkDir string
163164 NixOSToplevelCache nixosToplevelCacheStore
165165+ StartedAt time.Time // when the VM booted, for the max-lifetime cap
164166}
165167166168func (e *Engine) cleanupState(ctx context.Context, wid models.WorkflowId, state *workflowState) error {
167169 if state == nil {
168170 return nil
169171 }
172172+173173+ // stop advertising this VM for debug shells before we tear it down
174174+ e.unregisterDebugTarget(wid)
170175171176 ctx = context.WithoutCancel(ctx)
172177