Monorepo for Tangled
tangled.org
1package microvm
2
3import (
4 "context"
5 _ "embed"
6 "encoding/json"
7 "errors"
8 "fmt"
9 "log/slog"
10 "os"
11 "os/exec"
12 "path/filepath"
13 "strconv"
14 "strings"
15 "sync"
16 "time"
17
18 "github.com/digitalocean/go-qemu/qmp"
19 "github.com/google/uuid"
20)
21
22const (
23 defaultQMPTimeout = 10 * time.Second
24 outerSlirpCIDR = "10.0.2.0/24"
25 innerSlirpNet = "10.0.3.0/24"
26 innerSlirpHost = "10.0.3.2"
27 innerSlirpDNS = "10.0.3.3"
28 innerSlirpDHCP = "10.0.3.15"
29 netnsTapName = "tap0"
30 netnsMTU = "65520"
31)
32
33type QEMUConfig struct {
34 Image ImageSpec
35 BootTimeout time.Duration
36 CID uint32
37 EnableKVM bool
38 QEMULogPath string
39 QMPPath string
40 SerialLogPath string
41 WorkDir string
42 VolumePaths map[string]string
43 VolumeBaseName string
44 Cgroup CgroupLimits
45 Dev bool
46}
47
48type QEMUVMHandle struct {
49 cid uint32
50 Process *os.Process
51 qemuLogPath string
52 QMPMon *qmp.SocketMonitor
53 QMPPath string
54 serialLogPath string
55 workDir string
56
57 cmd *exec.Cmd
58 done chan struct{}
59 qemuLogFile *os.File
60 cgroup *CgroupHandle
61 slirpCmd *exec.Cmd
62 slirpExit *os.File
63 waitErr error
64 waitErrMu sync.Mutex
65}
66
67type qemuRunner struct{}
68
69func (qemuRunner) Validate(spec ImageSpec, enableKVM bool) error {
70 if _, err := exec.LookPath(spec.RunnerCmd()); err != nil {
71 return fmt.Errorf("required host command %q not found in PATH: %w", spec.RunnerCmd(), err)
72 }
73 if _, err := os.Stat("/dev/vhost-vsock"); err != nil {
74 return fmt.Errorf("microvm requires /dev/vhost-vsock for vhost-vsock-device: %w", err)
75 }
76 if enableKVM {
77 if _, err := os.Stat("/dev/kvm"); err != nil {
78 return fmt.Errorf("microvm KVM was requested but /dev/kvm is not accessible: %w", err)
79 }
80 }
81 if len(spec.NetworkInterfaces) > 0 {
82 if _, err := os.Stat("/dev/net/tun"); err != nil {
83 return fmt.Errorf("microvm slirp4netns networking requires /dev/net/tun: %w", err)
84 }
85 for _, cmd := range []string{"ip", "mount", "slirp4netns", "unshare"} {
86 if _, err := exec.LookPath(cmd); err != nil {
87 return fmt.Errorf("required host command %q not found in PATH: %w", cmd, err)
88 }
89 }
90 }
91 return nil
92}
93
94func (qemuRunner) Start(ctx context.Context, cfg VMConfig, volumePaths map[string]string, logger *slog.Logger) (VMHandle, error) {
95 bootTimeout := cfg.BootTimeout
96 if bootTimeout == 0 {
97 bootTimeout = 10 * time.Second
98 }
99 return StartQEMU(ctx, QEMUConfig{
100 Image: cfg.Image,
101 BootTimeout: bootTimeout,
102 CID: cfg.CID,
103 EnableKVM: cfg.EnableKVM,
104 WorkDir: cfg.WorkDir,
105 VolumePaths: volumePaths,
106 Cgroup: cfg.Cgroup,
107 Dev: cfg.Dev,
108 }, logger)
109}
110
111func StartQEMU(ctx context.Context, cfg QEMUConfig, logger *slog.Logger) (VMHandle, error) {
112 if logger == nil {
113 logger = slog.Default()
114 }
115
116 workDir := cfg.WorkDir
117
118 handle := &QEMUVMHandle{
119 workDir: workDir,
120 }
121
122 var ok bool
123 defer func() {
124 if !ok {
125 _ = handle.Close()
126 }
127 }()
128
129 cid := cfg.CID
130 if cid == 0 {
131 var err error
132 cid, err = AllocateCID()
133 if err != nil {
134 return nil, err
135 }
136 }
137 if cid < minGuestCID {
138 return nil, fmt.Errorf("guest CID must be >= %d", minGuestCID)
139 }
140 handle.cid = cid
141
142 volumePaths := cfg.VolumePaths
143
144 qemuLogPath := cfg.QEMULogPath
145 if qemuLogPath == "" {
146 qemuLogPath = filepath.Join(workDir, "qemu.log")
147 }
148 qemuLogFile, err := createParentedFile(qemuLogPath)
149 if err != nil {
150 return nil, err
151 }
152 handle.qemuLogPath = qemuLogPath
153 handle.qemuLogFile = qemuLogFile
154
155 serialLogPath := cfg.SerialLogPath
156 if serialLogPath == "" {
157 serialLogPath = filepath.Join(workDir, "serial.log")
158 }
159 if err := os.MkdirAll(filepath.Dir(serialLogPath), 0o755); err != nil {
160 return nil, fmt.Errorf("create serial log directory: %w", err)
161 }
162 handle.serialLogPath = serialLogPath
163
164 qmpPath := cfg.QMPPath
165 if qmpPath == "" {
166 qmpPath = filepath.Join(workDir, "qmp.sock")
167 }
168 handle.QMPPath = qmpPath
169
170 qemuCmd := cfg.Image.RunnerCmd()
171 qemuBinary, err := exec.LookPath(qemuCmd)
172 if err != nil {
173 return nil, fmt.Errorf("%s command not found in PATH: %w", qemuCmd, err)
174 }
175
176 args, err := qemuArgs(qemuArgsConfig{
177 Image: cfg.Image,
178 CID: cid,
179 EnableKVM: cfg.EnableKVM,
180 QMPPath: qmpPath,
181 SerialLogPath: serialLogPath,
182 VolumePaths: volumePaths,
183 })
184 if err != nil {
185 return nil, err
186 }
187
188 cmd, slirpNet, err := qemuCommand(ctx, qemuBinary, args, cfg.Image, workDir, cfg.Dev)
189 if err != nil {
190 return nil, err
191 }
192 cmd.Env = append(os.Environ(), "TMPDIR="+workDir)
193 cmd.Stdout = qemuLogFile
194 cmd.Stderr = qemuLogFile
195
196 cgroup, err := prepareCgroup(cfg.Cgroup, logger)
197 if err != nil {
198 return nil, err
199 }
200 handle.cgroup = cgroup
201
202 logger.Info("starting qemu microvm", "cid", cid, "workDir", workDir, "serialLog", serialLogPath, "qmp", qmpPath)
203 if err := cmd.Start(); err != nil {
204 return nil, fmt.Errorf("starting qemu: %w", err)
205 }
206 handle.cmd = cmd
207 handle.Process = cmd.Process
208 handle.done = make(chan struct{})
209 go func() {
210 err := cmd.Wait()
211 handle.waitErrMu.Lock()
212 handle.waitErr = err
213 handle.waitErrMu.Unlock()
214 close(handle.done)
215 }()
216
217 if err := cgroup.AddProcess(cmd.Process.Pid, logger); err != nil {
218 return nil, err
219 }
220
221 if slirpNet != nil {
222 handle.slirpCmd, handle.slirpExit, err = slirpNet.Start(ctx, qemuLogFile, logger)
223 if err != nil {
224 return nil, err
225 }
226 if handle.slirpCmd != nil && handle.slirpCmd.Process != nil {
227 if err := cgroup.AddProcess(handle.slirpCmd.Process.Pid, logger); err != nil {
228 return nil, err
229 }
230 }
231 }
232
233 qmpTimeout := cfg.BootTimeout
234 if qmpTimeout == 0 {
235 qmpTimeout = defaultQMPTimeout
236 }
237 if err := handle.waitForQMP(ctx, qmpTimeout); err != nil {
238 return nil, err
239 }
240
241 status, err := handle.QMPQueryStatus()
242 if err != nil {
243 return nil, err
244 }
245 if status != "running" {
246 return nil, fmt.Errorf("qemu guest not running (status: %s)", status)
247 }
248 logger.Info("qemu microvm running", "cid", cid, "status", status)
249
250 ok = true
251 return handle, nil
252}
253
254func (h *QEMUVMHandle) Wait() error {
255 if h == nil || h.done == nil {
256 return nil
257 }
258 <-h.done
259 h.waitErrMu.Lock()
260 defer h.waitErrMu.Unlock()
261 return h.waitErr
262}
263
264func (h *QEMUVMHandle) WaitContext(ctx context.Context) error {
265 if h == nil || h.done == nil {
266 return nil
267 }
268 select {
269 case <-h.done:
270 h.waitErrMu.Lock()
271 defer h.waitErrMu.Unlock()
272 return h.waitErr
273 case <-ctx.Done():
274 return ctx.Err()
275 }
276}
277
278func (h *QEMUVMHandle) Kill() error {
279 if h == nil || h.Process == nil {
280 return nil
281 }
282 return h.Process.Kill()
283}
284
285func (h *QEMUVMHandle) Shutdown(ctx context.Context) error {
286 if h == nil {
287 return nil
288 }
289 if h.QMPMon != nil {
290 if err := h.QMPSystemPowerdown(); err != nil {
291 return err
292 }
293 }
294 if h.done == nil {
295 return nil
296 }
297 select {
298 case <-h.done:
299 return h.Wait()
300 case <-ctx.Done():
301 _ = h.Kill()
302 _ = h.Wait()
303 return ctx.Err()
304 }
305}
306
307func (h *QEMUVMHandle) Close() error {
308 if h == nil {
309 return nil
310 }
311
312 var closeErr error
313 if h.QMPMon != nil {
314 closeErr = errors.Join(closeErr, h.QMPMon.Disconnect())
315 h.QMPMon = nil
316 }
317 if h.Process != nil {
318 _ = h.Process.Kill()
319 _ = h.Wait()
320 }
321 if h.slirpExit != nil {
322 _ = h.slirpExit.Close()
323 h.slirpExit = nil
324 }
325 if h.slirpCmd != nil && h.slirpCmd.Process != nil {
326 _ = h.slirpCmd.Process.Kill()
327 _ = h.slirpCmd.Wait()
328 h.slirpCmd = nil
329 }
330 if h.qemuLogFile != nil {
331 closeErr = errors.Join(closeErr, h.qemuLogFile.Close())
332 h.qemuLogFile = nil
333 }
334 if h.cgroup != nil {
335 closeErr = errors.Join(closeErr, h.cgroup.Close())
336 h.cgroup = nil
337 }
338 return closeErr
339}
340
341func (h *QEMUVMHandle) QMPRun(command qmp.Command) ([]byte, error) {
342 if h == nil || h.QMPMon == nil {
343 return nil, fmt.Errorf("qmp monitor is not connected")
344 }
345 data, err := json.Marshal(command)
346 if err != nil {
347 return nil, err
348 }
349 return h.QMPMon.Run(data)
350}
351
352func (h *QEMUVMHandle) QMPQueryStatus() (string, error) {
353 raw, err := h.QMPRun(qmp.Command{Execute: "query-status"})
354 if err != nil {
355 return "", fmt.Errorf("qmp query-status failed: %w", err)
356 }
357
358 var resp struct {
359 Return struct {
360 Status string `json:"status"`
361 } `json:"return"`
362 }
363 if err := json.Unmarshal(raw, &resp); err != nil {
364 return "", fmt.Errorf("qmp query-status parse: %w", err)
365 }
366 return resp.Return.Status, nil
367}
368
369func (h *QEMUVMHandle) QMPSystemPowerdown() error {
370 _, err := h.QMPRun(qmp.Command{Execute: "system_powerdown"})
371 return err
372}
373
374func (h *QEMUVMHandle) Logs() VMLogs {
375 if h == nil {
376 return VMLogs{}
377 }
378 return VMLogs{
379 Serial: h.serialLogPath,
380 Extra: map[string]string{
381 "qemu": h.qemuLogPath,
382 },
383 }
384}
385
386func (h *QEMUVMHandle) CID() uint32 {
387 if h == nil {
388 return 0
389 }
390 return h.cid
391}
392
393func (h *QEMUVMHandle) WorkDir() string {
394 if h == nil {
395 return ""
396 }
397 return h.workDir
398}
399
400func (h *QEMUVMHandle) OOMKilled() bool {
401 if h == nil {
402 return false
403 }
404 return h.cgroup.OOMKilled()
405}
406
407func (h *QEMUVMHandle) waitForQMP(ctx context.Context, timeout time.Duration) error {
408 qmpCtx, cancel := context.WithTimeout(ctx, timeout)
409 defer cancel()
410
411 var lastErr error
412 for {
413 mon, err := qmp.NewSocketMonitor("unix", h.QMPPath, 2*time.Second)
414 if err == nil {
415 if err = mon.Connect(); err == nil {
416 h.QMPMon = mon
417 return nil
418 }
419 _ = mon.Disconnect()
420 }
421 lastErr = err
422
423 select {
424 case <-qmpCtx.Done():
425 return fmt.Errorf("qmp connect timeout: %w", lastErr)
426 case <-h.done:
427 return fmt.Errorf("qemu exited before qmp was ready: %w", h.Wait())
428 case <-time.After(25 * time.Millisecond):
429 }
430 }
431}
432
433func qemuCommand(
434 ctx context.Context,
435 qemuBinary string,
436 args []string,
437 spec ImageSpec,
438 workDir string,
439 dev bool,
440) (*exec.Cmd, *slirpNamespace, error) {
441 if len(spec.NetworkInterfaces) == 0 {
442 return exec.CommandContext(ctx, qemuBinary, args...), nil, nil
443 }
444
445 ipPath, err := exec.LookPath("ip")
446 if err != nil {
447 return nil, nil, fmt.Errorf("ip command not found in PATH: %w", err)
448 }
449 mountPath, err := exec.LookPath("mount")
450 if err != nil {
451 return nil, nil, fmt.Errorf("mount command not found in PATH: %w", err)
452 }
453 unsharePath, err := exec.LookPath("unshare")
454 if err != nil {
455 return nil, nil, fmt.Errorf("unshare command not found in PATH: %w", err)
456 }
457
458 pidFile, resolvPath, wrapperPath, err := prepareQEMUNetnsFiles(workDir, dev)
459 if err != nil {
460 return nil, nil, err
461 }
462
463 cmdArgs := append([]string{
464 "--user",
465 "--map-root-user",
466 "--net",
467 "--mount",
468 "--propagation", "private",
469 "--",
470 wrapperPath,
471 pidFile,
472 ipPath,
473 mountPath,
474 resolvPath,
475 qemuBinary,
476 }, args...)
477
478 cmd := exec.CommandContext(ctx, unsharePath, cmdArgs...)
479
480 return cmd, &slirpNamespace{
481 spec: spec,
482 pidFile: pidFile,
483 dev: dev,
484 }, nil
485}
486
487func prepareQEMUNetnsFiles(workDir string, dev bool) (pidFile, resolvPath, wrapperPath string, err error) {
488 pidFile = filepath.Join(workDir, "qemu-netns.pid")
489 resolvPath = filepath.Join(workDir, "qemu-netns-resolv.conf")
490 wrapperPath = filepath.Join(workDir, "qemu-netns-wrapper")
491
492 // the guest resolves through shuttle on 127.0.0.1. keep qemu's slirp DNS
493 // pointed at an unroutable local resolver inside this network namespace so
494 // direct guest queries to 10.0.3.3 don't bypass the shuttle dns policy.
495 if err := os.WriteFile(resolvPath, []byte("nameserver 127.0.0.1\n"), 0o644); err != nil {
496 return "", "", "", fmt.Errorf("write qemu network namespace resolv.conf: %w", err)
497 }
498
499 if err := writeNetnsWrapper(wrapperPath, dev); err != nil {
500 return "", "", "", fmt.Errorf("write qemu network namespace wrapper: %w", err)
501 }
502
503 return pidFile, resolvPath, wrapperPath, nil
504}
505
506type qemuArgsConfig struct {
507 Image ImageSpec
508 CID uint32
509 EnableKVM bool
510 QMPPath string
511 SerialLogPath string
512 VolumePaths map[string]string
513}
514
515func qemuArgs(cfg qemuArgsConfig) ([]string, error) {
516 uuid := uuid.New()
517
518 b := newArgBuilder(64)
519
520 addQEMUMachineArgs(&b, cfg, uuid)
521 addQEMUStoreArgs(&b, cfg)
522
523 if cfg.EnableKVM {
524 addQEMUKVMArgs(&b, cfg.Image)
525 }
526
527 if err := addQEMUVolumeArgs(&b, cfg); err != nil {
528 return nil, err
529 }
530
531 if err := addQEMUNetworkArgs(&b, cfg.Image.NetworkInterfaces); err != nil {
532 return nil, err
533 }
534
535 b.Optf("-device", "vhost-vsock-device,guest-cid=%d", cfg.CID)
536
537 if len(cfg.Image.RunnerConfig.ExtraArgs) > 0 {
538 b.Add(cfg.Image.RunnerConfig.ExtraArgs...)
539 }
540
541 return b.Args(), nil
542}
543
544func addQEMUMachineArgs(b *argBuilder, cfg qemuArgsConfig, uuid uuid.UUID) {
545 if cfg.Image.RunnerConfig.Machine != "" {
546 b.Opt("-M", cfg.Image.RunnerConfig.Machine)
547 }
548 b.Optf("-m", "%dM", cfg.Image.MemoryMiB)
549 b.Opt("-smp", strconv.Itoa(cfg.Image.VCPUs))
550
551 b.Add(
552 "-nodefaults",
553 "-no-user-config",
554 "-no-reboot",
555 )
556
557 b.Opt("-kernel", cfg.Image.Kernel)
558 b.Opt("-initrd", cfg.Image.Initrd)
559
560 b.Opt("-device", "virtio-rng-device")
561
562 b.Optf("-smbios", "type=1,uuid=%s", uuid)
563 b.Opt("-serial", "file:"+cfg.SerialLogPath)
564
565 // use virtio console if requsted. this is faster than the serial UART logging
566 // because serial has a higher cost when being accesssed. we still have to
567 // support serial itself for early kernel boot but thats OK.
568 if cfg.Image.RunnerConfig.Console == "hvc0" {
569 b.Optf("-chardev", "file,id=virtiocon0,path=%s,append=on", cfg.SerialLogPath)
570 b.Add("-device", "virtio-serial-device")
571 b.Opt("-device", "virtconsole,chardev=virtiocon0")
572 }
573 b.Opt("-display", "none")
574 b.Opt("-monitor", "none")
575 b.Opt("-append", cfg.Image.BootArgs)
576
577 b.Opt("-sandbox", "on")
578 b.Optf("-qmp", "unix:%s,server,nowait", cfg.QMPPath)
579}
580
581func addQEMUStoreArgs(b *argBuilder, cfg qemuArgsConfig) {
582 drive := newOptionBuilder(8)
583 drive.KV("id", "store")
584 drive.KV("format", "raw")
585 drive.Add("read-only=on")
586 drive.KV("file", cfg.Image.StoreDisk)
587 drive.Add("if=none")
588 drive.Add("aio=io_uring")
589
590 b.Opt("-drive", drive.String())
591 b.Opt("-device", "virtio-blk-device,drive=store")
592}
593
594func addQEMUKVMArgs(b *argBuilder, image ImageSpec) {
595 b.Flag("-enable-kvm")
596 if image.RunnerConfig.CPU != "" {
597 b.Opt("-cpu", image.RunnerConfig.CPU)
598 }
599 b.Opt("-device", "i8042")
600}
601
602func addQEMUVolumeArgs(b *argBuilder, cfg qemuArgsConfig) error {
603 for index, volume := range cfg.Image.Volumes {
604 path := cfg.VolumePaths[volume.Image]
605 if path == "" {
606 return fmt.Errorf("missing prepared path for volume %q", volume.Image)
607 }
608
609 driveID := fmt.Sprintf("volume%d", index)
610
611 drive := newOptionBuilder(10)
612 drive.KV("id", driveID)
613 drive.KV("format", "raw")
614 drive.Add("read-only=off")
615 drive.KV("file", path)
616 drive.Add("if=none")
617 drive.Add("aio=io_uring")
618 drive.Add("discard=unmap")
619 drive.Add("cache=none")
620
621 b.Opt("-drive", drive.String())
622 b.Optf("-device", "virtio-blk-device,drive=%s", driveID)
623 }
624
625 return nil
626}
627
628func addQEMUNetworkArgs(b *argBuilder, interfaces []NetworkInterface) error {
629 for _, networkInterface := range interfaces {
630 if networkInterface.Type != "slirp4netns" {
631 return fmt.Errorf("unsupported microvm network interface type %q", networkInterface.Type)
632 }
633
634 netdevOpts := newOptionBuilder(6)
635 netdevOpts.Add("user")
636 netdevOpts.KV("id", networkInterface.ID)
637 netdevOpts.KV("net", innerSlirpNet)
638 netdevOpts.KV("host", innerSlirpHost)
639 netdevOpts.KV("dns", innerSlirpDNS)
640 netdevOpts.KV("dhcpstart", innerSlirpDHCP)
641
642 b.Opt("-netdev", netdevOpts.String())
643 b.Optf(
644 "-device", "virtio-net-device,netdev=%s,mac=%s",
645 networkInterface.ID, networkInterface.MAC,
646 )
647 }
648
649 return nil
650}
651
652func waitForPIDFile(ctx context.Context, path string) (string, error) {
653 waitCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
654 defer cancel()
655
656 ticker := time.NewTicker(25 * time.Millisecond)
657 defer ticker.Stop()
658
659 for {
660 data, err := os.ReadFile(path)
661 if err == nil {
662 pid := strings.TrimSpace(string(data))
663 if pid != "" {
664 return pid, nil
665 }
666 } else if !errors.Is(err, os.ErrNotExist) {
667 return "", fmt.Errorf("read qemu network namespace pid: %w", err)
668 }
669
670 select {
671 case <-waitCtx.Done():
672 return "", fmt.Errorf("waiting for qemu network namespace pid: %w", waitCtx.Err())
673 case <-ticker.C:
674 }
675 }
676}