Monorepo for Tangled
tangled.org
1package microvm
2
3import (
4 "fmt"
5 "log/slog"
6 "os"
7 "path/filepath"
8 "regexp"
9 "strings"
10
11 cgroups "github.com/containerd/cgroups/v3"
12 "github.com/containerd/cgroups/v3/cgroup2"
13 "github.com/prometheus/procfs"
14)
15
16var (
17 cgroupInvalidChar = regexp.MustCompile(`[^a-zA-Z0-9\-_.]`)
18 cgroupConsecutiveSep = regexp.MustCompile(`[-_.]{2,}`)
19)
20
21const (
22 cgroupParentSelf = "self"
23 supervisorCgroupName = "supervisor"
24)
25
26type CgroupLimits struct {
27 Enabled bool
28 Parent *CgroupParent
29 Name string
30 MemoryMaxMiB int64
31 SwapMaxMiB *int64
32 PidsMax int64
33}
34
35type CgroupParent struct {
36 root *cgroup2.Manager
37 mountpoint string
38 group string
39}
40
41type CgroupHandle struct {
42 manager *cgroup2.Manager
43}
44
45func initCgroupParent(parent string, supervisorMemoryMinMiB int64, logger *slog.Logger) (*CgroupParent, error) {
46 if parent == "" {
47 parent = cgroupParentSelf
48 }
49 if cgroups.Mode() != cgroups.Unified {
50 return nil, fmt.Errorf("microVM cgroups require cgroup v2 unified mode")
51 }
52
53 mountpoint, group, err := resolveCgroupParent(parent)
54 if err != nil {
55 return nil, err
56 }
57 if _, err := os.Stat(filepath.Join(mountpoint, strings.TrimPrefix(group, "/"))); err != nil {
58 return nil, fmt.Errorf("stat cgroup parent %q:%q: %w", mountpoint, group, err)
59 }
60
61 root, err := cgroup2.Load(group, cgroup2.WithMountpoint(mountpoint))
62 if err != nil {
63 return nil, fmt.Errorf("load cgroup parent %q:%q: %w", mountpoint, group, err)
64 }
65
66 if group != "/" {
67 if err := moveParentProcesses(root, supervisorMemoryMinMiB, logger); err != nil {
68 return nil, err
69 }
70 }
71
72 if logger != nil {
73 logger.Info("initialized microVM cgroup parent", "mountpoint", mountpoint, "group", group)
74 }
75 return &CgroupParent{root: root, mountpoint: mountpoint, group: group}, nil
76}
77
78func prepareCgroup(limits CgroupLimits, logger *slog.Logger) (*CgroupHandle, error) {
79 if !limits.Enabled {
80 return nil, nil
81 }
82 if limits.Parent == nil || limits.Parent.root == nil {
83 return nil, fmt.Errorf("cgroup parent is not initialized")
84 }
85 name := sanitizeCgroupName(limits.Name)
86 if name == "" {
87 return nil, fmt.Errorf("cgroup name is empty")
88 }
89
90 manager, err := limits.Parent.root.NewChild(name, cgroupResources(limits))
91 if err != nil {
92 return nil, fmt.Errorf("create cgroup %q: %w", name, err)
93 }
94
95 if logger != nil {
96 logger.Info("created microVM cgroup", "name", name, "parentGroup", limits.Parent.group)
97 }
98 return &CgroupHandle{manager: manager}, nil
99}
100
101func cgroupResources(limits CgroupLimits) *cgroup2.Resources {
102 resources := &cgroup2.Resources{}
103 if limits.MemoryMaxMiB > 0 || limits.SwapMaxMiB != nil {
104 memory := &cgroup2.Memory{}
105 if limits.MemoryMaxMiB > 0 {
106 maxBytes := limits.MemoryMaxMiB * 1024 * 1024
107 memory.Max = &maxBytes
108 }
109 if limits.SwapMaxMiB != nil {
110 swapBytes := *limits.SwapMaxMiB * 1024 * 1024
111 memory.Swap = &swapBytes
112 }
113 oomGroup := true
114 memory.OOMGroup = &oomGroup
115 resources.Memory = memory
116 }
117 if limits.PidsMax > 0 {
118 resources.Pids = &cgroup2.Pids{Max: limits.PidsMax}
119 }
120 return resources
121}
122
123func supervisorResources(memoryMinMiB int64) *cgroup2.Resources {
124 if memoryMinMiB <= 0 {
125 return nil
126 }
127 minBytes := memoryMinMiB * 1024 * 1024
128 return &cgroup2.Resources{
129 Memory: &cgroup2.Memory{Min: &minBytes},
130 }
131}
132
133func (h *CgroupHandle) AddProcess(pid int, logger *slog.Logger) error {
134 if h == nil || h.manager == nil {
135 return nil
136 }
137 if pid <= 0 {
138 return fmt.Errorf("invalid pid %d", pid)
139 }
140 if err := h.manager.AddProc(uint64(pid)); err != nil {
141 return fmt.Errorf("add pid %d to cgroup: %w", pid, err)
142 }
143 if logger != nil {
144 logger.Info("added process to microVM cgroup", "pid", pid)
145 }
146 return nil
147}
148
149func (h *CgroupHandle) Close() error {
150 if h == nil || h.manager == nil {
151 return nil
152 }
153 return h.manager.Delete()
154}
155
156func (h *CgroupHandle) OOMKilled() bool {
157 if h == nil || h.manager == nil {
158 return false
159 }
160 metrics, err := h.manager.Stat()
161 if err != nil || metrics == nil || metrics.MemoryEvents == nil {
162 return false
163 }
164 return metrics.MemoryEvents.OomKill > 0
165}
166
167func resolveCgroupParent(parent string) (string, string, error) {
168 mountpoint, err := cgroup2Mountpoint()
169 if err != nil {
170 return "", "", err
171 }
172
173 if parent == "" || parent == cgroupParentSelf {
174 group, err := selfCgroupV2Path()
175 if err != nil {
176 return "", "", err
177 }
178 return mountpoint, group, nil
179 }
180 if !filepath.IsAbs(parent) {
181 return "", "", fmt.Errorf("cgroup parent must be %q or an absolute delegated cgroupfs path: %q", cgroupParentSelf, parent)
182 }
183
184 cleanParent := filepath.Clean(parent)
185 rel, err := filepath.Rel(mountpoint, cleanParent)
186 if err != nil {
187 return "", "", fmt.Errorf("resolve cgroup parent %q relative to cgroup2 mount %q: %w", cleanParent, mountpoint, err)
188 }
189 if rel == ".." || strings.HasPrefix(rel, "../") {
190 return "", "", fmt.Errorf("cgroup parent %q is outside cgroup2 mount %q", cleanParent, mountpoint)
191 }
192 if rel == "." {
193 return mountpoint, "/", nil
194 }
195
196 group := "/" + filepath.ToSlash(rel)
197 if err := cgroup2.VerifyGroupPath(group); err != nil {
198 return "", "", fmt.Errorf("invalid cgroup parent path %q: %w", group, err)
199 }
200 return mountpoint, group, nil
201}
202
203func cgroup2Mountpoint() (string, error) {
204 mounts, err := procfs.GetMounts()
205 if err != nil {
206 return "", fmt.Errorf("read procfs mountinfo: %w", err)
207 }
208 for _, mount := range mounts {
209 if mount.FSType == "cgroup2" {
210 return mount.MountPoint, nil
211 }
212 }
213 return "", fmt.Errorf("cgroup v2 mountpoint not found")
214}
215
216func selfCgroupV2Path() (string, error) {
217 self, err := procfs.Self()
218 if err != nil {
219 return "", fmt.Errorf("open procfs self: %w", err)
220 }
221 groups, err := self.Cgroups()
222 if err != nil {
223 return "", fmt.Errorf("read procfs self cgroups: %w", err)
224 }
225 for _, group := range groups {
226 if group.HierarchyID != 0 {
227 continue
228 }
229 path := group.Path
230 if path == "" {
231 path = "/"
232 }
233 if err := cgroup2.VerifyGroupPath(path); err != nil {
234 return "", fmt.Errorf("invalid self cgroup path %q: %w", path, err)
235 }
236 return path, nil
237 }
238 return "", fmt.Errorf("current process has no cgroup v2 hierarchy entry")
239}
240
241func moveParentProcesses(parent *cgroup2.Manager, supervisorMemoryMinMiB int64, logger *slog.Logger) error {
242 procs, err := parent.Procs(false)
243 if err != nil {
244 return fmt.Errorf("list parent cgroup processes: %w", err)
245 }
246
247 // first create with empty resources
248 supervisor, err := parent.NewChild(supervisorCgroupName, &cgroup2.Resources{})
249 if err != nil {
250 return fmt.Errorf("create supervisor cgroup: %w", err)
251 }
252
253 // move procs
254 for _, pid := range procs {
255 if err := supervisor.AddProc(pid); err != nil {
256 return fmt.Errorf("move pid %d to supervisor cgroup: %w", pid, err)
257 }
258 }
259
260 // now apply resources. we can't do this while parent has procs still
261 if res := supervisorResources(supervisorMemoryMinMiB); res != nil {
262 // we use a "new" parent here, this is so we enable subtree_control.
263 // .Update() does not work here...
264 if _, err = parent.NewChild(supervisorCgroupName, res); err != nil {
265 return fmt.Errorf("apply supervisor cgroup resources: %w", err)
266 }
267 }
268
269 if logger != nil && len(procs) > 0 {
270 logger.Info("moved spindle processes to supervisor cgroup", "processes", len(procs))
271 }
272 return nil
273}
274
275func sanitizeCgroupName(name string) string {
276 name = cgroupInvalidChar.ReplaceAllLiteralString(name, "-")
277 name = cgroupConsecutiveSep.ReplaceAllLiteralString(name, "-")
278 return strings.Trim(name, "-_.")
279}