summaryrefslogtreecommitdiff
path: root/internal/probemanager/manager.go
blob: 8b15f946f1f4c960dc517d048c4dfedc6d584b37 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
package probemanager

import (
	"cmp"
	"errors"
	"fmt"
	"slices"
	"strings"
	"sync"
)

// Link abstracts an attached tracepoint link.
type Link interface {
	Destroy() error
}

// Program abstracts a loadable BPF program that can attach to a tracepoint.
type Program interface {
	AttachTracepoint(category, name string) (Link, error)
}

// Attacher resolves BPF programs by name.
type Attacher interface {
	GetProgram(name string) (Program, error)
}

// ProbeState is an immutable view used by callers/UI.
type ProbeState struct {
	Syscall string
	Active  bool
	Error   string
}

type probeEntry struct {
	syscall string
	enterTP string
	exitTP  string

	enterLink Link
	exitLink  Link
	attachMu  sync.Mutex

	active  bool
	lastErr error
}

// Manager tracks probe attach/detach state for grouped syscall tracepoints.
type Manager struct {
	mu       sync.Mutex
	attacher Attacher
	probes   map[string]*probeEntry
	closed   bool
}

// NewManager creates a new probe manager that resolves programs via attacher.
func NewManager(attacher Attacher) *Manager {
	return &Manager{
		attacher: attacher,
		probes:   make(map[string]*probeEntry),
	}
}

// Register registers the enter/exit tracepoint pair for a syscall key.
func (m *Manager) Register(syscall string, pair TracepointPair) {
	if m == nil || syscall == "" {
		return
	}

	m.mu.Lock()
	defer m.mu.Unlock()

	entry, ok := m.probes[syscall]
	if !ok {
		entry = &probeEntry{syscall: syscall}
		m.probes[syscall] = entry
	}
	entry.enterTP = pair.Enter
	entry.exitTP = pair.Exit
}

// AttachAll registers and attaches all tracepoint pairs selected by shouldAttach.
//
// If onAttachError is non-nil, per-syscall attach failures are reported through
// the callback and AttachAll continues with the remaining tracepoints. This is
// the desired mode in production: when running a binary built on a newer kernel
// against an older one, some syscalls' tracepoints may be absent and the
// corresponding attach call returns ENOENT. The error is recorded on the
// probe entry (visible via States()) regardless of the callback.
//
// If onAttachError is nil, AttachAll preserves the strict legacy behavior and
// returns the first attach error to the caller. Tests rely on this mode.
func (m *Manager) AttachAll(shouldAttach func(string) bool, tpNames []string, onAttachError func(syscall string, err error)) error {
	if m == nil {
		return errors.New("probe manager is nil")
	}
	if shouldAttach == nil {
		shouldAttach = func(string) bool { return true }
	}

	groups := GroupTracepoints(tpNames)
	for syscall, pair := range groups {
		m.Register(syscall, pair)
		if !shouldAttach(pair.Enter) && !shouldAttach(pair.Exit) {
			continue
		}
		if err := m.Attach(syscall); err != nil {
			if onAttachError == nil {
				return err
			}
			onAttachError(syscall, err)
		}
	}
	return nil
}

// Toggle flips a syscall probe between attached and detached states.
func (m *Manager) Toggle(syscall string) error {
	if m == nil {
		return errors.New("probe manager is nil")
	}
	if syscall == "" {
		return errors.New("syscall is required")
	}

	m.mu.Lock()
	entry, err := m.entryLocked(syscall)
	if err != nil {
		m.mu.Unlock()
		return err
	}
	active := entry.active
	m.mu.Unlock()

	if active {
		return m.Detach(syscall)
	}
	return m.Attach(syscall)
}

// Attach attaches enter/exit tracepoints for a registered syscall.
// Attach attaches enter/exit tracepoints for a registered syscall.
func (m *Manager) Attach(syscall string) error {
	if syscall == "" {
		return errors.New("syscall is required")
	}

	m.mu.Lock()
	entry, err := m.entryLocked(syscall)
	if err != nil {
		m.mu.Unlock()
		return err
	}
	m.mu.Unlock()
	entry.attachMu.Lock()
	defer entry.attachMu.Unlock()

	// Re-acquire the lock after the per-entry mutex to prevent races with
	// concurrent Detach calls on the same syscall.
	enterTP, exitTP, attacher, err := m.snapshotAttachParams(syscall, entry)
	if err != nil {
		return err
	}
	if attacher == nil {
		return nil // entry was already active
	}

	enterLink, exitLink, attachErr := attachPair(attacher, enterTP, exitTP)
	return m.commitAttach(syscall, entry, enterLink, exitLink, attachErr)
}

// snapshotAttachParams re-validates the entry under the manager lock and
// returns the tracepoint names and attacher needed for attachPair. It returns
// (nil attacher, nil error) when the probe is already active.
func (m *Manager) snapshotAttachParams(syscall string, entry *probeEntry) (enterTP, exitTP string, attacher Attacher, err error) {
	m.mu.Lock()
	entry, err = m.entryLocked(syscall)
	if err != nil {
		m.mu.Unlock()
		return "", "", nil, err
	}
	if entry.active {
		m.mu.Unlock()
		return "", "", nil, nil
	}
	enterTP = entry.enterTP
	exitTP = entry.exitTP
	attacher = m.attacher
	m.mu.Unlock()
	return enterTP, exitTP, attacher, nil
}

// commitAttach stores the newly attached link pair in entry under the manager
// lock, recording any attach error or cleaning up on a concurrent manager close.
func (m *Manager) commitAttach(syscall string, entry *probeEntry, enterLink, exitLink Link, attachErr error) error {
	m.mu.Lock()
	defer m.mu.Unlock()
	var err error
	entry, err = m.entryLocked(syscall)
	if err != nil {
		return errors.Join(
			err,
			destroyLink(fmt.Sprintf("cleanup enter %s", syscall), enterLink),
			destroyLink(fmt.Sprintf("cleanup exit %s", syscall), exitLink),
		)
	}
	if attachErr != nil {
		entry.lastErr = attachErr
		entry.active = entry.enterLink != nil || entry.exitLink != nil
		return attachErr
	}
	entry.enterLink = enterLink
	entry.exitLink = exitLink
	entry.lastErr = nil
	entry.active = enterLink != nil || exitLink != nil
	return nil
}

// Detach detaches enter/exit tracepoints for a registered syscall.
func (m *Manager) Detach(syscall string) error {
	if syscall == "" {
		return errors.New("syscall is required")
	}

	m.mu.Lock()
	entry, err := m.entryLocked(syscall)
	if err != nil {
		m.mu.Unlock()
		return err
	}
	m.mu.Unlock()
	entry.attachMu.Lock()
	defer entry.attachMu.Unlock()

	// Re-acquire the lock after the per-entry mutex to prevent races with
	// concurrent Attach calls on the same syscall.
	m.mu.Lock()
	entry, err = m.entryLocked(syscall)
	if err != nil {
		m.mu.Unlock()
		return err
	}
	enterLink := entry.enterLink
	exitLink := entry.exitLink
	m.mu.Unlock()

	enterErr, exitErr, errs := destroyLinkPair(syscall, enterLink, exitLink)
	return m.commitDetach(entry, enterErr, exitErr, errs)
}

// destroyLinkPair destroys both BPF links and collects any errors into a slice.
// It returns each link's error separately so partial-success can be recorded.
func destroyLinkPair(syscall string, enterLink, exitLink Link) (enterErr, exitErr error, errs []string) {
	if enterLink != nil {
		if err := enterLink.Destroy(); err != nil {
			enterErr = err
			errs = append(errs, fmt.Sprintf("detach enter %s: %v", syscall, err))
		}
	}
	if exitLink != nil {
		if err := exitLink.Destroy(); err != nil {
			exitErr = err
			errs = append(errs, fmt.Sprintf("detach exit %s: %v", syscall, err))
		}
	}
	return enterErr, exitErr, errs
}

// commitDetach updates entry link pointers and active flag under the manager
// lock, then returns a combined error if any link destroy failed.
func (m *Manager) commitDetach(entry *probeEntry, enterErr, exitErr error, errs []string) error {
	m.mu.Lock()
	defer m.mu.Unlock()
	if enterErr == nil {
		entry.enterLink = nil
	}
	if exitErr == nil {
		entry.exitLink = nil
	}
	entry.active = entry.enterLink != nil || entry.exitLink != nil
	if len(errs) == 0 {
		entry.lastErr = nil
		return nil
	}
	combined := errors.New(strings.Join(errs, "; "))
	entry.lastErr = combined
	return combined
}

// States returns a stable snapshot of all known probe states.
func (m *Manager) States() []ProbeState {
	if m == nil {
		return nil
	}

	m.mu.Lock()
	defer m.mu.Unlock()

	out := make([]ProbeState, 0, len(m.probes))
	for syscall, entry := range m.probes {
		state := ProbeState{
			Syscall: syscall,
			Active:  entry.active,
		}
		if entry.lastErr != nil {
			state.Error = entry.lastErr.Error()
		}
		out = append(out, state)
	}
	slices.SortFunc(out, func(a, b ProbeState) int { return cmp.Compare(a.Syscall, b.Syscall) })
	return out
}

// ActiveCount returns the number of active probes and total registered probes.
func (m *Manager) ActiveCount() (active, total int) {
	if m == nil {
		return 0, 0
	}

	m.mu.Lock()
	defer m.mu.Unlock()

	total = len(m.probes)
	for _, entry := range m.probes {
		if entry.active {
			active++
		}
	}
	return active, total
}

// IsActive reports whether the syscall probe is currently active.
func (m *Manager) IsActive(syscall string) bool {
	if m == nil || syscall == "" {
		return false
	}

	m.mu.Lock()
	defer m.mu.Unlock()

	entry, ok := m.probes[syscall]
	if !ok {
		return false
	}
	return entry.active
}

// Close detaches all registered probes and marks the manager closed.
// It returns the first detach error encountered (subsequent errors are
// recorded on the probe entry but not returned).
func (m *Manager) Close() error {
	if m == nil {
		return nil
	}
	entries, ok := m.snapshotAndMarkClosed()
	if !ok {
		return nil // already closed
	}

	var firstErr error
	for _, item := range entries {
		if err := m.detachProbeEntry(item); err != nil && firstErr == nil {
			firstErr = err
		}
	}
	return firstErr
}

// pairEntry groups a probe entry with its syscall name for use during Close.
type pairEntry struct {
	syscall  string
	entry    *probeEntry
	hasLinks bool
}

// snapshotAndMarkClosed atomically marks the manager as closed and returns a
// snapshot of all probe entries. Returns (nil, false) if already closed.
func (m *Manager) snapshotAndMarkClosed() ([]pairEntry, bool) {
	m.mu.Lock()
	defer m.mu.Unlock()
	if m.closed {
		return nil, false
	}
	entries := make([]pairEntry, 0, len(m.probes))
	for syscall, entry := range m.probes {
		entries = append(entries, pairEntry{
			syscall:  syscall,
			entry:    entry,
			hasLinks: entry.enterLink != nil || entry.exitLink != nil,
		})
	}
	m.closed = true
	return entries, true
}

// detachProbeEntry destroys the BPF links for a single probe entry under its
// per-entry mutex, clears the link pointers, and records any error.
func (m *Manager) detachProbeEntry(item pairEntry) error {
	if item.hasLinks {
		item.entry.attachMu.Lock()
		defer item.entry.attachMu.Unlock()
	}

	m.mu.Lock()
	enterLink := item.entry.enterLink
	exitLink := item.entry.exitLink
	item.entry.enterLink = nil
	item.entry.exitLink = nil
	item.entry.active = false
	item.entry.lastErr = nil
	m.mu.Unlock()

	var errForSyscall error
	if enterLink != nil {
		if err := enterLink.Destroy(); err != nil {
			errForSyscall = err
		}
	}
	if exitLink != nil {
		if err := exitLink.Destroy(); err != nil && errForSyscall == nil {
			errForSyscall = err
		}
	}
	m.setLastError(item.syscall, errForSyscall)
	return errForSyscall
}

func (m *Manager) entryLocked(syscall string) (*probeEntry, error) {
	if m.closed {
		return nil, errors.New("probe manager is closed")
	}
	if m.attacher == nil {
		return nil, errors.New("probe manager has no attacher")
	}
	entry, ok := m.probes[syscall]
	if !ok {
		return nil, fmt.Errorf("unknown syscall %q", syscall)
	}
	return entry, nil
}

func (m *Manager) setLastError(syscall string, err error) {
	m.mu.Lock()
	defer m.mu.Unlock()
	entry, ok := m.probes[syscall]
	if !ok {
		return
	}
	entry.lastErr = err
}

func attachPair(attacher Attacher, enterTP, exitTP string) (Link, Link, error) {
	enterLink, err := attachOne(attacher, enterTP)
	if err != nil {
		return nil, nil, err
	}

	exitLink, err := attachOne(attacher, exitTP)
	if err != nil {
		return nil, nil, errors.Join(err, destroyLink("cleanup enter link after exit attach failure", enterLink))
	}
	return enterLink, exitLink, nil
}

func destroyLink(action string, link Link) error {
	if link == nil {
		return nil
	}
	if err := link.Destroy(); err != nil {
		return fmt.Errorf("%s: %w", action, err)
	}
	return nil
}

func attachOne(attacher Attacher, tracepoint string) (Link, error) {
	if tracepoint == "" {
		return nil, nil
	}
	progName := "handle_" + tracepoint
	prog, err := attacher.GetProgram(progName)
	if err != nil {
		return nil, fmt.Errorf("get program %s: %w", progName, err)
	}
	link, err := prog.AttachTracepoint("syscalls", tracepoint)
	if err != nil {
		return nil, fmt.Errorf("attach %s: %w", tracepoint, err)
	}
	return link, nil
}