summaryrefslogtreecommitdiff
path: root/cmd/ioworkload/scenario_priority.go
blob: 548df1b9c2ebee286049ab7c2fcdd1efde5bd0fa (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
package main

import (
	"fmt"
	"syscall"

	"golang.org/x/sys/unix"
)

// prioProcess is PRIO_PROCESS (value 0): the "which" selector telling
// get/setpriority that "who" identifies a process (rather than a process group
// or user). Paired with who == 0 it means "the calling process", so the calls
// are entirely self-targeted and need no privilege.
const prioProcess = 0

// niceOffset is the bias the kernel applies to getpriority's return value. The
// raw getpriority(2) syscall never returns a negative number (that range is
// reserved for -errno), so instead of returning the nice value directly
// (-20..19) it returns 20 - nice (i.e. 1..40). To recover the actual nice value
// we must subtract the return value from this offset. setpriority(2), by
// contrast, takes the real nice value (-20..19) directly, so we convert before
// re-applying it.
const niceOffset = 20

// priorityBasic exercises the SAFE, NON-DISRUPTIVE members of the priority pair
// (getpriority/setpriority), entirely self-targeted (PRIO_PROCESS, who 0 == the
// calling process), so it changes no other process and leaves this process's
// nice value byte-for-byte unchanged:
//
//   - getpriority(PRIO_PROCESS, 0)  reads the current nice value (a pure read).
//   - setpriority(PRIO_PROCESS, 0, currentNice)  re-applies the EXACT nice value
//     just read back, so the priority is left unchanged.
//
// Re-applying the current nice value needs no privilege: lowering the priority
// (a larger nice) is always allowed, and writing back the unchanged value is a
// no-op the kernel permits regardless of RLIMIT_NICE. This mirrors the safe
// sched_setaffinity round-trip in schedRoundtripAffinity (scenario_sched.go),
// where we likewise read a value and write the identical value straight back.
//
// Both syscalls classify as FamilyProcess with a KindNull enter (PRIO_PROCESS is
// an opcode, not an fd) and an UNCLASSIFIED return (getpriority returns a nice
// value, NOT a byte count), so the scenario exists purely to fire the enter
// tracepoints end-to-end.
func priorityBasic() error {
	// getpriority returns 20 - nice (see niceOffset); recover the real nice.
	prio, err := unix.Getpriority(prioProcess, 0)
	if err != nil {
		return fmt.Errorf("getpriority(PRIO_PROCESS, 0): %w", err)
	}
	currentNice := niceOffset - prio

	// Re-apply the IDENTICAL nice value we just read: a no-op change that needs
	// no privilege.
	if err := unix.Setpriority(prioProcess, 0, currentNice); err != nil {
		return fmt.Errorf("setpriority(PRIO_PROCESS, 0, %d) (restore same nice): %w", currentNice, err)
	}
	return nil
}

// ioprioWhoProcess is IOPRIO_WHO_PROCESS (value 1): the "which" selector telling
// ioprio_get/ioprio_set that "who" identifies a single process (rather than a
// process group or user). Paired with who == 0 it means "the calling process",
// so the calls are entirely self-targeted and need no privilege.
const ioprioWhoProcess = 1

// ioprioClassShift is the bit position of the I/O-priority class within the
// 16-bit ioprio value: the top 3 bits hold the class, the low 13 bits the level.
// IOPRIO_PRIO_VALUE(class, level) == (class << ioprioClassShift) | level.
const ioprioClassShift = 13

// ioprioClassBE is IOPRIO_CLASS_BE (value 2), the best-effort scheduling class.
// Setting best-effort on SELF is unprivileged; the realtime class
// (IOPRIO_CLASS_RT == 1) would require CAP_SYS_ADMIN, so we never use it.
const ioprioClassBE = 2

// ioprioBasic exercises the SAFE, NON-DISRUPTIVE I/O-priority pair
// (ioprio_get/ioprio_set), the I/O analogues of getpriority/setpriority. There
// is no x/sys wrapper for these, so we issue the raw syscalls directly. Both are
// self-targeted (IOPRIO_WHO_PROCESS, who 0 == the calling process), so they
// affect no other process:
//
//   - ioprio_get(IOPRIO_WHO_PROCESS, 0)  READS the current I/O priority. The
//     return is a packed (class<<13)|level value, or 0 when none is set (the
//     process inherits a default derived from its CPU nice value).
//   - ioprio_set(IOPRIO_WHO_PROCESS, 0, value)  re-applies the value just read.
//     If ioprio_get returned 0 (no explicit priority), we instead set best-effort
//     level 4 — a harmless, unprivileged choice. Best-effort needs no privilege;
//     we deliberately avoid the realtime class, which needs CAP_SYS_ADMIN.
//
// Both syscalls classify as FamilyProcess with a KindNull enter (IOPRIO_WHO_*
// is an opcode, not an fd) and an UNCLASSIFIED return (the value is an
// I/O-priority word, NOT a byte count), so the scenario exists purely to fire
// the enter tracepoints end-to-end, mirroring priorityBasic above.
func ioprioBasic() error {
	// ioprio_get returns -1/errno on failure, else the packed priority (>= 0).
	ret, _, errno := syscall.Syscall(unix.SYS_IOPRIO_GET, ioprioWhoProcess, 0, 0)
	if errno != 0 {
		return fmt.Errorf("ioprio_get(IOPRIO_WHO_PROCESS, 0): %w", errno)
	}

	ioprioValue := uintptr(ret)
	if ioprioValue == 0 {
		// No explicit I/O priority is set; choose a harmless best-effort value
		// (class BE, level 4) so ioprio_set has a valid, unprivileged argument.
		ioprioValue = uintptr(ioprioClassBE<<ioprioClassShift | 4)
	}

	// Re-apply the value we just read (or the safe best-effort default): a
	// non-disruptive, unprivileged self-update.
	_, _, errno = syscall.Syscall(unix.SYS_IOPRIO_SET, ioprioWhoProcess, 0, ioprioValue)
	if errno != 0 {
		return fmt.Errorf("ioprio_set(IOPRIO_WHO_PROCESS, 0, %#x): %w", ioprioValue, errno)
	}
	return nil
}