Skip to content

Commit b03277f

Browse files
committed
Fleshing out readme
1 parent 38cebd9 commit b03277f

File tree

5 files changed

+89
-59
lines changed

5 files changed

+89
-59
lines changed

.gitignore

+1-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
*.o
22
main
33
.vscode
4-
ebpf-syscall-stats
5-
6-
ebpf-syscalls-stats
4+
minderbinder
75
bpf_x86_bpfel.go

Makefile

+5-5
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
all: ebpf-syscalls-stats
1+
all: minderbinder
22

33
clean:
4-
rm -f ebpf-syscalls-stats
4+
rm -f minderbinder
55
rm -f bpf_x86_bpfel.go
66
rm -f *.o
77

8-
ebpf-syscalls-stats: $(wildcard *.go) bpf_x86_bpfel.o
8+
minderbinder: $(wildcard *.go) bpf_x86_bpfel.o
99
go build
1010

1111
bpf_x86_bpfel.o: ebpf/main.c
@@ -14,8 +14,8 @@ bpf_x86_bpfel.o: ebpf/main.c
1414
#
1515
# Debugging targets
1616
#
17-
testload: ebpf-syscalls-stats
18-
sudo ./ebpf-syscalls-stats --testLoad
17+
testload: minderbinder
18+
sudo ./minderbinder --testLoad
1919

2020
watch:
2121
while true; do \

README.md

+76-2
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,79 @@ _“Yossarian also thought Milo was a jerk; but he also know that Milo was a gen
66

77
## What is this?
88
Minderbinder is a tool that uses eBPF to inject failures into running processes.
9-
Presently it can inject failures into **system calls** by attaching kprobes to the system call handler
10-
uand failures into **outgoing network traffic** by atta:wching traffic to the [TC subsystem](https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/8/html/configuring_and_managing_networking/linux-traffic-control_configuring-and-managing-networking).
9+
Presently it can inject failures into **system calls** by attaching kprobes to the system call handler and failures into **outgoing network traffic** by attaching a traffic filter to the [TC subsystem](https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/8/html/configuring_and_managing_networking/linux-traffic-control_configuring-and-managing-networking).
10+
11+
## What's it for?
12+
Minderbinder aims to make it easy to generically inject failures into processes. At the moment you can write a config.yaml that describes the failures to inject and the processes to inject them into, start minderbinder, and see what happens.
13+
14+
## Running Minderbinder
15+
Check out [config.yaml](config.yaml) for a complete example. Minderbinder supports two different interventions - `syscall` and `outgoing_network`:
16+
17+
```yaml
18+
agents_of_chaos:
19+
syscall:
20+
# Stop curl from using `openat`
21+
- name: break_curl_openat
22+
syscall: openat
23+
ret_code: -2 # NOENT / no such file or directory
24+
targets:
25+
- process_name: curl
26+
delay_ms: 100 # Milliseconds to wait after the process starts. For openat, this gives the process a chance to start properly.
27+
failure_rate: 100
28+
outgoing_network:
29+
- name: break_wget_network
30+
targets:
31+
- process_name: wget
32+
delay_ms: 100 # Milliseconds. In this case, 100ms should be enough to get a DNS request through for the endpoint, before breaking the actual transfer to the HTTP server
33+
failure_rate: 100
34+
```
35+
36+
To run minderbinder, you specify the configuration file, and if you are using `outgoing_network`, the interface to attach to:
37+
```bash
38+
sudo ./minderbinder --interface enp67s0 config.yaml
39+
```
40+
41+
42+
## Big Picture
43+
44+
The long-term goal is to provide a back-end for existing unit test frameworks, so that we can write component tests that can trivially break the code under test in interesting, chaos-related fashions. Something like this:
45+
46+
```go
47+
func TestYourAPIHandler_DownstreamFailure(t *testing.T) {
48+
// Create a new request
49+
req := httptest.NewRequest(http.MethodGet, "/your-api-endpoint", nil)
50+
51+
// Record the response
52+
rec := httptest.NewRecorder()
53+
54+
// Failure configuration
55+
cfg := FailureConfig{
56+
OutgoingNetwork: [] OutgoingNetworkFailure {
57+
{
58+
Protocol: "TCP",
59+
DestPort: 443,
60+
FailureRate: 100
61+
}
62+
}
63+
}
64+
65+
// Wrap the actual handler call with Minderbinder. Because Minderbinder is injecting
66+
// failures into this process using eBPF, we don't need to elaborately craft stubs here;
67+
// we can setup the
68+
minderbinder := &Minderbinder{}
69+
minderbinder.WithFailures(cfg, func() (*http.Response, error) {
70+
// Call the API handler
71+
YourAPIHandler(rec, req)
72+
return nil
73+
})
74+
75+
// We should get a 502 / bad gateway back
76+
assert.Equal(t, http.StatusBadGateway, rec.Code)
77+
assert.Equal(t, "Downstream service failed\n", rec.Body.String())
78+
}
79+
80+
```
81+
82+
This gives us a mechanism to test our application and services resiliance in the face of failures. Traditionally we would do this either by extensively stubbing _all_ the interesting interfaces around the application and injecting failures, or, using some chaos engineering tool to inject failures into the entire aggregate system in a deployed cloud environment. Because Minderbinder leverages eBPF for the failure injection, the code needed for each supported language would be straightforward, as it would simply have to configure the native minderbinder component.
83+
84+
t

config.go

-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ type SystemIncomingNetworkConfig struct {
3939
}
4040

4141
type Target struct {
42-
ProcessID int `yaml:"process_id,omitempty"`
4342
ProcessName string `yaml:"process_name,omitempty"`
4443
}
4544

probes.go

+7-48
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,10 @@ func (lm *LinkManager) Close() {
3333
}
3434

3535
for _, l := range lm.netfilter {
36-
netlink.FilterDel(l)
36+
err := netlink.FilterDel(l)
37+
if err != nil {
38+
log.Printf("Failed deleting TC filter")
39+
}
3740
}
3841

3942
lm.objs.Close()
@@ -61,7 +64,7 @@ func setupBpfAndProbes(config *Config, outgoingTrafficInterface string) (*LinkMa
6164
validateConfig(config, outgoingTrafficInterface)
6265

6366
// Attach everything
64-
tracepoints, err := attachTracepoints(&linkManager.objs, config)
67+
tracepoints, err := attachTracepoints(&linkManager.objs)
6568
if err != nil {
6669
return nil, err
6770
}
@@ -108,28 +111,6 @@ func timestampSinceBoot() (uint64, error) {
108111
// Validates config before we load it
109112
func validateConfig(config *Config, outgoingTrafficInterface string) {
110113

111-
// Explicitly block process ID targets for now. By supporting `comm` targeting only
112-
// it is easy for us to use 'delay' to track time-since-process-started, since we
113-
// see the process starting.
114-
// At some point we should implement this and remove this check!
115-
116-
for _, netConfig := range config.AgentsOfChaos.OutgoingNetwork {
117-
for _, target := range netConfig.Targets {
118-
if target.ProcessID != 0 && target.ProcessName == "" {
119-
log.Fatalf("Process ID not yet supported for OutgoingNetwork %s", netConfig.Name)
120-
}
121-
}
122-
}
123-
124-
for _, syscallConfig := range config.AgentsOfChaos.Syscall {
125-
for _, target := range syscallConfig.Targets {
126-
if target.ProcessID != 0 && target.ProcessName == "" {
127-
log.Fatalf("Process ID not yet supported for OutgoingNetwork %s", syscallConfig.Name)
128-
}
129-
130-
}
131-
}
132-
133114
// If we have any outgoing network configuration at all, we need an outgoing traffic interface
134115
if len(config.AgentsOfChaos.OutgoingNetwork) > 0 && outgoingTrafficInterface == "" {
135116
log.Fatalf("outgoing_network configurations provided, but no --interface argument given")
@@ -140,7 +121,7 @@ func loadProbeInfo(objs *bpfObjects, config *Config) error {
140121

141122
// Get curent time in nanoseconds since boot. We will use this to anchor any "not before"
142123
// probes.
143-
currentTimeNs, err := timestampSinceBoot()
124+
_, err := timestampSinceBoot()
144125
if err != nil {
145126
return err
146127
}
@@ -177,29 +158,7 @@ func loadProbeInfo(objs *bpfObjects, config *Config) error {
177158
if err := objs.SyscallTargetConfig.Update(key, value, ebpf.UpdateAny); err != nil {
178159
return err
179160
}
180-
181-
} else if target.ProcessID != 0 {
182-
//
183-
// !! This whole path is blocked by the configuration loading for now !! .
184-
// !! We only support loading comm targeting. Delay after start logic needs some work !!
185-
//
186-
log.Printf("Loading syscall target: %s, pid=%d", syscall.Syscall, target.ProcessID)
187-
188-
key := uint32(target.ProcessID)
189-
190-
value := bpfSyscallTargetT{}
191-
value.FailureConfig.DelayAfterStartNs = currentTimeNs + (uint64(syscall.DelayMs) * 1_000_000)
192-
value.FailureConfig.FailureRatePercent = uint32(syscall.FailureRate)
193-
value.FailureConfig.InjectedRetCode = uint32(syscall.RetCode)
194-
value.FailureConfig.SyscallId = syscallId
195-
196-
if err := objs.SyscallTargets.Update(key, value, ebpf.UpdateAny); err != nil {
197-
return err
198-
}
199-
} else {
200-
log.Fatalf("Need to specify either a ProcessName or PID for syscall configuration named %s", syscall.Name)
201161
}
202-
203162
}
204163
}
205164

@@ -266,7 +225,7 @@ func getUniqueSyscalls(config *Config) []string {
266225

267226
}
268227

269-
func attachTracepoints(objs *bpfObjects, config *Config) ([]link.Link, error) {
228+
func attachTracepoints(objs *bpfObjects) ([]link.Link, error) {
270229
var tracepoints []link.Link
271230

272231
tpInfo := []struct {

0 commit comments

Comments
 (0)