Fleshing out readme

scottgerring · scottgerring · commit b03277f67c1b · 2024-08-29T10:25:15.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,5 @@
 *.o
 main
 .vscode
-ebpf-syscall-stats
-
-ebpf-syscalls-stats
+minderbinder
 bpf_x86_bpfel.go
diff --git a/Makefile b/Makefile
@@ -1,11 +1,11 @@
-all: ebpf-syscalls-stats
+all: minderbinder
 
 clean:
-	rm -f ebpf-syscalls-stats
+	rm -f minderbinder
 	rm -f bpf_x86_bpfel.go
 	rm -f *.o
 
-ebpf-syscalls-stats: $(wildcard *.go) bpf_x86_bpfel.o
+minderbinder: $(wildcard *.go) bpf_x86_bpfel.o
 	go build
 
 bpf_x86_bpfel.o: ebpf/main.c
@@ -14,8 +14,8 @@ bpf_x86_bpfel.o: ebpf/main.c
 # 
 # Debugging targets
 #
-testload: ebpf-syscalls-stats
-	sudo ./ebpf-syscalls-stats --testLoad
+testload: minderbinder
+	sudo ./minderbinder --testLoad
 
 watch:
 	while true; do \
diff --git a/README.md b/README.md
@@ -6,5 +6,79 @@ _“Yossarian also thought Milo was a jerk; but he also know that Milo was a gen
 
 ## What is this?
 Minderbinder is a tool that uses eBPF to inject failures into running processes. 
-Presently it can inject failures into **system calls** by attaching kprobes to the system call handler
-uand failures into **outgoing network traffic** by atta:wching traffic to the [TC subsystem](https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/8/html/configuring_and_managing_networking/linux-traffic-control_configuring-and-managing-networking). 
+Presently it can inject failures into **system calls** by attaching kprobes to the system call handler and failures into **outgoing network traffic** by attaching a traffic filter to the [TC subsystem](https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/8/html/configuring_and_managing_networking/linux-traffic-control_configuring-and-managing-networking). 
+
+## What's it for?
+Minderbinder aims to make it easy to generically inject failures into processes. At the moment you can write a config.yaml that describes the failures to inject and the processes to inject them into, start minderbinder, and see what happens. 
+
+## Running Minderbinder
+Check out [config.yaml](config.yaml) for a complete example. Minderbinder supports two different interventions - `syscall` and `outgoing_network`:
+
+```yaml
+agents_of_chaos:
+  syscall:  
+    # Stop curl from using `openat`
+    - name: break_curl_openat
+      syscall: openat
+      ret_code: -2 # NOENT / no such file or directory
+      targets:
+        - process_name: curl
+      delay_ms: 100 # Milliseconds to wait after the process starts. For openat, this gives the process a chance to start properly.
+      failure_rate: 100
+  outgoing_network:
+    - name: break_wget_network
+      targets:
+        - process_name: wget
+      delay_ms: 100 # Milliseconds. In this case, 100ms should be enough to get a DNS request through for the endpoint, before breaking the actual transfer to the HTTP server
+      failure_rate: 100      
+```
+
+To run minderbinder, you specify the configuration file, and if you are using `outgoing_network`, the interface to attach to:
+ ```bash
+sudo ./minderbinder --interface enp67s0 config.yaml
+```
+
+
+## Big Picture
+
+The long-term goal is to provide a back-end for existing unit test frameworks, so that we can write component tests that can trivially break the code under test in interesting, chaos-related fashions. Something like this:
+
+```go
+func TestYourAPIHandler_DownstreamFailure(t *testing.T) {
+	// Create a new request
+	req := httptest.NewRequest(http.MethodGet, "/your-api-endpoint", nil)
+
+	// Record the response
+	rec := httptest.NewRecorder()
+
+	// Failure configuration
+	cfg := FailureConfig{
+		OutgoingNetwork: [] OutgoingNetworkFailure {
+            {
+                Protocol: "TCP",
+                DestPort: 443,
+                FailureRate: 100
+            }
+        }
+	}
+
+	// Wrap the actual handler call with Minderbinder. Because Minderbinder is injecting
+    // failures into this process using eBPF, we don't need to elaborately craft stubs here;
+    // we can setup the 
+    minderbinder := &Minderbinder{}
+	minderbinder.WithFailures(cfg, func() (*http.Response, error) {
+		// Call the API handler
+		YourAPIHandler(rec, req)
+		return nil
+	})
+
+	// We should get a 502 / bad gateway back
+	assert.Equal(t, http.StatusBadGateway, rec.Code)
+	assert.Equal(t, "Downstream service failed\n", rec.Body.String())
+}
+
+```
+
+This gives us a mechanism to test our application and services resiliance in the face of failures. Traditionally we would do this either by extensively stubbing _all_ the interesting interfaces around the application and injecting failures, or, using some chaos engineering tool to inject failures into the entire aggregate system in a deployed cloud environment. Because Minderbinder leverages eBPF for the failure injection, the code needed for each supported language would be straightforward, as it would simply have to configure the native minderbinder component.
+
+t
diff --git a/config.go b/config.go
@@ -39,7 +39,6 @@ type SystemIncomingNetworkConfig struct {
 }
 
 type Target struct {
-	ProcessID   int    `yaml:"process_id,omitempty"`
 	ProcessName string `yaml:"process_name,omitempty"`
 }
 
diff --git a/probes.go b/probes.go
@@ -33,7 +33,10 @@ func (lm *LinkManager) Close() {
 	}
 
 	for _, l := range lm.netfilter {
-		netlink.FilterDel(l)
+		err := netlink.FilterDel(l)
+		if err != nil {
+			log.Printf("Failed deleting TC filter")
+		}
 	}
 
 	lm.objs.Close()
@@ -61,7 +64,7 @@ func setupBpfAndProbes(config *Config, outgoingTrafficInterface string) (*LinkMa
 	validateConfig(config, outgoingTrafficInterface)
 
 	// Attach everything
-	tracepoints, err := attachTracepoints(&linkManager.objs, config)
+	tracepoints, err := attachTracepoints(&linkManager.objs)
 	if err != nil {
 		return nil, err
 	}
@@ -108,28 +111,6 @@ func timestampSinceBoot() (uint64, error) {
 // Validates config before we load it
 func validateConfig(config *Config, outgoingTrafficInterface string) {
 
-	// Explicitly block process ID targets for now. By supporting `comm` targeting only
-	// it is easy for us to use 'delay' to track time-since-process-started, since we
-	// see the process starting.
-	// At some point we should implement this and remove this check!
-
-	for _, netConfig := range config.AgentsOfChaos.OutgoingNetwork {
-		for _, target := range netConfig.Targets {
-			if target.ProcessID != 0 && target.ProcessName == "" {
-				log.Fatalf("Process ID not yet supported for OutgoingNetwork %s", netConfig.Name)
-			}
-		}
-	}
-
-	for _, syscallConfig := range config.AgentsOfChaos.Syscall {
-		for _, target := range syscallConfig.Targets {
-			if target.ProcessID != 0 && target.ProcessName == "" {
-				log.Fatalf("Process ID not yet supported for OutgoingNetwork %s", syscallConfig.Name)
-			}
-
-		}
-	}
-
 	// If we have any outgoing network configuration at all, we need an outgoing traffic interface
 	if len(config.AgentsOfChaos.OutgoingNetwork) > 0 && outgoingTrafficInterface == "" {
 		log.Fatalf("outgoing_network configurations provided, but no --interface argument given")
@@ -140,7 +121,7 @@ func loadProbeInfo(objs *bpfObjects, config *Config) error {
 
 	// Get curent time in nanoseconds since boot. We will use this to anchor any "not before"
 	// probes.
-	currentTimeNs, err := timestampSinceBoot()
+	_, err := timestampSinceBoot()
 	if err != nil {
 		return err
 	}
@@ -177,29 +158,7 @@ func loadProbeInfo(objs *bpfObjects, config *Config) error {
 				if err := objs.SyscallTargetConfig.Update(key, value, ebpf.UpdateAny); err != nil {
 					return err
 				}
-
-			} else if target.ProcessID != 0 {
-				//
-				// !! This whole path is blocked by the configuration loading for now !! .
-				// !! We only support loading comm targeting. Delay after start logic needs some work !!
-				//
-				log.Printf("Loading syscall target: %s, pid=%d", syscall.Syscall, target.ProcessID)
-
-				key := uint32(target.ProcessID)
-
-				value := bpfSyscallTargetT{}
-				value.FailureConfig.DelayAfterStartNs = currentTimeNs + (uint64(syscall.DelayMs) * 1_000_000)
-				value.FailureConfig.FailureRatePercent = uint32(syscall.FailureRate)
-				value.FailureConfig.InjectedRetCode = uint32(syscall.RetCode)
-				value.FailureConfig.SyscallId = syscallId
-
-				if err := objs.SyscallTargets.Update(key, value, ebpf.UpdateAny); err != nil {
-					return err
-				}
-			} else {
-				log.Fatalf("Need to specify either a ProcessName or PID for syscall configuration named %s", syscall.Name)
 			}
-
 		}
 	}
 
@@ -266,7 +225,7 @@ func getUniqueSyscalls(config *Config) []string {
 
 }
 
-func attachTracepoints(objs *bpfObjects, config *Config) ([]link.Link, error) {
+func attachTracepoints(objs *bpfObjects) ([]link.Link, error) {
 	var tracepoints []link.Link
 
 	tpInfo := []struct {

Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,6 @@ type SystemIncomingNetworkConfig struct {`
`39`	`39`	`}`
`40`	`40`
`41`	`41`	`type Target struct {`
`42`		- ProcessID int `yaml:"process_id,omitempty"`
`43`	`42`	ProcessName string `yaml:"process_name,omitempty"`
`44`	`43`	`}`
`45`	`44`