Skip to content

Commit 673a6ac

Browse files
committed
Handle processes whose main thread has exited
1 parent 76d5c10 commit 673a6ac

File tree

4 files changed

+45
-26
lines changed

4 files changed

+45
-26
lines changed

processmanager/manager.go

+1
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ func New(ctx context.Context, includeTracers types.IncludedTracers, monitorInter
9595
interpreters: interpreters,
9696
exitEvents: make(map[libpf.PID]times.KTime),
9797
pidToProcessInfo: make(map[libpf.PID]*processInfo),
98+
pidMainThreadExit: make(libpf.Set[libpf.PID]),
9899
ebpf: ebpf,
99100
FileIDMapper: fileIDMapper,
100101
elfInfoCache: elfInfoCache,

processmanager/processinfo.go

+30-24
Original file line numberDiff line numberDiff line change
@@ -591,20 +591,12 @@ func (pm *ProcessManager) SynchronizeProcess(pr process.Process) {
591591
return
592592
}
593593
if len(mappings) == 0 {
594-
// Valid process without any (executable) mappings. All cases are
595-
// handled as process exit. Possible causes and reasoning:
596-
// 1. It is a kernel worker process. The eBPF does not send events from these,
597-
// but we can see kernel threads here during startup when tracer walks
598-
// /proc and tries to synchronize all PIDs it sees.
599-
// The PID should not exist anywhere, but we can still double check and
600-
// make sure the PID is not tracked.
601-
// 2. It is a normal process executing, but we just sampled it when the kernel
602-
// execve() is rebuilding the mappings and nothing is currently mapped.
603-
// In this case we can handle it as process exit because everything about
604-
// the process is changing: all mappings, comm, etc. If execve fails, we
605-
// reaped it early. If execve succeeds, we will get new synchronization
606-
// request soon, and handle it as a new process event.
607-
pm.processPIDExit(pid)
594+
// TODO: Check if main thread has exited, e.g. /proc/PID/stat will show zombie
595+
log.Warnf("%v: main thread exit", pid)
596+
pm.mu.Lock()
597+
defer pm.mu.Unlock()
598+
pm.pidMainThreadExit[pid] = libpf.Void{}
599+
pm.ebpf.RemoveReportedPID(pid)
608600
return
609601
}
610602

@@ -630,23 +622,33 @@ func (pm *ProcessManager) SynchronizeProcess(pr process.Process) {
630622

631623
// CleanupPIDs executes a periodic synchronization of pidToProcessInfo table with system processes.
632624
// NOTE: Exported only for tracer.
633-
func (pm *ProcessManager) CleanupPIDs() {
625+
func (pm *ProcessManager) CleanupPIDs(fast bool) {
634626
deadPids := make([]libpf.PID, 0, 16)
635-
636627
pm.mu.RLock()
637-
for pid := range pm.pidToProcessInfo {
628+
defer func() {
629+
pm.mu.RUnlock()
630+
for _, pid := range deadPids {
631+
pm.processPIDExit(pid)
632+
}
633+
if len(deadPids) > 0 {
634+
log.Debugf("Cleaned up %d dead PIDs", len(deadPids))
635+
}
636+
}()
637+
638+
log.Warnf("Fast cleanup")
639+
for pid := range pm.pidMainThreadExit {
638640
if live, _ := proc.IsPIDLive(pid); !live {
639641
deadPids = append(deadPids, pid)
640642
}
641643
}
642-
pm.mu.RUnlock()
643-
644-
for _, pid := range deadPids {
645-
pm.processPIDExit(pid)
644+
if fast {
645+
return
646646
}
647-
648-
if len(deadPids) > 0 {
649-
log.Debugf("Cleaned up %d dead PIDs", len(deadPids))
647+
log.Warnf("Slow cleanup")
648+
for pid := range pm.pidToProcessInfo {
649+
if live, _ := proc.IsPIDLive(pid); !live {
650+
deadPids = append(deadPids, pid)
651+
}
650652
}
651653
}
652654

@@ -707,6 +709,10 @@ func (pm *ProcessManager) ProcessedUntil(traceCaptureKTime times.KTime) {
707709
}
708710

709711
delete(pm.pidToProcessInfo, pid)
712+
if _, ok := pm.pidMainThreadExit[pid]; ok {
713+
delete(pm.pidMainThreadExit, pid)
714+
log.Warnf("%v: cleanup", pid)
715+
}
710716

711717
for _, instance := range pm.interpreters[pid] {
712718
if err2 := instance.Detach(pm.ebpf, pid); err2 != nil {

processmanager/types.go

+3
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ type ProcessManager struct {
5353
// pidToProcessInfo keeps track of the executable memory mappings.
5454
pidToProcessInfo map[libpf.PID]*processInfo
5555

56+
// pidMainThreadExit keeps track of PIDs whose main thread (pid == tgid) has exited.
57+
pidMainThreadExit libpf.Set[libpf.PID]
58+
5659
// exitEvents records the pid exit time and is a list of pending exit events to be handled.
5760
exitEvents map[libpf.PID]times.KTime
5861

tracer/events.go

+11-2
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,23 @@ func (t *Tracer) StartPIDEventProcessor(ctx context.Context) {
4343

4444
// Process the PID events that are incoming in the Tracer channel.
4545
func (t *Tracer) processPIDEvents(ctx context.Context) {
46-
pidCleanupTicker := time.NewTicker(t.intervals.PIDCleanupInterval())
46+
var lastSlowPIDCleanup time.Time
47+
// TODO: Add the following to times.go
48+
pidCleanupTicker := time.NewTicker(5 * time.Second)
4749
defer pidCleanupTicker.Stop()
4850
for {
4951
select {
5052
case pid := <-t.pidEvents:
53+
log.Warnf("PID: %v", pid)
5154
t.processManager.SynchronizeProcess(process.New(pid))
5255
case <-pidCleanupTicker.C:
53-
t.processManager.CleanupPIDs()
56+
fast := true
57+
now := time.Now()
58+
if now.Sub(lastSlowPIDCleanup) >= t.intervals.PIDCleanupInterval() {
59+
fast = false
60+
lastSlowPIDCleanup = now
61+
}
62+
t.processManager.CleanupPIDs(fast)
5463
case <-ctx.Done():
5564
return
5665
}

0 commit comments

Comments
 (0)