Skip to content

Commit 2905324

Browse files
author
Miroslav Shipkovenski
committed
Fix race condition between zombie reaper and RPC command execution
The sidecar container had a race condition between the reapZombies goroutine and the RPC command handler (CmdExec.Run). Both called Wait4 on the same child processes: - reapZombies called Wait4(-1, WNOHANG) to reap any exited child. - CmdExec.Run called cmd.Run(), which internally calls cmd.Wait(), which calls waitid(P_PID, <pid>) on the specific child process. Since the kernel only stores one exit status per process and the first Wait4/waitid call to match a PID consumes it, the reaper could steal the exit status before cmd.Wait() collected it. This caused cmd.Wait() to fail with "waitid: no child processes" (ECHILD), which surfaced in kapp-controller logs as: Fetching resources: waitid: no child processes The fix introduces a centralized Reaper that is the sole caller of Wait4 in the sidecar process. CmdExec.Run now uses cmd.Start() instead of cmd.Run() and registers the child PID with the Reaper via a channel. The Reaper's Wait4(-1) loop dispatches exit statuses to tracked PIDs through their channels, and silently discards statuses for untracked PIDs (orphaned zombies). A mutex ensures cmd.Start() and PID registration are atomic with respect to the reaping loop, preventing the Reaper from consuming an exit status before the channel is registered. Made-with: Cursor ai-assisted=yes Made-with: Cursor Signed-off-by: Miroslav Shipkovenski <miroslav.shipkovenski@broadcom.com>
1 parent 84c5e28 commit 2905324

File tree

6 files changed

+368
-39
lines changed

6 files changed

+368
-39
lines changed

cmd/controller/sidecarexec.go

Lines changed: 3 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,17 @@
44
package main
55

66
import (
7-
"syscall"
8-
"time"
9-
10-
"carvel.dev/kapp-controller/pkg/exec"
117
"carvel.dev/kapp-controller/pkg/sidecarexec"
12-
"github.com/go-logr/logr"
138
"sigs.k8s.io/controller-runtime/pkg/log/zap"
149
)
1510

1611
func sidecarexecMain() {
1712
mainLog := zap.New(zap.UseDevMode(false)).WithName("kc-sidecarexec")
1813
mainLog.Info("start sidecarexec", "version", Version)
1914

20-
go reapZombies(mainLog)
15+
reaper := sidecarexec.NewReaper(mainLog)
16+
go reaper.Run()
2117

22-
localCmdRunner := exec.NewPlainCmdRunner()
2318
opts := sidecarexec.ServerOpts{
2419
AllowedCmdNames: []string{
2520
// Fetch (calls impgkg and others internally)
@@ -29,25 +24,10 @@ func sidecarexecMain() {
2924
},
3025
}
3126

32-
server := sidecarexec.NewServer(localCmdRunner, opts, mainLog)
27+
server := sidecarexec.NewServer(reaper, opts, mainLog)
3328

3429
err := server.Serve()
3530
if err != nil {
3631
mainLog.Error(err, "Serving RPC")
3732
}
3833
}
39-
40-
func reapZombies(log logr.Logger) {
41-
log.Info("starting zombie reaper")
42-
43-
for {
44-
var status syscall.WaitStatus
45-
46-
pid, _ := syscall.Wait4(-1, &status, syscall.WNOHANG, nil)
47-
if pid <= 0 {
48-
time.Sleep(1 * time.Second)
49-
} else {
50-
continue
51-
}
52-
}
53-
}

pkg/sidecarexec/cmd_exec.go

Lines changed: 52 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@ package sidecarexec
66
import (
77
"bytes"
88
"fmt"
9+
"io"
910
"os"
1011
goexec "os/exec"
11-
12-
"carvel.dev/kapp-controller/pkg/exec"
12+
"sync"
1313
)
1414

1515
// CmdInput describes a command to run.
@@ -31,11 +31,13 @@ type CmdOutput struct {
3131

3232
// CmdExec provides RPC interface for command execution.
3333
type CmdExec struct {
34-
local exec.CmdRunner
34+
reaper *Reaper
3535
allowedCmdNames map[string]struct{}
3636
}
3737

3838
// Run executes a command (out of a set of allowed ones).
39+
// It uses cmd.Start() and delegates process wait to the centralized Reaper
40+
// to avoid a race with the zombie reaping loop.
3941
func (r CmdExec) Run(input CmdInput, output *CmdOutput) error {
4042
if _, found := r.allowedCmdNames[input.Command]; !found {
4143
return fmt.Errorf("Command '%s' is not allowed", input.Command)
@@ -54,20 +56,57 @@ func (r CmdExec) Run(input CmdInput, output *CmdOutput) error {
5456
cmd.Dir = input.Dir
5557
}
5658

57-
var stdout, stderr bytes.Buffer
58-
cmd.Stdout = &stdout
59-
cmd.Stderr = &stderr
60-
61-
err := r.local.Run(cmd)
59+
stdoutPipe, err := cmd.StdoutPipe()
6260
if err != nil {
63-
output.Error = err.Error()
64-
output.ExitCode = -1
65-
if exitError, ok := err.(*goexec.ExitError); ok {
66-
output.ExitCode = exitError.ExitCode()
67-
}
61+
return fmt.Errorf("Creating stdout pipe: %s", err)
6862
}
63+
stderrPipe, err := cmd.StderrPipe()
64+
if err != nil {
65+
return fmt.Errorf("Creating stderr pipe: %s", err)
66+
}
67+
68+
// Hold the reaper lock across Start + Track to guarantee the reaping
69+
// loop cannot consume this child's exit status before we register it.
70+
r.reaper.Lock()
71+
if err := cmd.Start(); err != nil {
72+
r.reaper.Unlock()
73+
return fmt.Errorf("Starting command: %s", err)
74+
}
75+
statusCh := r.reaper.TrackLocked(cmd.Process.Pid)
76+
r.reaper.Unlock()
77+
78+
var stdout, stderr bytes.Buffer
79+
var wg sync.WaitGroup
80+
wg.Add(2)
81+
go func() {
82+
defer wg.Done()
83+
io.Copy(&stdout, stdoutPipe)
84+
}()
85+
go func() {
86+
defer wg.Done()
87+
io.Copy(&stderr, stderrPipe)
88+
}()
89+
90+
// Wait for all output to be read before checking exit status.
91+
// The pipes close when the process exits, so this will unblock.
92+
wg.Wait()
93+
94+
// Now wait for the Reaper to deliver the exit status.
95+
waitStatus := <-statusCh
96+
97+
// Tell Go's os/exec we handled the wait ourselves.
98+
cmd.Process.Release()
6999

70100
output.Stdout = stdout.Bytes()
71101
output.Stderr = stderr.Bytes()
102+
103+
if waitStatus.Signaled() {
104+
output.ExitCode = -1
105+
output.Error = fmt.Sprintf("process killed by signal %d", waitStatus.Signal())
106+
} else if waitStatus.Exited() && waitStatus.ExitStatus() != 0 {
107+
output.ExitCode = waitStatus.ExitStatus()
108+
output.Error = fmt.Sprintf("exit status %d", waitStatus.ExitStatus())
109+
}
110+
72111
return nil
73112
}

pkg/sidecarexec/export_test.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
// Copyright 2024 The Carvel Authors.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package sidecarexec
5+
6+
import "syscall"
7+
8+
// CmdExec_ForTesting creates a CmdExec with the given reaper and allowed
9+
// command names. This is only available in test builds.
10+
func CmdExec_ForTesting(reaper *Reaper, allowedCmds []string) CmdExec {
11+
allowed := map[string]struct{}{}
12+
for _, cmd := range allowedCmds {
13+
allowed[cmd] = struct{}{}
14+
}
15+
return CmdExec{reaper: reaper, allowedCmdNames: allowed}
16+
}
17+
18+
// Dispatch_ForTesting exposes the reaper's dispatch method for unit tests.
19+
func Dispatch_ForTesting(r *Reaper, pid int, status syscall.WaitStatus) bool {
20+
return r.dispatch(pid, status)
21+
}

pkg/sidecarexec/reaper.go

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
// Copyright 2024 The Carvel Authors.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package sidecarexec
5+
6+
import (
7+
"sync"
8+
"syscall"
9+
"time"
10+
11+
"github.com/go-logr/logr"
12+
)
13+
14+
// Reaper is a centralized zombie process reaper that eliminates the race
15+
// condition between a blanket Wait4(-1) and Go's os/exec cmd.Wait().
16+
// It is the sole goroutine calling Wait4, and dispatches exit statuses
17+
// back to callers that registered interest in specific PIDs.
18+
type Reaper struct {
19+
mu sync.Mutex
20+
tracked map[int]chan syscall.WaitStatus
21+
log logr.Logger
22+
}
23+
24+
// NewReaper creates a new Reaper. Start the reaping loop by calling Run
25+
// in a separate goroutine.
26+
func NewReaper(log logr.Logger) *Reaper {
27+
return &Reaper{
28+
tracked: make(map[int]chan syscall.WaitStatus),
29+
log: log.WithName("reaper"),
30+
}
31+
}
32+
33+
// Lock acquires the reaper's mutex. This must be held while starting a
34+
// process and calling TrackLocked, so that the reaping loop cannot
35+
// consume the process exit status before tracking is registered.
36+
func (r *Reaper) Lock() { r.mu.Lock() }
37+
38+
// Unlock releases the reaper's mutex.
39+
func (r *Reaper) Unlock() { r.mu.Unlock() }
40+
41+
// TrackLocked registers a PID so that when Wait4 reaps it, the wait status
42+
// is delivered to the returned channel. The caller MUST hold r.Lock() and
43+
// must have started the process while holding the lock, ensuring no race
44+
// between process start and tracking registration.
45+
func (r *Reaper) TrackLocked(pid int) <-chan syscall.WaitStatus {
46+
ch := make(chan syscall.WaitStatus, 1)
47+
r.tracked[pid] = ch
48+
return ch
49+
}
50+
51+
// dispatch delivers a wait status to a tracked PID's channel.
52+
// Returns true if the PID was being tracked (an RPC-managed process).
53+
func (r *Reaper) dispatch(pid int, status syscall.WaitStatus) bool {
54+
r.mu.Lock()
55+
ch, ok := r.tracked[pid]
56+
if ok {
57+
delete(r.tracked, pid)
58+
}
59+
r.mu.Unlock()
60+
61+
if ok {
62+
ch <- status
63+
}
64+
return ok
65+
}
66+
67+
// Run is the reaping loop and must be the only goroutine calling Wait4
68+
// in this process. It runs forever.
69+
func (r *Reaper) Run() {
70+
r.log.Info("starting zombie reaper")
71+
72+
for {
73+
var status syscall.WaitStatus
74+
75+
// The mutex is acquired here so that Wait4 cannot race with
76+
// a concurrent cmd.Start() + TrackLocked() sequence.
77+
r.mu.Lock()
78+
pid, err := syscall.Wait4(-1, &status, syscall.WNOHANG, nil)
79+
80+
if pid > 0 {
81+
ch, ok := r.tracked[pid]
82+
if ok {
83+
delete(r.tracked, pid)
84+
}
85+
r.mu.Unlock()
86+
87+
if ok {
88+
ch <- status
89+
r.log.V(1).Info("reaped tracked process", "pid", pid)
90+
} else {
91+
r.log.V(1).Info("reaped orphaned zombie", "pid", pid)
92+
}
93+
continue
94+
}
95+
r.mu.Unlock()
96+
97+
if pid == 0 || (pid == -1 && err == syscall.ECHILD) {
98+
time.Sleep(1 * time.Second)
99+
continue
100+
}
101+
102+
r.log.V(1).Info("wait4 unexpected error", "err", err)
103+
time.Sleep(1 * time.Second)
104+
}
105+
}

0 commit comments

Comments
 (0)