Skip to content

Commit 7512b9f

Browse files
authored
Revert "Revert "[supervisor] improve docker activation (#18143)" (#18236)"
This reverts commit 9b21205.
1 parent 0bb6aaa commit 7512b9f

File tree

5 files changed

+293
-74
lines changed

5 files changed

+293
-74
lines changed

components/supervisor/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ require (
2424
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0
2525
github.com/grpc-ecosystem/grpc-gateway/v2 v2.11.3
2626
github.com/improbable-eng/grpc-web v0.14.0
27-
github.com/mailru/easygo v0.0.0-20190618140210-3c14a0dc985f
2827
github.com/prometheus/client_golang v1.14.0
2928
github.com/prometheus/client_model v0.3.0
3029
github.com/prometheus/common v0.42.0
@@ -116,6 +115,7 @@ require (
116115
github.com/mitchellh/go-homedir v1.1.0 // indirect
117116
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
118117
github.com/modern-go/reflect2 v1.0.2 // indirect
118+
github.com/moredure/easygo v0.0.0-20220122214504-21cd2ebdd15b
119119
github.com/opencontainers/go-digest v1.0.0 // indirect
120120
github.com/opencontainers/image-spec v1.0.2 // indirect
121121
github.com/opencontainers/runtime-spec v1.0.2 // indirect

components/supervisor/go.sum

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

components/supervisor/pkg/activation/activation.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ import (
1010
"os"
1111
"sync"
1212

13-
"github.com/mailru/easygo/netpoll"
13+
"github.com/gitpod-io/gitpod/common-go/log"
14+
"github.com/moredure/easygo/netpoll"
1415
"golang.org/x/xerrors"
1516
)
1617

@@ -19,16 +20,22 @@ type Callback func(socketFD *os.File) error
1920

2021
// Listen polls on the listener and calls callback when someone writes to it.
2122
func Listen(ctx context.Context, l net.Listener, activate Callback) error {
22-
poller, err := netpoll.New(nil)
23+
poller, err := netpoll.New(&netpoll.Config{
24+
OnWaitError: func(err error) {
25+
log.WithError(err).Error("netpoll: wait loop error")
26+
},
27+
})
2328
if err != nil {
2429
return err
2530
}
31+
defer poller.Close()
2632

2733
// Get netpoll descriptor with EventRead|EventEdgeTriggered.
2834
desc, err := netpoll.HandleListener(l, netpoll.EventRead|netpoll.EventEdgeTriggered)
2935
if err != nil {
3036
return err
3137
}
38+
defer desc.Close()
3239

3340
var (
3441
runc = make(chan bool, 1)
Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
// Copyright (c) 2023 Gitpod GmbH. All rights reserved.
2+
// Licensed under the GNU Affero General Public License (AGPL).
3+
// See License.AGPL.txt in the project root for license information.
4+
5+
package supervisor
6+
7+
import (
8+
"context"
9+
"errors"
10+
"fmt"
11+
"io"
12+
"net"
13+
"os"
14+
"os/exec"
15+
"strings"
16+
"sync"
17+
"syscall"
18+
"time"
19+
20+
"github.com/cenkalti/backoff/v4"
21+
"golang.org/x/xerrors"
22+
23+
"github.com/gitpod-io/gitpod/common-go/analytics"
24+
"github.com/gitpod-io/gitpod/common-go/log"
25+
"github.com/gitpod-io/gitpod/common-go/process"
26+
"github.com/gitpod-io/gitpod/supervisor/api"
27+
"github.com/gitpod-io/gitpod/supervisor/pkg/activation"
28+
"github.com/gitpod-io/gitpod/supervisor/pkg/dropwriter"
29+
"github.com/gitpod-io/gitpod/supervisor/pkg/terminal"
30+
)
31+
32+
// if exit error happens one after another within dockerStartErrorBurstDuration then we're in the burst
33+
// and should stop trying after maxBurstDockerStartAttempts
34+
const (
35+
maxBurstDockerStartAttempts = 8
36+
dockerStartErrorBurstDuration = time.Minute * 1
37+
maxIntervalBetweenDockerStart = 15 * time.Second
38+
39+
logsDir = "/workspace/.gitpod/logs"
40+
dockerUpLogFilePath = logsDir + "/docker-up.log"
41+
)
42+
43+
var (
44+
failedForRetryError = errors.New("failed for retry start docker-up")
45+
)
46+
47+
func socketActivationForDocker(parentCtx context.Context, wg *sync.WaitGroup, term *terminal.Mux, cfg *Config, telemetry analytics.Writer, notifications *NotificationService) {
48+
defer wg.Done()
49+
50+
ctx, cancel := context.WithCancel(parentCtx)
51+
defer cancel()
52+
53+
logFile, err := openDockerUpLogFile()
54+
if err != nil {
55+
log.WithError(err).Error("docker-up: cannot open log file")
56+
} else {
57+
defer logFile.Close()
58+
}
59+
60+
var notificationSent bool
61+
var notificationDisabled = cfg.isHeadless() || logFile == nil
62+
notifyDockerUpFail := func() error {
63+
if notificationDisabled || notificationSent {
64+
return nil
65+
}
66+
notificationSent = true
67+
defer func() {
68+
notificationSent = false
69+
}()
70+
openLogs := "Open Logs"
71+
dontShowAgain := "Don't Show Again"
72+
resp, err := notifications.Notify(ctx, &api.NotifyRequest{
73+
Level: api.NotifyRequest_ERROR,
74+
Message: "Docker daemon is failing to start.",
75+
Actions: []string{openLogs, dontShowAgain},
76+
})
77+
if err != nil {
78+
return err
79+
}
80+
telemetry.Track(analytics.TrackMessage{
81+
Identity: analytics.Identity{UserID: cfg.OwnerId},
82+
Event: "gitpod_activate_docker_fail_notification",
83+
Properties: map[string]interface{}{
84+
"instanceId": cfg.WorkspaceInstanceID,
85+
"workspaceId": cfg.WorkspaceID,
86+
"debugWorkspace": cfg.isDebugWorkspace(),
87+
"action": resp.Action,
88+
},
89+
})
90+
if resp.Action == dontShowAgain {
91+
notificationDisabled = true
92+
return nil
93+
}
94+
if resp.Action != openLogs {
95+
return nil
96+
}
97+
gpPath, err := exec.LookPath("gp")
98+
if err != nil {
99+
return err
100+
}
101+
gpCmd := exec.CommandContext(ctx, gpPath, "open", dockerUpLogFilePath)
102+
gpCmd.Env = childProcEnvvars
103+
gpCmd.Stdout = os.Stdout
104+
gpCmd.Stderr = os.Stderr
105+
return gpCmd.Run()
106+
}
107+
108+
var stdout, stderr io.Writer
109+
110+
stdout = os.Stdout
111+
stderr = os.Stderr
112+
if cfg.WorkspaceLogRateLimit > 0 {
113+
limit := int64(cfg.WorkspaceLogRateLimit)
114+
stdout = dropwriter.Writer(stdout, dropwriter.NewBucket(limit*1024*3, limit*1024))
115+
stderr = dropwriter.Writer(stderr, dropwriter.NewBucket(limit*1024*3, limit*1024))
116+
log.WithField("limit_kb_per_sec", limit).Info("docker-up: rate limiting log output")
117+
}
118+
119+
if logFile != nil {
120+
stdout = io.MultiWriter(stdout, logFile)
121+
stderr = io.MultiWriter(stderr, logFile)
122+
}
123+
124+
for ctx.Err() == nil {
125+
err := listenToDockerSocket(ctx, term, cfg, telemetry, logFile, stdout, stderr)
126+
if ctx.Err() != nil {
127+
return
128+
}
129+
130+
if errors.Is(err, failedForRetryError) {
131+
go func() {
132+
notifyErr := notifyDockerUpFail()
133+
if notifyErr != nil && !errors.Is(notifyErr, context.Canceled) {
134+
log.WithError(notifyErr).Error("cannot notify about docker-up failure")
135+
}
136+
}()
137+
}
138+
139+
time.Sleep(1 * time.Second)
140+
}
141+
}
142+
143+
// listenToDockerSocket listens to the docker socket and starts the docker-up process
144+
// if it fails to start with sometimes retry then listener is closed to requests all incoming requests and prevent infinite try
145+
func listenToDockerSocket(parentCtx context.Context, term *terminal.Mux, cfg *Config, telemetry analytics.Writer, logFile *os.File, stdout, stderr io.Writer) error {
146+
ctx, cancel := context.WithCancel(parentCtx)
147+
defer cancel()
148+
149+
fn := "/var/run/docker.sock"
150+
l, err := net.Listen("unix", fn)
151+
if err != nil {
152+
return err
153+
}
154+
155+
go func() {
156+
<-ctx.Done()
157+
l.Close()
158+
}()
159+
160+
_ = os.Chown(fn, gitpodUID, gitpodGID)
161+
162+
var lastExitErrorTime time.Time
163+
burstAttempts := 0
164+
backoffStrategy := backoff.NewExponentialBackOff()
165+
backoffStrategy.MaxInterval = maxIntervalBetweenDockerStart
166+
167+
for ctx.Err() == nil {
168+
err = activation.Listen(ctx, l, func(socketFD *os.File) error {
169+
defer socketFD.Close()
170+
startTime := time.Now()
171+
telemetry.Track(analytics.TrackMessage{
172+
Identity: analytics.Identity{UserID: cfg.OwnerId},
173+
Event: "gitpod_activate_docker",
174+
Properties: map[string]interface{}{
175+
"instanceId": cfg.WorkspaceInstanceID,
176+
"workspaceId": cfg.WorkspaceID,
177+
"debugWorkspace": cfg.isDebugWorkspace(),
178+
},
179+
})
180+
181+
if logFile != nil {
182+
defer fmt.Fprintf(logFile, "\n======= Stop docker-up at %s =======\n", time.Now().Format(time.RFC3339))
183+
fmt.Fprintf(logFile, "======= Start docker-up at %s =======\n", startTime.Format(time.RFC3339))
184+
}
185+
186+
cmd := exec.CommandContext(ctx, "/usr/bin/docker-up")
187+
cmd.Env = append(os.Environ(), "LISTEN_FDS=1")
188+
cmd.ExtraFiles = []*os.File{socketFD}
189+
cmd.Stdout = stdout
190+
cmd.Stderr = stderr
191+
192+
err = cmd.Start()
193+
if err != nil {
194+
return err
195+
}
196+
ptyCtx, cancel := context.WithCancel(ctx)
197+
go func(ptyCtx context.Context) {
198+
select {
199+
case <-ctx.Done():
200+
_ = cmd.Process.Signal(syscall.SIGTERM)
201+
case <-ptyCtx.Done():
202+
}
203+
}(ptyCtx)
204+
err = cmd.Wait()
205+
cancel()
206+
return err
207+
})
208+
209+
if ctx.Err() != nil {
210+
return ctx.Err()
211+
}
212+
213+
var exitError *exec.ExitError
214+
if err != nil {
215+
exitError, _ = err.(*exec.ExitError)
216+
}
217+
if exitError != nil && exitError.ExitCode() > 0 {
218+
// docker-up or daemon exited with an error - we'll try again
219+
// it can be a transient condition like apt is locked by another process to install prerequisites
220+
// or permament like misconfigured env var, image which does not allow to install prerequisites at all
221+
// or any general issue with docker daemon startup
222+
if time.Since(lastExitErrorTime) <= dockerStartErrorBurstDuration {
223+
// we're in the exit error burst
224+
burstAttempts++
225+
} else {
226+
// no burst, reset the counter
227+
burstAttempts = 0
228+
backoffStrategy.Reset()
229+
}
230+
lastExitErrorTime = time.Now()
231+
232+
} else {
233+
// transient condition like a docker daemon get killed because of OOM
234+
burstAttempts = 0
235+
backoffStrategy.Reset()
236+
}
237+
238+
nextBackOff := backoffStrategy.NextBackOff()
239+
240+
if nextBackOff == backoff.Stop || burstAttempts >= maxBurstDockerStartAttempts {
241+
cancel()
242+
log.WithError(err).WithField("attempts", burstAttempts).Error("cannot activate docker after maximum attempts, stopping trying permanently")
243+
if logFile != nil {
244+
fmt.Fprintln(logFile, "Cannot activate docker after maximum attempts, stopping trying permanently.")
245+
fmt.Fprintln(logFile, "Please check logs above, fix the configuration, use `gp validate` to verify change, and commit to apply.")
246+
fmt.Fprintf(logFile, "If it does not help, please reach out to the support. Don't forget to share your workspace ID: %s.\n", cfg.WorkspaceID)
247+
}
248+
return failedForRetryError
249+
}
250+
251+
if err != nil &&
252+
!errors.Is(err, context.Canceled) &&
253+
!process.IsNotChildProcess(err) &&
254+
!strings.Contains(err.Error(), "signal: ") {
255+
// don't log typical transient errors
256+
log.WithError(err).WithField("attempts", burstAttempts).WithField("backoff", nextBackOff.String()).Error("cannot activate docker, retrying...")
257+
}
258+
time.Sleep(nextBackOff)
259+
}
260+
return ctx.Err()
261+
}
262+
263+
func openDockerUpLogFile() (*os.File, error) {
264+
if err := os.MkdirAll(logsDir, 0755); err != nil {
265+
return nil, xerrors.Errorf("cannot create logs dir: %w", err)
266+
}
267+
if err := os.Chown(logsDir, gitpodUID, gitpodGID); err != nil {
268+
return nil, xerrors.Errorf("cannot chown logs dir: %w", err)
269+
}
270+
logFile, err := os.OpenFile(dockerUpLogFilePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
271+
if err != nil {
272+
return nil, xerrors.Errorf("cannot open docker-up log file: %w", err)
273+
}
274+
275+
if err := os.Chown(dockerUpLogFilePath, gitpodUID, gitpodGID); err != nil {
276+
_ = logFile.Close()
277+
return nil, xerrors.Errorf("cannot chown docker-up log file: %w", err)
278+
}
279+
return logFile, nil
280+
}

0 commit comments

Comments
 (0)