mirror of
https://github.com/tailscale/tailscale.git
synced 2024-11-29 04:55:31 +00:00
logtail: be less aggressive about re-uploads (#8117)
The retry logic was pathological in the following ways: * If we restarted the logging service, any pending uploads would be placed in a retry-loop where it depended on backoff.Backoff, which was too aggresive. It would retry failures within milliseconds, taking at least 10 retries to hit a delay of 1 second. * In the event where a logstream was rate limited, the aggressive retry logic would severely exacerbate the problem since each retry would also log an error message. It is by chance that the rate of log error spam does not happen to exceed the rate limit itself. We modify the retry logic in the following ways: * We now respect the "Retry-After" header sent by the logging service. * Lacking a "Retry-After" header, we retry after a hard-coded period of 30 to 60 seconds. This avoids the thundering-herd effect when all nodes try reconnecting to the logging service at the same time after a restart. * We do not treat a status 400 as having been uploaded. This is simply not the behavior of the logging service. Updates #tailscale/corp#11213 Signed-off-by: Joe Tsai <joetsai@digital-static.net>
This commit is contained in:
parent
da90fab899
commit
84c99fe0d9
@ -283,7 +283,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
|
||||
tailscale.com/tka from tailscale.com/ipn/ipnlocal+
|
||||
W tailscale.com/tsconst from tailscale.com/net/interfaces
|
||||
tailscale.com/tsd from tailscale.com/cmd/tailscaled+
|
||||
tailscale.com/tstime from tailscale.com/wgengine/magicsock
|
||||
tailscale.com/tstime from tailscale.com/wgengine/magicsock+
|
||||
💣 tailscale.com/tstime/mono from tailscale.com/net/tstun+
|
||||
tailscale.com/tstime/rate from tailscale.com/wgengine/filter+
|
||||
tailscale.com/tsweb/varz from tailscale.com/cmd/tailscaled
|
||||
|
@ -13,19 +13,19 @@
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
mrand "math/rand"
|
||||
"net/http"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"tailscale.com/envknob"
|
||||
"tailscale.com/logtail/backoff"
|
||||
"tailscale.com/net/interfaces"
|
||||
"tailscale.com/net/netmon"
|
||||
"tailscale.com/net/sockstats"
|
||||
"tailscale.com/tstime"
|
||||
tslogger "tailscale.com/types/logger"
|
||||
"tailscale.com/types/logid"
|
||||
"tailscale.com/util/set"
|
||||
@ -128,9 +128,6 @@ func NewLogger(cfg Config, logf tslogger.Logf) *Logger {
|
||||
cfg.FlushDelayFn = func() time.Duration { return 0 }
|
||||
}
|
||||
|
||||
stdLogf := func(f string, a ...any) {
|
||||
fmt.Fprintf(cfg.Stderr, strings.TrimSuffix(f, "\n")+"\n", a...)
|
||||
}
|
||||
var urlSuffix string
|
||||
if !cfg.CopyPrivateID.IsZero() {
|
||||
urlSuffix = "?copyId=" + cfg.CopyPrivateID.String()
|
||||
@ -148,7 +145,6 @@ func NewLogger(cfg Config, logf tslogger.Logf) *Logger {
|
||||
sentinel: make(chan int32, 16),
|
||||
flushDelayFn: cfg.FlushDelayFn,
|
||||
timeNow: cfg.TimeNow,
|
||||
bo: backoff.NewBackoff("logtail", stdLogf, 30*time.Second),
|
||||
metricsDelta: cfg.MetricsDelta,
|
||||
sockstatsLabel: sockstats.LabelLogtailLogger,
|
||||
|
||||
@ -186,7 +182,6 @@ type Logger struct {
|
||||
flushPending atomic.Bool
|
||||
sentinel chan int32
|
||||
timeNow func() time.Time
|
||||
bo *backoff.Backoff
|
||||
zstdEncoder Encoder
|
||||
uploadCancel func()
|
||||
explainedRaw bool
|
||||
@ -373,23 +368,38 @@ func (l *Logger) uploading(ctx context.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
for len(body) > 0 {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
}
|
||||
uploaded, err := l.upload(ctx, body, origlen)
|
||||
var lastError string
|
||||
var numFailures int
|
||||
var firstFailure time.Time
|
||||
for len(body) > 0 && ctx.Err() == nil {
|
||||
retryAfter, err := l.upload(ctx, body, origlen)
|
||||
if err != nil {
|
||||
numFailures++
|
||||
firstFailure = time.Now()
|
||||
|
||||
if !l.internetUp() {
|
||||
fmt.Fprintf(l.stderr, "logtail: internet down; waiting\n")
|
||||
l.awaitInternetUp(ctx)
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(l.stderr, "logtail: upload: %v\n", err)
|
||||
}
|
||||
l.bo.BackOff(ctx, err)
|
||||
if uploaded {
|
||||
|
||||
// Only print the same message once.
|
||||
if currError := err.Error(); lastError != currError {
|
||||
fmt.Fprintf(l.stderr, "logtail: upload: %v\n", err)
|
||||
lastError = currError
|
||||
}
|
||||
|
||||
// Sleep for the specified retryAfter period,
|
||||
// otherwise default to some random value.
|
||||
if retryAfter <= 0 {
|
||||
retryAfter = time.Duration(30+mrand.Intn(30)) * time.Second
|
||||
}
|
||||
tstime.Sleep(ctx, retryAfter)
|
||||
} else {
|
||||
// Only print a success message after recovery.
|
||||
if numFailures > 0 {
|
||||
fmt.Fprintf(l.stderr, "logtail: upload succeeded after %d failures and %s\n", numFailures, time.Since(firstFailure).Round(time.Second))
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
@ -433,7 +443,7 @@ func (l *Logger) awaitInternetUp(ctx context.Context) {
|
||||
// upload uploads body to the log server.
|
||||
// origlen indicates the pre-compression body length.
|
||||
// origlen of -1 indicates that the body is not compressed.
|
||||
func (l *Logger) upload(ctx context.Context, body []byte, origlen int) (uploaded bool, err error) {
|
||||
func (l *Logger) upload(ctx context.Context, body []byte, origlen int) (retryAfter time.Duration, err error) {
|
||||
const maxUploadTime = 45 * time.Second
|
||||
ctx = sockstats.WithSockStats(ctx, l.sockstatsLabel, l.Logf)
|
||||
ctx, cancel := context.WithTimeout(ctx, maxUploadTime)
|
||||
@ -460,17 +470,16 @@ func (l *Logger) upload(ctx context.Context, body []byte, origlen int) (uploaded
|
||||
l.httpDoCalls.Add(1)
|
||||
resp, err := l.httpc.Do(req)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("log upload of %d bytes %s failed: %v", len(body), compressedNote, err)
|
||||
return 0, fmt.Errorf("log upload of %d bytes %s failed: %v", len(body), compressedNote, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
uploaded = resp.StatusCode == 400 // the server saved the logs anyway
|
||||
b, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
|
||||
return uploaded, fmt.Errorf("log upload of %d bytes %s failed %d: %q", len(body), compressedNote, resp.StatusCode, b)
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
n, _ := strconv.Atoi(resp.Header.Get("Retry-After"))
|
||||
b, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<10))
|
||||
return time.Duration(n) * time.Second, fmt.Errorf("log upload of %d bytes %s failed %d: %s", len(body), compressedNote, resp.StatusCode, bytes.TrimSpace(b))
|
||||
}
|
||||
|
||||
return true, nil
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// Flush uploads all logs to the server. It blocks until complete or there is an
|
||||
|
Loading…
Reference in New Issue
Block a user