From d8953bf2ba140f5e5fdb46c8c93de5e80ab8e151 Mon Sep 17 00:00:00 2001 From: Denton Gentry Date: Mon, 28 Feb 2022 20:13:33 -0800 Subject: [PATCH] cmd/derpprobe: don't alert for smaller failures. There is a Cosmic Background level of DERP Unreachability, with individual nodes or regions becoming unreachable briefly and returning a short time later. This is due to hosting provider outages or just the Internet sloshing about. Returning a 500 error pages a human. Being awoken at 3am for a transient error is annoying. For relatively small levels of badness don't page a human, just post to Slack. If the outage impacts a significant fraction of the DERP fleet, then page a human. Signed-off-by: Denton Gentry --- cmd/derpprobe/derpprobe.go | 81 +++++++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/cmd/derpprobe/derpprobe.go b/cmd/derpprobe/derpprobe.go index bf66943c5..ca389aa38 100644 --- a/cmd/derpprobe/derpprobe.go +++ b/cmd/derpprobe/derpprobe.go @@ -19,7 +19,9 @@ import ( "log" "net" "net/http" + "os" "sort" + "strings" "sync" "time" @@ -54,7 +56,15 @@ var ( func main() { flag.Parse() + + // proactively load the DERP map. Nothing terrible happens if this fails, so we ignore + // the error. The Slack bot will print a notification that the DERP map was empty. + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + _, _ = getDERPMap(ctx) + go probeLoop() + go slackLoop() log.Fatal(http.ListenAndServe(*listen, http.HandlerFunc(serve))) } @@ -138,10 +148,14 @@ func getOverallStatus() (o overallStatus) { func serve(w http.ResponseWriter, r *http.Request) { st := getOverallStatus() summary := "All good" - if len(st.bad) > 0 { + if (float64(len(st.bad)) / float64(len(st.bad)+len(st.good))) > 0.25 { + // This will generate an alert and page a human. + // It also ends up in Slack, but as part of the alert handling pipeline not + // because we generated a Slack notification from here. w.WriteHeader(500) summary = fmt.Sprintf("%d problems", len(st.bad)) } + io.WriteString(w, "\n") fmt.Fprintf(w, "

derp probe

\n%s:\n") } +func notifySlack(text string) error { + type SlackRequestBody struct { + Text string `json:"text"` + } + + slackBody, err := json.Marshal(SlackRequestBody{Text: text}) + if err != nil { + return err + } + + webhookUrl := os.Getenv("SLACK_WEBHOOK") + if webhookUrl == "" { + return errors.New("No SLACK_WEBHOOK configured") + } + + req, err := http.NewRequest("POST", webhookUrl, bytes.NewReader(slackBody)) + if err != nil { + return err + } + req.Header.Add("Content-Type", "application/json") + + client := &http.Client{Timeout: 10 * time.Second} + resp, err := client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + + if resp.StatusCode != 200 { + return errors.New(resp.Status) + } + + body, _ := io.ReadAll(resp.Body) + if string(body) != "ok" { + return errors.New("Non-ok response returned from Slack") + } + return nil +} + +// We only page a human if it looks like there is a significant outage across multiple regions. +// To Slack, we report all failures great and small. +func slackLoop() { + inBadState := false + for { + time.Sleep(time.Second * 30) + st := getOverallStatus() + + if len(st.bad) > 0 && !inBadState { + err := notifySlack(strings.Join(st.bad, "\n")) + if err == nil { + inBadState = true + } else { + log.Printf("%d problems, notify Slack failed: %v", len(st.bad), err) + } + } + + if len(st.bad) == 0 && inBadState { + err := notifySlack("All DERPs recovered.") + if err == nil { + inBadState = false + } + } + } +} + func sortedRegions(dm *tailcfg.DERPMap) []*tailcfg.DERPRegion { ret := make([]*tailcfg.DERPRegion, 0, len(dm.Regions)) for _, r := range dm.Regions {