cmd/derpprobe: don't alert for smaller failures.

There is a Cosmic Background level of DERP Unreachability,
with individual nodes or regions becoming unreachable briefly
and returning a short time later. This is due to hosting provider
outages or just the Internet sloshing about.

Returning a 500 error pages a human. Being awoken at 3am for
a transient error is annoying.

For relatively small levels of badness don't page a human,
just post to Slack. If the outage impacts a significant fraction
of the DERP fleet, then page a human.

Signed-off-by: Denton Gentry <dgentry@tailscale.com>
This commit is contained in:
Denton Gentry 2022-02-28 20:13:33 -08:00 committed by Dave Anderson
parent 84a2dc3a7e
commit d8953bf2ba

View File

@ -19,7 +19,9 @@
"log"
"net"
"net/http"
"os"
"sort"
"strings"
"sync"
"time"
@ -54,7 +56,15 @@
func main() {
flag.Parse()
// proactively load the DERP map. Nothing terrible happens if this fails, so we ignore
// the error. The Slack bot will print a notification that the DERP map was empty.
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
_, _ = getDERPMap(ctx)
go probeLoop()
go slackLoop()
log.Fatal(http.ListenAndServe(*listen, http.HandlerFunc(serve)))
}
@ -138,10 +148,14 @@ func getOverallStatus() (o overallStatus) {
func serve(w http.ResponseWriter, r *http.Request) {
st := getOverallStatus()
summary := "All good"
if len(st.bad) > 0 {
if (float64(len(st.bad)) / float64(len(st.bad)+len(st.good))) > 0.25 {
// This will generate an alert and page a human.
// It also ends up in Slack, but as part of the alert handling pipeline not
// because we generated a Slack notification from here.
w.WriteHeader(500)
summary = fmt.Sprintf("%d problems", len(st.bad))
}
io.WriteString(w, "<html><head><style>.bad { font-weight: bold; color: #700; }</style></head>\n")
fmt.Fprintf(w, "<body><h1>derp probe</h1>\n%s:<ul>", summary)
for _, s := range st.bad {
@ -153,6 +167,71 @@ func serve(w http.ResponseWriter, r *http.Request) {
io.WriteString(w, "</ul></body></html>\n")
}
func notifySlack(text string) error {
type SlackRequestBody struct {
Text string `json:"text"`
}
slackBody, err := json.Marshal(SlackRequestBody{Text: text})
if err != nil {
return err
}
webhookUrl := os.Getenv("SLACK_WEBHOOK")
if webhookUrl == "" {
return errors.New("No SLACK_WEBHOOK configured")
}
req, err := http.NewRequest("POST", webhookUrl, bytes.NewReader(slackBody))
if err != nil {
return err
}
req.Header.Add("Content-Type", "application/json")
client := &http.Client{Timeout: 10 * time.Second}
resp, err := client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return errors.New(resp.Status)
}
body, _ := io.ReadAll(resp.Body)
if string(body) != "ok" {
return errors.New("Non-ok response returned from Slack")
}
return nil
}
// We only page a human if it looks like there is a significant outage across multiple regions.
// To Slack, we report all failures great and small.
func slackLoop() {
inBadState := false
for {
time.Sleep(time.Second * 30)
st := getOverallStatus()
if len(st.bad) > 0 && !inBadState {
err := notifySlack(strings.Join(st.bad, "\n"))
if err == nil {
inBadState = true
} else {
log.Printf("%d problems, notify Slack failed: %v", len(st.bad), err)
}
}
if len(st.bad) == 0 && inBadState {
err := notifySlack("All DERPs recovered.")
if err == nil {
inBadState = false
}
}
}
}
func sortedRegions(dm *tailcfg.DERPMap) []*tailcfg.DERPRegion {
ret := make([]*tailcfg.DERPRegion, 0, len(dm.Regions))
for _, r := range dm.Regions {