mirror of
https://github.com/tailscale/tailscale.git
synced 2024-11-29 04:55:31 +00:00
cmd/derpprobe: don't alert for smaller failures.
There is a Cosmic Background level of DERP Unreachability, with individual nodes or regions becoming unreachable briefly and returning a short time later. This is due to hosting provider outages or just the Internet sloshing about. Returning a 500 error pages a human. Being awoken at 3am for a transient error is annoying. For relatively small levels of badness don't page a human, just post to Slack. If the outage impacts a significant fraction of the DERP fleet, then page a human. Signed-off-by: Denton Gentry <dgentry@tailscale.com>
This commit is contained in:
parent
84a2dc3a7e
commit
d8953bf2ba
@ -19,7 +19,9 @@
|
||||
"log"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@ -54,7 +56,15 @@
|
||||
|
||||
func main() {
|
||||
flag.Parse()
|
||||
|
||||
// proactively load the DERP map. Nothing terrible happens if this fails, so we ignore
|
||||
// the error. The Slack bot will print a notification that the DERP map was empty.
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
_, _ = getDERPMap(ctx)
|
||||
|
||||
go probeLoop()
|
||||
go slackLoop()
|
||||
log.Fatal(http.ListenAndServe(*listen, http.HandlerFunc(serve)))
|
||||
}
|
||||
|
||||
@ -138,10 +148,14 @@ func getOverallStatus() (o overallStatus) {
|
||||
func serve(w http.ResponseWriter, r *http.Request) {
|
||||
st := getOverallStatus()
|
||||
summary := "All good"
|
||||
if len(st.bad) > 0 {
|
||||
if (float64(len(st.bad)) / float64(len(st.bad)+len(st.good))) > 0.25 {
|
||||
// This will generate an alert and page a human.
|
||||
// It also ends up in Slack, but as part of the alert handling pipeline not
|
||||
// because we generated a Slack notification from here.
|
||||
w.WriteHeader(500)
|
||||
summary = fmt.Sprintf("%d problems", len(st.bad))
|
||||
}
|
||||
|
||||
io.WriteString(w, "<html><head><style>.bad { font-weight: bold; color: #700; }</style></head>\n")
|
||||
fmt.Fprintf(w, "<body><h1>derp probe</h1>\n%s:<ul>", summary)
|
||||
for _, s := range st.bad {
|
||||
@ -153,6 +167,71 @@ func serve(w http.ResponseWriter, r *http.Request) {
|
||||
io.WriteString(w, "</ul></body></html>\n")
|
||||
}
|
||||
|
||||
func notifySlack(text string) error {
|
||||
type SlackRequestBody struct {
|
||||
Text string `json:"text"`
|
||||
}
|
||||
|
||||
slackBody, err := json.Marshal(SlackRequestBody{Text: text})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
webhookUrl := os.Getenv("SLACK_WEBHOOK")
|
||||
if webhookUrl == "" {
|
||||
return errors.New("No SLACK_WEBHOOK configured")
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("POST", webhookUrl, bytes.NewReader(slackBody))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Add("Content-Type", "application/json")
|
||||
|
||||
client := &http.Client{Timeout: 10 * time.Second}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
return errors.New(resp.Status)
|
||||
}
|
||||
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
if string(body) != "ok" {
|
||||
return errors.New("Non-ok response returned from Slack")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// We only page a human if it looks like there is a significant outage across multiple regions.
|
||||
// To Slack, we report all failures great and small.
|
||||
func slackLoop() {
|
||||
inBadState := false
|
||||
for {
|
||||
time.Sleep(time.Second * 30)
|
||||
st := getOverallStatus()
|
||||
|
||||
if len(st.bad) > 0 && !inBadState {
|
||||
err := notifySlack(strings.Join(st.bad, "\n"))
|
||||
if err == nil {
|
||||
inBadState = true
|
||||
} else {
|
||||
log.Printf("%d problems, notify Slack failed: %v", len(st.bad), err)
|
||||
}
|
||||
}
|
||||
|
||||
if len(st.bad) == 0 && inBadState {
|
||||
err := notifySlack("All DERPs recovered.")
|
||||
if err == nil {
|
||||
inBadState = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func sortedRegions(dm *tailcfg.DERPMap) []*tailcfg.DERPRegion {
|
||||
ret := make([]*tailcfg.DERPRegion, 0, len(dm.Regions))
|
||||
for _, r := range dm.Regions {
|
||||
|
Loading…
Reference in New Issue
Block a user