mirror of
https://github.com/tailscale/tailscale.git
synced 2025-04-21 06:01:42 +00:00
cmd/derpprobe: don't alert for smaller failures.
There is a Cosmic Background level of DERP Unreachability, with individual nodes or regions becoming unreachable briefly and returning a short time later. This is due to hosting provider outages or just the Internet sloshing about. Returning a 500 error pages a human. Being awoken at 3am for a transient error is annoying. For relatively small levels of badness don't page a human, just post to Slack. If the outage impacts a significant fraction of the DERP fleet, then page a human. Signed-off-by: Denton Gentry <dgentry@tailscale.com>
This commit is contained in:
parent
84a2dc3a7e
commit
d8953bf2ba
@ -19,7 +19,9 @@ import (
|
|||||||
"log"
|
"log"
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"os"
|
||||||
"sort"
|
"sort"
|
||||||
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -54,7 +56,15 @@ var (
|
|||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
|
|
||||||
|
// proactively load the DERP map. Nothing terrible happens if this fails, so we ignore
|
||||||
|
// the error. The Slack bot will print a notification that the DERP map was empty.
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
_, _ = getDERPMap(ctx)
|
||||||
|
|
||||||
go probeLoop()
|
go probeLoop()
|
||||||
|
go slackLoop()
|
||||||
log.Fatal(http.ListenAndServe(*listen, http.HandlerFunc(serve)))
|
log.Fatal(http.ListenAndServe(*listen, http.HandlerFunc(serve)))
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -138,10 +148,14 @@ func getOverallStatus() (o overallStatus) {
|
|||||||
func serve(w http.ResponseWriter, r *http.Request) {
|
func serve(w http.ResponseWriter, r *http.Request) {
|
||||||
st := getOverallStatus()
|
st := getOverallStatus()
|
||||||
summary := "All good"
|
summary := "All good"
|
||||||
if len(st.bad) > 0 {
|
if (float64(len(st.bad)) / float64(len(st.bad)+len(st.good))) > 0.25 {
|
||||||
|
// This will generate an alert and page a human.
|
||||||
|
// It also ends up in Slack, but as part of the alert handling pipeline not
|
||||||
|
// because we generated a Slack notification from here.
|
||||||
w.WriteHeader(500)
|
w.WriteHeader(500)
|
||||||
summary = fmt.Sprintf("%d problems", len(st.bad))
|
summary = fmt.Sprintf("%d problems", len(st.bad))
|
||||||
}
|
}
|
||||||
|
|
||||||
io.WriteString(w, "<html><head><style>.bad { font-weight: bold; color: #700; }</style></head>\n")
|
io.WriteString(w, "<html><head><style>.bad { font-weight: bold; color: #700; }</style></head>\n")
|
||||||
fmt.Fprintf(w, "<body><h1>derp probe</h1>\n%s:<ul>", summary)
|
fmt.Fprintf(w, "<body><h1>derp probe</h1>\n%s:<ul>", summary)
|
||||||
for _, s := range st.bad {
|
for _, s := range st.bad {
|
||||||
@ -153,6 +167,71 @@ func serve(w http.ResponseWriter, r *http.Request) {
|
|||||||
io.WriteString(w, "</ul></body></html>\n")
|
io.WriteString(w, "</ul></body></html>\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func notifySlack(text string) error {
|
||||||
|
type SlackRequestBody struct {
|
||||||
|
Text string `json:"text"`
|
||||||
|
}
|
||||||
|
|
||||||
|
slackBody, err := json.Marshal(SlackRequestBody{Text: text})
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
webhookUrl := os.Getenv("SLACK_WEBHOOK")
|
||||||
|
if webhookUrl == "" {
|
||||||
|
return errors.New("No SLACK_WEBHOOK configured")
|
||||||
|
}
|
||||||
|
|
||||||
|
req, err := http.NewRequest("POST", webhookUrl, bytes.NewReader(slackBody))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
req.Header.Add("Content-Type", "application/json")
|
||||||
|
|
||||||
|
client := &http.Client{Timeout: 10 * time.Second}
|
||||||
|
resp, err := client.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
if resp.StatusCode != 200 {
|
||||||
|
return errors.New(resp.Status)
|
||||||
|
}
|
||||||
|
|
||||||
|
body, _ := io.ReadAll(resp.Body)
|
||||||
|
if string(body) != "ok" {
|
||||||
|
return errors.New("Non-ok response returned from Slack")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// We only page a human if it looks like there is a significant outage across multiple regions.
|
||||||
|
// To Slack, we report all failures great and small.
|
||||||
|
func slackLoop() {
|
||||||
|
inBadState := false
|
||||||
|
for {
|
||||||
|
time.Sleep(time.Second * 30)
|
||||||
|
st := getOverallStatus()
|
||||||
|
|
||||||
|
if len(st.bad) > 0 && !inBadState {
|
||||||
|
err := notifySlack(strings.Join(st.bad, "\n"))
|
||||||
|
if err == nil {
|
||||||
|
inBadState = true
|
||||||
|
} else {
|
||||||
|
log.Printf("%d problems, notify Slack failed: %v", len(st.bad), err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(st.bad) == 0 && inBadState {
|
||||||
|
err := notifySlack("All DERPs recovered.")
|
||||||
|
if err == nil {
|
||||||
|
inBadState = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func sortedRegions(dm *tailcfg.DERPMap) []*tailcfg.DERPRegion {
|
func sortedRegions(dm *tailcfg.DERPMap) []*tailcfg.DERPRegion {
|
||||||
ret := make([]*tailcfg.DERPRegion, 0, len(dm.Regions))
|
ret := make([]*tailcfg.DERPRegion, 0, len(dm.Regions))
|
||||||
for _, r := range dm.Regions {
|
for _, r := range dm.Regions {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user