Compare commits

...

3 Commits

Author SHA1 Message Date
Kristoffer Dalby
c8ebbede54 Simplify map session management (#1931)
This PR removes the complicated session management introduced in https://github.com/juanfont/headscale/pull/1791 which kept track of the sessions in a map, in addition to the channel already kept track of in the notifier.

Instead of trying to close the mapsession, it will now be replaced by the new one and closed after so all new updates goes to the right place.

The map session serve function is also split into a streaming and a non-streaming version for better readability.

RemoveNode in the notifier will not remove a node if the channel is not matching the one that has been passed (e.g. it has been replaced with a new one).

A new tuning parameter has been added to added to set timeout before the notifier gives up to send an update to a node.

Add a keep alive resetter so we wait with sending keep alives if a node has just received an update.

In addition it adds a bunch of env debug flags that can be set:

- `HEADSCALE_DEBUG_HIGH_CARDINALITY_METRICS`: make certain metrics include per node.id, not recommended to use in prod. 
- `HEADSCALE_DEBUG_PROFILING_ENABLED`: activate tracing 
- `HEADSCALE_DEBUG_PROFILING_PATH`: where to store traces 
- `HEADSCALE_DEBUG_DUMP_CONFIG`: calls `spew.Dump` on the config object startup
- `HEADSCALE_DEBUG_DEADLOCK`: enable go-deadlock to dump goroutines if it looks like a deadlock has occured, enabled in integration tests.

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
2024-05-24 10:15:34 +02:00
Stefan Majer
8185a70dc7 Fix typos (#1860)
* Fix typos

* trigger GitHub actions

* remove kdiff3 orig files

* fix unicode

* remove unnecessary function call

* remove unnecessary comment

* remove unnecessary comment

---------

Co-authored-by: ohdearaugustin <ohdearaugustin@users.noreply.github.com>
2024-05-19 23:49:27 +02:00
ohdearaugustin
2dc62e981e move debug image to distroless (#1950) 2024-05-19 11:17:37 +02:00
34 changed files with 469 additions and 328 deletions

View File

@@ -135,7 +135,7 @@ kos:
- id: ghcr-debug - id: ghcr-debug
repository: ghcr.io/juanfont/headscale repository: ghcr.io/juanfont/headscale
bare: true bare: true
base_image: "debian:12" base_image: gcr.io/distroless/base-debian12:debug
build: headscale build: headscale
main: ./cmd/headscale main: ./cmd/headscale
env: env:
@@ -160,7 +160,7 @@ kos:
- id: dockerhub-debug - id: dockerhub-debug
build: headscale build: headscale
base_image: "debian:12" base_image: gcr.io/distroless/base-debian12:debug
repository: headscale/headscale repository: headscale/headscale
bare: true bare: true
platforms: platforms:

View File

@@ -26,7 +26,7 @@ after improving the test harness as part of adopting [#1460](https://github.com/
- Code reorganisation, a lot of code has moved, please review the following PRs accordingly [#1473](https://github.com/juanfont/headscale/pull/1473) - Code reorganisation, a lot of code has moved, please review the following PRs accordingly [#1473](https://github.com/juanfont/headscale/pull/1473)
- Change the structure of database configuration, see [config-example.yaml](./config-example.yaml) for the new structure. [#1700](https://github.com/juanfont/headscale/pull/1700) - Change the structure of database configuration, see [config-example.yaml](./config-example.yaml) for the new structure. [#1700](https://github.com/juanfont/headscale/pull/1700)
- Old structure has been remove and the configuration _must_ be converted. - Old structure has been remove and the configuration _must_ be converted.
- Adds additional configuration for PostgreSQL for setting max open, idle conection and idle connection lifetime. - Adds additional configuration for PostgreSQL for setting max open, idle connection and idle connection lifetime.
- API: Machine is now Node [#1553](https://github.com/juanfont/headscale/pull/1553) - API: Machine is now Node [#1553](https://github.com/juanfont/headscale/pull/1553)
- Remove support for older Tailscale clients [#1611](https://github.com/juanfont/headscale/pull/1611) - Remove support for older Tailscale clients [#1611](https://github.com/juanfont/headscale/pull/1611)
- The latest supported client is 1.38 - The latest supported client is 1.38
@@ -70,7 +70,7 @@ after improving the test harness as part of adopting [#1460](https://github.com/
### Changes ### Changes
- Add environment flags to enable pprof (profiling) [#1382](https://github.com/juanfont/headscale/pull/1382) - Add environment flags to enable pprof (profiling) [#1382](https://github.com/juanfont/headscale/pull/1382)
- Profiles are continously generated in our integration tests. - Profiles are continuously generated in our integration tests.
- Fix systemd service file location in `.deb` packages [#1391](https://github.com/juanfont/headscale/pull/1391) - Fix systemd service file location in `.deb` packages [#1391](https://github.com/juanfont/headscale/pull/1391)
- Improvements on Noise implementation [#1379](https://github.com/juanfont/headscale/pull/1379) - Improvements on Noise implementation [#1379](https://github.com/juanfont/headscale/pull/1379)
- Replace node filter logic, ensuring nodes with access can see eachother [#1381](https://github.com/juanfont/headscale/pull/1381) - Replace node filter logic, ensuring nodes with access can see eachother [#1381](https://github.com/juanfont/headscale/pull/1381)
@@ -161,7 +161,7 @@ after improving the test harness as part of adopting [#1460](https://github.com/
- SSH ACLs status: - SSH ACLs status:
- Support `accept` and `check` (SSH can be enabled and used for connecting and authentication) - Support `accept` and `check` (SSH can be enabled and used for connecting and authentication)
- Rejecting connections **are not supported**, meaning that if you enable SSH, then assume that _all_ `ssh` connections **will be allowed**. - Rejecting connections **are not supported**, meaning that if you enable SSH, then assume that _all_ `ssh` connections **will be allowed**.
- If you decied to try this feature, please carefully managed permissions by blocking port `22` with regular ACLs or do _not_ set `--ssh` on your clients. - If you decided to try this feature, please carefully managed permissions by blocking port `22` with regular ACLs or do _not_ set `--ssh` on your clients.
- We are currently improving our testing of the SSH ACLs, help us get an overview by testing and giving feedback. - We are currently improving our testing of the SSH ACLs, help us get an overview by testing and giving feedback.
- This feature should be considered dangerous and it is disabled by default. Enable by setting `HEADSCALE_EXPERIMENTAL_FEATURE_SSH=1`. - This feature should be considered dangerous and it is disabled by default. Enable by setting `HEADSCALE_EXPERIMENTAL_FEATURE_SSH=1`.
@@ -211,7 +211,7 @@ after improving the test harness as part of adopting [#1460](https://github.com/
### Changes ### Changes
- Updated dependencies (including the library that lacked armhf support) [#722](https://github.com/juanfont/headscale/pull/722) - Updated dependencies (including the library that lacked armhf support) [#722](https://github.com/juanfont/headscale/pull/722)
- Fix missing group expansion in function `excludeCorretlyTaggedNodes` [#563](https://github.com/juanfont/headscale/issues/563) - Fix missing group expansion in function `excludeCorrectlyTaggedNodes` [#563](https://github.com/juanfont/headscale/issues/563)
- Improve registration protocol implementation and switch to NodeKey as main identifier [#725](https://github.com/juanfont/headscale/pull/725) - Improve registration protocol implementation and switch to NodeKey as main identifier [#725](https://github.com/juanfont/headscale/pull/725)
- Add ability to connect to PostgreSQL via unix socket [#734](https://github.com/juanfont/headscale/pull/734) - Add ability to connect to PostgreSQL via unix socket [#734](https://github.com/juanfont/headscale/pull/734)
@@ -231,7 +231,7 @@ after improving the test harness as part of adopting [#1460](https://github.com/
- Fix send on closed channel crash in polling [#542](https://github.com/juanfont/headscale/pull/542) - Fix send on closed channel crash in polling [#542](https://github.com/juanfont/headscale/pull/542)
- Fixed spurious calls to setLastStateChangeToNow from ephemeral nodes [#566](https://github.com/juanfont/headscale/pull/566) - Fixed spurious calls to setLastStateChangeToNow from ephemeral nodes [#566](https://github.com/juanfont/headscale/pull/566)
- Add command for moving nodes between namespaces [#362](https://github.com/juanfont/headscale/issues/362) - Add command for moving nodes between namespaces [#362](https://github.com/juanfont/headscale/issues/362)
- Added more configuration parameters for OpenID Connect (scopes, free-form paramters, domain and user allowlist) - Added more configuration parameters for OpenID Connect (scopes, free-form parameters, domain and user allowlist)
- Add command to set tags on a node [#525](https://github.com/juanfont/headscale/issues/525) - Add command to set tags on a node [#525](https://github.com/juanfont/headscale/issues/525)
- Add command to view tags of nodes [#356](https://github.com/juanfont/headscale/issues/356) - Add command to view tags of nodes [#356](https://github.com/juanfont/headscale/issues/356)
- Add --all (-a) flag to enable routes command [#360](https://github.com/juanfont/headscale/issues/360) - Add --all (-a) flag to enable routes command [#360](https://github.com/juanfont/headscale/issues/360)
@@ -279,10 +279,10 @@ after improving the test harness as part of adopting [#1460](https://github.com/
- Fix a bug were the same IP could be assigned to multiple hosts if joined in quick succession [#346](https://github.com/juanfont/headscale/pull/346) - Fix a bug were the same IP could be assigned to multiple hosts if joined in quick succession [#346](https://github.com/juanfont/headscale/pull/346)
- Simplify the code behind registration of machines [#366](https://github.com/juanfont/headscale/pull/366) - Simplify the code behind registration of machines [#366](https://github.com/juanfont/headscale/pull/366)
- Nodes are now only written to database if they are registrated successfully - Nodes are now only written to database if they are registered successfully
- Fix a limitation in the ACLs that prevented users to write rules with `*` as source [#374](https://github.com/juanfont/headscale/issues/374) - Fix a limitation in the ACLs that prevented users to write rules with `*` as source [#374](https://github.com/juanfont/headscale/issues/374)
- Reduce the overhead of marshal/unmarshal for Hostinfo, routes and endpoints by using specific types in Machine [#371](https://github.com/juanfont/headscale/pull/371) - Reduce the overhead of marshal/unmarshal for Hostinfo, routes and endpoints by using specific types in Machine [#371](https://github.com/juanfont/headscale/pull/371)
- Apply normalization function to FQDN on hostnames when hosts registers and retrieve informations [#363](https://github.com/juanfont/headscale/issues/363) - Apply normalization function to FQDN on hostnames when hosts registers and retrieve information [#363](https://github.com/juanfont/headscale/issues/363)
- Fix a bug that prevented the use of `tailscale logout` with OIDC [#508](https://github.com/juanfont/headscale/issues/508) - Fix a bug that prevented the use of `tailscale logout` with OIDC [#508](https://github.com/juanfont/headscale/issues/508)
- Added Tailscale repo HEAD and unstable releases channel to the integration tests targets [#513](https://github.com/juanfont/headscale/pull/513) - Added Tailscale repo HEAD and unstable releases channel to the integration tests targets [#513](https://github.com/juanfont/headscale/pull/513)

View File

@@ -1,21 +1,43 @@
# This Dockerfile and the images produced are for testing headscale, # Copyright (c) Tailscale Inc & AUTHORS
# and are in no way endorsed by Headscale's maintainers as an # SPDX-License-Identifier: BSD-3-Clause
# official nor supported release or distribution.
FROM golang:latest # This Dockerfile is more or less lifted from tailscale/tailscale
# to ensure a similar build process when testing the HEAD of tailscale.
RUN apt-get update \ FROM golang:1.22-alpine AS build-env
&& apt-get install -y dnsutils git iptables ssh ca-certificates \
&& rm -rf /var/lib/apt/lists/*
RUN useradd --shell=/bin/bash --create-home ssh-it-user WORKDIR /go/src
RUN apk add --no-cache git
# Replace `RUN git...` with `COPY` and a local checked out version of Tailscale in `./tailscale`
# to test specific commits of the Tailscale client. This is useful when trying to find out why
# something specific broke between two versions of Tailscale with for example `git bisect`.
# COPY ./tailscale .
RUN git clone https://github.com/tailscale/tailscale.git RUN git clone https://github.com/tailscale/tailscale.git
WORKDIR /go/tailscale WORKDIR /go/src/tailscale
RUN git checkout main \
&& sh build_dist.sh tailscale.com/cmd/tailscale \ # see build_docker.sh
&& sh build_dist.sh tailscale.com/cmd/tailscaled \ ARG VERSION_LONG=""
&& cp tailscale /usr/local/bin/ \ ENV VERSION_LONG=$VERSION_LONG
&& cp tailscaled /usr/local/bin/ ARG VERSION_SHORT=""
ENV VERSION_SHORT=$VERSION_SHORT
ARG VERSION_GIT_HASH=""
ENV VERSION_GIT_HASH=$VERSION_GIT_HASH
ARG TARGETARCH
RUN GOARCH=$TARGETARCH go install -ldflags="\
-X tailscale.com/version.longStamp=$VERSION_LONG \
-X tailscale.com/version.shortStamp=$VERSION_SHORT \
-X tailscale.com/version.gitCommitStamp=$VERSION_GIT_HASH" \
-v ./cmd/tailscale ./cmd/tailscaled ./cmd/containerboot
FROM alpine:3.18
RUN apk add --no-cache ca-certificates iptables iproute2 ip6tables curl
COPY --from=build-env /go/bin/* /usr/local/bin/
# For compat with the previous run.sh, although ideally you should be
# using build_docker.sh which sets an entrypoint for the image.
RUN mkdir /tailscale && ln -s /usr/local/bin/containerboot /tailscale/run.sh

View File

@@ -99,7 +99,7 @@ Please read the [CONTRIBUTING.md](./CONTRIBUTING.md) file.
### Requirements ### Requirements
To contribute to headscale you would need the lastest version of [Go](https://golang.org) To contribute to headscale you would need the latest version of [Go](https://golang.org)
and [Buf](https://buf.build)(Protobuf generator). and [Buf](https://buf.build)(Protobuf generator).
We recommend using [Nix](https://nixos.org/) to setup a development environment. This can We recommend using [Nix](https://nixos.org/) to setup a development environment. This can

View File

@@ -105,7 +105,7 @@ derp:
automatically_add_embedded_derp_region: true automatically_add_embedded_derp_region: true
# For better connection stability (especially when using an Exit-Node and DNS is not working), # For better connection stability (especially when using an Exit-Node and DNS is not working),
# it is possible to optionall add the public IPv4 and IPv6 address to the Derp-Map using: # it is possible to optionally add the public IPv4 and IPv6 address to the Derp-Map using:
ipv4: 1.2.3.4 ipv4: 1.2.3.4
ipv6: 2001:db8::1 ipv6: 2001:db8::1
@@ -199,7 +199,7 @@ log:
format: text format: text
level: info level: info
# Path to a file containg ACL policies. # Path to a file containing ACL policies.
# ACLs can be defined as YAML or HUJSON. # ACLs can be defined as YAML or HUJSON.
# https://tailscale.com/kb/1018/acls/ # https://tailscale.com/kb/1018/acls/
acl_policy_path: "" acl_policy_path: ""

View File

@@ -14,7 +14,7 @@ If the node is already registered, it can advertise exit capabilities like this:
$ sudo tailscale set --advertise-exit-node $ sudo tailscale set --advertise-exit-node
``` ```
To use a node as an exit node, IP forwarding must be enabled on the node. Check the official [Tailscale documentation](https://tailscale.com/kb/1019/subnets/?tab=linux#enable-ip-forwarding) for how to enable IP fowarding. To use a node as an exit node, IP forwarding must be enabled on the node. Check the official [Tailscale documentation](https://tailscale.com/kb/1019/subnets/?tab=linux#enable-ip-forwarding) for how to enable IP forwarding.
## On the control server ## On the control server

View File

@@ -36,7 +36,7 @@ We don't know. We might be working on it. If you want to help, please send us a
Please be aware that there are a number of reasons why we might not accept specific contributions: Please be aware that there are a number of reasons why we might not accept specific contributions:
- It is not possible to implement the feature in a way that makes sense in a self-hosted environment. - It is not possible to implement the feature in a way that makes sense in a self-hosted environment.
- Given that we are reverse-engineering Tailscale to satify our own curiosity, we might be interested in implementing the feature ourselves. - Given that we are reverse-engineering Tailscale to satisfy our own curiosity, we might be interested in implementing the feature ourselves.
- You are not sending unit and integration tests with it. - You are not sending unit and integration tests with it.
## Do you support Y method of deploying Headscale? ## Do you support Y method of deploying Headscale?

View File

@@ -58,12 +58,12 @@ A solution could be to consider a headscale server (in it's entirety) as a
tailnet. tailnet.
For personal users the default behavior could either allow all communications For personal users the default behavior could either allow all communications
between all namespaces (like tailscale) or dissallow all communications between between all namespaces (like tailscale) or disallow all communications between
namespaces (current behavior). namespaces (current behavior).
For businesses and organisations, viewing a headscale instance a single tailnet For businesses and organisations, viewing a headscale instance a single tailnet
would allow users (namespace) to talk to each other with the ACLs. As described would allow users (namespace) to talk to each other with the ACLs. As described
in tailscale's documentation [[1]], a server should be tagged and personnal in tailscale's documentation [[1]], a server should be tagged and personal
devices should be tied to a user. Translated in headscale's terms each user can devices should be tied to a user. Translated in headscale's terms each user can
have multiple devices and all those devices should be in the same namespace. have multiple devices and all those devices should be in the same namespace.
The servers should be tagged and used as such. The servers should be tagged and used as such.
@@ -88,7 +88,7 @@ the ability to rules in either format (HuJSON or YAML).
Let's build an example use case for a small business (It may be the place where Let's build an example use case for a small business (It may be the place where
ACL's are the most useful). ACL's are the most useful).
We have a small company with a boss, an admin, two developper and an intern. We have a small company with a boss, an admin, two developer and an intern.
The boss should have access to all servers but not to the users hosts. Admin The boss should have access to all servers but not to the users hosts. Admin
should also have access to all hosts except that their permissions should be should also have access to all hosts except that their permissions should be
@@ -173,7 +173,7 @@ need to add the following ACLs
"ports": ["prod:*", "dev:*", "internal:*"] "ports": ["prod:*", "dev:*", "internal:*"]
}, },
// admin have access to adminstration port (lets only consider port 22 here) // admin have access to administration port (lets only consider port 22 here)
{ {
"action": "accept", "action": "accept",
"users": ["group:admin"], "users": ["group:admin"],

View File

@@ -1,13 +1,13 @@
# Controlling `headscale` with remote CLI # Controlling `headscale` with remote CLI
## Prerequisit ## Prerequisite
- A workstation to run `headscale` (could be Linux, macOS, other supported platforms) - A workstation to run `headscale` (could be Linux, macOS, other supported platforms)
- A `headscale` server (version `0.13.0` or newer) - A `headscale` server (version `0.13.0` or newer)
- Access to create API keys (local access to the `headscale` server) - Access to create API keys (local access to the `headscale` server)
- `headscale` _must_ be served over TLS/HTTPS - `headscale` _must_ be served over TLS/HTTPS
- Remote access does _not_ support unencrypted traffic. - Remote access does _not_ support unencrypted traffic.
- Port `50443` must be open in the firewall (or port overriden by `grpc_listen_addr` option) - Port `50443` must be open in the firewall (or port overridden by `grpc_listen_addr` option)
## Goal ## Goal
@@ -97,4 +97,4 @@ Checklist:
- Make sure you use version `0.13.0` or newer. - Make sure you use version `0.13.0` or newer.
- Verify that your TLS certificate is valid and trusted - Verify that your TLS certificate is valid and trusted
- If you do not have access to a trusted certificate (e.g. from Let's Encrypt), add your self signed certificate to the trust store of your OS or - If you do not have access to a trusted certificate (e.g. from Let's Encrypt), add your self signed certificate to the trust store of your OS or
- Set `HEADSCALE_CLI_INSECURE` to 0 in your environement - Set `HEADSCALE_CLI_INSECURE` to 0 in your environment

View File

@@ -115,7 +115,7 @@ The following Caddyfile is all that is necessary to use Caddy as a reverse proxy
} }
``` ```
Caddy v2 will [automatically](https://caddyserver.com/docs/automatic-https) provision a certficate for your domain/subdomain, force HTTPS, and proxy websockets - no further configuration is necessary. Caddy v2 will [automatically](https://caddyserver.com/docs/automatic-https) provision a certificate for your domain/subdomain, force HTTPS, and proxy websockets - no further configuration is necessary.
For a slightly more complex configuration which utilizes Docker containers to manage Caddy, Headscale, and Headscale-UI, [Guru Computing's guide](https://blog.gurucomputing.com.au/smart-vpns-with-headscale/) is an excellent reference. For a slightly more complex configuration which utilizes Docker containers to manage Caddy, Headscale, and Headscale-UI, [Guru Computing's guide](https://blog.gurucomputing.com.au/smart-vpns-with-headscale/) is an excellent reference.

View File

@@ -30,7 +30,7 @@ describing how to make `headscale` run properly in a server environment.
cd headscale cd headscale
# optionally checkout a release # optionally checkout a release
# option a. you can find offical relase at https://github.com/juanfont/headscale/releases/latest # option a. you can find official release at https://github.com/juanfont/headscale/releases/latest
# option b. get latest tag, this may be a beta release # option b. get latest tag, this may be a beta release
latestTag=$(git describe --tags `git rev-list --tags --max-count=1`) latestTag=$(git describe --tags `git rev-list --tags --max-count=1`)
@@ -57,7 +57,7 @@ describing how to make `headscale` run properly in a server environment.
cd headscale cd headscale
# optionally checkout a release # optionally checkout a release
# option a. you can find offical relase at https://github.com/juanfont/headscale/releases/latest # option a. you can find official release at https://github.com/juanfont/headscale/releases/latest
# option b. get latest tag, this may be a beta release # option b. get latest tag, this may be a beta release
latestTag=$(git describe --tags `git rev-list --tags --max-count=1`) latestTag=$(git describe --tags `git rev-list --tags --max-count=1`)

View File

@@ -31,7 +31,7 @@
# When updating go.mod or go.sum, a new sha will need to be calculated, # When updating go.mod or go.sum, a new sha will need to be calculated,
# update this if you have a mismatch after doing a change to thos files. # update this if you have a mismatch after doing a change to thos files.
vendorHash = "sha256-wXfKeiJaGe6ahOsONrQhvbuMN8flQ13b0ZjxdbFs1e8="; vendorHash = "sha256-EorT2AVwA3usly/LcNor6r5UIhLCdj3L4O4ilgTIC2o=";
subPackages = ["cmd/headscale"]; subPackages = ["cmd/headscale"];

2
go.mod
View File

@@ -29,6 +29,7 @@ require (
github.com/puzpuzpuz/xsync/v3 v3.1.0 github.com/puzpuzpuz/xsync/v3 v3.1.0
github.com/rs/zerolog v1.32.0 github.com/rs/zerolog v1.32.0
github.com/samber/lo v1.39.0 github.com/samber/lo v1.39.0
github.com/sasha-s/go-deadlock v0.3.1
github.com/spf13/cobra v1.8.0 github.com/spf13/cobra v1.8.0
github.com/spf13/viper v1.18.2 github.com/spf13/viper v1.18.2
github.com/stretchr/testify v1.9.0 github.com/stretchr/testify v1.9.0
@@ -155,6 +156,7 @@ require (
github.com/opencontainers/image-spec v1.1.0 // indirect github.com/opencontainers/image-spec v1.1.0 // indirect
github.com/opencontainers/runc v1.1.12 // indirect github.com/opencontainers/runc v1.1.12 // indirect
github.com/pelletier/go-toml/v2 v2.2.2 // indirect github.com/pelletier/go-toml/v2 v2.2.2 // indirect
github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5 // indirect
github.com/pierrec/lz4/v4 v4.1.21 // indirect github.com/pierrec/lz4/v4 v4.1.21 // indirect
github.com/pkg/errors v0.9.1 // indirect github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect

4
go.sum
View File

@@ -367,6 +367,8 @@ github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaR
github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ= github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ=
github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM= github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM=
github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs= github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs=
github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5 h1:q2e307iGHPdTGp0hoxKjt1H5pDo6utceo3dQVK3I5XQ=
github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5/go.mod h1:jvVRKCrJTQWu0XVbaOlby/2lO20uSCHEMzzplHXte1o=
github.com/philip-bui/grpc-zerolog v1.0.1 h1:EMacvLRUd2O1K0eWod27ZP5CY1iTNkhBDLSN+Q4JEvA= github.com/philip-bui/grpc-zerolog v1.0.1 h1:EMacvLRUd2O1K0eWod27ZP5CY1iTNkhBDLSN+Q4JEvA=
github.com/philip-bui/grpc-zerolog v1.0.1/go.mod h1:qXbiq/2X4ZUMMshsqlWyTHOcw7ns+GZmlqZZN05ZHcQ= github.com/philip-bui/grpc-zerolog v1.0.1/go.mod h1:qXbiq/2X4ZUMMshsqlWyTHOcw7ns+GZmlqZZN05ZHcQ=
github.com/pierrec/lz4/v4 v4.1.14/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pierrec/lz4/v4 v4.1.14/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
@@ -423,6 +425,8 @@ github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6g
github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ= github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ=
github.com/samber/lo v1.39.0 h1:4gTz1wUhNYLhFSKl6O+8peW0v2F4BCY034GRpU9WnuA= github.com/samber/lo v1.39.0 h1:4gTz1wUhNYLhFSKl6O+8peW0v2F4BCY034GRpU9WnuA=
github.com/samber/lo v1.39.0/go.mod h1:+m/ZKRl6ClXCE2Lgf3MsQlWfh4bn1bz6CXEOxnEXnEA= github.com/samber/lo v1.39.0/go.mod h1:+m/ZKRl6ClXCE2Lgf3MsQlWfh4bn1bz6CXEOxnEXnEA=
github.com/sasha-s/go-deadlock v0.3.1 h1:sqv7fDNShgjcaxkO0JNcOAlr8B9+cV5Ey/OB71efZx0=
github.com/sasha-s/go-deadlock v0.3.1/go.mod h1:F73l+cr82YSh10GxyRI6qZiCgK64VaZjwesgfQ1/iLM=
github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM=
github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8= github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I=

View File

@@ -19,6 +19,7 @@ import (
"time" "time"
"github.com/coreos/go-oidc/v3/oidc" "github.com/coreos/go-oidc/v3/oidc"
"github.com/davecgh/go-spew/spew"
"github.com/gorilla/mux" "github.com/gorilla/mux"
grpcMiddleware "github.com/grpc-ecosystem/go-grpc-middleware" grpcMiddleware "github.com/grpc-ecosystem/go-grpc-middleware"
grpcRuntime "github.com/grpc-ecosystem/grpc-gateway/v2/runtime" grpcRuntime "github.com/grpc-ecosystem/grpc-gateway/v2/runtime"
@@ -104,16 +105,15 @@ type Headscale struct {
registrationCache *cache.Cache registrationCache *cache.Cache
pollNetMapStreamWG sync.WaitGroup pollNetMapStreamWG sync.WaitGroup
mapSessions map[types.NodeID]*mapSession
mapSessionMu sync.Mutex
} }
var ( var (
profilingEnabled = envknob.Bool("HEADSCALE_PROFILING_ENABLED") profilingEnabled = envknob.Bool("HEADSCALE_DEBUG_PROFILING_ENABLED")
profilingPath = envknob.String("HEADSCALE_DEBUG_PROFILING_PATH")
tailsqlEnabled = envknob.Bool("HEADSCALE_DEBUG_TAILSQL_ENABLED") tailsqlEnabled = envknob.Bool("HEADSCALE_DEBUG_TAILSQL_ENABLED")
tailsqlStateDir = envknob.String("HEADSCALE_DEBUG_TAILSQL_STATE_DIR") tailsqlStateDir = envknob.String("HEADSCALE_DEBUG_TAILSQL_STATE_DIR")
tailsqlTSKey = envknob.String("TS_AUTHKEY") tailsqlTSKey = envknob.String("TS_AUTHKEY")
dumpConfig = envknob.Bool("HEADSCALE_DEBUG_DUMP_CONFIG")
) )
func NewHeadscale(cfg *types.Config) (*Headscale, error) { func NewHeadscale(cfg *types.Config) (*Headscale, error) {
@@ -138,7 +138,6 @@ func NewHeadscale(cfg *types.Config) (*Headscale, error) {
registrationCache: registrationCache, registrationCache: registrationCache,
pollNetMapStreamWG: sync.WaitGroup{}, pollNetMapStreamWG: sync.WaitGroup{},
nodeNotifier: notifier.NewNotifier(cfg), nodeNotifier: notifier.NewNotifier(cfg),
mapSessions: make(map[types.NodeID]*mapSession),
} }
app.db, err = db.NewHeadscaleDatabase( app.db, err = db.NewHeadscaleDatabase(
@@ -330,7 +329,7 @@ func (h *Headscale) grpcAuthenticationInterceptor(ctx context.Context,
// Check if the request is coming from the on-server client. // Check if the request is coming from the on-server client.
// This is not secure, but it is to maintain maintainability // This is not secure, but it is to maintain maintainability
// with the "legacy" database-based client // with the "legacy" database-based client
// It is also neede for grpc-gateway to be able to connect to // It is also needed for grpc-gateway to be able to connect to
// the server // the server
client, _ := peer.FromContext(ctx) client, _ := peer.FromContext(ctx)
@@ -502,14 +501,14 @@ func (h *Headscale) createRouter(grpcMux *grpcRuntime.ServeMux) *mux.Router {
// Serve launches the HTTP and gRPC server service Headscale and the API. // Serve launches the HTTP and gRPC server service Headscale and the API.
func (h *Headscale) Serve() error { func (h *Headscale) Serve() error {
if _, enableProfile := os.LookupEnv("HEADSCALE_PROFILING_ENABLED"); enableProfile { if profilingEnabled {
if profilePath, ok := os.LookupEnv("HEADSCALE_PROFILING_PATH"); ok { if profilingPath != "" {
err := os.MkdirAll(profilePath, os.ModePerm) err := os.MkdirAll(profilingPath, os.ModePerm)
if err != nil { if err != nil {
log.Fatal().Err(err).Msg("failed to create profiling directory") log.Fatal().Err(err).Msg("failed to create profiling directory")
} }
defer profile.Start(profile.ProfilePath(profilePath)).Stop() defer profile.Start(profile.ProfilePath(profilingPath)).Stop()
} else { } else {
defer profile.Start().Stop() defer profile.Start().Stop()
} }
@@ -517,6 +516,10 @@ func (h *Headscale) Serve() error {
var err error var err error
if dumpConfig {
spew.Dump(h.cfg)
}
// Fetch an initial DERP Map before we start serving // Fetch an initial DERP Map before we start serving
h.DERPMap = derp.GetDERPMap(h.cfg.DERP) h.DERPMap = derp.GetDERPMap(h.cfg.DERP)
h.mapper = mapper.NewMapper(h.db, h.cfg, h.DERPMap, h.nodeNotifier) h.mapper = mapper.NewMapper(h.db, h.cfg, h.DERPMap, h.nodeNotifier)
@@ -729,19 +732,6 @@ func (h *Headscale) Serve() error {
w.WriteHeader(http.StatusOK) w.WriteHeader(http.StatusOK)
w.Write([]byte(h.nodeNotifier.String())) w.Write([]byte(h.nodeNotifier.String()))
}) })
debugMux.HandleFunc("/debug/mapresp", func(w http.ResponseWriter, r *http.Request) {
h.mapSessionMu.Lock()
defer h.mapSessionMu.Unlock()
var b strings.Builder
b.WriteString("mapresponders:\n")
for k, v := range h.mapSessions {
fmt.Fprintf(&b, "\t%d: %p\n", k, v)
}
w.WriteHeader(http.StatusOK)
w.Write([]byte(b.String()))
})
debugMux.Handle("/metrics", promhttp.Handler()) debugMux.Handle("/metrics", promhttp.Handler())
debugHTTPServer := &http.Server{ debugHTTPServer := &http.Server{
@@ -822,17 +812,6 @@ func (h *Headscale) Serve() error {
expireNodeCancel() expireNodeCancel()
expireEphemeralCancel() expireEphemeralCancel()
trace("closing map sessions")
wg := sync.WaitGroup{}
for _, mapSess := range h.mapSessions {
wg.Add(1)
go func() {
mapSess.close()
wg.Done()
}()
}
wg.Wait()
trace("waiting for netmap stream to close") trace("waiting for netmap stream to close")
h.pollNetMapStreamWG.Wait() h.pollNetMapStreamWG.Wait()

View File

@@ -661,7 +661,7 @@ func GenerateGivenName(
} }
func DeleteExpiredEphemeralNodes(tx *gorm.DB, func DeleteExpiredEphemeralNodes(tx *gorm.DB,
inactivityThreshhold time.Duration, inactivityThreshold time.Duration,
) ([]types.NodeID, []types.NodeID) { ) ([]types.NodeID, []types.NodeID) {
users, err := ListUsers(tx) users, err := ListUsers(tx)
if err != nil { if err != nil {
@@ -679,7 +679,7 @@ func DeleteExpiredEphemeralNodes(tx *gorm.DB,
for idx, node := range nodes { for idx, node := range nodes {
if node.IsEphemeral() && node.LastSeen != nil && if node.IsEphemeral() && node.LastSeen != nil &&
time.Now(). time.Now().
After(node.LastSeen.Add(inactivityThreshhold)) { After(node.LastSeen.Add(inactivityThreshold)) {
expired = append(expired, node.ID) expired = append(expired, node.ID)
log.Info(). log.Info().

View File

@@ -393,7 +393,7 @@ func (s *Suite) TestSetTags(c *check.C) {
c.Assert(err, check.IsNil) c.Assert(err, check.IsNil)
c.Assert(node.ForcedTags, check.DeepEquals, types.StringList(sTags)) c.Assert(node.ForcedTags, check.DeepEquals, types.StringList(sTags))
// assign duplicat tags, expect no errors but no doubles in DB // assign duplicate tags, expect no errors but no doubles in DB
eTags := []string{"tag:bar", "tag:test", "tag:unknown", "tag:test"} eTags := []string{"tag:bar", "tag:test", "tag:unknown", "tag:test"}
err = db.SetTags(node.ID, eTags) err = db.SetTags(node.ID, eTags)
c.Assert(err, check.IsNil) c.Assert(err, check.IsNil)

View File

@@ -83,7 +83,7 @@ func CreatePreAuthKey(
if !seenTags[tag] { if !seenTags[tag] {
if err := tx.Save(&types.PreAuthKeyACLTag{PreAuthKeyID: key.ID, Tag: tag}).Error; err != nil { if err := tx.Save(&types.PreAuthKeyACLTag{PreAuthKeyID: key.ID, Tag: tag}).Error; err != nil {
return nil, fmt.Errorf( return nil, fmt.Errorf(
"failed to ceate key tag in the database: %w", "failed to create key tag in the database: %w",
err, err,
) )
} }

View File

@@ -204,7 +204,7 @@ func DERPProbeHandler(
} }
} }
// DERPBootstrapDNSHandler implements the /bootsrap-dns endpoint // DERPBootstrapDNSHandler implements the /bootstrap-dns endpoint
// Described in https://github.com/tailscale/tailscale/issues/1405, // Described in https://github.com/tailscale/tailscale/issues/1405,
// this endpoint provides a way to help a client when it fails to start up // this endpoint provides a way to help a client when it fails to start up
// because its DNS are broken. // because its DNS are broken.

View File

@@ -7,8 +7,23 @@ import (
"github.com/gorilla/mux" "github.com/gorilla/mux"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto" "github.com/prometheus/client_golang/prometheus/promauto"
"tailscale.com/envknob"
) )
var debugHighCardinalityMetrics = envknob.Bool("HEADSCALE_DEBUG_HIGH_CARDINALITY_METRICS")
var mapResponseLastSentSeconds *prometheus.GaugeVec
func init() {
if debugHighCardinalityMetrics {
mapResponseLastSentSeconds = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: prometheusNamespace,
Name: "mapresponse_last_sent_seconds",
Help: "last sent metric to node.id",
}, []string{"type", "id"})
}
}
const prometheusNamespace = "headscale" const prometheusNamespace = "headscale"
var ( var (
@@ -37,16 +52,16 @@ var (
Name: "mapresponse_readonly_requests_total", Name: "mapresponse_readonly_requests_total",
Help: "total count of readonly requests received", Help: "total count of readonly requests received",
}, []string{"status"}) }, []string{"status"})
mapResponseSessions = promauto.NewGauge(prometheus.GaugeOpts{ mapResponseEnded = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: prometheusNamespace, Namespace: prometheusNamespace,
Name: "mapresponse_current_sessions_total", Name: "mapresponse_ended_total",
Help: "total count open map response sessions", Help: "total count of new mapsessions ended",
})
mapResponseRejected = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: prometheusNamespace,
Name: "mapresponse_rejected_new_sessions_total",
Help: "total count of new mapsessions rejected",
}, []string{"reason"}) }, []string{"reason"})
mapResponseClosed = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: prometheusNamespace,
Name: "mapresponse_closed_total",
Help: "total count of calls to mapresponse close",
}, []string{"return"})
httpDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ httpDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
Namespace: prometheusNamespace, Namespace: prometheusNamespace,
Name: "http_duration_seconds", Name: "http_duration_seconds",

View File

@@ -231,62 +231,12 @@ func (ns *noiseServer) NoisePollNetMapHandler(
return return
} }
sess := ns.headscale.newMapSession(req.Context(), mapRequest, writer, node) sess := ns.headscale.newMapSession(req.Context(), mapRequest, writer, node)
sess.tracef("a node sending a MapRequest with Noise protocol") sess.tracef("a node sending a MapRequest with Noise protocol")
if !sess.isStreaming() {
// If a streaming mapSession exists for this node, close it
// and start a new one.
if sess.isStreaming() {
sess.tracef("aquiring lock to check stream")
ns.headscale.mapSessionMu.Lock()
if _, ok := ns.headscale.mapSessions[node.ID]; ok {
// NOTE/TODO(kradalby): From how I understand the protocol, when
// a client connects with stream=true, and already has a streaming
// connection open, the correct way is to close the current channel
// and replace it. However, I cannot manage to get that working with
// some sort of lock/block happening on the cancelCh in the streaming
// session.
// Not closing the channel and replacing it puts us in a weird state
// which keeps a ghost stream open, receiving keep alives, but no updates.
//
// Typically a new connection is opened when one exists as a client which
// is already authenticated reconnects (e.g. down, then up). The client will
// start auth and streaming at the same time, and then cancel the streaming
// when the auth has finished successfully, opening a new connection.
//
// As a work-around to not replacing, abusing the clients "resilience"
// by reject the new connection which will cause the client to immediately
// reconnect and "fix" the issue, as the other connection typically has been
// closed, meaning there is nothing to replace.
//
// sess.infof("node has an open stream(%p), replacing with %p", oldSession, sess)
// oldSession.close()
defer ns.headscale.mapSessionMu.Unlock()
sess.infof("node has an open stream(%p), rejecting new stream", sess)
mapResponseRejected.WithLabelValues("exists").Inc()
return
}
ns.headscale.mapSessions[node.ID] = sess
mapResponseSessions.Inc()
ns.headscale.mapSessionMu.Unlock()
sess.tracef("releasing lock to check stream")
}
sess.serve() sess.serve()
} else {
if sess.isStreaming() { sess.serveLongPoll()
sess.tracef("aquiring lock to remove stream")
ns.headscale.mapSessionMu.Lock()
defer ns.headscale.mapSessionMu.Unlock()
delete(ns.headscale.mapSessions, node.ID)
mapResponseSessions.Dec()
sess.tracef("releasing lock to remove stream")
} }
} }

View File

@@ -3,22 +3,43 @@ package notifier
import ( import (
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto" "github.com/prometheus/client_golang/prometheus/promauto"
"tailscale.com/envknob"
) )
const prometheusNamespace = "headscale" const prometheusNamespace = "headscale"
var debugHighCardinalityMetrics = envknob.Bool("HEADSCALE_DEBUG_HIGH_CARDINALITY_METRICS")
var notifierUpdateSent *prometheus.CounterVec
func init() {
if debugHighCardinalityMetrics {
notifierUpdateSent = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: prometheusNamespace,
Name: "notifier_update_sent_total",
Help: "total count of update sent on nodes channel",
}, []string{"status", "type", "trigger", "id"})
} else {
notifierUpdateSent = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: prometheusNamespace,
Name: "notifier_update_sent_total",
Help: "total count of update sent on nodes channel",
}, []string{"status", "type", "trigger"})
}
}
var ( var (
notifierWaitersForLock = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: prometheusNamespace,
Name: "notifier_waiters_for_lock",
Help: "gauge of waiters for the notifier lock",
}, []string{"type", "action"})
notifierWaitForLock = promauto.NewHistogramVec(prometheus.HistogramOpts{ notifierWaitForLock = promauto.NewHistogramVec(prometheus.HistogramOpts{
Namespace: prometheusNamespace, Namespace: prometheusNamespace,
Name: "notifier_wait_for_lock_seconds", Name: "notifier_wait_for_lock_seconds",
Help: "histogram of time spent waiting for the notifier lock", Help: "histogram of time spent waiting for the notifier lock",
Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.5, 1, 3, 5, 10}, Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.5, 1, 3, 5, 10},
}, []string{"action"}) }, []string{"action"})
notifierUpdateSent = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: prometheusNamespace,
Name: "notifier_update_sent_total",
Help: "total count of update sent on nodes channel",
}, []string{"status", "type", "trigger"})
notifierUpdateReceived = promauto.NewCounterVec(prometheus.CounterOpts{ notifierUpdateReceived = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: prometheusNamespace, Namespace: prometheusNamespace,
Name: "notifier_update_received_total", Name: "notifier_update_received_total",
@@ -29,4 +50,19 @@ var (
Name: "notifier_open_channels_total", Name: "notifier_open_channels_total",
Help: "total count open channels in notifier", Help: "total count open channels in notifier",
}) })
notifierBatcherWaitersForLock = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: prometheusNamespace,
Name: "notifier_batcher_waiters_for_lock",
Help: "gauge of waiters for the notifier batcher lock",
}, []string{"type", "action"})
notifierBatcherChanges = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: prometheusNamespace,
Name: "notifier_batcher_changes_pending",
Help: "gauge of full changes pending in the notifier batcher",
}, []string{})
notifierBatcherPatches = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: prometheusNamespace,
Name: "notifier_batcher_patches_pending",
Help: "gauge of patches pending in the notifier batcher",
}, []string{})
) )

View File

@@ -11,25 +11,40 @@ import (
"github.com/juanfont/headscale/hscontrol/types" "github.com/juanfont/headscale/hscontrol/types"
"github.com/puzpuzpuz/xsync/v3" "github.com/puzpuzpuz/xsync/v3"
"github.com/rs/zerolog/log" "github.com/rs/zerolog/log"
"github.com/sasha-s/go-deadlock"
"tailscale.com/envknob"
"tailscale.com/tailcfg" "tailscale.com/tailcfg"
"tailscale.com/util/set" "tailscale.com/util/set"
) )
var debugDeadlock = envknob.Bool("HEADSCALE_DEBUG_DEADLOCK")
var debugDeadlockTimeout = envknob.RegisterDuration("HEADSCALE_DEBUG_DEADLOCK_TIMEOUT")
func init() {
deadlock.Opts.Disable = !debugDeadlock
if debugDeadlock {
deadlock.Opts.DeadlockTimeout = debugDeadlockTimeout()
deadlock.Opts.PrintAllCurrentGoroutines = true
}
}
type Notifier struct { type Notifier struct {
l sync.RWMutex l deadlock.Mutex
nodes map[types.NodeID]chan<- types.StateUpdate nodes map[types.NodeID]chan<- types.StateUpdate
connected *xsync.MapOf[types.NodeID, bool] connected *xsync.MapOf[types.NodeID, bool]
b *batcher b *batcher
cfg *types.Config
} }
func NewNotifier(cfg *types.Config) *Notifier { func NewNotifier(cfg *types.Config) *Notifier {
n := &Notifier{ n := &Notifier{
nodes: make(map[types.NodeID]chan<- types.StateUpdate), nodes: make(map[types.NodeID]chan<- types.StateUpdate),
connected: xsync.NewMapOf[types.NodeID, bool](), connected: xsync.NewMapOf[types.NodeID, bool](),
cfg: cfg,
} }
b := newBatcher(cfg.Tuning.BatchChangeDelay, n) b := newBatcher(cfg.Tuning.BatchChangeDelay, n)
n.b = b n.b = b
// TODO(kradalby): clean this up
go b.doWork() go b.doWork()
return n return n
} }
@@ -39,59 +54,75 @@ func (n *Notifier) Close() {
n.b.close() n.b.close()
} }
func (n *Notifier) AddNode(nodeID types.NodeID, c chan<- types.StateUpdate) { func (n *Notifier) tracef(nID types.NodeID, msg string, args ...any) {
log.Trace().Caller().Uint64("node.id", nodeID.Uint64()).Msg("acquiring lock to add node") log.Trace().
defer log.Trace(). Uint64("node.id", nID.Uint64()).
Caller(). Int("open_chans", len(n.nodes)).Msgf(msg, args...)
Uint64("node.id", nodeID.Uint64()). }
Msg("releasing lock to add node")
func (n *Notifier) AddNode(nodeID types.NodeID, c chan<- types.StateUpdate) {
start := time.Now() start := time.Now()
notifierWaitersForLock.WithLabelValues("lock", "add").Inc()
n.l.Lock() n.l.Lock()
defer n.l.Unlock() defer n.l.Unlock()
notifierWaitersForLock.WithLabelValues("lock", "add").Dec()
notifierWaitForLock.WithLabelValues("add").Observe(time.Since(start).Seconds()) notifierWaitForLock.WithLabelValues("add").Observe(time.Since(start).Seconds())
// If a channel exists, it means the node has opened a new
// connection. Close the old channel and replace it.
if curr, ok := n.nodes[nodeID]; ok {
n.tracef(nodeID, "channel present, closing and replacing")
close(curr)
}
n.nodes[nodeID] = c n.nodes[nodeID] = c
n.connected.Store(nodeID, true) n.connected.Store(nodeID, true)
log.Trace(). n.tracef(nodeID, "added new channel")
Uint64("node.id", nodeID.Uint64()).
Int("open_chans", len(n.nodes)).
Msg("Added new channel")
notifierNodeUpdateChans.Inc() notifierNodeUpdateChans.Inc()
} }
func (n *Notifier) RemoveNode(nodeID types.NodeID) { // RemoveNode removes a node and a given channel from the notifier.
log.Trace().Caller().Uint64("node.id", nodeID.Uint64()).Msg("acquiring lock to remove node") // It checks that the channel is the same as currently being updated
defer log.Trace(). // and ignores the removal if it is not.
Caller(). // RemoveNode reports if the node/chan was removed.
Uint64("node.id", nodeID.Uint64()). func (n *Notifier) RemoveNode(nodeID types.NodeID, c chan<- types.StateUpdate) bool {
Msg("releasing lock to remove node")
start := time.Now() start := time.Now()
notifierWaitersForLock.WithLabelValues("lock", "remove").Inc()
n.l.Lock() n.l.Lock()
defer n.l.Unlock() defer n.l.Unlock()
notifierWaitersForLock.WithLabelValues("lock", "remove").Dec()
notifierWaitForLock.WithLabelValues("remove").Observe(time.Since(start).Seconds()) notifierWaitForLock.WithLabelValues("remove").Observe(time.Since(start).Seconds())
if len(n.nodes) == 0 { if len(n.nodes) == 0 {
return return true
}
// If the channel exist, but it does not belong
// to the caller, ignore.
if curr, ok := n.nodes[nodeID]; ok {
if curr != c {
n.tracef(nodeID, "channel has been replaced, not removing")
return false
}
} }
delete(n.nodes, nodeID) delete(n.nodes, nodeID)
n.connected.Store(nodeID, false) n.connected.Store(nodeID, false)
log.Trace(). n.tracef(nodeID, "removed channel")
Uint64("node.id", nodeID.Uint64()).
Int("open_chans", len(n.nodes)).
Msg("Removed channel")
notifierNodeUpdateChans.Dec() notifierNodeUpdateChans.Dec()
return true
} }
// IsConnected reports if a node is connected to headscale and has a // IsConnected reports if a node is connected to headscale and has a
// poll session open. // poll session open.
func (n *Notifier) IsConnected(nodeID types.NodeID) bool { func (n *Notifier) IsConnected(nodeID types.NodeID) bool {
n.l.RLock() notifierWaitersForLock.WithLabelValues("lock", "conncheck").Inc()
defer n.l.RUnlock() n.l.Lock()
defer n.l.Unlock()
notifierWaitersForLock.WithLabelValues("lock", "conncheck").Dec()
if val, ok := n.connected.Load(nodeID); ok { if val, ok := n.connected.Load(nodeID); ok {
return val return val
@@ -130,15 +161,11 @@ func (n *Notifier) NotifyByNodeID(
update types.StateUpdate, update types.StateUpdate,
nodeID types.NodeID, nodeID types.NodeID,
) { ) {
log.Trace().Caller().Str("type", update.Type.String()).Msg("acquiring lock to notify")
defer log.Trace().
Caller().
Str("type", update.Type.String()).
Msg("releasing lock, finished notifying")
start := time.Now() start := time.Now()
n.l.RLock() notifierWaitersForLock.WithLabelValues("lock", "notify").Inc()
defer n.l.RUnlock() n.l.Lock()
defer n.l.Unlock()
notifierWaitersForLock.WithLabelValues("lock", "notify").Dec()
notifierWaitForLock.WithLabelValues("notify").Observe(time.Since(start).Seconds()) notifierWaitForLock.WithLabelValues("notify").Observe(time.Since(start).Seconds())
if c, ok := n.nodes[nodeID]; ok { if c, ok := n.nodes[nodeID]; ok {
@@ -150,50 +177,94 @@ func (n *Notifier) NotifyByNodeID(
Any("origin", types.NotifyOriginKey.Value(ctx)). Any("origin", types.NotifyOriginKey.Value(ctx)).
Any("origin-hostname", types.NotifyHostnameKey.Value(ctx)). Any("origin-hostname", types.NotifyHostnameKey.Value(ctx)).
Msgf("update not sent, context cancelled") Msgf("update not sent, context cancelled")
if debugHighCardinalityMetrics {
notifierUpdateSent.WithLabelValues("cancelled", update.Type.String(), types.NotifyOriginKey.Value(ctx), nodeID.String()).Inc()
} else {
notifierUpdateSent.WithLabelValues("cancelled", update.Type.String(), types.NotifyOriginKey.Value(ctx)).Inc() notifierUpdateSent.WithLabelValues("cancelled", update.Type.String(), types.NotifyOriginKey.Value(ctx)).Inc()
}
return return
case c <- update: case c <- update:
log.Trace(). n.tracef(nodeID, "update successfully sent on chan, origin: %s, origin-hostname: %s", ctx.Value("origin"), ctx.Value("hostname"))
Uint64("node.id", nodeID.Uint64()). if debugHighCardinalityMetrics {
Any("origin", ctx.Value("origin")). notifierUpdateSent.WithLabelValues("ok", update.Type.String(), types.NotifyOriginKey.Value(ctx), nodeID.String()).Inc()
Any("origin-hostname", ctx.Value("hostname")). } else {
Msgf("update successfully sent on chan")
notifierUpdateSent.WithLabelValues("ok", update.Type.String(), types.NotifyOriginKey.Value(ctx)).Inc() notifierUpdateSent.WithLabelValues("ok", update.Type.String(), types.NotifyOriginKey.Value(ctx)).Inc()
} }
} }
} }
}
func (n *Notifier) sendAll(update types.StateUpdate) { func (n *Notifier) sendAll(update types.StateUpdate) {
start := time.Now() start := time.Now()
n.l.RLock() notifierWaitersForLock.WithLabelValues("lock", "send-all").Inc()
defer n.l.RUnlock() n.l.Lock()
defer n.l.Unlock()
notifierWaitersForLock.WithLabelValues("lock", "send-all").Dec()
notifierWaitForLock.WithLabelValues("send-all").Observe(time.Since(start).Seconds()) notifierWaitForLock.WithLabelValues("send-all").Observe(time.Since(start).Seconds())
for _, c := range n.nodes { for id, c := range n.nodes {
c <- update // Whenever an update is sent to all nodes, there is a chance that the node
// has disconnected and the goroutine that was supposed to consume the update
// has shut down the channel and is waiting for the lock held here in RemoveNode.
// This means that there is potential for a deadlock which would stop all updates
// going out to clients. This timeout prevents that from happening by moving on to the
// next node if the context is cancelled. Afther sendAll releases the lock, the add/remove
// call will succeed and the update will go to the correct nodes on the next call.
ctx, cancel := context.WithTimeout(context.Background(), n.cfg.Tuning.NotifierSendTimeout)
defer cancel()
select {
case <-ctx.Done():
log.Error().
Err(ctx.Err()).
Uint64("node.id", id.Uint64()).
Msgf("update not sent, context cancelled")
if debugHighCardinalityMetrics {
notifierUpdateSent.WithLabelValues("cancelled", update.Type.String(), "send-all", id.String()).Inc()
} else {
notifierUpdateSent.WithLabelValues("cancelled", update.Type.String(), "send-all").Inc()
}
return
case c <- update:
if debugHighCardinalityMetrics {
notifierUpdateSent.WithLabelValues("ok", update.Type.String(), "send-all", id.String()).Inc()
} else {
notifierUpdateSent.WithLabelValues("ok", update.Type.String(), "send-all").Inc() notifierUpdateSent.WithLabelValues("ok", update.Type.String(), "send-all").Inc()
} }
} }
}
}
func (n *Notifier) String() string { func (n *Notifier) String() string {
n.l.RLock() notifierWaitersForLock.WithLabelValues("lock", "string").Inc()
defer n.l.RUnlock() n.l.Lock()
defer n.l.Unlock()
notifierWaitersForLock.WithLabelValues("lock", "string").Dec()
var b strings.Builder var b strings.Builder
b.WriteString("chans:\n") fmt.Fprintf(&b, "chans (%d):\n", len(n.nodes))
for k, v := range n.nodes { var keys []types.NodeID
fmt.Fprintf(&b, "\t%d: %p\n", k, v) n.connected.Range(func(key types.NodeID, value bool) bool {
keys = append(keys, key)
return true
})
sort.Slice(keys, func(i, j int) bool {
return keys[i] < keys[j]
})
for _, key := range keys {
fmt.Fprintf(&b, "\t%d: %p\n", key, n.nodes[key])
} }
b.WriteString("\n") b.WriteString("\n")
b.WriteString("connected:\n") fmt.Fprintf(&b, "connected (%d):\n", len(n.nodes))
n.connected.Range(func(k types.NodeID, v bool) bool { for _, key := range keys {
fmt.Fprintf(&b, "\t%d: %t\n", k, v) val, _ := n.connected.Load(key)
return true fmt.Fprintf(&b, "\t%d: %t\n", key, val)
}) }
return b.String() return b.String()
} }
@@ -230,13 +301,16 @@ func (b *batcher) close() {
// addOrPassthrough adds the update to the batcher, if it is not a // addOrPassthrough adds the update to the batcher, if it is not a
// type that is currently batched, it will be sent immediately. // type that is currently batched, it will be sent immediately.
func (b *batcher) addOrPassthrough(update types.StateUpdate) { func (b *batcher) addOrPassthrough(update types.StateUpdate) {
notifierBatcherWaitersForLock.WithLabelValues("lock", "add").Inc()
b.mu.Lock() b.mu.Lock()
defer b.mu.Unlock() defer b.mu.Unlock()
notifierBatcherWaitersForLock.WithLabelValues("lock", "add").Dec()
switch update.Type { switch update.Type {
case types.StatePeerChanged: case types.StatePeerChanged:
b.changedNodeIDs.Add(update.ChangeNodes...) b.changedNodeIDs.Add(update.ChangeNodes...)
b.nodesChanged = true b.nodesChanged = true
notifierBatcherChanges.WithLabelValues().Set(float64(b.changedNodeIDs.Len()))
case types.StatePeerChangedPatch: case types.StatePeerChangedPatch:
for _, newPatch := range update.ChangePatches { for _, newPatch := range update.ChangePatches {
@@ -248,6 +322,7 @@ func (b *batcher) addOrPassthrough(update types.StateUpdate) {
} }
} }
b.patchesChanged = true b.patchesChanged = true
notifierBatcherPatches.WithLabelValues().Set(float64(len(b.patches)))
default: default:
b.n.sendAll(update) b.n.sendAll(update)
@@ -257,8 +332,10 @@ func (b *batcher) addOrPassthrough(update types.StateUpdate) {
// flush sends all the accumulated patches to all // flush sends all the accumulated patches to all
// nodes in the notifier. // nodes in the notifier.
func (b *batcher) flush() { func (b *batcher) flush() {
notifierBatcherWaitersForLock.WithLabelValues("lock", "flush").Inc()
b.mu.Lock() b.mu.Lock()
defer b.mu.Unlock() defer b.mu.Unlock()
notifierBatcherWaitersForLock.WithLabelValues("lock", "flush").Dec()
if b.nodesChanged || b.patchesChanged { if b.nodesChanged || b.patchesChanged {
var patches []*tailcfg.PeerChange var patches []*tailcfg.PeerChange
@@ -296,8 +373,10 @@ func (b *batcher) flush() {
} }
b.changedNodeIDs = set.Slice[types.NodeID]{} b.changedNodeIDs = set.Slice[types.NodeID]{}
notifierBatcherChanges.WithLabelValues().Set(0)
b.nodesChanged = false b.nodesChanged = false
b.patches = make(map[types.NodeID]tailcfg.PeerChange, len(b.patches)) b.patches = make(map[types.NodeID]tailcfg.PeerChange, len(b.patches))
notifierBatcherPatches.WithLabelValues().Set(0)
b.patchesChanged = false b.patchesChanged = false
} }
} }

View File

@@ -227,7 +227,7 @@ func TestBatcher(t *testing.T) {
ch := make(chan types.StateUpdate, 30) ch := make(chan types.StateUpdate, 30)
defer close(ch) defer close(ch)
n.AddNode(1, ch) n.AddNode(1, ch)
defer n.RemoveNode(1) defer n.RemoveNode(1, ch)
for _, u := range tt.updates { for _, u := range tt.updates {
n.NotifyAll(context.Background(), u) n.NotifyAll(context.Background(), u)

View File

@@ -532,7 +532,7 @@ func (s *Suite) TestRuleInvalidGeneration(c *check.C) {
"example-host-2:80" "example-host-2:80"
], ],
"deny": [ "deny": [
"exapmle-host-2:100" "example-host-2:100"
], ],
}, },
{ {
@@ -635,7 +635,7 @@ func Test_expandGroup(t *testing.T) {
wantErr: false, wantErr: false,
}, },
{ {
name: "InexistantGroup", name: "InexistentGroup",
field: field{ field: field{
pol: ACLPolicy{ pol: ACLPolicy{
Groups: Groups{ Groups: Groups{
@@ -2604,7 +2604,7 @@ func Test_getFilteredByACLPeers(t *testing.T) {
{ {
name: "all hosts can talk to each other", name: "all hosts can talk to each other",
args: args{ args: args{
nodes: types.Nodes{ // list of all nodess in the database nodes: types.Nodes{ // list of all nodes in the database
&types.Node{ &types.Node{
ID: 1, ID: 1,
IPv4: iap("100.64.0.1"), IPv4: iap("100.64.0.1"),
@@ -2651,7 +2651,7 @@ func Test_getFilteredByACLPeers(t *testing.T) {
{ {
name: "One host can talk to another, but not all hosts", name: "One host can talk to another, but not all hosts",
args: args{ args: args{
nodes: types.Nodes{ // list of all nodess in the database nodes: types.Nodes{ // list of all nodes in the database
&types.Node{ &types.Node{
ID: 1, ID: 1,
IPv4: iap("100.64.0.1"), IPv4: iap("100.64.0.1"),
@@ -2693,7 +2693,7 @@ func Test_getFilteredByACLPeers(t *testing.T) {
{ {
name: "host cannot directly talk to destination, but return path is authorized", name: "host cannot directly talk to destination, but return path is authorized",
args: args{ args: args{
nodes: types.Nodes{ // list of all nodess in the database nodes: types.Nodes{ // list of all nodes in the database
&types.Node{ &types.Node{
ID: 1, ID: 1,
IPv4: iap("100.64.0.1"), IPv4: iap("100.64.0.1"),
@@ -2735,7 +2735,7 @@ func Test_getFilteredByACLPeers(t *testing.T) {
{ {
name: "rules allows all hosts to reach one destination", name: "rules allows all hosts to reach one destination",
args: args{ args: args{
nodes: types.Nodes{ // list of all nodess in the database nodes: types.Nodes{ // list of all nodes in the database
&types.Node{ &types.Node{
ID: 1, ID: 1,
IPv4: iap("100.64.0.1"), IPv4: iap("100.64.0.1"),
@@ -2777,7 +2777,7 @@ func Test_getFilteredByACLPeers(t *testing.T) {
{ {
name: "rules allows all hosts to reach one destination, destination can reach all hosts", name: "rules allows all hosts to reach one destination, destination can reach all hosts",
args: args{ args: args{
nodes: types.Nodes{ // list of all nodess in the database nodes: types.Nodes{ // list of all nodes in the database
&types.Node{ &types.Node{
ID: 1, ID: 1,
IPv4: iap("100.64.0.1"), IPv4: iap("100.64.0.1"),
@@ -2824,7 +2824,7 @@ func Test_getFilteredByACLPeers(t *testing.T) {
{ {
name: "rule allows all hosts to reach all destinations", name: "rule allows all hosts to reach all destinations",
args: args{ args: args{
nodes: types.Nodes{ // list of all nodess in the database nodes: types.Nodes{ // list of all nodes in the database
&types.Node{ &types.Node{
ID: 1, ID: 1,
IPv4: iap("100.64.0.1"), IPv4: iap("100.64.0.1"),
@@ -2871,7 +2871,7 @@ func Test_getFilteredByACLPeers(t *testing.T) {
{ {
name: "without rule all communications are forbidden", name: "without rule all communications are forbidden",
args: args{ args: args{
nodes: types.Nodes{ // list of all nodess in the database nodes: types.Nodes{ // list of all nodes in the database
&types.Node{ &types.Node{
ID: 1, ID: 1,
IPv4: iap("100.64.0.1"), IPv4: iap("100.64.0.1"),

View File

@@ -9,13 +9,13 @@ import (
"net/netip" "net/netip"
"sort" "sort"
"strings" "strings"
"sync"
"time" "time"
"github.com/juanfont/headscale/hscontrol/db" "github.com/juanfont/headscale/hscontrol/db"
"github.com/juanfont/headscale/hscontrol/mapper" "github.com/juanfont/headscale/hscontrol/mapper"
"github.com/juanfont/headscale/hscontrol/types" "github.com/juanfont/headscale/hscontrol/types"
"github.com/rs/zerolog/log" "github.com/rs/zerolog/log"
"github.com/sasha-s/go-deadlock"
xslices "golang.org/x/exp/slices" xslices "golang.org/x/exp/slices"
"gorm.io/gorm" "gorm.io/gorm"
"tailscale.com/tailcfg" "tailscale.com/tailcfg"
@@ -29,11 +29,6 @@ type contextKey string
const nodeNameContextKey = contextKey("nodeName") const nodeNameContextKey = contextKey("nodeName")
type sessionManager struct {
mu sync.RWMutex
sess map[types.NodeID]*mapSession
}
type mapSession struct { type mapSession struct {
h *Headscale h *Headscale
req tailcfg.MapRequest req tailcfg.MapRequest
@@ -41,12 +36,13 @@ type mapSession struct {
capVer tailcfg.CapabilityVersion capVer tailcfg.CapabilityVersion
mapper *mapper.Mapper mapper *mapper.Mapper
serving bool cancelChMu deadlock.Mutex
servingMu sync.Mutex
ch chan types.StateUpdate ch chan types.StateUpdate
cancelCh chan struct{} cancelCh chan struct{}
cancelChOpen bool
keepAlive time.Duration
keepAliveTicker *time.Ticker keepAliveTicker *time.Ticker
node *types.Node node *types.Node
@@ -77,6 +73,8 @@ func (h *Headscale) newMapSession(
} }
} }
ka := keepAliveInterval + (time.Duration(rand.IntN(9000)) * time.Millisecond)
return &mapSession{ return &mapSession{
h: h, h: h,
ctx: ctx, ctx: ctx,
@@ -86,13 +84,12 @@ func (h *Headscale) newMapSession(
capVer: req.Version, capVer: req.Version,
mapper: h.mapper, mapper: h.mapper,
// serving indicates if a client is being served.
serving: false,
ch: updateChan, ch: updateChan,
cancelCh: make(chan struct{}), cancelCh: make(chan struct{}),
cancelChOpen: true,
keepAliveTicker: time.NewTicker(keepAliveInterval + (time.Duration(rand.IntN(9000)) * time.Millisecond)), keepAlive: ka,
keepAliveTicker: nil,
// Loggers // Loggers
warnf: warnf, warnf: warnf,
@@ -103,15 +100,23 @@ func (h *Headscale) newMapSession(
} }
func (m *mapSession) close() { func (m *mapSession) close() {
m.servingMu.Lock() m.cancelChMu.Lock()
defer m.servingMu.Unlock() defer m.cancelChMu.Unlock()
if !m.serving {
if !m.cancelChOpen {
mapResponseClosed.WithLabelValues("chanclosed").Inc()
return return
} }
m.tracef("mapSession (%p) sending message on cancel chan") m.tracef("mapSession (%p) sending message on cancel chan", m)
m.cancelCh <- struct{}{} select {
m.tracef("mapSession (%p) sent message on cancel chan") case m.cancelCh <- struct{}{}:
mapResponseClosed.WithLabelValues("sent").Inc()
m.tracef("mapSession (%p) sent message on cancel chan", m)
case <-time.After(30 * time.Second):
mapResponseClosed.WithLabelValues("timeout").Inc()
m.tracef("mapSession (%p) timed out sending close message", m)
}
} }
func (m *mapSession) isStreaming() bool { func (m *mapSession) isStreaming() bool {
@@ -126,40 +131,12 @@ func (m *mapSession) isReadOnlyUpdate() bool {
return !m.req.Stream && m.req.OmitPeers && m.req.ReadOnly return !m.req.Stream && m.req.OmitPeers && m.req.ReadOnly
} }
// handlePoll ensures the node gets the appropriate updates from either func (m *mapSession) resetKeepAlive() {
// polling or immediate responses. m.keepAliveTicker.Reset(m.keepAlive)
//
//nolint:gocyclo
func (m *mapSession) serve() {
// Register with the notifier if this is a streaming
// session
if m.isStreaming() {
// defers are called in reverse order,
// so top one is executed last.
// Failover the node's routes if any.
defer m.infof("node has disconnected, mapSession: %p", m)
defer m.pollFailoverRoutes("node closing connection", m.node)
defer m.h.updateNodeOnlineStatus(false, m.node)
defer m.h.nodeNotifier.RemoveNode(m.node.ID)
defer func() {
m.servingMu.Lock()
defer m.servingMu.Unlock()
m.serving = false
close(m.cancelCh)
}()
m.serving = true
m.h.nodeNotifier.AddNode(m.node.ID, m.ch)
m.h.updateNodeOnlineStatus(true, m.node)
m.infof("node has connected, mapSession: %p", m)
} }
// serve handles non-streaming requests.
func (m *mapSession) serve() {
// TODO(kradalby): A set todos to harden: // TODO(kradalby): A set todos to harden:
// - func to tell the stream to die, readonly -> false, !stream && omitpeers -> false, true // - func to tell the stream to die, readonly -> false, !stream && omitpeers -> false, true
@@ -196,13 +173,43 @@ func (m *mapSession) serve() {
return return
} }
}
// serveLongPoll ensures the node gets the appropriate updates from either
// polling or immediate responses.
//
//nolint:gocyclo
func (m *mapSession) serveLongPoll() {
// Clean up the session when the client disconnects
defer func() {
m.cancelChMu.Lock()
m.cancelChOpen = false
close(m.cancelCh)
m.cancelChMu.Unlock()
// only update node status if the node channel was removed.
// in principal, it will be removed, but the client rapidly
// reconnects, the channel might be of another connection.
// In that case, it is not closed and the node is still online.
if m.h.nodeNotifier.RemoveNode(m.node.ID, m.ch) {
// Failover the node's routes if any.
m.h.updateNodeOnlineStatus(false, m.node)
m.pollFailoverRoutes("node closing connection", m.node)
}
m.infof("node has disconnected, mapSession: %p, chan: %p", m, m.ch)
}()
// From version 68, all streaming requests can be treated as read only. // From version 68, all streaming requests can be treated as read only.
// TODO: Remove when we drop support for 1.48
if m.capVer < 68 { if m.capVer < 68 {
// Error has been handled/written to client in the func // Error has been handled/written to client in the func
// return // return
err := m.handleSaveNode() err := m.handleSaveNode()
if err != nil { if err != nil {
mapResponseWriteUpdatesInStream.WithLabelValues("error").Inc() mapResponseWriteUpdatesInStream.WithLabelValues("error").Inc()
m.close()
return return
} }
mapResponseWriteUpdatesInStream.WithLabelValues("ok").Inc() mapResponseWriteUpdatesInStream.WithLabelValues("ok").Inc()
@@ -224,6 +231,13 @@ func (m *mapSession) serve() {
ctx, cancel := context.WithCancel(context.WithValue(m.ctx, nodeNameContextKey, m.node.Hostname)) ctx, cancel := context.WithCancel(context.WithValue(m.ctx, nodeNameContextKey, m.node.Hostname))
defer cancel() defer cancel()
m.keepAliveTicker = time.NewTicker(m.keepAlive)
m.h.nodeNotifier.AddNode(m.node.ID, m.ch)
go m.h.updateNodeOnlineStatus(true, m.node)
m.infof("node has connected, mapSession: %p, chan: %p", m, m.ch)
// Loop through updates and continuously send them to the // Loop through updates and continuously send them to the
// client. // client.
for { for {
@@ -231,13 +245,21 @@ func (m *mapSession) serve() {
select { select {
case <-m.cancelCh: case <-m.cancelCh:
m.tracef("poll cancelled received") m.tracef("poll cancelled received")
return mapResponseEnded.WithLabelValues("cancelled").Inc()
case <-ctx.Done():
m.tracef("poll context done")
return return
// Consume all updates sent to node case <-ctx.Done():
case update := <-m.ch: m.tracef("poll context done")
mapResponseEnded.WithLabelValues("done").Inc()
return
// Consume updates sent to node
case update, ok := <-m.ch:
if !ok {
m.tracef("update channel closed, streaming session is likely being replaced")
return
}
m.tracef("received stream update: %s %s", update.Type.String(), update.Message) m.tracef("received stream update: %s %s", update.Type.String(), update.Message)
mapResponseUpdateReceived.WithLabelValues(update.Type.String()).Inc() mapResponseUpdateReceived.WithLabelValues(update.Type.String()).Inc()
@@ -303,15 +325,13 @@ func (m *mapSession) serve() {
return return
} }
// log.Trace().Str("node", m.node.Hostname).TimeDiff("timeSpent", time.Now(), startMapResp).Str("mkey", m.node.MachineKey.String()).Int("type", int(update.Type)).Msg("finished making map response")
// Only send update if there is change // Only send update if there is change
if data != nil { if data != nil {
startWrite := time.Now() startWrite := time.Now()
_, err = m.w.Write(data) _, err = m.w.Write(data)
if err != nil { if err != nil {
mapResponseSent.WithLabelValues("error", updateType).Inc() mapResponseSent.WithLabelValues("error", updateType).Inc()
m.errf(err, "Could not write the map response, for mapSession: %p", m) m.errf(err, "could not write the map response(%s), for mapSession: %p", update.Type.String(), m)
return return
} }
@@ -324,8 +344,12 @@ func (m *mapSession) serve() {
log.Trace().Str("node", m.node.Hostname).TimeDiff("timeSpent", time.Now(), startWrite).Str("mkey", m.node.MachineKey.String()).Msg("finished writing mapresp to node") log.Trace().Str("node", m.node.Hostname).TimeDiff("timeSpent", time.Now(), startWrite).Str("mkey", m.node.MachineKey.String()).Msg("finished writing mapresp to node")
if debugHighCardinalityMetrics {
mapResponseLastSentSeconds.WithLabelValues(updateType, m.node.ID.String()).Set(float64(time.Now().Unix()))
}
mapResponseSent.WithLabelValues("ok", updateType).Inc() mapResponseSent.WithLabelValues("ok", updateType).Inc()
m.tracef("update sent") m.tracef("update sent")
m.resetKeepAlive()
} }
case <-m.keepAliveTicker.C: case <-m.keepAliveTicker.C:
@@ -348,6 +372,9 @@ func (m *mapSession) serve() {
return return
} }
if debugHighCardinalityMetrics {
mapResponseLastSentSeconds.WithLabelValues("keepalive", m.node.ID.String()).Set(float64(time.Now().Unix()))
}
mapResponseSent.WithLabelValues("ok", "keepalive").Inc() mapResponseSent.WithLabelValues("ok", "keepalive").Inc()
} }
} }
@@ -404,16 +431,6 @@ func (h *Headscale) updateNodeOnlineStatus(online bool, node *types.Node) {
}, node.ID) }, node.ID)
} }
func closeChanWithLog[C chan []byte | chan struct{} | chan types.StateUpdate](channel C, node, name string) {
log.Trace().
Str("handler", "PollNetMap").
Str("node", node).
Str("channel", "Done").
Msg(fmt.Sprintf("Closing %s channel", name))
close(channel)
}
func (m *mapSession) handleEndpointUpdate() { func (m *mapSession) handleEndpointUpdate() {
m.tracef("received endpoint update") m.tracef("received endpoint update")
@@ -425,6 +442,17 @@ func (m *mapSession) handleEndpointUpdate() {
m.node.ApplyPeerChange(&change) m.node.ApplyPeerChange(&change)
sendUpdate, routesChanged := hostInfoChanged(m.node.Hostinfo, m.req.Hostinfo) sendUpdate, routesChanged := hostInfoChanged(m.node.Hostinfo, m.req.Hostinfo)
// The node might not set NetInfo if it has not changed and if
// the full HostInfo object is overrwritten, the information is lost.
// If there is no NetInfo, keep the previous one.
// From 1.66 the client only sends it if changed:
// https://github.com/tailscale/tailscale/commit/e1011f138737286ecf5123ff887a7a5800d129a2
// TODO(kradalby): evaulate if we need better comparing of hostinfo
// before we take the changes.
if m.req.Hostinfo.NetInfo == nil {
m.req.Hostinfo.NetInfo = m.node.Hostinfo.NetInfo
}
m.node.Hostinfo = m.req.Hostinfo m.node.Hostinfo = m.req.Hostinfo
logTracePeerChange(m.node.Hostname, sendUpdate, &change) logTracePeerChange(m.node.Hostname, sendUpdate, &change)

View File

@@ -171,6 +171,7 @@ type LogConfig struct {
} }
type Tuning struct { type Tuning struct {
NotifierSendTimeout time.Duration
BatchChangeDelay time.Duration BatchChangeDelay time.Duration
NodeMapSessionBufferedChanSize int NodeMapSessionBufferedChanSize int
} }
@@ -232,6 +233,7 @@ func LoadConfig(path string, isFile bool) error {
viper.SetDefault("ephemeral_node_inactivity_timeout", "120s") viper.SetDefault("ephemeral_node_inactivity_timeout", "120s")
viper.SetDefault("tuning.notifier_send_timeout", "800ms")
viper.SetDefault("tuning.batch_change_delay", "800ms") viper.SetDefault("tuning.batch_change_delay", "800ms")
viper.SetDefault("tuning.node_mapsession_buffered_chan_size", 30) viper.SetDefault("tuning.node_mapsession_buffered_chan_size", 30)
@@ -768,6 +770,7 @@ func GetHeadscaleConfig() (*Config, error) {
// TODO(kradalby): Document these settings when more stable // TODO(kradalby): Document these settings when more stable
Tuning: Tuning{ Tuning: Tuning{
NotifierSendTimeout: viper.GetDuration("tuning.notifier_send_timeout"),
BatchChangeDelay: viper.GetDuration("tuning.batch_change_delay"), BatchChangeDelay: viper.GetDuration("tuning.batch_change_delay"),
NodeMapSessionBufferedChanSize: viper.GetInt("tuning.node_mapsession_buffered_chan_size"), NodeMapSessionBufferedChanSize: viper.GetInt("tuning.node_mapsession_buffered_chan_size"),
}, },

View File

@@ -43,6 +43,10 @@ func (id NodeID) Uint64() uint64 {
return uint64(id) return uint64(id)
} }
func (id NodeID) String() string {
return strconv.FormatUint(id.Uint64(), util.Base10)
}
// Node is a Headscale client. // Node is a Headscale client.
type Node struct { type Node struct {
ID NodeID `gorm:"primary_key"` ID NodeID `gorm:"primary_key"`

View File

@@ -1,6 +1,7 @@
package integration package integration
import ( import (
"context"
"encoding/json" "encoding/json"
"fmt" "fmt"
"net/netip" "net/netip"
@@ -15,6 +16,7 @@ import (
"github.com/rs/zerolog/log" "github.com/rs/zerolog/log"
"github.com/samber/lo" "github.com/samber/lo"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"golang.org/x/sync/errgroup"
"tailscale.com/client/tailscale/apitype" "tailscale.com/client/tailscale/apitype"
"tailscale.com/types/key" "tailscale.com/types/key"
) )
@@ -335,14 +337,14 @@ func TestTaildrop(t *testing.T) {
IntegrationSkip(t) IntegrationSkip(t)
t.Parallel() t.Parallel()
retry := func(times int, sleepInverval time.Duration, doWork func() error) error { retry := func(times int, sleepInterval time.Duration, doWork func() error) error {
var err error var err error
for attempts := 0; attempts < times; attempts++ { for attempts := 0; attempts < times; attempts++ {
err = doWork() err = doWork()
if err == nil { if err == nil {
return nil return nil
} }
time.Sleep(sleepInverval) time.Sleep(sleepInterval)
} }
return err return err
@@ -793,7 +795,7 @@ func TestNodeOnlineStatus(t *testing.T) {
continue continue
} }
// All peers of this nodess are reporting to be // All peers of this nodes are reporting to be
// connected to the control server // connected to the control server
assert.Truef( assert.Truef(
t, t,
@@ -829,24 +831,10 @@ func TestPingAllByIPManyUpDown(t *testing.T) {
"user2": len(MustTestVersions), "user2": len(MustTestVersions),
} }
headscaleConfig := map[string]string{
"HEADSCALE_DERP_URLS": "",
"HEADSCALE_DERP_SERVER_ENABLED": "true",
"HEADSCALE_DERP_SERVER_REGION_ID": "999",
"HEADSCALE_DERP_SERVER_REGION_CODE": "headscale",
"HEADSCALE_DERP_SERVER_REGION_NAME": "Headscale Embedded DERP",
"HEADSCALE_DERP_SERVER_STUN_LISTEN_ADDR": "0.0.0.0:3478",
"HEADSCALE_DERP_SERVER_PRIVATE_KEY_PATH": "/tmp/derp.key",
// Envknob for enabling DERP debug logs
"DERP_DEBUG_LOGS": "true",
"DERP_PROBER_DEBUG_LOGS": "true",
}
err = scenario.CreateHeadscaleEnv(spec, err = scenario.CreateHeadscaleEnv(spec,
[]tsic.Option{}, []tsic.Option{},
hsic.WithTestName("pingallbyip"), hsic.WithTestName("pingallbyipmany"),
hsic.WithConfigEnv(headscaleConfig), hsic.WithEmbeddedDERPServerOnly(),
hsic.WithTLS(), hsic.WithTLS(),
hsic.WithHostnameAsServerURL(), hsic.WithHostnameAsServerURL(),
) )
@@ -870,19 +858,35 @@ func TestPingAllByIPManyUpDown(t *testing.T) {
success := pingAllHelper(t, allClients, allAddrs) success := pingAllHelper(t, allClients, allAddrs)
t.Logf("%d successful pings out of %d", success, len(allClients)*len(allIps)) t.Logf("%d successful pings out of %d", success, len(allClients)*len(allIps))
wg, _ := errgroup.WithContext(context.Background())
for run := range 3 { for run := range 3 {
t.Logf("Starting DownUpPing run %d", run+1) t.Logf("Starting DownUpPing run %d", run+1)
for _, client := range allClients { for _, client := range allClients {
t.Logf("taking down %q", client.Hostname()) c := client
client.Down() wg.Go(func() error {
t.Logf("taking down %q", c.Hostname())
return c.Down()
})
}
if err := wg.Wait(); err != nil {
t.Fatalf("failed to take down all nodes: %s", err)
} }
time.Sleep(5 * time.Second) time.Sleep(5 * time.Second)
for _, client := range allClients { for _, client := range allClients {
t.Logf("bringing up %q", client.Hostname()) c := client
client.Up() wg.Go(func() error {
t.Logf("bringing up %q", c.Hostname())
return c.Up()
})
}
if err := wg.Wait(); err != nil {
t.Fatalf("failed to take down all nodes: %s", err)
} }
time.Sleep(5 * time.Second) time.Sleep(5 * time.Second)

View File

@@ -286,9 +286,13 @@ func New(
} }
env := []string{ env := []string{
"HEADSCALE_PROFILING_ENABLED=1", "HEADSCALE_DEBUG_PROFILING_ENABLED=1",
"HEADSCALE_PROFILING_PATH=/tmp/profile", "HEADSCALE_DEBUG_PROFILING_PATH=/tmp/profile",
"HEADSCALE_DEBUG_DUMP_MAPRESPONSE_PATH=/tmp/mapresponses", "HEADSCALE_DEBUG_DUMP_MAPRESPONSE_PATH=/tmp/mapresponses",
"HEADSCALE_DEBUG_DEADLOCK=1",
"HEADSCALE_DEBUG_DEADLOCK_TIMEOUT=5s",
"HEADSCALE_DEBUG_HIGH_CARDINALITY_METRICS=1",
"HEADSCALE_DEBUG_DUMP_CONFIG=1",
} }
for key, value := range hsic.env { for key, value := range hsic.env {
env = append(env, fmt.Sprintf("%s=%s", key, value)) env = append(env, fmt.Sprintf("%s=%s", key, value))
@@ -397,7 +401,7 @@ func (t *HeadscaleInContainer) Shutdown() error {
) )
} }
err = t.SaveMetrics("/tmp/control/metrics.txt") err = t.SaveMetrics(fmt.Sprintf("/tmp/control/%s_metrics.txt", t.hostname))
if err != nil { if err != nil {
log.Printf( log.Printf(
"Failed to metrics from control: %s", "Failed to metrics from control: %s",
@@ -747,7 +751,7 @@ func createCertificate(hostname string) ([]byte, []byte, error) {
Locality: []string{"Leiden"}, Locality: []string{"Leiden"},
}, },
NotBefore: time.Now(), NotBefore: time.Now(),
NotAfter: time.Now().Add(60 * time.Minute), NotAfter: time.Now().Add(60 * time.Hour),
IsCA: true, IsCA: true,
ExtKeyUsage: []x509.ExtKeyUsage{ ExtKeyUsage: []x509.ExtKeyUsage{
x509.ExtKeyUsageClientAuth, x509.ExtKeyUsageClientAuth,

View File

@@ -51,8 +51,11 @@ var (
tailscaleVersions2021 = map[string]bool{ tailscaleVersions2021 = map[string]bool{
"head": true, "head": true,
"unstable": true, "unstable": true,
"1.60": true, // CapVer: 82 "1.66": true, // CapVer: not checked
"1.58": true, // CapVer: 82 "1.64": true, // CapVer: not checked
"1.62": true, // CapVer: not checked
"1.60": true, // CapVer: not checked
"1.58": true, // CapVer: not checked
"1.56": true, // CapVer: 82 "1.56": true, // CapVer: 82
"1.54": true, // CapVer: 79 "1.54": true, // CapVer: 79
"1.52": true, // CapVer: 79 "1.52": true, // CapVer: 79
@@ -423,11 +426,13 @@ func (s *Scenario) WaitForTailscaleSync() error {
if err != nil { if err != nil {
for _, user := range s.users { for _, user := range s.users {
for _, client := range user.Clients { for _, client := range user.Clients {
peers, _ := client.PrettyPeers() peers, allOnline, _ := client.FailingPeersAsString()
if !allOnline {
log.Println(peers) log.Println(peers)
} }
} }
} }
}
return err return err
} }
@@ -450,7 +455,7 @@ func (s *Scenario) WaitForTailscaleSyncWithPeerCount(peerCount int) error {
return nil return nil
} }
// CreateHeadscaleEnv is a conventient method returning a complete Headcale // CreateHeadscaleEnv is a convenient method returning a complete Headcale
// test environment with nodes of all versions, joined to the server with X // test environment with nodes of all versions, joined to the server with X
// users. // users.
func (s *Scenario) CreateHeadscaleEnv( func (s *Scenario) CreateHeadscaleEnv(

View File

@@ -36,5 +36,8 @@ type TailscaleClient interface {
Ping(hostnameOrIP string, opts ...tsic.PingOption) error Ping(hostnameOrIP string, opts ...tsic.PingOption) error
Curl(url string, opts ...tsic.CurlOption) (string, error) Curl(url string, opts ...tsic.CurlOption) (string, error)
ID() string ID() string
PrettyPeers() (string, error)
// FailingPeersAsString returns a formatted-ish multi-line-string of peers in the client
// and a bool indicating if the clients online count and peer count is equal.
FailingPeersAsString() (string, bool, error)
} }

View File

@@ -691,15 +691,18 @@ func (t *TailscaleInContainer) FQDN() (string, error) {
return status.Self.DNSName, nil return status.Self.DNSName, nil
} }
// PrettyPeers returns a formatted-ish table of peers in the client. // FailingPeersAsString returns a formatted-ish multi-line-string of peers in the client
func (t *TailscaleInContainer) PrettyPeers() (string, error) { // and a bool indicating if the clients online count and peer count is equal.
func (t *TailscaleInContainer) FailingPeersAsString() (string, bool, error) {
status, err := t.Status() status, err := t.Status()
if err != nil { if err != nil {
return "", fmt.Errorf("failed to get FQDN: %w", err) return "", false, fmt.Errorf("failed to get FQDN: %w", err)
} }
str := fmt.Sprintf("Peers of %s\n", t.hostname) var b strings.Builder
str += "Hostname\tOnline\tLastSeen\n"
fmt.Fprintf(&b, "Peers of %s\n", t.hostname)
fmt.Fprint(&b, "Hostname\tOnline\tLastSeen\n")
peerCount := len(status.Peers()) peerCount := len(status.Peers())
onlineCount := 0 onlineCount := 0
@@ -711,12 +714,12 @@ func (t *TailscaleInContainer) PrettyPeers() (string, error) {
onlineCount++ onlineCount++
} }
str += fmt.Sprintf("%s\t%t\t%s\n", peer.HostName, peer.Online, peer.LastSeen) fmt.Fprintf(&b, "%s\t%t\t%s\n", peer.HostName, peer.Online, peer.LastSeen)
} }
str += fmt.Sprintf("Peer Count: %d, Online Count: %d\n\n", peerCount, onlineCount) fmt.Fprintf(&b, "Peer Count: %d, Online Count: %d\n\n", peerCount, onlineCount)
return str, nil return b.String(), peerCount == onlineCount, nil
} }
// WaitForNeedsLogin blocks until the Tailscale (tailscaled) instance has // WaitForNeedsLogin blocks until the Tailscale (tailscaled) instance has

View File

@@ -331,7 +331,7 @@ func dockertestMaxWait() time.Duration {
// return timeout // return timeout
// } // }
// pingAllNegativeHelper is intended to have 1 or more nodes timeing out from the ping, // pingAllNegativeHelper is intended to have 1 or more nodes timing out from the ping,
// it counts failures instead of successes. // it counts failures instead of successes.
// func pingAllNegativeHelper(t *testing.T, clients []TailscaleClient, addrs []string) int { // func pingAllNegativeHelper(t *testing.T, clients []TailscaleClient, addrs []string) int {
// t.Helper() // t.Helper()