mirror of
https://github.com/tailscale/tailscale.git
synced 2024-12-12 11:14:40 +00:00
d4bfe34ba7
The Go zstd package is not friendly for stateless zstd compression. Passing around multiple zstd.Encoder just for stateless compression is a waste of memory since the memory is never freed and seldom used if no compression operations are happening. For performance, we pool the relevant Encoder/Decoder with the specific options set. Functionally, this package is a wrapper over the Go zstd package with a more ergonomic API for stateless operations. This package can be used to cleanup various pre-existing zstd.Encoder pools or one-off handlers spread throughout our codebases. Performance: BenchmarkEncode/Best 1690 610926 ns/op 25.78 MB/s 1 B/op 0 allocs/op zstd_test.go:137: memory: 50.336 MiB zstd_test.go:138: ratio: 3.269x BenchmarkEncode/Better 10000 100939 ns/op 156.04 MB/s 0 B/op 0 allocs/op zstd_test.go:137: memory: 20.399 MiB zstd_test.go:138: ratio: 3.131x BenchmarkEncode/Default 15775 74976 ns/op 210.08 MB/s 105 B/op 0 allocs/op zstd_test.go:137: memory: 1.586 MiB zstd_test.go:138: ratio: 3.064x BenchmarkEncode/Fastest 23222 53977 ns/op 291.81 MB/s 26 B/op 0 allocs/op zstd_test.go:137: memory: 599.458 KiB zstd_test.go:138: ratio: 2.898x BenchmarkEncode/FastestLowMemory 23361 50789 ns/op 310.13 MB/s 15 B/op 0 allocs/op zstd_test.go:137: memory: 334.458 KiB zstd_test.go:138: ratio: 2.898x BenchmarkEncode/FastestNoChecksum 23086 50253 ns/op 313.44 MB/s 26 B/op 0 allocs/op zstd_test.go:137: memory: 599.458 KiB zstd_test.go:138: ratio: 2.900x BenchmarkDecode/Checksum 70794 17082 ns/op 300.96 MB/s 4 B/op 0 allocs/op zstd_test.go:163: memory: 316.438 KiB BenchmarkDecode/NoChecksum 74935 15990 ns/op 321.51 MB/s 4 B/op 0 allocs/op zstd_test.go:163: memory: 316.438 KiB BenchmarkDecode/LowMemory 71043 16739 ns/op 307.13 MB/s 0 B/op 0 allocs/op zstd_test.go:163: memory: 79.347 KiB We can see that the options are taking effect where compression ratio improves with higher levels and compression speed diminishes. We can also see that LowMemory takes effect where the pooled coder object references less memory than other cases. We can see that the pooling is taking effect as there are 0 amortized allocations. Additional performance: BenchmarkEncodeParallel/zstd-24 1857 619264 ns/op 1796 B/op 49 allocs/op BenchmarkEncodeParallel/zstdframe-24 1954 532023 ns/op 4293 B/op 49 allocs/op BenchmarkDecodeParallel/zstd-24 5288 197281 ns/op 2516 B/op 49 allocs/op BenchmarkDecodeParallel/zstdframe-24 6441 196254 ns/op 2513 B/op 49 allocs/op In concurrent usage, handling the pooling in this package has a marginal benefit over the zstd package, which relies on a Go channel as the pooling mechanism. In particular, coders can be freed by the GC when not in use. Coders can be shared throughout the program if they use this package instead of multiple independent pools doing the same thing. The allocations are unrelated to pooling as they're caused by the spawning of goroutines. Updates #cleanup Updates tailscale/corp#18514 Updates tailscale/corp#17653 Updates tailscale/corp#18005 Signed-off-by: Joe Tsai <joetsai@digital-static.net>
210 lines
5.6 KiB
Go
210 lines
5.6 KiB
Go
// Copyright (c) Tailscale Inc & AUTHORS
|
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
package zstdframe
|
|
|
|
import (
|
|
"math/bits"
|
|
"math/rand/v2"
|
|
"os"
|
|
"runtime"
|
|
"strings"
|
|
"sync"
|
|
"testing"
|
|
|
|
"github.com/klauspost/compress/zstd"
|
|
"tailscale.com/util/must"
|
|
)
|
|
|
|
// Use the concatenation of all Go source files in zstdframe as testdata.
|
|
var src = func() (out []byte) {
|
|
for _, de := range must.Get(os.ReadDir(".")) {
|
|
if strings.HasSuffix(de.Name(), ".go") {
|
|
out = append(out, must.Get(os.ReadFile(de.Name()))...)
|
|
}
|
|
}
|
|
return out
|
|
}()
|
|
var dst []byte
|
|
var dsts [][]byte
|
|
|
|
// zstdEnc is identical to getEncoder without options,
|
|
// except it relies on concurrency managed by the zstd package itself.
|
|
var zstdEnc = must.Get(zstd.NewWriter(nil,
|
|
zstd.WithEncoderConcurrency(runtime.NumCPU()),
|
|
zstd.WithSingleSegment(true),
|
|
zstd.WithZeroFrames(true),
|
|
zstd.WithEncoderLevel(zstd.SpeedDefault),
|
|
zstd.WithEncoderCRC(true),
|
|
zstd.WithLowerEncoderMem(false)))
|
|
|
|
// zstdDec is identical to getDecoder without options,
|
|
// except it relies on concurrency managed by the zstd package itself.
|
|
var zstdDec = must.Get(zstd.NewReader(nil,
|
|
zstd.WithDecoderConcurrency(runtime.NumCPU()),
|
|
zstd.WithDecoderMaxMemory(1<<63),
|
|
zstd.IgnoreChecksum(false),
|
|
zstd.WithDecoderLowmem(false)))
|
|
|
|
var coders = []struct {
|
|
name string
|
|
appendEncode func([]byte, []byte) []byte
|
|
appendDecode func([]byte, []byte) ([]byte, error)
|
|
}{{
|
|
name: "zstd",
|
|
appendEncode: func(dst, src []byte) []byte { return zstdEnc.EncodeAll(src, dst) },
|
|
appendDecode: func(dst, src []byte) ([]byte, error) { return zstdDec.DecodeAll(src, dst) },
|
|
}, {
|
|
name: "zstdframe",
|
|
appendEncode: func(dst, src []byte) []byte { return AppendEncode(dst, src) },
|
|
appendDecode: func(dst, src []byte) ([]byte, error) { return AppendDecode(dst, src) },
|
|
}}
|
|
|
|
func TestDecodeMaxSize(t *testing.T) {
|
|
var enc, dec []byte
|
|
zeros := make([]byte, 1<<16, 2<<16)
|
|
check := func(encSize, maxDecSize int) {
|
|
var gotErr, wantErr error
|
|
enc = AppendEncode(enc[:0], zeros[:encSize])
|
|
|
|
// Directly calling zstd.Decoder.DecodeAll may not trigger size check
|
|
// since it only operates on closest power-of-two.
|
|
dec, gotErr = func() ([]byte, error) {
|
|
d := getDecoder(MaxDecodedSize(uint64(maxDecSize)))
|
|
defer putDecoder(d)
|
|
return d.Decoder.DecodeAll(enc, dec[:0]) // directly call zstd.Decoder.DecodeAll
|
|
}()
|
|
if encSize > 1<<(64-bits.LeadingZeros64(uint64(maxDecSize)-1)) {
|
|
wantErr = zstd.ErrDecoderSizeExceeded
|
|
}
|
|
if gotErr != wantErr {
|
|
t.Errorf("DecodeAll(AppendEncode(%d), %d) error = %v, want %v", encSize, maxDecSize, gotErr, wantErr)
|
|
}
|
|
|
|
// Calling AppendDecode should perform the exact size check.
|
|
dec, gotErr = AppendDecode(dec[:0], enc, MaxDecodedSize(uint64(maxDecSize)))
|
|
if encSize > maxDecSize {
|
|
wantErr = zstd.ErrDecoderSizeExceeded
|
|
}
|
|
if gotErr != wantErr {
|
|
t.Errorf("AppendDecode(AppendEncode(%d), %d) error = %v, want %v", encSize, maxDecSize, gotErr, wantErr)
|
|
}
|
|
}
|
|
|
|
rn := rand.New(rand.NewPCG(0, 0))
|
|
for n := 1 << 10; n <= len(zeros); n <<= 1 {
|
|
nl := rn.IntN(n + 1)
|
|
check(nl, nl)
|
|
check(nl, nl-1)
|
|
check(nl, (n+nl)/2)
|
|
check(nl, n)
|
|
check((n+nl)/2, n)
|
|
check(n-1, n-1)
|
|
check(n-1, n)
|
|
check(n-1, n+1)
|
|
check(n, n-1)
|
|
check(n, n)
|
|
check(n, n+1)
|
|
check(n+1, n-1)
|
|
check(n+1, n)
|
|
check(n+1, n+1)
|
|
}
|
|
}
|
|
|
|
func BenchmarkEncode(b *testing.B) {
|
|
options := []struct {
|
|
name string
|
|
opts []Option
|
|
}{
|
|
{name: "Best", opts: []Option{BestCompression}},
|
|
{name: "Better", opts: []Option{BetterCompression}},
|
|
{name: "Default", opts: []Option{DefaultCompression}},
|
|
{name: "Fastest", opts: []Option{FastestCompression}},
|
|
{name: "FastestLowMemory", opts: []Option{FastestCompression, LowMemory(true)}},
|
|
{name: "FastestNoChecksum", opts: []Option{FastestCompression, WithChecksum(false)}},
|
|
}
|
|
for _, bb := range options {
|
|
b.Run(bb.name, func(b *testing.B) {
|
|
b.ReportAllocs()
|
|
b.SetBytes(int64(len(src)))
|
|
for i := 0; i < b.N; i++ {
|
|
dst = AppendEncode(dst[:0], src, bb.opts...)
|
|
}
|
|
})
|
|
if testing.Verbose() {
|
|
ratio := float64(len(src)) / float64(len(dst))
|
|
b.Logf("ratio: %0.3fx", ratio)
|
|
}
|
|
}
|
|
}
|
|
|
|
func BenchmarkDecode(b *testing.B) {
|
|
options := []struct {
|
|
name string
|
|
opts []Option
|
|
}{
|
|
{name: "Checksum", opts: []Option{WithChecksum(true)}},
|
|
{name: "NoChecksum", opts: []Option{WithChecksum(false)}},
|
|
{name: "LowMemory", opts: []Option{LowMemory(true)}},
|
|
}
|
|
src := AppendEncode(nil, src)
|
|
for _, bb := range options {
|
|
b.Run(bb.name, func(b *testing.B) {
|
|
b.ReportAllocs()
|
|
b.SetBytes(int64(len(src)))
|
|
for i := 0; i < b.N; i++ {
|
|
dst = must.Get(AppendDecode(dst[:0], src, bb.opts...))
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func BenchmarkEncodeParallel(b *testing.B) {
|
|
numCPU := runtime.NumCPU()
|
|
for _, coder := range coders {
|
|
dsts = dsts[:0]
|
|
for i := 0; i < numCPU; i++ {
|
|
dsts = append(dsts, coder.appendEncode(nil, src))
|
|
}
|
|
b.Run(coder.name, func(b *testing.B) {
|
|
b.ReportAllocs()
|
|
for i := 0; i < b.N; i++ {
|
|
var group sync.WaitGroup
|
|
for j := 0; j < numCPU; j++ {
|
|
group.Add(1)
|
|
go func(j int) {
|
|
defer group.Done()
|
|
dsts[j] = coder.appendEncode(dsts[j][:0], src)
|
|
}(j)
|
|
}
|
|
group.Wait()
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func BenchmarkDecodeParallel(b *testing.B) {
|
|
numCPU := runtime.NumCPU()
|
|
for _, coder := range coders {
|
|
dsts = dsts[:0]
|
|
src := AppendEncode(nil, src)
|
|
for i := 0; i < numCPU; i++ {
|
|
dsts = append(dsts, must.Get(coder.appendDecode(nil, src)))
|
|
}
|
|
b.Run(coder.name, func(b *testing.B) {
|
|
b.ReportAllocs()
|
|
for i := 0; i < b.N; i++ {
|
|
var group sync.WaitGroup
|
|
for j := 0; j < numCPU; j++ {
|
|
group.Add(1)
|
|
go func(j int) {
|
|
defer group.Done()
|
|
dsts[j] = must.Get(coder.appendDecode(dsts[j][:0], src))
|
|
}(j)
|
|
}
|
|
group.Wait()
|
|
}
|
|
})
|
|
}
|
|
}
|