util/stringsx: add package for extra string functions, like CompareFold

Noted as useful during review of #14448.

Updates #14457

Change-Id: I0f16f08d5b05a8e9044b19ef6c02d3dab497f131
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
Brad Fitzpatrick 2024-12-22 20:38:20 -08:00 committed by Brad Fitzpatrick
parent 4267d0fc5b
commit 9e2819b5d4
2 changed files with 130 additions and 0 deletions

52
util/stringsx/stringsx.go Normal file
View File

@ -0,0 +1,52 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
// Package stringsx provides additional string manipulation functions
// that aren't in the standard library's strings package or go4.org/mem.
package stringsx
import (
"unicode"
"unicode/utf8"
)
// CompareFold returns -1, 0, or 1 depending on whether a < b, a == b, or a > b,
// like cmp.Compare, but case insensitively.
func CompareFold(a, b string) int {
// Track our position in both strings
ia, ib := 0, 0
for ia < len(a) && ib < len(b) {
ra, wa := nextRuneLower(a[ia:])
rb, wb := nextRuneLower(b[ib:])
if ra < rb {
return -1
}
if ra > rb {
return 1
}
ia += wa
ib += wb
if wa == 0 || wb == 0 {
break
}
}
// If we've reached here, one or both strings are exhausted
// The shorter string is "less than" if they match up to this point
switch {
case ia == len(a) && ib == len(b):
return 0
case ia == len(a):
return -1
default:
return 1
}
}
// nextRuneLower returns the next rune in the string, lowercased, along with its
// original (consumed) width in bytes. If the string is empty, it returns
// (utf8.RuneError, 0)
func nextRuneLower(s string) (r rune, width int) {
r, width = utf8.DecodeRuneInString(s)
return unicode.ToLower(r), width
}

View File

@ -0,0 +1,78 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
package stringsx
import (
"cmp"
"strings"
"testing"
)
func TestCompareFold(t *testing.T) {
tests := []struct {
a, b string
}{
// Basic ASCII cases
{"", ""},
{"a", "a"},
{"a", "A"},
{"A", "a"},
{"a", "b"},
{"b", "a"},
{"abc", "ABC"},
{"ABC", "abc"},
{"abc", "abd"},
{"abd", "abc"},
// Length differences
{"abc", "ab"},
{"ab", "abc"},
// Unicode cases
{"世界", "世界"},
{"Hello世界", "hello世界"},
{"世界Hello", "世界hello"},
{"世界", "世界x"},
{"世界x", "世界"},
// Special case folding examples
{"ß", "ss"}, // German sharp s
{"fi", "fi"}, // fi ligature
{"Σ", "σ"}, // Greek sigma
{"İ", "i\u0307"}, // Turkish dotted I
// Mixed cases
{"HelloWorld", "helloworld"},
{"HELLOWORLD", "helloworld"},
{"helloworld", "HELLOWORLD"},
{"HelloWorld", "helloworld"},
{"helloworld", "HelloWorld"},
// Edge cases
{" ", " "},
{"1", "1"},
{"123", "123"},
{"!@#", "!@#"},
}
wants := []int{}
for _, tt := range tests {
got := CompareFold(tt.a, tt.b)
want := cmp.Compare(strings.ToLower(tt.a), strings.ToLower(tt.b))
if got != want {
t.Errorf("CompareFold(%q, %q) = %v, want %v", tt.a, tt.b, got, want)
}
wants = append(wants, want)
}
if n := testing.AllocsPerRun(1000, func() {
for i, tt := range tests {
if CompareFold(tt.a, tt.b) != wants[i] {
panic("unexpected")
}
}
}); n > 0 {
t.Errorf("allocs = %v; want 0", int(n))
}
}