util/deephash: require pointer in API (#5467)

The entry logic of Hash has extra complexity to make sure
we always have an addressable value on hand.
If not, we heap allocate the input.
For this reason we document that there are performance benefits
to always providing a pointer.
Rather than documenting this, just enforce it through generics.

Also, delete the unused HasherForType function.
It's an interesting use of generics, but not well tested.
We can resurrect it from code history if there's a need for it.

Signed-off-by: Joe Tsai <joetsai@digital-static.net>
This commit is contained in:
Joe Tsai 2022-08-27 16:08:31 -07:00 committed by GitHub
parent c5b1565337
commit ab7e6f3f11
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 97 additions and 122 deletions

View File

@ -6,7 +6,7 @@
// without looping. The hash is only valid within the lifetime of a program.
// Users should not store the hash on disk or send it over the network.
// The hash is sufficiently strong and unique such that
// Hash(x) == Hash(y) is an appropriate replacement for x == y.
// Hash(&x) == Hash(&y) is an appropriate replacement for x == y.
//
// The definition of equality is identical to reflect.DeepEqual except:
// - Floating-point values are compared based on the raw bits,
@ -65,6 +65,33 @@ type hasher struct {
visitStack visitStack
}
var hasherPool = &sync.Pool{
New: func() any { return new(hasher) },
}
func (h *hasher) reset() {
if h.Block512.Hash == nil {
h.Block512.Hash = sha256.New()
}
h.Block512.Reset()
}
// hashType hashes a reflect.Type.
// The hash is only consistent within the lifetime of a program.
func (h *hasher) hashType(t reflect.Type) {
// This approach relies on reflect.Type always being backed by a unique
// *reflect.rtype pointer. A safer approach is to use a global sync.Map
// that maps reflect.Type to some arbitrary and unique index.
// While safer, it requires global state with memory that can never be GC'd.
rtypeAddr := reflect.ValueOf(t).Pointer() // address of *reflect.rtype
h.HashUint64(uint64(rtypeAddr))
}
func (h *hasher) sum() (s Sum) {
h.Sum(s.sum[:0])
return s
}
// Sum is an opaque checksum type that is comparable.
type Sum struct {
sum [sha256.Size]byte
@ -89,97 +116,57 @@ func initSeed() {
seed = uint64(time.Now().UnixNano())
}
func (h *hasher) Reset() {
if h.Block512.Hash == nil {
h.Block512.Hash = sha256.New()
}
h.Block512.Reset()
}
func (h *hasher) sum() (s Sum) {
h.Sum(s.sum[:0])
return s
}
var hasherPool = &sync.Pool{
New: func() any { return new(hasher) },
}
// Hash returns the hash of v.
// For performance, this should be a non-nil pointer.
func Hash(v any) (s Sum) {
func Hash[T any](v *T) Sum {
h := hasherPool.Get().(*hasher)
defer hasherPool.Put(h)
h.Reset()
h.reset()
seedOnce.Do(initSeed)
h.HashUint64(seed)
rv := reflect.ValueOf(v)
if rv.IsValid() {
var t reflect.Type
var p pointer
if rv.Kind() == reflect.Pointer && !rv.IsNil() {
t = rv.Type().Elem()
p = pointerOf(rv)
} else {
t = rv.Type()
va := reflect.New(t).Elem()
va.Set(rv)
p = pointerOf(va.Addr())
}
// Always treat the Hash input as an interface (it is), including hashing
// its type, otherwise two Hash calls of different types could hash to the
// same bytes off the different types and get equivalent Sum values. This is
// the same thing that we do for reflect.Kind Interface in hashValue, but
// the initial reflect.ValueOf from an interface value effectively strips
// the interface box off so we have to do it at the top level by hand.
h.hashType(t)
ti := getTypeInfo(t)
ti.hasher()(h, p)
// Always treat the Hash input as if it were an interface by including
// a hash of the type. This ensures that hashing of two different types
// but with the same value structure produces different hashes.
t := reflect.TypeOf(v).Elem()
h.hashType(t)
if v == nil {
h.HashUint8(0) // indicates nil
} else {
h.HashUint8(1) // indicates visiting pointer element
p := pointerOf(reflect.ValueOf(v))
hash := getTypeInfo(t).hasher()
hash(h, p)
}
return h.sum()
}
// HasherForType is like Hash, but it returns a Hash func that's specialized for
// the provided reflect type, avoiding a map lookup per value.
func HasherForType[T any]() func(T) Sum {
var zeroT T
t := reflect.TypeOf(zeroT)
ti := getTypeInfo(t)
var tiElem *typeInfo
if t.Kind() == reflect.Pointer {
tiElem = getTypeInfo(t.Elem())
}
// HasherForType returns a hash that is specialized for the provided type.
func HasherForType[T any]() func(*T) Sum {
var v *T
seedOnce.Do(initSeed)
return func(v T) (s Sum) {
t := reflect.TypeOf(v).Elem()
hash := getTypeInfo(t).hasher()
return func(v *T) (s Sum) {
// This logic is identical to Hash, but pull out a few statements.
h := hasherPool.Get().(*hasher)
defer hasherPool.Put(h)
h.Reset()
h.reset()
h.HashUint64(seed)
rv := reflect.ValueOf(v)
if rv.IsValid() {
if rv.Kind() == reflect.Pointer && !rv.IsNil() {
p := pointerOf(rv)
h.hashType(t.Elem())
tiElem.hasher()(h, p)
} else {
va := reflect.New(t).Elem()
va.Set(rv)
p := pointerOf(va.Addr())
h.hashType(t)
ti.hasher()(h, p)
}
h.hashType(t)
if v == nil {
h.HashUint8(0) // indicates nil
} else {
h.HashUint8(1) // indicates visiting pointer element
p := pointerOf(reflect.ValueOf(v))
hash(h, p)
}
return h.sum()
}
}
// Update sets last to the hash of v and reports whether its value changed.
func Update(last *Sum, v any) (changed bool) {
func Update[T any](last *Sum, v *T) (changed bool) {
sum := Hash(v)
changed = sum != *last
if changed {
@ -233,9 +220,9 @@ func genTypeHasher(ti *typeInfo) typeHasherFunc {
// Types with specific hashing.
switch t {
case timeTimeType:
return (*hasher).hashTimev
return hashTime
case netipAddrType:
return (*hasher).hashAddrv
return hashAddr
}
// Types that can have their memory representation directly hashed.
@ -245,7 +232,7 @@ func genTypeHasher(ti *typeInfo) typeHasherFunc {
switch t.Kind() {
case reflect.String:
return (*hasher).hashString
return hashString
case reflect.Array:
return makeArrayHasher(t)
case reflect.Slice:
@ -263,14 +250,7 @@ func genTypeHasher(ti *typeInfo) typeHasherFunc {
}
}
func (h *hasher) hashString(p pointer) {
s := *p.asString()
h.HashUint64(uint64(len(s)))
h.HashString(s)
}
// hashTimev hashes v, of kind time.Time.
func (h *hasher) hashTimev(p pointer) {
func hashTime(h *hasher, p pointer) {
// Include the zone offset (but not the name) to keep
// Hash(t1) == Hash(t2) being semantically equivalent to
// t1.Format(time.RFC3339Nano) == t2.Format(time.RFC3339Nano).
@ -281,8 +261,7 @@ func (h *hasher) hashTimev(p pointer) {
h.HashUint32(uint32(offset))
}
// hashAddrv hashes v, of type netip.Addr.
func (h *hasher) hashAddrv(p pointer) {
func hashAddr(h *hasher, p pointer) {
// The formatting of netip.Addr covers the
// IP version, the address, and the optional zone name (for v6).
// This is equivalent to a1.MarshalBinary() == a2.MarshalBinary().
@ -304,6 +283,12 @@ func (h *hasher) hashAddrv(p pointer) {
}
}
func hashString(h *hasher, p pointer) {
s := *p.asString()
h.HashUint64(uint64(len(s)))
h.HashString(s)
}
func makeMemHasher(n uintptr) typeHasherFunc {
return func(h *hasher, p pointer) {
h.HashBytes(p.asMemory(n))
@ -448,7 +433,7 @@ func makeMapHasher(t reflect.Type) typeHasherFunc {
for iter := v.MapRange(); iter.Next(); {
k.SetIterKey(iter)
e.SetIterValue(iter)
mh.h.Reset()
mh.h.reset()
hashKey(&mh.h, pointerOf(k.Addr()))
hashValue(&mh.h, pointerOf(e.Addr()))
mh.sum.xor(mh.h.sum())
@ -567,14 +552,3 @@ func (c *valueCache) get(t reflect.Type) reflect.Value {
}
return v
}
// hashType hashes a reflect.Type.
// The hash is only consistent within the lifetime of a program.
func (h *hasher) hashType(t reflect.Type) {
// This approach relies on reflect.Type always being backed by a unique
// *reflect.rtype pointer. A safer approach is to use a global sync.Map
// that maps reflect.Type to some arbitrary and unique index.
// While safer, it requires global state with memory that can never be GC'd.
rtypeAddr := reflect.ValueOf(t).Pointer() // address of *reflect.rtype
h.HashUint64(uint64(rtypeAddr))
}

View File

@ -160,7 +160,7 @@ type scalars struct {
}
for _, tt := range tests {
gotEq := Hash(tt.in[0]) == Hash(tt.in[1])
gotEq := Hash(&tt.in[0]) == Hash(&tt.in[1])
if gotEq != tt.wantEq {
t.Errorf("(Hash(%T %v) == Hash(%T %v)) = %v, want %v", tt.in[0], tt.in[0], tt.in[1], tt.in[1], gotEq, tt.wantEq)
}
@ -171,11 +171,11 @@ func TestDeepHash(t *testing.T) {
// v contains the types of values we care about for our current callers.
// Mostly we're just testing that we don't panic on handled types.
v := getVal()
hash1 := Hash(v)
t.Logf("hash: %v", hash1)
for i := 0; i < 20; i++ {
hash2 := Hash(getVal())
v := getVal()
hash2 := Hash(v)
if hash1 != hash2 {
t.Error("second hash didn't match")
}
@ -186,7 +186,7 @@ func TestDeepHash(t *testing.T) {
func TestIssue4868(t *testing.T) {
m1 := map[int]string{1: "foo"}
m2 := map[int]string{1: "bar"}
if Hash(m1) == Hash(m2) {
if Hash(&m1) == Hash(&m2) {
t.Error("bogus")
}
}
@ -194,7 +194,7 @@ func TestIssue4868(t *testing.T) {
func TestIssue4871(t *testing.T) {
m1 := map[string]string{"": "", "x": "foo"}
m2 := map[string]string{}
if h1, h2 := Hash(m1), Hash(m2); h1 == h2 {
if h1, h2 := Hash(&m1), Hash(&m2); h1 == h2 {
t.Errorf("bogus: h1=%x, h2=%x", h1, h2)
}
}
@ -202,7 +202,7 @@ func TestIssue4871(t *testing.T) {
func TestNilVsEmptymap(t *testing.T) {
m1 := map[string]string(nil)
m2 := map[string]string{}
if h1, h2 := Hash(m1), Hash(m2); h1 == h2 {
if h1, h2 := Hash(&m1), Hash(&m2); h1 == h2 {
t.Errorf("bogus: h1=%x, h2=%x", h1, h2)
}
}
@ -210,7 +210,7 @@ func TestNilVsEmptymap(t *testing.T) {
func TestMapFraming(t *testing.T) {
m1 := map[string]string{"foo": "", "fo": "o"}
m2 := map[string]string{}
if h1, h2 := Hash(m1), Hash(m2); h1 == h2 {
if h1, h2 := Hash(&m1), Hash(&m2); h1 == h2 {
t.Errorf("bogus: h1=%x, h2=%x", h1, h2)
}
}
@ -218,23 +218,25 @@ func TestMapFraming(t *testing.T) {
func TestQuick(t *testing.T) {
initSeed()
err := quick.Check(func(v, w map[string]string) bool {
return (Hash(v) == Hash(w)) == reflect.DeepEqual(v, w)
return (Hash(&v) == Hash(&w)) == reflect.DeepEqual(v, w)
}, &quick.Config{MaxCount: 1000, Rand: rand.New(rand.NewSource(int64(seed)))})
if err != nil {
t.Fatalf("seed=%v, err=%v", seed, err)
}
}
func getVal() any {
return &struct {
WGConfig *wgcfg.Config
RouterConfig *router.Config
MapFQDNAddrs map[dnsname.FQDN][]netip.Addr
MapFQDNAddrPorts map[dnsname.FQDN][]netip.AddrPort
MapDiscoPublics map[key.DiscoPublic]bool
MapResponse *tailcfg.MapResponse
FilterMatch filter.Match
}{
type tailscaleTypes struct {
WGConfig *wgcfg.Config
RouterConfig *router.Config
MapFQDNAddrs map[dnsname.FQDN][]netip.Addr
MapFQDNAddrPorts map[dnsname.FQDN][]netip.AddrPort
MapDiscoPublics map[key.DiscoPublic]bool
MapResponse *tailcfg.MapResponse
FilterMatch filter.Match
}
func getVal() *tailscaleTypes {
return &tailscaleTypes{
&wgcfg.Config{
Name: "foo",
Addresses: []netip.Prefix{netip.PrefixFrom(netip.AddrFrom16([16]byte{3: 3}).Unmap(), 5)},
@ -600,23 +602,23 @@ func TestMapCycle(t *testing.T) {
a["self"] = a
b := make(M) // cylic graph of 1 node
b["self"] = b
ha := Hash(a)
hb := Hash(b)
ha := Hash(&a)
hb := Hash(&b)
c.Assert(ha, qt.Equals, hb)
c1 := make(M) // cyclic graph of 2 nodes
c2 := make(M) // cyclic graph of 2 nodes
c1["peer"] = c2
c2["peer"] = c1
hc1 := Hash(c1)
hc2 := Hash(c2)
hc1 := Hash(&c1)
hc2 := Hash(&c2)
c.Assert(hc1, qt.Equals, hc2)
c.Assert(ha, qt.Not(qt.Equals), hc1)
c.Assert(hb, qt.Not(qt.Equals), hc2)
c3 := make(M) // graph of 1 node pointing to cyclic graph of 2 nodes
c3["child"] = c1
hc3 := Hash(c3)
hc3 := Hash(&c3)
c.Assert(hc1, qt.Not(qt.Equals), hc3)
}
@ -732,9 +734,8 @@ func ptrTo[T any](v T) *T { return &v }
func BenchmarkHashPacketFilter(b *testing.B) {
b.ReportAllocs()
hash := HasherForType[*[]tailcfg.FilterRule]()
for i := 0; i < b.N; i++ {
sink = hash(&filterRules)
sink = Hash(&filterRules)
}
}
@ -815,7 +816,7 @@ func BenchmarkTailcfgNode(b *testing.B) {
func TestExhaustive(t *testing.T) {
seen := make(map[Sum]bool)
for i := 0; i < 100000; i++ {
s := Hash(i)
s := Hash(&i)
if seen[s] {
t.Fatalf("hash collision %v", i)
}