tailscale/util/deephash/deephash.go

// Copyright (c) 2020 Tailscale Inc & AUTHORS All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package deephash hashes a Go value recursively, in a predictable order,
// without looping. The hash is only valid within the lifetime of a program.
// Users should not store the hash on disk or send it over the network.
// The hash is sufficiently strong and unique such that
// Hash(x) == Hash(y) is an appropriate replacement for x == y.
//
// This package, like most of the tailscale.com Go module, should be
// considered Tailscale-internal; we make no API promises.
package deephash

import (
	"bufio"
	"crypto/sha256"
	"encoding/binary"
	"encoding/hex"
	"fmt"
	"hash"
	"math"
	"reflect"
	"strconv"
	"sync"
	"time"
)

const scratchSize = 128

// hasher is reusable state for hashing a value.
// Get one via hasherPool.
type hasher struct {
	h       hash.Hash
	bw      *bufio.Writer
	scratch [scratchSize]byte
	visited map[uintptr]bool
}

// newHasher initializes a new hasher, for use by hasherPool.
func newHasher() *hasher {
	h := &hasher{
		h:       sha256.New(),
		visited: map[uintptr]bool{},
	}
	h.bw = bufio.NewWriterSize(h.h, h.h.BlockSize())
	return h
}

// setBufioWriter switches the bufio writer to w after flushing
// any output to the old one. It then also returns the old one, so
// the caller can switch back to it.
func (h *hasher) setBufioWriter(w *bufio.Writer) (old *bufio.Writer) {
	old = h.bw
	old.Flush()
	h.bw = w
	return old
}

// Sum is an opaque checksum type that is comparable.
type Sum struct {
	sum [sha256.Size]byte
}

func (s Sum) String() string {
	return hex.EncodeToString(s.sum[:])
}

var (
	once sync.Once
	seed uint64
)

// Hash returns the hash of v.
func (h *hasher) Hash(v interface{}) (hash Sum) {
	h.bw.Flush()
	h.h.Reset()
	once.Do(func() {
		seed = uint64(time.Now().UnixNano())
	})
	h.uint(seed)
	h.print(reflect.ValueOf(v))
	h.bw.Flush()
	// Sum into scratch & copy out, as hash.Hash is an interface
	// so the slice necessarily escapes, and there's no sha256
	// concrete type exported and we don't want the 'hash' result
	// parameter to escape to the heap:
	h.h.Sum(h.scratch[:0])
	copy(hash.sum[:], h.scratch[:])
	return
}

var hasherPool = &sync.Pool{
	New: func() interface{} { return newHasher() },
}

// Hash returns the hash of v.
func Hash(v interface{}) Sum {
	h := hasherPool.Get().(*hasher)
	defer hasherPool.Put(h)
	for k := range h.visited {
		delete(h.visited, k)
	}
	return h.Hash(v)
}

// Update sets last to the hash of v and reports whether its value changed.
func Update(last *Sum, v ...interface{}) (changed bool) {
	sum := Hash(v)
	if sum == *last {
		// unchanged.
		return false
	}
	*last = sum
	return true
}

var appenderToType = reflect.TypeOf((*appenderTo)(nil)).Elem()

type appenderTo interface {
	AppendTo([]byte) []byte
}

func (h *hasher) uint(i uint64) {
	binary.BigEndian.PutUint64(h.scratch[:8], i)
	h.bw.Write(h.scratch[:8])
}

func (h *hasher) int(i int) {
	binary.BigEndian.PutUint64(h.scratch[:8], uint64(i))
	h.bw.Write(h.scratch[:8])
}

var uint8Type = reflect.TypeOf(byte(0))

// print hashes v into w.
// It reports whether it was able to do so without hitting a cycle.
func (h *hasher) print(v reflect.Value) (acyclic bool) {
	if !v.IsValid() {
		return true
	}

	w := h.bw
	visited := h.visited

	if v.CanInterface() {
		// Use AppendTo methods, if available and cheap.
		if v.CanAddr() && v.Type().Implements(appenderToType) {
			a := v.Addr().Interface().(appenderTo)
			size := h.scratch[:8]
			record := a.AppendTo(size)
			binary.LittleEndian.PutUint64(record, uint64(len(record)-len(size)))
			w.Write(record)
			return true
		}
	}

	// Generic handling.
	switch v.Kind() {
	default:
		panic(fmt.Sprintf("unhandled kind %v for type %v", v.Kind(), v.Type()))
	case reflect.Ptr:
		ptr := v.Pointer()
		if visited[ptr] {
			return false
		}
		visited[ptr] = true
		return h.print(v.Elem())
	case reflect.Struct:
		acyclic = true
		w.WriteString("struct")
		h.int(v.NumField())
		for i, n := 0, v.NumField(); i < n; i++ {
			h.int(i)
			if !h.print(v.Field(i)) {
				acyclic = false
			}
		}
		return acyclic
	case reflect.Slice, reflect.Array:
		vLen := v.Len()
		if v.Kind() == reflect.Slice {
			h.int(vLen)
		}
		if v.Type().Elem() == uint8Type && v.CanInterface() {
			if vLen > 0 && vLen <= scratchSize {
				// If it fits in scratch, avoid the Interface allocation.
				// It seems tempting to do this for all sizes, doing
				// scratchSize bytes at a time, but reflect.Slice seems
				// to allocate, so it's not a win.
				n := reflect.Copy(reflect.ValueOf(&h.scratch).Elem(), v)
				w.Write(h.scratch[:n])
				return true
			}
			fmt.Fprintf(w, "%s", v.Interface())
			return true
		}
		acyclic = true
		for i := 0; i < vLen; i++ {
			h.int(i)
			if !h.print(v.Index(i)) {
				acyclic = false
			}
		}
		return acyclic
	case reflect.Interface:
		if v.IsNil() {
			w.WriteByte(0) // indicates nil
			return true
		}
		v = v.Elem()

		w.WriteByte(1) // indicates visiting interface value
		h.hashType(v.Type())
		return h.print(v)
	case reflect.Map:
		// TODO(bradfitz): ideally we'd avoid these map
		// operations to detect cycles if we knew from the map
		// element type that there no way to form a cycle,
		// which is the common case. Notably, we don't care
		// about hashing the same map+contents twice in
		// different parts of the tree. In fact, we should
		// ideally. (And this prevents it) We should only stop
		// hashing when there's a cycle.  What we should
		// probably do is make sure we enumerate the data
		// structure tree is a fixed order and then give each
		// pointer an increasing number, and when we hit a
		// dup, rather than emitting nothing, we should emit a
		// "value #12" reference. Which implies that all things
		// emit to the bufio.Writer should be type-tagged so
		// we can distinguish loop references without risk of
		// collisions.
		ptr := v.Pointer()
		if visited[ptr] {
			return false
		}
		visited[ptr] = true

		if h.hashMapAcyclic(v) {
			return true
		}
		return h.hashMapFallback(v)
	case reflect.String:
		h.int(v.Len())
		w.WriteString(v.String())
	case reflect.Bool:
		w.Write(strconv.AppendBool(h.scratch[:0], v.Bool()))
	case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
		w.Write(strconv.AppendInt(h.scratch[:0], v.Int(), 10))
	case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr:
		h.uint(v.Uint())
	case reflect.Float32, reflect.Float64:
		w.Write(strconv.AppendUint(h.scratch[:0], math.Float64bits(v.Float()), 10))
	case reflect.Complex64, reflect.Complex128:
		fmt.Fprintf(w, "%v", v.Complex())
	}
	return true
}

type mapHasher struct {
	xbuf [sha256.Size]byte // XOR'ed accumulated buffer
	ebuf [sha256.Size]byte // scratch buffer
	s256 hash.Hash         // sha256 hash.Hash
	bw   *bufio.Writer     // to hasher into ebuf
	val  valueCache        // re-usable values for map iteration
	iter *reflect.MapIter  // re-usable map iterator
}

func (mh *mapHasher) Reset() {
	for i := range mh.xbuf {
		mh.xbuf[i] = 0
	}
}

func (mh *mapHasher) startEntry() {
	for i := range mh.ebuf {
		mh.ebuf[i] = 0
	}
	mh.bw.Flush()
	mh.s256.Reset()
}

func (mh *mapHasher) endEntry() {
	mh.bw.Flush()
	for i, b := range mh.s256.Sum(mh.ebuf[:0]) {
		mh.xbuf[i] ^= b
	}
}

var mapHasherPool = &sync.Pool{
	New: func() interface{} {
		mh := new(mapHasher)
		mh.s256 = sha256.New()
		mh.bw = bufio.NewWriter(mh.s256)
		mh.val = make(valueCache)
		mh.iter = new(reflect.MapIter)
		return mh
	},
}

type valueCache map[reflect.Type]reflect.Value

func (c valueCache) get(t reflect.Type) reflect.Value {
	v, ok := c[t]
	if !ok {
		v = reflect.New(t).Elem()
		c[t] = v
	}
	return v
}

// hashMapAcyclic is the faster sort-free version of map hashing. If
// it detects a cycle it returns false and guarantees that nothing was
// written to w.
func (h *hasher) hashMapAcyclic(v reflect.Value) (acyclic bool) {
	mh := mapHasherPool.Get().(*mapHasher)
	defer mapHasherPool.Put(mh)
	mh.Reset()
	iter := mapIter(mh.iter, v)
	defer mapIter(mh.iter, reflect.Value{}) // avoid pinning v from mh.iter when we return

	// Temporarily switch to the map hasher's bufio.Writer.
	oldw := h.setBufioWriter(mh.bw)
	defer h.setBufioWriter(oldw)

	k := mh.val.get(v.Type().Key())
	e := mh.val.get(v.Type().Elem())
	for iter.Next() {
		key := iterKey(iter, k)
		val := iterVal(iter, e)
		mh.startEntry()
		if !h.print(key) {
			return false
		}
		if !h.print(val) {
			return false
		}
		mh.endEntry()
	}
	oldw.Write(mh.xbuf[:])
	return true
}

func (h *hasher) hashMapFallback(v reflect.Value) (acyclic bool) {
	acyclic = true
	sm := newSortedMap(v)
	w := h.bw
	fmt.Fprintf(w, "map[%d]{\n", len(sm.Key))
	for i, k := range sm.Key {
		if !h.print(k) {
			acyclic = false
		}
		w.WriteString(": ")
		if !h.print(sm.Value[i]) {
			acyclic = false
		}
		w.WriteString("\n")
	}
	w.WriteString("}\n")
	return acyclic
}

// hashType hashes a reflect.Type.
// The hash is only consistent within the lifetime of a program.
func (h *hasher) hashType(t reflect.Type) {
	// This approach relies on reflect.Type always being backed by a unique
	// *reflect.rtype pointer. A safer approach is to use a global sync.Map
	// that maps reflect.Type to some arbitrary and unique index.
	// While safer, it requires global state with memory that can never be GC'd.
	rtypeAddr := reflect.ValueOf(t).Pointer() // address of *reflect.rtype
	h.uint(uint64(rtypeAddr))
}