taildrop: switch hashing to be streaming based (#9861)

While the previous logic was correct, it did not perform well.
Resuming is a dance between the client and server, where
1. the client requests hashes for a partial file,
2. the server then computes those hashes,
3. the client computes hashes locally and compares them.
4. goto 1 while the partial file still has data

While step 2 is running, the client is sitting idle.
While step 3 is running, the server is sitting idle.

By streaming over the block hash immediately after the server
computes it, the client can start checking the hash,
while the server works on the next hash (in a pipelined manner).
This performs dramatically better and also uses less memory
as we don't need to hold a list of hashes, but only need to
handle one hash at a time.

There are two detriments to this approach:
* The HTTP API relies on a JSON stream,
  which is not a standard REST-like pattern.
  However, since we implement both client and server,
  this is fine.
* While the stream is on-going, we hold an open file handle
  on the server side while the file is being hashed.
  On really slow streams, this could hold a file open forever.

Updates tailscale/corp#14772

Signed-off-by: Joe Tsai <joetsai@digital-static.net>
Co-authored-by: Rhea Ghosh <rhea@tailscale.com>
This commit is contained in:
Joe Tsai
2023-10-17 17:53:40 -07:00
committed by GitHub
parent 7971333603
commit 3f27087e9d
4 changed files with 121 additions and 144 deletions

View File

@@ -1320,38 +1320,36 @@ func (h *Handler) serveFilePut(w http.ResponseWriter, r *http.Request) {
// Before we PUT a file we check to see if there are any existing partial file and if so,
// we resume the upload from where we left off by sending the remaining file instead of
// the full file.
offset, remainingBody, err := taildrop.ResumeReader(r.Body, func(offset, length int64) (taildrop.FileChecksums, error) {
client := &http.Client{
Transport: h.b.Dialer().PeerAPITransport(),
Timeout: 10 * time.Second,
}
req, err := http.NewRequestWithContext(r.Context(), "GET", dstURL.String()+"/v0/put/"+filenameEscaped, nil)
if err != nil {
return taildrop.FileChecksums{}, err
}
rangeHdr, ok := httphdr.FormatRange([]httphdr.Range{{Start: offset, Length: length}})
if !ok {
return taildrop.FileChecksums{}, fmt.Errorf("invalid offset and length")
}
req.Header.Set("Range", rangeHdr)
switch resp, err := client.Do(req); {
case err != nil:
return taildrop.FileChecksums{}, err
case resp.StatusCode == http.StatusMethodNotAllowed || resp.StatusCode == http.StatusNotFound:
return taildrop.FileChecksums{}, nil // implies remote peer on older version
case resp.StatusCode != http.StatusOK:
return taildrop.FileChecksums{}, fmt.Errorf("unexpected status code %d", resp.StatusCode)
default:
var checksums taildrop.FileChecksums
err = json.NewDecoder(resp.Body).Decode(&checksums)
return checksums, err
}
})
var offset int64
var resumeDuration time.Duration
remainingBody := io.Reader(r.Body)
client := &http.Client{
Transport: h.b.Dialer().PeerAPITransport(),
Timeout: 10 * time.Second,
}
req, err := http.NewRequestWithContext(r.Context(), "GET", dstURL.String()+"/v0/put/"+filenameEscaped, nil)
if err != nil {
// ResumeReader ensures that the returned offset and reader are consistent
// even if an error is encountered. Thus, we can still proceed.
h.logf("reader could not be fully resumed: %v", err)
http.Error(w, "bogus peer URL", http.StatusInternalServerError)
return
}
switch resp, err := client.Do(req); {
case err != nil:
h.logf("could not fetch remote hashes: %v", err)
case resp.StatusCode == http.StatusMethodNotAllowed || resp.StatusCode == http.StatusNotFound:
// noop; implies older peerapi without resume support
case resp.StatusCode != http.StatusOK:
h.logf("fetch remote hashes status code: %d", resp.StatusCode)
default:
resumeStart := time.Now()
dec := json.NewDecoder(resp.Body)
offset, remainingBody, err = taildrop.ResumeReader(r.Body, func() (out taildrop.BlockChecksum, err error) {
err = dec.Decode(&out)
return out, err
})
if err != nil {
h.logf("reader could not be fully resumed: %v", err)
}
resumeDuration = time.Since(resumeStart).Round(time.Millisecond)
}
outReq, err := http.NewRequestWithContext(r.Context(), "PUT", "http://peer/v0/put/"+filenameEscaped, remainingBody)
@@ -1361,6 +1359,7 @@ func (h *Handler) serveFilePut(w http.ResponseWriter, r *http.Request) {
}
outReq.ContentLength = r.ContentLength
if offset > 0 {
h.logf("resuming put at offset %d after %v", offset, resumeDuration)
rangeHdr, _ := httphdr.FormatRange([]httphdr.Range{{offset, 0}})
outReq.Header.Set("Range", rangeHdr)
if outReq.ContentLength >= 0 {