From 38093219fd10b0cfc17a129088b69e97cb904317 Mon Sep 17 00:00:00 2001 From: Arceliar Date: Sun, 2 Dec 2018 14:46:58 -0600 Subject: [PATCH 1/2] dimensionless way to track how often nodes are faster than the current parent --- src/yggdrasil/switch.go | 87 +++++++++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 33 deletions(-) diff --git a/src/yggdrasil/switch.go b/src/yggdrasil/switch.go index 28f83122..60ba02a8 100644 --- a/src/yggdrasil/switch.go +++ b/src/yggdrasil/switch.go @@ -21,7 +21,7 @@ import ( const switch_timeout = time.Minute const switch_updateInterval = switch_timeout / 2 const switch_throttle = switch_updateInterval / 2 -const switch_max_time = time.Hour +const switch_faster_threshold = 2880 // 1 update per 30 seconds for 24 hours // The switch locator represents the topology and network state dependent info about a node, minus the signatures that go with it. // Nodes will pick the best root they see, provided that the root continues to push out updates with new timestamps. @@ -119,13 +119,13 @@ func (x *switchLocator) isAncestorOf(y *switchLocator) bool { // Information about a peer, used by the switch to build the tree and eventually make routing decisions. type peerInfo struct { - key sigPubKey // ID of this peer - locator switchLocator // Should be able to respond with signatures upon request - degree uint64 // Self-reported degree - time time.Time // Time this node was last seen - firstSeen time.Time - port switchPort // Interface number of this peer - msg switchMsg // The wire switchMsg used + key sigPubKey // ID of this peer + locator switchLocator // Should be able to respond with signatures upon request + degree uint64 // Self-reported degree + time time.Time // Time this node was last seen + faster uint16 // Counter of how often a node is faster than the current parent, penalized extra if slower + port switchPort // Interface number of this peer + msg switchMsg // The wire switchMsg used } // This is just a uint64 with a named type for clarity reasons. @@ -252,7 +252,7 @@ func (t *switchTable) forgetPeer(port switchPort) { return } for _, info := range t.data.peers { - t.unlockedHandleMsg(&info.msg, info.port) + t.unlockedHandleMsg(&info.msg, info.port, true) } } @@ -326,7 +326,7 @@ func (t *switchTable) checkRoot(msg *switchMsg) bool { func (t *switchTable) handleMsg(msg *switchMsg, fromPort switchPort) { t.mutex.Lock() defer t.mutex.Unlock() - t.unlockedHandleMsg(msg, fromPort) + t.unlockedHandleMsg(msg, fromPort, false) } // This updates the switch with information about a peer. @@ -334,7 +334,8 @@ func (t *switchTable) handleMsg(msg *switchMsg, fromPort switchPort) { // That happens if this node is already our parent, or is advertising a better root, or is advertising a better path to the same root, etc... // There are a lot of very delicate order sensitive checks here, so its' best to just read the code if you need to understand what it's doing. // It's very important to not change the order of the statements in the case function unless you're absolutely sure that it's safe, including safe if used along side nodes that used the previous order. -func (t *switchTable) unlockedHandleMsg(msg *switchMsg, fromPort switchPort) { +// Set the third arg to true if you're reprocessing an old message, e.g. to find a new parent after one disconnects, to avoid updating some timing related things. +func (t *switchTable) unlockedHandleMsg(msg *switchMsg, fromPort switchPort, reprocessing bool) { // TODO directly use a switchMsg instead of switchMessage + sigs now := time.Now() // Set up the sender peerInfo @@ -350,10 +351,7 @@ func (t *switchTable) unlockedHandleMsg(msg *switchMsg, fromPort switchPort) { } sender.msg = *msg oldSender, isIn := t.data.peers[fromPort] - if !isIn { - oldSender.firstSeen = now - } - sender.firstSeen = oldSender.firstSeen + sender.faster = oldSender.faster sender.port = fromPort sender.time = now // Decide what to do @@ -374,9 +372,36 @@ func (t *switchTable) unlockedHandleMsg(msg *switchMsg, fromPort switchPort) { doUpdate := false if !equiv(&sender.locator, &oldSender.locator) { doUpdate = true - sender.firstSeen = now } + // Check if faster than the current parent, and update sender.faster accordingly + switch { + case reprocessing: + // Don't change anything if we're just reprocessing old messages. + case !isIn: + // Not known, sender.faster == 0, but set it explicitly just to make that obvious to the reader. + sender.faster = 0 + case msg.Root != oldSender.locator.root: + // This is a new root. + // Honestly not sure if we should reset or do something else. For now, we'll just leave it alone. + case sender.port == t.parent: + // This is the current parent. If roots change, there's a good chance that they're still the best route to the root, so we probably don't want them to converge towards 0. + // If we leae them alone, then when a different node gets parented, this one will get penalized by a couple of points, so it hopefully shouldn't flap too hard to leave this alone for now. + case sender.locator.tstamp <= t.data.locator.tstamp: + // This timestamp came in slower than our parent's, so we should penalize them by more than we reward faster nodes. + if sender.faster > 1 { + sender.faster -= 2 + } else { + // If exactly 1, don't let it roll under + sender.faster = 0 + } + default: + // They sent us an update faster than our parent did, so reward them. + // FIXME make sure this can't ever roll over. It shouldn't be possible, we'd switch to them as a parent first, but still... + sender.faster++ + } + // Update sender t.data.peers[fromPort] = sender + // Decide if we should also update our root info to make the sender our parent updateRoot := false oldParent, isIn := t.data.peers[t.parent] noParent := !isIn @@ -391,20 +416,8 @@ func (t *switchTable) unlockedHandleMsg(msg *switchMsg, fromPort switchPort) { } return true }() - // Get the time we've known about the sender (or old parent's) current coords, up to a maximum of `switch_max_time`. - sTime := now.Sub(sender.firstSeen) - if sTime > switch_max_time { - sTime = switch_max_time - } - pTime := now.Sub(oldParent.firstSeen) - if pTime > switch_max_time { - pTime = switch_max_time - } - // Really want to compare sLen/sTime and pLen/pTime - // Cross multiplied to avoid divide-by-zero - cost := float64(len(sender.locator.coords)) * pTime.Seconds() - pCost := float64(len(t.data.locator.coords)) * sTime.Seconds() dropTstamp, isIn := t.drop[sender.locator.root] + // Decide if we need to update info about the root or change parents. switch { case !noLoop: // This route loops, so we can't use the sender as our parent. @@ -420,8 +433,16 @@ func (t *switchTable) unlockedHandleMsg(msg *switchMsg, fromPort switchPort) { case noParent: // We currently have no working parent, and at this point in the switch statement, anything is better than nothing. updateRoot = true - case cost < pCost: - // The sender has a better combination of path length and reliability than the current parent. + case sender.faster > switch_faster_threshold: + // The is reliably faster than the current parent. + updateRoot = true + case reprocessing && len(sender.locator.coords) < len(oldParent.locator.coords): + // We're reprocessing old messages to find a new parent. + // That means we're in the middle of a route flap. + // We don't know how often each node is faster than the others, only relative to the old parent. + // If any of them was faster than the old parent, then we'd probably already be using them. + // So the best we can really do is pick the shortest route and hope it's OK as a starting point. + // TODO: Find some way to reliably store relative order between all peers. Basically a pxp "faster" matrix, more likely a faster port->uint map per peer, but preferably not literally that, since it'd be tedious to manage and probably slows down updates. updateRoot = true case sender.port != t.parent: // Ignore further cases if the sender isn't our parent. @@ -432,9 +453,9 @@ func (t *switchTable) unlockedHandleMsg(msg *switchMsg, fromPort switchPort) { // Then reprocess *all* messages to look for a better parent. // This is so we don't keep using this node as our parent if there's something better. t.parent = 0 - t.unlockedHandleMsg(msg, fromPort) + t.unlockedHandleMsg(msg, fromPort, true) for _, info := range t.data.peers { - t.unlockedHandleMsg(&info.msg, info.port) + t.unlockedHandleMsg(&info.msg, info.port, true) } case now.Sub(t.time) < switch_throttle: // We've already gotten an update from this root recently, so ignore this one to avoid flooding. From dcfe55dae8682aeb59336c189a0d837dcc0ec67b Mon Sep 17 00:00:00 2001 From: Arceliar Date: Sun, 2 Dec 2018 16:36:25 -0600 Subject: [PATCH 2/2] store 'faster' relationships between all pairs of peers, to make fallback easier when a parent goes offline --- src/yggdrasil/switch.go | 111 +++++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 53 deletions(-) diff --git a/src/yggdrasil/switch.go b/src/yggdrasil/switch.go index 60ba02a8..9c1bfe90 100644 --- a/src/yggdrasil/switch.go +++ b/src/yggdrasil/switch.go @@ -18,10 +18,12 @@ import ( "time" ) -const switch_timeout = time.Minute -const switch_updateInterval = switch_timeout / 2 -const switch_throttle = switch_updateInterval / 2 -const switch_faster_threshold = 2880 // 1 update per 30 seconds for 24 hours +const ( + switch_timeout = time.Minute + switch_updateInterval = switch_timeout / 2 + switch_throttle = switch_updateInterval / 2 + switch_faster_threshold = 240 //Number of switch updates before switching to a faster parent +) // The switch locator represents the topology and network state dependent info about a node, minus the signatures that go with it. // Nodes will pick the best root they see, provided that the root continues to push out updates with new timestamps. @@ -119,13 +121,13 @@ func (x *switchLocator) isAncestorOf(y *switchLocator) bool { // Information about a peer, used by the switch to build the tree and eventually make routing decisions. type peerInfo struct { - key sigPubKey // ID of this peer - locator switchLocator // Should be able to respond with signatures upon request - degree uint64 // Self-reported degree - time time.Time // Time this node was last seen - faster uint16 // Counter of how often a node is faster than the current parent, penalized extra if slower - port switchPort // Interface number of this peer - msg switchMsg // The wire switchMsg used + key sigPubKey // ID of this peer + locator switchLocator // Should be able to respond with signatures upon request + degree uint64 // Self-reported degree + time time.Time // Time this node was last seen + faster map[switchPort]uint64 // Counter of how often a node is faster than the current parent, penalized extra if slower + port switchPort // Interface number of this peer + msg switchMsg // The wire switchMsg used } // This is just a uint64 with a named type for clarity reasons. @@ -350,8 +352,6 @@ func (t *switchTable) unlockedHandleMsg(msg *switchMsg, fromPort switchPort, rep prevKey = hop.Next } sender.msg = *msg - oldSender, isIn := t.data.peers[fromPort] - sender.faster = oldSender.faster sender.port = fromPort sender.time = now // Decide what to do @@ -370,34 +370,39 @@ func (t *switchTable) unlockedHandleMsg(msg *switchMsg, fromPort switchPort, rep return true } doUpdate := false + oldSender := t.data.peers[fromPort] if !equiv(&sender.locator, &oldSender.locator) { doUpdate = true } - // Check if faster than the current parent, and update sender.faster accordingly - switch { - case reprocessing: - // Don't change anything if we're just reprocessing old messages. - case !isIn: - // Not known, sender.faster == 0, but set it explicitly just to make that obvious to the reader. - sender.faster = 0 - case msg.Root != oldSender.locator.root: - // This is a new root. - // Honestly not sure if we should reset or do something else. For now, we'll just leave it alone. - case sender.port == t.parent: - // This is the current parent. If roots change, there's a good chance that they're still the best route to the root, so we probably don't want them to converge towards 0. - // If we leae them alone, then when a different node gets parented, this one will get penalized by a couple of points, so it hopefully shouldn't flap too hard to leave this alone for now. - case sender.locator.tstamp <= t.data.locator.tstamp: - // This timestamp came in slower than our parent's, so we should penalize them by more than we reward faster nodes. - if sender.faster > 1 { - sender.faster -= 2 - } else { - // If exactly 1, don't let it roll under - sender.faster = 0 + // Update the matrix of peer "faster" thresholds + if reprocessing { + sender.faster = oldSender.faster + } else { + sender.faster = make(map[switchPort]uint64, len(oldSender.faster)) + for port, peer := range t.data.peers { + if port == fromPort { + continue + } + switch { + case msg.Root != peer.locator.root: + // Different roots, blindly guess that the relationships will stay the same? + sender.faster[port] = oldSender.faster[peer.port] + case sender.locator.tstamp <= peer.locator.tstamp: + // Slower than this node, penalize (more than the reward amount) + if oldSender.faster[port] > 1 { + sender.faster[port] = oldSender.faster[peer.port] - 2 + } else { + sender.faster[port] = 0 + } + default: + // We were faster than this node, so increment, as long as we don't overflow because of it + if oldSender.faster[peer.port] < switch_faster_threshold { + sender.faster[port] = oldSender.faster[peer.port] + 1 + } else { + sender.faster[port] = switch_faster_threshold + } + } } - default: - // They sent us an update faster than our parent did, so reward them. - // FIXME make sure this can't ever roll over. It shouldn't be possible, we'd switch to them as a parent first, but still... - sender.faster++ } // Update sender t.data.peers[fromPort] = sender @@ -433,30 +438,30 @@ func (t *switchTable) unlockedHandleMsg(msg *switchMsg, fromPort switchPort, rep case noParent: // We currently have no working parent, and at this point in the switch statement, anything is better than nothing. updateRoot = true - case sender.faster > switch_faster_threshold: + case sender.faster[t.parent] >= switch_faster_threshold: // The is reliably faster than the current parent. updateRoot = true - case reprocessing && len(sender.locator.coords) < len(oldParent.locator.coords): - // We're reprocessing old messages to find a new parent. - // That means we're in the middle of a route flap. - // We don't know how often each node is faster than the others, only relative to the old parent. - // If any of them was faster than the old parent, then we'd probably already be using them. - // So the best we can really do is pick the shortest route and hope it's OK as a starting point. - // TODO: Find some way to reliably store relative order between all peers. Basically a pxp "faster" matrix, more likely a faster port->uint map per peer, but preferably not literally that, since it'd be tedious to manage and probably slows down updates. + case reprocessing && sender.faster[t.parent] > oldParent.faster[sender.port]: + // The sender seems to be reliably faster than the current parent, so switch to them instead. updateRoot = true case sender.port != t.parent: // Ignore further cases if the sender isn't our parent. - case !equiv(&sender.locator, &t.data.locator): + case !reprocessing && !equiv(&sender.locator, &t.data.locator): // Special case: - // If coords changed, then this may now be a worse parent than before. - // Re-parent the node (de-parent and reprocess the message). - // Then reprocess *all* messages to look for a better parent. - // This is so we don't keep using this node as our parent if there's something better. + // If coords changed, then we need to penalize this node somehow, to prevent flapping. + // First, reset all faster-related info to 0. + // Then, de-parent the node and reprocess all messages to find a new parent. t.parent = 0 - t.unlockedHandleMsg(msg, fromPort, true) - for _, info := range t.data.peers { - t.unlockedHandleMsg(&info.msg, info.port, true) + sender.faster = nil + for _, peer := range t.data.peers { + if peer.port == sender.port { + continue + } + delete(peer.faster, sender.port) + t.unlockedHandleMsg(&peer.msg, peer.port, true) } + // Process the sender last, to avoid keeping them as a parent if at all possible. + t.unlockedHandleMsg(&sender.msg, sender.port, true) case now.Sub(t.time) < switch_throttle: // We've already gotten an update from this root recently, so ignore this one to avoid flooding. case sender.locator.tstamp > t.data.locator.tstamp: