Fed sending tweaks to make s&f more robust

This commit is contained in:
Devon Hudson 2023-01-28 13:42:40 -07:00
parent ab8a3e44ef
commit 0d99625b86
No known key found for this signature in database
GPG key ID: CD06B18E77F6A628
3 changed files with 44 additions and 27 deletions

View file

@ -410,34 +410,49 @@ func (oq *destinationQueue) nextTransaction(
defer cancel()
relayServers := oq.statistics.KnownRelayServers()
if oq.statistics.AssumedOffline() && len(relayServers) > 0 {
sendMethod = statistics.SendViaRelay
relaySuccess := false
logrus.Infof("Sending to relay servers: %v", relayServers)
// TODO : how to pass through actual userID here?!?!?!?!
userID, userErr := gomatrixserverlib.NewUserID("@user:"+string(oq.destination), false)
if userErr != nil {
return userErr, sendMethod
}
// Attempt sending to each known relay server.
for _, relayServer := range relayServers {
_, relayErr := oq.client.P2PSendTransactionToRelay(ctx, *userID, t, relayServer)
if relayErr != nil {
err = relayErr
} else {
// If sending to one of the relay servers succeeds, consider the send successful.
relaySuccess = true
}
}
// Clear the error if sending to any of the relay servers succeeded.
if relaySuccess {
err = nil
}
} else {
hasRelayServers := len(relayServers) > 0
shouldSendToRelays := oq.statistics.AssumedOffline() && hasRelayServers
if !shouldSendToRelays {
sendMethod = statistics.SendDirect
_, err = oq.client.SendTransaction(ctx, t)
} else {
// Try sending directly to the destination first in case they came back online.
sendMethod = statistics.SendDirect
_, err = oq.client.SendTransaction(ctx, t)
if err != nil {
// The destination is still offline, try sending to relays.
sendMethod = statistics.SendViaRelay
relaySuccess := false
logrus.Infof("Sending %q to relay servers: %v", t.TransactionID, relayServers)
// TODO : how to pass through actual userID here?!?!?!?!
userID, userErr := gomatrixserverlib.NewUserID("@user:"+string(oq.destination), false)
if userErr != nil {
return userErr, sendMethod
}
// Attempt sending to each known relay server.
for _, relayServer := range relayServers {
_, relayErr := oq.client.P2PSendTransactionToRelay(ctx, *userID, t, relayServer)
if relayErr != nil {
err = relayErr
} else {
// If sending to one of the relay servers succeeds, consider the send successful.
relaySuccess = true
// TODO : what about if the dest comes back online but can't see their relay?
// How do I sync with the dest in that case?
// Should change the database to have a "relay success" flag on events and if
// I see the node back online, maybe directly send through the backlog of events
// with "relay success"... could lead to duplicate events, but only those that
// I sent. And will lead to a much more consistent experience.
}
}
// Clear the error if sending to any of the relay servers succeeded.
if relaySuccess {
err = nil
}
}
}
switch errResponse := err.(type) {
case nil:

View file

@ -164,6 +164,8 @@ func (s *ServerStatistics) Success(method SendMethod) {
logrus.WithError(err).Errorf("Failed to remove %q from blacklist", s.serverName)
}
}
s.removeAssumedOffline()
}
}

View file

@ -49,7 +49,7 @@ func (c *FederationAPI) Defaults(opts DefaultOpts) {
c.Database.Defaults(10)
}
c.FederationMaxRetries = 16
c.P2PFederationRetriesUntilAssumedOffline = 2
c.P2PFederationRetriesUntilAssumedOffline = 1
c.DisableTLSValidation = false
c.DisableHTTPKeepalives = false
if opts.Generate {