Handle errors differently in the DeviceListUpdater (#2695)

`If a device list update goes missing, the server resyncs on the next
one` was failing because a previous test would receive a `waitTime` of
1h, resulting in the test timing out.
This now tries to handle the returned errors differently, e.g. by using
the default `waitTime` of 2s. Also doesn't try further users in the
list, if one of the errors would cause a longer `waitTime`.
This commit is contained in:
Till 2022-09-07 11:44:27 +02:00 committed by GitHub
parent 847032df36
commit 440eb0f3a2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 44 additions and 19 deletions

View file

@ -53,9 +53,9 @@ func (d *Database) AssociateEDUWithDestination(
// Keep EDUs for at least x minutes before deleting them
expiresAt = gomatrixserverlib.AsTimestamp(time.Now().Add(duration))
}
// We forcibly set m.direct_to_device events to 0, as we always want them
// to be delivered. (required for E2EE)
if eduType == gomatrixserverlib.MDirectToDevice {
// We forcibly set m.direct_to_device and m.device_list_update events
// to 0, as we always want them to be delivered. (required for E2EE)
if eduType == gomatrixserverlib.MDirectToDevice || eduType == gomatrixserverlib.MDeviceListUpdate {
expiresAt = 0
}
return d.Writer.Do(d.DB, nil, func(txn *sql.Tx) error {

View file

@ -24,7 +24,7 @@ import (
"net/url"
"strings"
opentracing "github.com/opentracing/opentracing-go"
"github.com/opentracing/opentracing-go"
"github.com/opentracing/opentracing-go/ext"
)
@ -81,8 +81,8 @@ func PostJSON[reqtype, restype any, errtype error](
return fmt.Errorf("HTTP %d from %s (no response body)", res.StatusCode, apiURL)
}
var reserr errtype
if err = json.Unmarshal(body, reserr); err != nil {
return fmt.Errorf("HTTP %d from %s", res.StatusCode, apiURL)
if err = json.Unmarshal(body, &reserr); err != nil {
return fmt.Errorf("HTTP %d from %s - %w", res.StatusCode, apiURL, err)
}
return reserr
}

View file

@ -19,9 +19,11 @@ import (
"encoding/json"
"fmt"
"hash/fnv"
"net"
"sync"
"time"
"github.com/matrix-org/gomatrix"
"github.com/matrix-org/gomatrixserverlib"
"github.com/matrix-org/util"
"github.com/prometheus/client_golang/prometheus"
@ -388,6 +390,8 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam
return waitTime, true
}
failCount := 0
userLoop:
for _, userID := range userIDs {
if ctx.Err() != nil {
// we've timed out, give up and go to the back of the queue to let another server be processed.
@ -397,19 +401,35 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam
res, err := u.fedClient.GetUserDevices(ctx, serverName, userID)
if err != nil {
failCount += 1
fcerr, ok := err.(*fedsenderapi.FederationClientError)
if ok {
if fcerr.RetryAfter > 0 {
waitTime = fcerr.RetryAfter
} else if fcerr.Blacklisted {
switch e := err.(type) {
case *fedsenderapi.FederationClientError:
if e.RetryAfter > 0 {
waitTime = e.RetryAfter
break userLoop
} else if e.Blacklisted {
waitTime = time.Hour * 8
} else {
// For all other errors (DNS resolution, network etc.) wait 1 hour.
waitTime = time.Hour
break userLoop
}
} else {
waitTime = time.Hour
logger.WithError(err).WithField("user_id", userID).Debug("GetUserDevices returned unknown error type")
case net.Error:
// Use the default waitTime, if it's a timeout.
// It probably doesn't make sense to try further users.
if !e.Timeout() {
waitTime = time.Minute * 10
logrus.WithError(e).Error("GetUserDevices returned net.Error")
break userLoop
}
case gomatrix.HTTPError:
// The remote server returned an error, give it some time to recover
if e.Code >= 500 {
waitTime = time.Minute * 10
logrus.WithError(e).Error("GetUserDevices returned gomatrix.HTTPError")
break userLoop
}
default:
// Something else failed
waitTime = time.Minute * 10
logger.WithError(err).WithField("user_id", userID).Debugf("GetUserDevices returned unknown error type: %T", err)
break userLoop
}
continue
}
@ -437,7 +457,11 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam
}
}
if failCount > 0 {
logger.WithField("total", len(userIDs)).WithField("failed", failCount).WithField("wait", waitTime).Warn("Failed to query device keys for some users")
logger.WithFields(logrus.Fields{
"total": len(userIDs),
"failed": failCount,
"skipped": len(userIDs) - failCount,
}).Warn("Failed to query device keys for some users")
}
for _, userID := range userIDs {
// always clear the channel to unblock Update calls regardless of success/failure

View file

@ -742,3 +742,4 @@ User in private room doesn't appear in user directory
User joining then leaving public room appears and dissappears from directory
User in remote room doesn't appear in user directory after server left room
User in shared private room does appear in user directory until leave
If a device list update goes missing, the server resyncs on the next one