From 440eb0f3a2e5aecddebe3e093d5740ce5c4505c2 Mon Sep 17 00:00:00 2001 From: Till <2353100+S7evinK@users.noreply.github.com> Date: Wed, 7 Sep 2022 11:44:27 +0200 Subject: [PATCH] Handle errors differently in the `DeviceListUpdater` (#2695) `If a device list update goes missing, the server resyncs on the next one` was failing because a previous test would receive a `waitTime` of 1h, resulting in the test timing out. This now tries to handle the returned errors differently, e.g. by using the default `waitTime` of 2s. Also doesn't try further users in the list, if one of the errors would cause a longer `waitTime`. --- federationapi/storage/shared/storage_edus.go | 6 +-- internal/httputil/http.go | 6 +-- keyserver/internal/device_list_update.go | 48 +++++++++++++++----- sytest-whitelist | 3 +- 4 files changed, 44 insertions(+), 19 deletions(-) diff --git a/federationapi/storage/shared/storage_edus.go b/federationapi/storage/shared/storage_edus.go index ce9632ed3..e0c740c11 100644 --- a/federationapi/storage/shared/storage_edus.go +++ b/federationapi/storage/shared/storage_edus.go @@ -53,9 +53,9 @@ func (d *Database) AssociateEDUWithDestination( // Keep EDUs for at least x minutes before deleting them expiresAt = gomatrixserverlib.AsTimestamp(time.Now().Add(duration)) } - // We forcibly set m.direct_to_device events to 0, as we always want them - // to be delivered. (required for E2EE) - if eduType == gomatrixserverlib.MDirectToDevice { + // We forcibly set m.direct_to_device and m.device_list_update events + // to 0, as we always want them to be delivered. (required for E2EE) + if eduType == gomatrixserverlib.MDirectToDevice || eduType == gomatrixserverlib.MDeviceListUpdate { expiresAt = 0 } return d.Writer.Do(d.DB, nil, func(txn *sql.Tx) error { diff --git a/internal/httputil/http.go b/internal/httputil/http.go index 1e07ee33c..ad26de512 100644 --- a/internal/httputil/http.go +++ b/internal/httputil/http.go @@ -24,7 +24,7 @@ import ( "net/url" "strings" - opentracing "github.com/opentracing/opentracing-go" + "github.com/opentracing/opentracing-go" "github.com/opentracing/opentracing-go/ext" ) @@ -81,8 +81,8 @@ func PostJSON[reqtype, restype any, errtype error]( return fmt.Errorf("HTTP %d from %s (no response body)", res.StatusCode, apiURL) } var reserr errtype - if err = json.Unmarshal(body, reserr); err != nil { - return fmt.Errorf("HTTP %d from %s", res.StatusCode, apiURL) + if err = json.Unmarshal(body, &reserr); err != nil { + return fmt.Errorf("HTTP %d from %s - %w", res.StatusCode, apiURL, err) } return reserr } diff --git a/keyserver/internal/device_list_update.go b/keyserver/internal/device_list_update.go index 304b67b23..8530f9078 100644 --- a/keyserver/internal/device_list_update.go +++ b/keyserver/internal/device_list_update.go @@ -19,9 +19,11 @@ import ( "encoding/json" "fmt" "hash/fnv" + "net" "sync" "time" + "github.com/matrix-org/gomatrix" "github.com/matrix-org/gomatrixserverlib" "github.com/matrix-org/util" "github.com/prometheus/client_golang/prometheus" @@ -388,6 +390,8 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam return waitTime, true } failCount := 0 + +userLoop: for _, userID := range userIDs { if ctx.Err() != nil { // we've timed out, give up and go to the back of the queue to let another server be processed. @@ -397,19 +401,35 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam res, err := u.fedClient.GetUserDevices(ctx, serverName, userID) if err != nil { failCount += 1 - fcerr, ok := err.(*fedsenderapi.FederationClientError) - if ok { - if fcerr.RetryAfter > 0 { - waitTime = fcerr.RetryAfter - } else if fcerr.Blacklisted { + switch e := err.(type) { + case *fedsenderapi.FederationClientError: + if e.RetryAfter > 0 { + waitTime = e.RetryAfter + break userLoop + } else if e.Blacklisted { waitTime = time.Hour * 8 - } else { - // For all other errors (DNS resolution, network etc.) wait 1 hour. - waitTime = time.Hour + break userLoop } - } else { - waitTime = time.Hour - logger.WithError(err).WithField("user_id", userID).Debug("GetUserDevices returned unknown error type") + case net.Error: + // Use the default waitTime, if it's a timeout. + // It probably doesn't make sense to try further users. + if !e.Timeout() { + waitTime = time.Minute * 10 + logrus.WithError(e).Error("GetUserDevices returned net.Error") + break userLoop + } + case gomatrix.HTTPError: + // The remote server returned an error, give it some time to recover + if e.Code >= 500 { + waitTime = time.Minute * 10 + logrus.WithError(e).Error("GetUserDevices returned gomatrix.HTTPError") + break userLoop + } + default: + // Something else failed + waitTime = time.Minute * 10 + logger.WithError(err).WithField("user_id", userID).Debugf("GetUserDevices returned unknown error type: %T", err) + break userLoop } continue } @@ -437,7 +457,11 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam } } if failCount > 0 { - logger.WithField("total", len(userIDs)).WithField("failed", failCount).WithField("wait", waitTime).Warn("Failed to query device keys for some users") + logger.WithFields(logrus.Fields{ + "total": len(userIDs), + "failed": failCount, + "skipped": len(userIDs) - failCount, + }).Warn("Failed to query device keys for some users") } for _, userID := range userIDs { // always clear the channel to unblock Update calls regardless of success/failure diff --git a/sytest-whitelist b/sytest-whitelist index 5c8896b99..fc10bd3cc 100644 --- a/sytest-whitelist +++ b/sytest-whitelist @@ -741,4 +741,5 @@ Newly joined room includes presence in incremental sync User in private room doesn't appear in user directory User joining then leaving public room appears and dissappears from directory User in remote room doesn't appear in user directory after server left room -User in shared private room does appear in user directory until leave \ No newline at end of file +User in shared private room does appear in user directory until leave +If a device list update goes missing, the server resyncs on the next one \ No newline at end of file