Make 'Device list doesn't change if remote server is down' pass (#1268)

- As a last resort, query the DB when exhausting all possible remote query endpoints, but keep the field in `failures` so clients can detect that this is stale data. - Unblock `DeviceListUpdater.Update` on failures rather than timing out. - Use a mutex when writing directly to `res`, not just for failures.
2025-12-06 14:33:10 -06:00 · 2020-08-13 16:43:27 +01:00 · 2020-08-13 16:43:27 +01:00 · 20c8f252a7
parent 4c4732a9c9
commit 20c8f252a7
3 changed files with 78 additions and 58 deletions
--- a/keyserver/internal/device_list_update.go
+++ b/keyserver/internal/device_list_update.go
@ -342,10 +342,12 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam
 		if err != nil {
 			logger.WithError(err).WithField("user_id", userID).Error("fetched device list but failed to store/emit it")
 			hasFailures = true
-		} else {
-			u.clearChannel(userID)
 		}
 	}
+	for _, userID := range userIDs {
+		// always clear the channel to unblock Update calls regardless of success/failure
+		u.clearChannel(userID)
+	}
 	return hasFailures
 }

--- a/keyserver/internal/internal.go
+++ b/keyserver/internal/internal.go
@ -318,12 +318,39 @@ func (a *KeyInternalAPI) queryRemoteKeys(
 	// allows us to wait until all federation servers have been poked
 	var wg sync.WaitGroup
 	wg.Add(len(domainToDeviceKeys))
-	// mutex for failures
-	var failMu sync.Mutex
+	// mutex for writing directly to res (e.g failures)
+	var respMu sync.Mutex

 	// fan out
 	for domain, deviceKeys := range domainToDeviceKeys {
-		go func(serverName string, devKeys map[string][]string) {
+		go a.queryRemoteKeysOnServer(ctx, domain, deviceKeys, &wg, &respMu, timeout, resultCh, res)
+	}
+
+	// Close the result channel when the goroutines have quit so the for .. range exits
+	go func() {
+		wg.Wait()
+		close(resultCh)
+	}()
+
+	for result := range resultCh {
+		for userID, nest := range result.DeviceKeys {
+			res.DeviceKeys[userID] = make(map[string]json.RawMessage)
+			for deviceID, deviceKey := range nest {
+				keyJSON, err := json.Marshal(deviceKey)
+				if err != nil {
+					continue
+				}
+				res.DeviceKeys[userID][deviceID] = keyJSON
+			}
+		}
+	}
+}
+
+func (a *KeyInternalAPI) queryRemoteKeysOnServer(
+	ctx context.Context, serverName string, devKeys map[string][]string, wg *sync.WaitGroup,
+	respMu *sync.Mutex, timeout time.Duration, resultCh chan<- *gomatrixserverlib.RespQueryKeys,
+	res *api.QueryKeysResponse,
+) {
 	defer wg.Done()
 	fedCtx, cancel := context.WithTimeout(ctx, timeout)
 	defer cancel()
@ -351,7 +378,9 @@ func (a *KeyInternalAPI) queryRemoteKeys(
 		}
 		// refresh entries from DB: unlike remoteKeysFromDatabase we know we previously had no device info for this
 		// user so the fact that we're populating all devices here isn't a problem so long as we have devices.
+		respMu.Lock()
 		err = a.populateResponseWithDeviceKeysFromDatabase(ctx, res, userID, nil)
+		respMu.Unlock()
 		if err != nil {
 			logrus.WithFields(logrus.Fields{
 				logrus.ErrorKey: err,
@ -367,36 +396,24 @@ func (a *KeyInternalAPI) queryRemoteKeys(
 		return
 	}
 	queryKeysResp, err := a.FedClient.QueryKeys(fedCtx, gomatrixserverlib.ServerName(serverName), devKeys)
-			if err != nil {
-				failMu.Lock()
+	if err == nil {
+		resultCh <- &queryKeysResp
+		return
+	}
+	respMu.Lock()
 	res.Failures[serverName] = map[string]interface{}{
 		"message": err.Error(),
 	}
-				failMu.Unlock()
-				return
-			}
-			resultCh <- &queryKeysResp
-		}(domain, deviceKeys)
-	}

-	// Close the result channel when the goroutines have quit so the for .. range exits
-	go func() {
-		wg.Wait()
-		close(resultCh)
-	}()
+	// last ditch, use the cache only. This is good for when clients hit /keys/query and the remote server
+	// is down, better to return something than nothing at all. Clients can know about the failure by
+	// inspecting the failures map though so they can know it's a cached response.
+	for userID, dkeys := range devKeys {
+		// drop the error as it's already a failure at this point
+		_ = a.populateResponseWithDeviceKeysFromDatabase(ctx, res, userID, dkeys)
+	}
+	respMu.Unlock()

-	for result := range resultCh {
-		for userID, nest := range result.DeviceKeys {
-			res.DeviceKeys[userID] = make(map[string]json.RawMessage)
-			for deviceID, deviceKey := range nest {
-				keyJSON, err := json.Marshal(deviceKey)
-				if err != nil {
-					continue
-				}
-				res.DeviceKeys[userID][deviceID] = keyJSON
-			}
-		}
-	}
 }

 func (a *KeyInternalAPI) populateResponseWithDeviceKeysFromDatabase(
--- a/1
+++ b/1
@ -148,6 +148,7 @@ Get left notifs in sync and /keys/changes when other user leaves
 Can query remote device keys using POST after notification
 Server correctly resyncs when client query keys and there is no remote cache
 Server correctly resyncs when server leaves and rejoins a room
+Device list doesn't change if remote server is down
 Can add account data
 Can add account data to room
 Can get account data without syncing