bugfix: fix race condition when updating presence via /sync (#2470)

* bugfix: fix race condition when updating presence via /sync

Previously when presence is updated via /sync, we would send the presence update
asyncly via NATS. This created a race condition:
 - If the presence update is processed quickly, the /sync which triggered the presence
   update would see an online presence.
 - If the presence update was processed slowly, the /sync which triggered the presence
   update would see an offline presence.

This is the root cause behind the flakey sytest: 'User sees their own presence in a sync'.

The fix is to ensure we update the database/advance the stream position synchronously
for local users.

* Bugfix for test
This commit is contained in:
kegsay 2022-05-17 15:53:08 +01:00 committed by GitHub
parent ac92e04772
commit b3162755a9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 103 additions and 20 deletions

View file

@ -138,9 +138,12 @@ func (s *PresenceConsumer) onMessage(ctx context.Context, msg *nats.Msg) bool {
presence := msg.Header.Get("presence")
timestamp := msg.Header.Get("last_active_ts")
fromSync, _ := strconv.ParseBool(msg.Header.Get("from_sync"))
logrus.Debugf("syncAPI received presence event: %+v", msg.Header)
if fromSync { // do not process local presence changes; we already did this synchronously.
return true
}
ts, err := strconv.Atoi(timestamp)
if err != nil {
return true
@ -151,15 +154,19 @@ func (s *PresenceConsumer) onMessage(ctx context.Context, msg *nats.Msg) bool {
newMsg := msg.Header.Get("status_msg")
statusMsg = &newMsg
}
// OK is already checked, so no need to do it again
// already checked, so no need to check error
p, _ := types.PresenceFromString(presence)
pos, err := s.db.UpdatePresence(ctx, userID, p, statusMsg, gomatrixserverlib.Timestamp(ts), fromSync)
if err != nil {
s.EmitPresence(ctx, userID, p, statusMsg, ts, fromSync)
return true
}
func (s *PresenceConsumer) EmitPresence(ctx context.Context, userID string, presence types.Presence, statusMsg *string, ts int, fromSync bool) {
pos, err := s.db.UpdatePresence(ctx, userID, presence, statusMsg, gomatrixserverlib.Timestamp(ts), fromSync)
if err != nil {
logrus.WithError(err).WithField("user", userID).WithField("presence", presence).Warn("failed to updated presence for user")
return
}
s.stream.Advance(pos)
s.notifier.OnNewPresence(types.StreamingToken{PresencePosition: pos}, userID)
return true
}

View file

@ -53,19 +53,24 @@ type RequestPool struct {
streams *streams.Streams
Notifier *notifier.Notifier
producer PresencePublisher
consumer PresenceConsumer
}
type PresencePublisher interface {
SendPresence(userID string, presence types.Presence, statusMsg *string) error
}
type PresenceConsumer interface {
EmitPresence(ctx context.Context, userID string, presence types.Presence, statusMsg *string, ts int, fromSync bool)
}
// NewRequestPool makes a new RequestPool
func NewRequestPool(
db storage.Database, cfg *config.SyncAPI,
userAPI userapi.SyncUserAPI, keyAPI keyapi.SyncKeyAPI,
rsAPI roomserverAPI.SyncRoomserverAPI,
streams *streams.Streams, notifier *notifier.Notifier,
producer PresencePublisher, enableMetrics bool,
producer PresencePublisher, consumer PresenceConsumer, enableMetrics bool,
) *RequestPool {
if enableMetrics {
prometheus.MustRegister(
@ -83,6 +88,7 @@ func NewRequestPool(
streams: streams,
Notifier: notifier,
producer: producer,
consumer: consumer,
}
go rp.cleanLastSeen()
go rp.cleanPresence(db, time.Minute*5)
@ -160,6 +166,13 @@ func (rp *RequestPool) updatePresence(db storage.Presence, presence string, user
logrus.WithError(err).Error("Unable to publish presence message from sync")
return
}
// now synchronously update our view of the world. It's critical we do this before calculating
// the /sync response else we may not return presence: online immediately.
rp.consumer.EmitPresence(
context.Background(), userID, presenceID, newPresence.ClientFields.StatusMsg,
int(gomatrixserverlib.AsTimestamp(time.Now())), true,
)
}
func (rp *RequestPool) updateLastSeen(req *http.Request, device *userapi.Device) {

View file

@ -38,6 +38,12 @@ func (d dummyDB) MaxStreamPositionForPresence(ctx context.Context) (types.Stream
return 0, nil
}
type dummyConsumer struct{}
func (d dummyConsumer) EmitPresence(ctx context.Context, userID string, presence types.Presence, statusMsg *string, ts int, fromSync bool) {
}
func TestRequestPool_updatePresence(t *testing.T) {
type args struct {
presence string
@ -45,6 +51,7 @@ func TestRequestPool_updatePresence(t *testing.T) {
sleep time.Duration
}
publisher := &dummyPublisher{}
consumer := &dummyConsumer{}
syncMap := sync.Map{}
tests := []struct {
@ -101,6 +108,7 @@ func TestRequestPool_updatePresence(t *testing.T) {
rp := &RequestPool{
presence: &syncMap,
producer: publisher,
consumer: consumer,
cfg: &config.SyncAPI{
Matrix: &config.Global{
JetStream: config.JetStream{

View file

@ -64,8 +64,17 @@ func AddPublicRoutes(
Topic: cfg.Matrix.JetStream.Prefixed(jetstream.OutputPresenceEvent),
JetStream: js,
}
presenceConsumer := consumers.NewPresenceConsumer(
base.ProcessContext, cfg, js, natsClient, syncDB,
notifier, streams.PresenceStreamProvider,
userAPI,
)
requestPool := sync.NewRequestPool(syncDB, cfg, userAPI, keyAPI, rsAPI, streams, notifier, federationPresenceProducer, base.EnableMetrics)
requestPool := sync.NewRequestPool(syncDB, cfg, userAPI, keyAPI, rsAPI, streams, notifier, federationPresenceProducer, presenceConsumer, base.EnableMetrics)
if err = presenceConsumer.Start(); err != nil {
logrus.WithError(err).Panicf("failed to start presence consumer")
}
userAPIStreamEventProducer := &producers.UserAPIStreamEventProducer{
JetStream: js,
@ -131,15 +140,6 @@ func AddPublicRoutes(
logrus.WithError(err).Panicf("failed to start receipts consumer")
}
presenceConsumer := consumers.NewPresenceConsumer(
base.ProcessContext, cfg, js, natsClient, syncDB,
notifier, streams.PresenceStreamProvider,
userAPI,
)
if err = presenceConsumer.Start(); err != nil {
logrus.WithError(err).Panicf("failed to start presence consumer")
}
routing.Setup(
base.PublicClientAPIMux, requestPool, syncDB, userAPI,
rsAPI, cfg, base.Caches,

View file

@ -19,6 +19,7 @@ import (
userapi "github.com/matrix-org/dendrite/userapi/api"
"github.com/matrix-org/gomatrixserverlib"
"github.com/nats-io/nats.go"
"github.com/tidwall/gjson"
)
type syncRoomserverAPI struct {
@ -256,6 +257,60 @@ func testSyncAPICreateRoomSyncEarly(t *testing.T, dbType test.DBType) {
}
}
// Test that if we hit /sync we get back presence: online, regardless of whether messages get delivered
// via NATS. Regression test for a flakey test "User sees their own presence in a sync"
func TestSyncAPIUpdatePresenceImmediately(t *testing.T) {
test.WithAllDatabases(t, func(t *testing.T, dbType test.DBType) {
testSyncAPIUpdatePresenceImmediately(t, dbType)
})
}
func testSyncAPIUpdatePresenceImmediately(t *testing.T, dbType test.DBType) {
user := test.NewUser(t)
alice := userapi.Device{
ID: "ALICEID",
UserID: user.ID,
AccessToken: "ALICE_BEARER_TOKEN",
DisplayName: "Alice",
AccountType: userapi.AccountTypeUser,
}
base, close := testrig.CreateBaseDendrite(t, dbType)
base.Cfg.Global.Presence.EnableOutbound = true
base.Cfg.Global.Presence.EnableInbound = true
defer close()
jsctx, _ := base.NATS.Prepare(base.ProcessContext, &base.Cfg.Global.JetStream)
defer jetstream.DeleteAllStreams(jsctx, &base.Cfg.Global.JetStream)
AddPublicRoutes(base, &syncUserAPI{accounts: []userapi.Device{alice}}, &syncRoomserverAPI{}, &syncKeyAPI{})
w := httptest.NewRecorder()
base.PublicClientAPIMux.ServeHTTP(w, test.NewRequest(t, "GET", "/_matrix/client/v3/sync", test.WithQueryParams(map[string]string{
"access_token": alice.AccessToken,
"timeout": "0",
"set_presence": "online",
})))
if w.Code != 200 {
t.Fatalf("got HTTP %d want %d", w.Code, 200)
}
var res types.Response
if err := json.NewDecoder(w.Body).Decode(&res); err != nil {
t.Errorf("failed to decode response body: %s", err)
}
if len(res.Presence.Events) != 1 {
t.Fatalf("expected 1 presence events, got: %+v", res.Presence.Events)
}
if res.Presence.Events[0].Sender != alice.UserID {
t.Errorf("sender: got %v want %v", res.Presence.Events[0].Sender, alice.UserID)
}
if res.Presence.Events[0].Type != "m.presence" {
t.Errorf("type: got %v want %v", res.Presence.Events[0].Type, "m.presence")
}
if gjson.ParseBytes(res.Presence.Events[0].Content).Get("presence").Str != "online" {
t.Errorf("content: not online, got %v", res.Presence.Events[0].Content)
}
}
func toNATSMsgs(t *testing.T, base *base.BaseDendrite, input []*gomatrixserverlib.HeaderedEvent) []*nats.Msg {
result := make([]*nats.Msg, len(input))
for i, ev := range input {