Roomserver input API queuing using NATS

This commit is contained in:
Neil Alexander 2021-11-03 11:41:51 +00:00
parent 9eb89515d2
commit 6b835b83bf
No known key found for this signature in database
GPG key ID: A02A2019A2BB0944
14 changed files with 88 additions and 182 deletions

View file

@ -58,7 +58,7 @@ func NewInternalAPI(
}, },
}, },
} }
consumer, _ := jetstream.SetupConsumerProducer(&base.Cfg.Global.JetStream) _, consumer, _ := jetstream.SetupConsumerProducer(&base.Cfg.Global.JetStream)
// Create a connection to the appservice postgres DB // Create a connection to the appservice postgres DB
appserviceDB, err := storage.NewDatabase(&base.Cfg.AppServiceAPI.Database) appserviceDB, err := storage.NewDatabase(&base.Cfg.AppServiceAPI.Database)

View file

@ -49,7 +49,7 @@ func AddPublicRoutes(
extRoomsProvider api.ExtraPublicRoomsProvider, extRoomsProvider api.ExtraPublicRoomsProvider,
mscCfg *config.MSCs, mscCfg *config.MSCs,
) { ) {
_, producer := jetstream.SetupConsumerProducer(&cfg.Matrix.JetStream) _, _, producer := jetstream.SetupConsumerProducer(&cfg.Matrix.JetStream)
syncProducer := &producers.SyncAPIProducer{ syncProducer := &producers.SyncAPIProducer{
Producer: producer, Producer: producer,

View file

@ -42,7 +42,7 @@ func NewInternalAPI(
) api.EDUServerInputAPI { ) api.EDUServerInputAPI {
cfg := &base.Cfg.EDUServer cfg := &base.Cfg.EDUServer
_, producer := jetstream.SetupConsumerProducer(&cfg.Matrix.JetStream) _, _, producer := jetstream.SetupConsumerProducer(&cfg.Matrix.JetStream)
return &input.EDUServerInputAPI{ return &input.EDUServerInputAPI{
Cache: eduCache, Cache: eduCache,

View file

@ -61,7 +61,7 @@ func NewInternalAPI(
FailuresUntilBlacklist: cfg.FederationMaxRetries, FailuresUntilBlacklist: cfg.FederationMaxRetries,
} }
consumer, _ := jetstream.SetupConsumerProducer(&cfg.Matrix.JetStream) _, consumer, _ := jetstream.SetupConsumerProducer(&cfg.Matrix.JetStream)
queues := queue.NewOutgoingQueues( queues := queue.NewOutgoingQueues(
federationSenderDB, base.ProcessContext, federationSenderDB, base.ProcessContext,

1
go.mod
View file

@ -6,6 +6,7 @@ replace github.com/nats-io/nats.go => github.com/neilalexander/nats.go v1.11.1-0
require ( require (
github.com/Arceliar/ironwood v0.0.0-20210619124114-6ad55cae5031 github.com/Arceliar/ironwood v0.0.0-20210619124114-6ad55cae5031
github.com/Arceliar/phony v0.0.0-20210209235338-dde1a8dca979
github.com/DATA-DOG/go-sqlmock v1.5.0 github.com/DATA-DOG/go-sqlmock v1.5.0
github.com/HdrHistogram/hdrhistogram-go v1.0.1 // indirect github.com/HdrHistogram/hdrhistogram-go v1.0.1 // indirect
github.com/Masterminds/semver/v3 v3.1.1 github.com/Masterminds/semver/v3 v3.1.1

View file

@ -40,7 +40,7 @@ func AddInternalRoutes(router *mux.Router, intAPI api.KeyInternalAPI) {
func NewInternalAPI( func NewInternalAPI(
base *setup.BaseDendrite, cfg *config.KeyServer, fedClient fedsenderapi.FederationClient, base *setup.BaseDendrite, cfg *config.KeyServer, fedClient fedsenderapi.FederationClient,
) api.KeyInternalAPI { ) api.KeyInternalAPI {
consumer, producer := jetstream.SetupConsumerProducer(&cfg.Matrix.JetStream) _, consumer, producer := jetstream.SetupConsumerProducer(&cfg.Matrix.JetStream)
db, err := storage.NewDatabase(&cfg.Database) db, err := storage.NewDatabase(&cfg.Database)
if err != nil { if err != nil {

View file

@ -16,6 +16,8 @@ import (
"github.com/matrix-org/dendrite/roomserver/storage" "github.com/matrix-org/dendrite/roomserver/storage"
"github.com/matrix-org/dendrite/setup/config" "github.com/matrix-org/dendrite/setup/config"
"github.com/matrix-org/gomatrixserverlib" "github.com/matrix-org/gomatrixserverlib"
"github.com/nats-io/nats.go"
"github.com/sirupsen/logrus"
) )
// RoomserverInternalAPI is an implementation of api.RoomserverInternalAPI // RoomserverInternalAPI is an implementation of api.RoomserverInternalAPI
@ -39,13 +41,15 @@ type RoomserverInternalAPI struct {
KeyRing gomatrixserverlib.JSONVerifier KeyRing gomatrixserverlib.JSONVerifier
fsAPI fsAPI.FederationSenderInternalAPI fsAPI fsAPI.FederationSenderInternalAPI
asAPI asAPI.AppServiceQueryAPI asAPI asAPI.AppServiceQueryAPI
OutputRoomEventTopic string // Kafka topic for new output room events InputRoomEventTopic string // JetStream topic for new input room events
OutputRoomEventTopic string // JetStream topic for new output room events
PerspectiveServerNames []gomatrixserverlib.ServerName PerspectiveServerNames []gomatrixserverlib.ServerName
} }
func NewRoomserverAPI( func NewRoomserverAPI(
cfg *config.RoomServer, roomserverDB storage.Database, producer sarama.SyncProducer, cfg *config.RoomServer, roomserverDB storage.Database,
outputRoomEventTopic string, caches caching.RoomServerCaches, consumer nats.JetStreamContext, producer sarama.SyncProducer,
inputRoomEventTopic, outputRoomEventTopic string, caches caching.RoomServerCaches,
keyRing gomatrixserverlib.JSONVerifier, perspectiveServerNames []gomatrixserverlib.ServerName, keyRing gomatrixserverlib.JSONVerifier, perspectiveServerNames []gomatrixserverlib.ServerName,
) *RoomserverInternalAPI { ) *RoomserverInternalAPI {
serverACLs := acls.NewServerACLs(roomserverDB) serverACLs := acls.NewServerACLs(roomserverDB)
@ -64,13 +68,18 @@ func NewRoomserverAPI(
}, },
Inputer: &input.Inputer{ Inputer: &input.Inputer{
DB: roomserverDB, DB: roomserverDB,
InputRoomEventTopic: inputRoomEventTopic,
OutputRoomEventTopic: outputRoomEventTopic, OutputRoomEventTopic: outputRoomEventTopic,
Consumer: consumer,
Producer: producer, Producer: producer,
ServerName: cfg.Matrix.ServerName, ServerName: cfg.Matrix.ServerName,
ACLs: serverACLs, ACLs: serverACLs,
}, },
// perform-er structs get initialised when we have a federation sender to use // perform-er structs get initialised when we have a federation sender to use
} }
if err := a.Inputer.Start(); err != nil {
logrus.WithError(err).Panic("failed to start roomserver input API")
}
return a return a
} }

View file

@ -19,8 +19,8 @@ import (
"context" "context"
"encoding/json" "encoding/json"
"sync" "sync"
"time"
"github.com/Arceliar/phony"
"github.com/Shopify/sarama" "github.com/Shopify/sarama"
"github.com/getsentry/sentry-go" "github.com/getsentry/sentry-go"
"github.com/matrix-org/dendrite/internal/hooks" "github.com/matrix-org/dendrite/internal/hooks"
@ -28,10 +28,10 @@ import (
"github.com/matrix-org/dendrite/roomserver/api" "github.com/matrix-org/dendrite/roomserver/api"
"github.com/matrix-org/dendrite/roomserver/storage" "github.com/matrix-org/dendrite/roomserver/storage"
"github.com/matrix-org/gomatrixserverlib" "github.com/matrix-org/gomatrixserverlib"
"github.com/nats-io/nats.go"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"github.com/tidwall/gjson" "github.com/tidwall/gjson"
"go.uber.org/atomic"
) )
var keyContentFields = map[string]string{ var keyContentFields = map[string]string{
@ -42,48 +42,62 @@ var keyContentFields = map[string]string{
type Inputer struct { type Inputer struct {
DB storage.Database DB storage.Database
Consumer nats.JetStreamContext
Producer sarama.SyncProducer Producer sarama.SyncProducer
ServerName gomatrixserverlib.ServerName ServerName gomatrixserverlib.ServerName
ACLs *acls.ServerACLs ACLs *acls.ServerACLs
InputRoomEventTopic string
OutputRoomEventTopic string OutputRoomEventTopic string
workers sync.Map // room ID -> *inputWorker workers sync.Map // room ID -> *phony.Inbox
} }
type inputTask struct { // onMessage is called when a new event arrives in the roomserver input stream.
ctx context.Context func (r *Inputer) Start() error {
event *api.InputRoomEvent _, err := r.Consumer.Subscribe(
wg *sync.WaitGroup r.InputRoomEventTopic,
err error // written back by worker, only safe to read when all tasks are done func(msg *nats.Msg) {
} _ = msg.InProgress()
var inputRoomEvent api.InputRoomEvent
type inputWorker struct { if err := json.Unmarshal(msg.Data, &inputRoomEvent); err != nil {
r *Inputer _ = msg.Nak()
running atomic.Bool return
input *fifoQueue
}
// Guarded by a CAS on w.running
func (w *inputWorker) start() {
defer w.running.Store(false)
for {
select {
case <-w.input.wait():
task, ok := w.input.pop()
if !ok {
continue
} }
roomserverInputBackpressure.With(prometheus.Labels{ inbox, _ := r.workers.LoadOrStore(msg.Header.Get("room_id"), &phony.Inbox{})
"room_id": task.event.Event.RoomID(), inbox.(*phony.Inbox).Act(nil, func() {
}).Dec() if _, err := r.processRoomEvent(context.TODO(), &inputRoomEvent); err != nil {
hooks.Run(hooks.KindNewEventReceived, task.event.Event) sentry.CaptureException(err)
_, task.err = w.r.processRoomEvent(task.ctx, task.event) _ = msg.Nak()
if task.err == nil {
hooks.Run(hooks.KindNewEventPersisted, task.event.Event)
} else { } else {
sentry.CaptureException(task.err) hooks.Run(hooks.KindNewEventPersisted, inputRoomEvent.Event)
_ = msg.Ack()
} }
task.wg.Done() })
case <-time.After(time.Second * 5): },
nats.ManualAck(),
)
return err
}
// InputRoomEvents implements api.RoomserverInternalAPI
func (r *Inputer) InputRoomEvents(
_ context.Context,
request *api.InputRoomEventsRequest,
response *api.InputRoomEventsResponse,
) {
var err error
for _, e := range request.InputRoomEvents {
msg := &nats.Msg{
Subject: r.InputRoomEventTopic,
Header: nats.Header{},
}
msg.Header.Set("room_id", e.Event.RoomID())
msg.Data, err = json.Marshal(e)
if err != nil {
response.ErrMsg = err.Error()
return
}
if _, err = r.Consumer.PublishMsg(msg); err != nil {
response.ErrMsg = err.Error()
return return
} }
} }
@ -156,67 +170,3 @@ var roomserverInputBackpressure = prometheus.NewGaugeVec(
}, },
[]string{"room_id"}, []string{"room_id"},
) )
// InputRoomEvents implements api.RoomserverInternalAPI
func (r *Inputer) InputRoomEvents(
_ context.Context,
request *api.InputRoomEventsRequest,
response *api.InputRoomEventsResponse,
) {
// Create a wait group. Each task that we dispatch will call Done on
// this wait group so that we know when all of our events have been
// processed.
wg := &sync.WaitGroup{}
wg.Add(len(request.InputRoomEvents))
tasks := make([]*inputTask, len(request.InputRoomEvents))
for i, e := range request.InputRoomEvents {
// Work out if we are running per-room workers or if we're just doing
// it on a global basis (e.g. SQLite).
roomID := "global"
if r.DB.SupportsConcurrentRoomInputs() {
roomID = e.Event.RoomID()
}
// Look up the worker, or create it if it doesn't exist. This channel
// is buffered to reduce the chance that we'll be blocked by another
// room - the channel will be quite small as it's just pointer types.
w, _ := r.workers.LoadOrStore(roomID, &inputWorker{
r: r,
input: newFIFOQueue(),
})
worker := w.(*inputWorker)
// Create a task. This contains the input event and a reference to
// the wait group, so that the worker can notify us when this specific
// task has been finished.
tasks[i] = &inputTask{
ctx: context.Background(),
event: &request.InputRoomEvents[i],
wg: wg,
}
// Send the task to the worker.
if worker.running.CAS(false, true) {
go worker.start()
}
worker.input.push(tasks[i])
roomserverInputBackpressure.With(prometheus.Labels{
"room_id": roomID,
}).Inc()
}
// Wait for all of the workers to return results about our tasks.
wg.Wait()
// If any of the tasks returned an error, we should probably report
// that back to the caller.
for _, task := range tasks {
if task.err != nil {
response.ErrMsg = task.err.Error()
_, rejected := task.err.(*gomatrixserverlib.NotAllowed)
response.NotAllowed = rejected
return
}
}
}

View file

@ -1,64 +0,0 @@
package input
import (
"sync"
)
type fifoQueue struct {
tasks []*inputTask
count int
mutex sync.Mutex
notifs chan struct{}
}
func newFIFOQueue() *fifoQueue {
q := &fifoQueue{
notifs: make(chan struct{}, 1),
}
return q
}
func (q *fifoQueue) push(frame *inputTask) {
q.mutex.Lock()
defer q.mutex.Unlock()
q.tasks = append(q.tasks, frame)
q.count++
select {
case q.notifs <- struct{}{}:
default:
}
}
// pop returns the first item of the queue, if there is one.
// The second return value will indicate if a task was returned.
// You must check this value, even after calling wait().
func (q *fifoQueue) pop() (*inputTask, bool) {
q.mutex.Lock()
defer q.mutex.Unlock()
if q.count == 0 {
return nil, false
}
frame := q.tasks[0]
q.tasks[0] = nil
q.tasks = q.tasks[1:]
q.count--
if q.count == 0 {
// Force a GC of the underlying array, since it might have
// grown significantly if the queue was hammered for some reason
q.tasks = nil
}
return frame, true
}
// wait returns a channel which can be used to detect when an
// item is waiting in the queue.
func (q *fifoQueue) wait() <-chan struct{} {
q.mutex.Lock()
defer q.mutex.Unlock()
if q.count > 0 && len(q.notifs) == 0 {
ch := make(chan struct{})
close(ch)
return ch
}
return q.notifs
}

View file

@ -41,7 +41,7 @@ func NewInternalAPI(
) api.RoomserverInternalAPI { ) api.RoomserverInternalAPI {
cfg := &base.Cfg.RoomServer cfg := &base.Cfg.RoomServer
_, producer := jetstream.SetupConsumerProducer(&cfg.Matrix.JetStream) js, _, producer := jetstream.SetupConsumerProducer(&cfg.Matrix.JetStream)
var perspectiveServerNames []gomatrixserverlib.ServerName var perspectiveServerNames []gomatrixserverlib.ServerName
for _, kp := range base.Cfg.SigningKeyServer.KeyPerspectives { for _, kp := range base.Cfg.SigningKeyServer.KeyPerspectives {
@ -54,7 +54,9 @@ func NewInternalAPI(
} }
return internal.NewRoomserverAPI( return internal.NewRoomserverAPI(
cfg, roomserverDB, producer, string(cfg.Matrix.JetStream.TopicFor(jetstream.OutputRoomEvent)), cfg, roomserverDB, js, producer,
cfg.Matrix.JetStream.TopicFor(jetstream.InputRoomEvent),
cfg.Matrix.JetStream.TopicFor(jetstream.OutputRoomEvent),
base.Caches, keyRing, perspectiveServerNames, base.Caches, keyRing, perspectiveServerNames,
) )
} }

View file

@ -181,7 +181,9 @@ func mustCreateRoomserverAPI(t *testing.T) (api.RoomserverInternalAPI, *dummyPro
logrus.WithError(err).Panicf("failed to connect to room server db") logrus.WithError(err).Panicf("failed to connect to room server db")
} }
return internal.NewRoomserverAPI( return internal.NewRoomserverAPI(
&cfg.RoomServer, roomserverDB, dp, string(cfg.Global.JetStream.TopicFor(jetstream.OutputRoomEvent)), &cfg.RoomServer, roomserverDB, dp,
cfg.Global.JetStream.TopicFor(jetstream.InputRoomEvent),
cfg.Global.JetStream.TopicFor(jetstream.OutputRoomEvent),
base.Caches, &test.NopJSONVerifier{}, nil, base.Caches, &test.NopJSONVerifier{}, nil,
), dp ), dp
} }

View file

@ -18,7 +18,7 @@ import (
var natsServer *natsserver.Server var natsServer *natsserver.Server
var natsServerMutex sync.Mutex var natsServerMutex sync.Mutex
func SetupConsumerProducer(cfg *config.JetStream) (sarama.Consumer, sarama.SyncProducer) { func SetupConsumerProducer(cfg *config.JetStream) (nats.JetStreamContext, sarama.Consumer, sarama.SyncProducer) {
// check if we need an in-process NATS Server // check if we need an in-process NATS Server
if len(cfg.Addresses) != 0 { if len(cfg.Addresses) != 0 {
return setupNATS(cfg, nil) return setupNATS(cfg, nil)
@ -51,20 +51,20 @@ func SetupConsumerProducer(cfg *config.JetStream) (sarama.Consumer, sarama.SyncP
return setupNATS(cfg, nc) return setupNATS(cfg, nc)
} }
func setupNATS(cfg *config.JetStream, nc *natsclient.Conn) (sarama.Consumer, sarama.SyncProducer) { func setupNATS(cfg *config.JetStream, nc *natsclient.Conn) (nats.JetStreamContext, sarama.Consumer, sarama.SyncProducer) {
if nc == nil { if nc == nil {
var err error var err error
nc, err = nats.Connect(strings.Join(cfg.Addresses, ",")) nc, err = nats.Connect(strings.Join(cfg.Addresses, ","))
if err != nil { if err != nil {
logrus.WithError(err).Panic("Unable to connect to NATS") logrus.WithError(err).Panic("Unable to connect to NATS")
return nil, nil return nil, nil, nil
} }
} }
s, err := nc.JetStream() s, err := nc.JetStream()
if err != nil { if err != nil {
logrus.WithError(err).Panic("Unable to get JetStream context") logrus.WithError(err).Panic("Unable to get JetStream context")
return nil, nil return nil, nil, nil
} }
for _, stream := range streams { for _, stream := range streams {
@ -89,5 +89,5 @@ func setupNATS(cfg *config.JetStream, nc *natsclient.Conn) (sarama.Consumer, sar
consumer := saramajs.NewJetStreamConsumer(nc, s, "") consumer := saramajs.NewJetStreamConsumer(nc, s, "")
producer := saramajs.NewJetStreamProducer(nc, s, "") producer := saramajs.NewJetStreamProducer(nc, s, "")
return consumer, producer return s, consumer, producer
} }

View file

@ -7,6 +7,7 @@ import (
) )
var ( var (
InputRoomEvent = "InputRoomEvent"
OutputRoomEvent = "OutputRoomEvent" OutputRoomEvent = "OutputRoomEvent"
OutputSendToDeviceEvent = "OutputSendToDeviceEvent" OutputSendToDeviceEvent = "OutputSendToDeviceEvent"
OutputKeyChangeEvent = "OutputKeyChangeEvent" OutputKeyChangeEvent = "OutputKeyChangeEvent"
@ -16,6 +17,11 @@ var (
) )
var streams = []*nats.StreamConfig{ var streams = []*nats.StreamConfig{
{
Name: InputRoomEvent,
Retention: nats.InterestPolicy,
Storage: nats.FileStorage,
},
{ {
Name: OutputRoomEvent, Name: OutputRoomEvent,
Retention: nats.InterestPolicy, Retention: nats.InterestPolicy,

View file

@ -48,7 +48,7 @@ func AddPublicRoutes(
federation *gomatrixserverlib.FederationClient, federation *gomatrixserverlib.FederationClient,
cfg *config.SyncAPI, cfg *config.SyncAPI,
) { ) {
consumer, _ := jetstream.SetupConsumerProducer(&cfg.Matrix.JetStream) _, consumer, _ := jetstream.SetupConsumerProducer(&cfg.Matrix.JetStream)
syncDB, err := storage.NewSyncServerDatasource(&cfg.Database) syncDB, err := storage.NewSyncServerDatasource(&cfg.Database)
if err != nil { if err != nil {