From 407f25008ca51e55d0946a45307f5377ebdf7880 Mon Sep 17 00:00:00 2001 From: Till Faelligen <2353100+S7evinK@users.noreply.github.com> Date: Tue, 31 Oct 2023 11:47:44 +0100 Subject: [PATCH] Add another metric --- userapi/internal/device_list_update.go | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/userapi/internal/device_list_update.go b/userapi/internal/device_list_update.go index 6d537a4d7..a4d28188a 100644 --- a/userapi/internal/device_list_update.go +++ b/userapi/internal/device_list_update.go @@ -152,6 +152,15 @@ var deviceListUpdaterBackpressure = prometheus.NewGaugeVec( }, []string{"worker_id"}, ) +var deviceListUpdaterServersRetrying = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "dendrite", + Subsystem: "keyserver", + Name: "worker_servers_retrying", + Help: "How many servers are queued for retry", + }, + []string{"worker_id"}, +) // NewDeviceListUpdater creates a new updater which fetches fresh device lists when they go stale. func NewDeviceListUpdater( @@ -162,7 +171,7 @@ func NewDeviceListUpdater( enableMetrics bool, ) *DeviceListUpdater { if enableMetrics { - prometheus.MustRegister(deviceListUpdaterBackpressure) + prometheus.MustRegister(deviceListUpdaterBackpressure, deviceListUpdaterServersRetrying) } return &DeviceListUpdater{ process: process, @@ -188,7 +197,7 @@ func (u *DeviceListUpdater) Start() error { // to stop (in this transaction) until key requests can be made. ch := make(chan spec.ServerName, 10) u.workerChans[i] = ch - go u.worker(ch) + go u.worker(ch, i) } staleLists, err := u.db.StaleDeviceLists(u.process.Context(), []spec.ServerName{}) @@ -389,7 +398,7 @@ func (u *DeviceListUpdater) clearChannel(userID string) { } } -func (u *DeviceListUpdater) worker(ch chan spec.ServerName) { +func (u *DeviceListUpdater) worker(ch chan spec.ServerName, workerID int) { retries := make(map[spec.ServerName]time.Time) retriesMu := &sync.Mutex{} // restarter goroutine which will inject failed servers into ch when it is time @@ -408,9 +417,12 @@ func (u *DeviceListUpdater) worker(ch chan spec.ServerName) { for _, srv := range serversToRetry { delete(retries, srv) } + deviceListUpdaterServersRetrying.With(prometheus.Labels{"worker_id": strconv.Itoa(workerID)}).Set(float64(len(retries))) retriesMu.Unlock() for _, srv := range serversToRetry { + deviceListUpdaterBackpressure.With(prometheus.Labels{"worker_id": strconv.Itoa(workerID)}).Inc() ch <- srv + deviceListUpdaterBackpressure.With(prometheus.Labels{"worker_id": strconv.Itoa(workerID)}).Dec() } } }()