Track reasons why the process is in a degraded state

This commit is contained in:
Neil Alexander 2022-10-04 13:02:41 +01:00
parent a767102f8a
commit 3da182212e
No known key found for this signature in database
GPG key ID: A02A2019A2BB0944
3 changed files with 33 additions and 16 deletions

View file

@ -18,6 +18,7 @@ import (
"context"
"crypto/tls"
"database/sql"
"encoding/json"
"fmt"
"io"
"net"
@ -467,8 +468,13 @@ func (b *BaseDendrite) SetupAndServeHTTP(
w.WriteHeader(200)
})
b.DendriteAdminMux.HandleFunc("/monitor/health", func(w http.ResponseWriter, r *http.Request) {
if b.ProcessContext.IsDegraded() {
if isDegraded, reasons := b.ProcessContext.IsDegraded(); isDegraded {
w.WriteHeader(503)
_ = json.NewEncoder(w).Encode(struct {
Warnings []string `json:"warnings"`
}{
Warnings: reasons,
})
return
}
w.WriteHeader(200)

View file

@ -169,9 +169,9 @@ func setupNATS(process *process.ProcessContext, cfg *config.JetStream, nc *natsc
// We've managed to add the stream in memory. What's on the
// disk will be left alone, but our ability to recover from a
// future crash will be limited. Yell about it.
sentry.CaptureException(fmt.Errorf("Stream %q is running in-memory; this may be due to data corruption in the JetStream storage directory, investigate as soon as possible", namespaced.Name))
logrus.Warn("Stream is running in-memory; this may be due to data corruption in the JetStream storage directory, investigate as soon as possible")
process.Degraded()
err := fmt.Errorf("Stream %q is running in-memory; this may be due to data corruption in the JetStream storage directory", namespaced.Name)
sentry.CaptureException(err)
process.Degraded(err)
}
}
}

View file

@ -2,19 +2,18 @@ package process
import (
"context"
"fmt"
"sync"
"github.com/getsentry/sentry-go"
"github.com/sirupsen/logrus"
"go.uber.org/atomic"
)
type ProcessContext struct {
wg *sync.WaitGroup // used to wait for components to shutdown
ctx context.Context // cancelled when Stop is called
shutdown context.CancelFunc // shut down Dendrite
degraded atomic.Bool
mu sync.RWMutex
wg *sync.WaitGroup // used to wait for components to shutdown
ctx context.Context // cancelled when Stop is called
shutdown context.CancelFunc // shut down Dendrite
degraded map[string]struct{} // reasons why the process is degraded
}
func NewProcessContext() *ProcessContext {
@ -50,13 +49,25 @@ func (b *ProcessContext) WaitForComponentsToFinish() {
b.wg.Wait()
}
func (b *ProcessContext) Degraded() {
if b.degraded.CompareAndSwap(false, true) {
logrus.Warn("Dendrite is running in a degraded state")
sentry.CaptureException(fmt.Errorf("Process is running in a degraded state"))
func (b *ProcessContext) Degraded(err error) {
b.mu.Lock()
defer b.mu.Unlock()
if _, ok := b.degraded[err.Error()]; !ok {
logrus.WithError(err).Warn("Dendrite has entered a degraded state")
sentry.CaptureException(err)
b.degraded[err.Error()] = struct{}{}
}
}
func (b *ProcessContext) IsDegraded() bool {
return b.degraded.Load()
func (b *ProcessContext) IsDegraded() (bool, []string) {
b.mu.RLock()
defer b.mu.RUnlock()
if len(b.degraded) == 0 {
return false, nil
}
reasons := make([]string, 0, len(b.degraded))
for reason := range b.degraded {
reasons = append(reasons, reason)
}
return true, reasons
}