Add more logs (#2005)

* Add more logs

To help debug the migration issue in #1924 along with manual data-loss-inducing fixes.
Also log the origin server on processed txns to help debug buggy server origins.

* Fix query
This commit is contained in:
kegsay 2021-09-07 15:07:14 +01:00 committed by GitHub
parent 51b119107c
commit 7dc8fb1fe7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 39 additions and 7 deletions

View file

@ -345,7 +345,7 @@ func (t *txnReq) processTransaction(ctx context.Context) (*gomatrixserverlib.Res
} }
if c := len(results); c > 0 { if c := len(results); c > 0 {
util.GetLogger(ctx).Infof("Processed %d PDUs from transaction %q", c, t.TransactionID) util.GetLogger(ctx).Infof("Processed %d PDUs from %v in transaction %q", c, t.Origin, t.TransactionID)
} }
return &gomatrixserverlib.RespSend{PDUs: results}, nil return &gomatrixserverlib.RespSend{PDUs: results}, nil
} }

View file

@ -220,7 +220,6 @@ func UpStateBlocksRefactor(tx *sql.Tx) error {
index := stateSnapshotData{snapshot.StateSnapshotNID, snapshot.RoomNID} index := stateSnapshotData{snapshot.StateSnapshotNID, snapshot.RoomNID}
newsnapshots[index] = append(newsnapshots[index], blocknid) newsnapshots[index] = append(newsnapshots[index], blocknid)
} }
for snapshotdata, newblocks := range newsnapshots { for snapshotdata, newblocks := range newsnapshots {
var newblocksarray pq.Int64Array var newblocksarray pq.Int64Array
for _, b := range newblocks { for _, b := range newblocks {
@ -252,16 +251,49 @@ func UpStateBlocksRefactor(tx *sql.Tx) error {
// If we do, this is a problem if Dendrite tries to load the snapshot as it will not exist // If we do, this is a problem if Dendrite tries to load the snapshot as it will not exist
// in roomserver_state_snapshots // in roomserver_state_snapshots
var count int64 var count int64
if err = tx.QueryRow(`SELECT COUNT(*) FROM roomserver_events WHERE state_snapshot_nid < $1 AND state_snapshot_nid != 0`, maxsnapshotid).Scan(&count); err != nil { if err = tx.QueryRow(`SELECT COUNT(*) FROM roomserver_events WHERE state_snapshot_nid < $1 AND state_snapshot_nid != 0`, maxsnapshotid).Scan(&count); err != nil {
return fmt.Errorf("assertion query failed: %s", err) return fmt.Errorf("assertion query failed: %s", err)
} }
if count > 0 { if count > 0 {
var debugEventID, debugRoomID string
var debugEventTypeNID, debugStateKeyNID, debugSnapNID, debugDepth int64
err = tx.QueryRow(
`SELECT event_id, event_type_nid, event_state_key_nid, roomserver_events.state_snapshot_nid, depth, room_id FROM roomserver_events
JOIN roomserver_rooms ON roomserver_rooms.room_nid = roomserver_events.room_nid WHERE roomserver_events.state_snapshot_nid < $1 AND roomserver_events.state_snapshot_nid != 0`, maxsnapshotid,
).Scan(&debugEventID, &debugEventTypeNID, &debugStateKeyNID, &debugSnapNID, &debugDepth, &debugRoomID)
if err != nil {
logrus.Errorf("cannot extract debug info: %v", err)
} else {
logrus.Errorf(
"Affected row: event_id=%v room_id=%v type=%v state_key=%v snapshot=%v depth=%v",
debugEventID, debugRoomID, debugEventTypeNID, debugStateKeyNID, debugSnapNID, debugDepth,
)
logrus.Errorf("To fix this manually, run this query first then retry the migration: "+
"UPDATE roomserver_events SET state_snapshot_nid=0 WHERE event_id='%v'", debugEventID)
}
return fmt.Errorf("%d events exist in roomserver_events which have not been converted to a new state_snapshot_nid; this is a bug, please report", count) return fmt.Errorf("%d events exist in roomserver_events which have not been converted to a new state_snapshot_nid; this is a bug, please report", count)
} }
if err = tx.QueryRow(`SELECT COUNT(*) FROM roomserver_rooms WHERE state_snapshot_nid < $1 AND state_snapshot_nid != 0`, maxsnapshotid).Scan(&count); err != nil { if err = tx.QueryRow(`SELECT COUNT(*) FROM roomserver_rooms WHERE state_snapshot_nid < $1 AND state_snapshot_nid != 0`, maxsnapshotid).Scan(&count); err != nil {
return fmt.Errorf("assertion query failed: %s", err) return fmt.Errorf("assertion query failed: %s", err)
} }
if count > 0 { if count > 0 {
var debugRoomID string
var debugSnapNID, debugLastEventNID int64
err = tx.QueryRow(
`SELECT room_id, state_snapshot_nid, last_event_sent_nid FROM roomserver_rooms WHERE state_snapshot_nid < $1 AND state_snapshot_nid != 0`, maxsnapshotid,
).Scan(&debugRoomID, &debugSnapNID, &debugLastEventNID)
if err != nil {
logrus.Errorf("cannot extract debug info: %v", err)
} else {
logrus.Errorf(
"Affected row: room_id=%v snapshot=%v last_sent=%v",
debugRoomID, debugSnapNID, debugLastEventNID,
)
logrus.Errorf("To fix this manually, run this query first then retry the migration: "+
"UPDATE roomserver_rooms SET state_snapshot_nid=0 WHERE room_id='%v'", debugRoomID)
logrus.Errorf("Running this UPDATE will cause the room in question to become unavailable on this server. Leave and re-join the room afterwards.")
}
return fmt.Errorf("%d rooms exist in roomserver_rooms which have not been converted to a new state_snapshot_nid; this is a bug, please report", count) return fmt.Errorf("%d rooms exist in roomserver_rooms which have not been converted to a new state_snapshot_nid; this is a bug, please report", count)
} }