From 5536fec902948e5c184f02497d87115894e10c6a Mon Sep 17 00:00:00 2001 From: Robert Swain <rob@matrix.org> Date: Mon, 22 May 2017 10:24:03 +0200 Subject: [PATCH] mediaapi/writers: Add base64hash to media_repository table A SHA-256 hash sum in golang base64 URLEncoding format (contains only URL-safe characters) is now calculated and stored for every file transferred to this server. Uploads to the server use this hash as the MediaID. Downloads from remote servers retain their MediaID from the remote server, but can use the hash for local deduplication and integrity checking purposes. --- .../mediaapi/storage/media_repository_table.go | 10 +++++++--- .../matrix-org/dendrite/mediaapi/types/types.go | 4 ++++ .../matrix-org/dendrite/mediaapi/writers/download.go | 2 ++ .../matrix-org/dendrite/mediaapi/writers/upload.go | 2 ++ 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/github.com/matrix-org/dendrite/mediaapi/storage/media_repository_table.go b/src/github.com/matrix-org/dendrite/mediaapi/storage/media_repository_table.go index 31846cf48..c19a9d9b3 100644 --- a/src/github.com/matrix-org/dendrite/mediaapi/storage/media_repository_table.go +++ b/src/github.com/matrix-org/dendrite/mediaapi/storage/media_repository_table.go @@ -42,6 +42,8 @@ CREATE TABLE IF NOT EXISTS media_repository ( creation_ts BIGINT NOT NULL, -- The file name with which the media was uploaded. upload_name TEXT NOT NULL, + -- A golang base64 URLEncoding string representation of a SHA-256 hash sum of the file data. + base64hash TEXT NOT NULL, -- The user who uploaded the file. Should be a Matrix user ID. user_id TEXT NOT NULL ); @@ -49,12 +51,12 @@ CREATE UNIQUE INDEX IF NOT EXISTS media_repository_index ON media_repository (me ` const insertMediaSQL = ` -INSERT INTO media_repository (media_id, media_origin, content_type, content_disposition, content_length, creation_ts, upload_name, user_id) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8) +INSERT INTO media_repository (media_id, media_origin, content_type, content_disposition, content_length, creation_ts, upload_name, base64hash, user_id) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) ` const selectMediaSQL = ` -SELECT content_type, content_disposition, content_length, creation_ts, upload_name, user_id FROM media_repository WHERE media_id = $1 AND media_origin = $2 +SELECT content_type, content_disposition, content_length, creation_ts, upload_name, base64hash, user_id FROM media_repository WHERE media_id = $1 AND media_origin = $2 ` type mediaStatements struct { @@ -84,6 +86,7 @@ func (s *mediaStatements) insertMedia(mediaMetadata *types.MediaMetadata) error mediaMetadata.ContentLength, mediaMetadata.CreationTimestamp, mediaMetadata.UploadName, + mediaMetadata.Base64Hash, mediaMetadata.UserID, ) return err @@ -102,6 +105,7 @@ func (s *mediaStatements) selectMedia(mediaID types.MediaID, mediaOrigin gomatri &mediaMetadata.ContentLength, &mediaMetadata.CreationTimestamp, &mediaMetadata.UploadName, + &mediaMetadata.Base64Hash, &mediaMetadata.UserID, ) return &mediaMetadata, err diff --git a/src/github.com/matrix-org/dendrite/mediaapi/types/types.go b/src/github.com/matrix-org/dendrite/mediaapi/types/types.go index 0da5b1017..bef86adcc 100644 --- a/src/github.com/matrix-org/dendrite/mediaapi/types/types.go +++ b/src/github.com/matrix-org/dendrite/mediaapi/types/types.go @@ -32,6 +32,9 @@ type ContentType string // Filename is a string representing the name of a file type Filename string +// Base64Hash is a base64 URLEncoding string representation of a SHA-256 hash sum +type Base64Hash string + // Path is an absolute or relative UNIX filesystem path type Path string @@ -56,6 +59,7 @@ type MediaMetadata struct { ContentLength ContentLength CreationTimestamp UnixMs UploadName Filename + Base64Hash Base64Hash UserID MatrixUserID } diff --git a/src/github.com/matrix-org/dendrite/mediaapi/writers/download.go b/src/github.com/matrix-org/dendrite/mediaapi/writers/download.go index 58c9b3adc..4dec2452b 100644 --- a/src/github.com/matrix-org/dendrite/mediaapi/writers/download.go +++ b/src/github.com/matrix-org/dendrite/mediaapi/writers/download.go @@ -332,6 +332,7 @@ func (r *downloadRequest) commitFileAndMetadata(tmpDir types.Path, absBasePath t r.Logger.WithFields(log.Fields{ "MediaID": r.MediaMetadata.MediaID, "Origin": r.MediaMetadata.Origin, + "Base64Hash": r.MediaMetadata.Base64Hash, "UploadName": r.MediaMetadata.UploadName, "Content-Length": r.MediaMetadata.ContentLength, "Content-Type": r.MediaMetadata.ContentType, @@ -483,6 +484,7 @@ func (r *downloadRequest) respondFromRemoteFile(w http.ResponseWriter, absBasePa // request's response. bytesWritten is therefore used as it is what would be sent to clients when reading from the local // file. r.MediaMetadata.ContentLength = types.ContentLength(bytesWritten) + r.MediaMetadata.Base64Hash = hash r.MediaMetadata.UserID = types.MatrixUserID("@:" + string(r.MediaMetadata.Origin)) updateActiveRemoteRequests = r.commitFileAndMetadata(tmpDir, absBasePath, activeRemoteRequests, db, mxcURL) diff --git a/src/github.com/matrix-org/dendrite/mediaapi/writers/upload.go b/src/github.com/matrix-org/dendrite/mediaapi/writers/upload.go index dc886353f..92a80f2e2 100644 --- a/src/github.com/matrix-org/dendrite/mediaapi/writers/upload.go +++ b/src/github.com/matrix-org/dendrite/mediaapi/writers/upload.go @@ -220,11 +220,13 @@ func Upload(req *http.Request, cfg *config.MediaAPI, db *storage.Database) util. } r.MediaMetadata.ContentLength = bytesWritten + r.MediaMetadata.Base64Hash = hash r.MediaMetadata.MediaID = types.MediaID(hash) logger.WithFields(log.Fields{ "MediaID": r.MediaMetadata.MediaID, "Origin": r.MediaMetadata.Origin, + "Base64Hash": r.MediaMetadata.Base64Hash, "UploadName": r.MediaMetadata.UploadName, "Content-Length": r.MediaMetadata.ContentLength, "Content-Type": r.MediaMetadata.ContentType,