diff --git a/src/github.com/matrix-org/dendrite/mediaapi/writers/download.go b/src/github.com/matrix-org/dendrite/mediaapi/writers/download.go index 259e6ad89..ab713fa5f 100644 --- a/src/github.com/matrix-org/dendrite/mediaapi/writers/download.go +++ b/src/github.com/matrix-org/dendrite/mediaapi/writers/download.go @@ -341,21 +341,14 @@ func (r *downloadRequest) commitFileAndMetadata(tmpDir types.Path, absBasePath t }).Info("Storing file metadata to media repository database") // The database is the source of truth so we need to have moved the file first - finalPath, err := getPathFromMediaMetadata(r.MediaMetadata, absBasePath) + finalPath, duplicate, err := moveFileWithHashCheck(tmpDir, r.MediaMetadata, absBasePath, r.Logger) if err != nil { - r.Logger.WithError(err).Warn("Failed to get file path from metadata") - removeDir(tmpDir, r.Logger) + r.Logger.WithError(err).Error("Failed to move file.") return updateActiveRemoteRequests } - - err = moveFile( - types.Path(path.Join(string(tmpDir), "content")), - types.Path(finalPath), - ) - if err != nil { - r.Logger.WithError(err).WithField("dst", finalPath).Warn("Failed to move file to final destination") - removeDir(tmpDir, r.Logger) - return updateActiveRemoteRequests + if duplicate == true { + r.Logger.WithField("dst", finalPath).Info("File was stored previously - discarding duplicate") + // Continue on to store the metadata in the database } // Writing the metadata to the media repository database and removing the mxcURL from activeRemoteRequests needs to be atomic. @@ -369,8 +362,13 @@ func (r *downloadRequest) commitFileAndMetadata(tmpDir types.Path, absBasePath t // if written to disk, add to db err = db.StoreMediaMetadata(r.MediaMetadata) if err != nil { - finalDir := path.Dir(finalPath) - removeDir(types.Path(finalDir), r.Logger) + // If the file is a duplicate (has the same hash as an existing file) then + // there is valid metadata in the database for that file. As such we only + // remove the file if it is not a duplicate. + if duplicate == false { + finalDir := path.Dir(finalPath) + removeDir(types.Path(finalDir), r.Logger) + } completeRemoteRequest(activeRemoteRequests, mxcURL) return updateActiveRemoteRequests } diff --git a/src/github.com/matrix-org/dendrite/mediaapi/writers/fileutils.go b/src/github.com/matrix-org/dendrite/mediaapi/writers/fileutils.go index 0517bf7ca..d99167545 100644 --- a/src/github.com/matrix-org/dendrite/mediaapi/writers/fileutils.go +++ b/src/github.com/matrix-org/dendrite/mediaapi/writers/fileutils.go @@ -250,3 +250,38 @@ func moveFile(src types.Path, dst types.Path) error { } return nil } + +// moveFileWithHashCheck attempts to move the file src to dst and checks for hash collisions based on metadata +// Check if destination file exists. As the destination is based on a hash of the file data, +// if it exists and the content length does not match then there is a hash collision for two different files. If +// it exists and the content length matches, it is believable that it is the same file and we can just +// discard the temporary file. +func moveFileWithHashCheck(tmpDir types.Path, mediaMetadata *types.MediaMetadata, absBasePath types.Path, logger *log.Entry) (string, bool, error) { + duplicate := false + finalPath, err := getPathFromMediaMetadata(mediaMetadata, absBasePath) + if err != nil { + removeDir(tmpDir, logger) + return "", duplicate, fmt.Errorf("failed to get file path from metadata: %q", err) + } + + var stat os.FileInfo + if stat, err = os.Stat(finalPath); os.IsExist(err) { + duplicate = true + if stat.Size() == int64(mediaMetadata.ContentLength) { + removeDir(tmpDir, logger) + return finalPath, duplicate, nil + } + // Remove the tmpDir as we anyway cannot cache the file on disk due to the hash collision + removeDir(tmpDir, logger) + return "", duplicate, fmt.Errorf("downloaded file with hash collision but different file size (%v)", finalPath) + } + err = moveFile( + types.Path(path.Join(string(tmpDir), "content")), + types.Path(finalPath), + ) + if err != nil { + removeDir(tmpDir, logger) + return "", duplicate, fmt.Errorf("failed to move file to final destination (%v): %q", finalPath, err) + } + return finalPath, duplicate, nil +} diff --git a/src/github.com/matrix-org/dendrite/mediaapi/writers/upload.go b/src/github.com/matrix-org/dendrite/mediaapi/writers/upload.go index debed48d3..947d7d94b 100644 --- a/src/github.com/matrix-org/dendrite/mediaapi/writers/upload.go +++ b/src/github.com/matrix-org/dendrite/mediaapi/writers/upload.go @@ -142,32 +142,26 @@ func parseAndValidateRequest(req *http.Request, cfg *config.MediaAPI) (*uploadRe // In case of any error, appropriate files and directories are cleaned up a // util.JSONResponse error is returned. func storeFileAndMetadata(tmpDir types.Path, absBasePath types.Path, mediaMetadata *types.MediaMetadata, db *storage.Database, logger *log.Entry) *util.JSONResponse { - finalPath, err := getPathFromMediaMetadata(mediaMetadata, absBasePath) + finalPath, duplicate, err := moveFileWithHashCheck(tmpDir, mediaMetadata, absBasePath, logger) if err != nil { - logger.WithError(err).Warn("Failed to get file path from metadata") - removeDir(tmpDir, logger) + logger.WithError(err).Error("Failed to move file.") return &util.JSONResponse{ Code: 400, JSON: jsonerror.Unknown(fmt.Sprintf("Failed to upload")), } } - - err = moveFile( - types.Path(path.Join(string(tmpDir), "content")), - types.Path(finalPath), - ) - if err != nil { - logger.WithError(err).WithField("dst", finalPath).Warn("Failed to move file to final destination") - removeDir(tmpDir, logger) - return &util.JSONResponse{ - Code: 400, - JSON: jsonerror.Unknown(fmt.Sprintf("Failed to upload")), - } + if duplicate == true { + logger.WithField("dst", finalPath).Info("File was stored previously - discarding duplicate") } if err = db.StoreMediaMetadata(mediaMetadata); err != nil { logger.WithError(err).Warn("Failed to store metadata") - removeDir(types.Path(path.Dir(finalPath)), logger) + // If the file is a duplicate (has the same hash as an existing file) then + // there is valid metadata in the database for that file. As such we only + // remove the file if it is not a duplicate. + if duplicate == false { + removeDir(types.Path(path.Dir(finalPath)), logger) + } return &util.JSONResponse{ Code: 400, JSON: jsonerror.Unknown(fmt.Sprintf("Failed to upload")),