From e5a59b75800016eefb69cbb277a2730ddc99f196 Mon Sep 17 00:00:00 2001 From: Rudraksh Pareek <54525605+DelusionalOptimist@users.noreply.github.com> Date: Fri, 29 Jan 2021 01:26:28 +0530 Subject: [PATCH] basic url previews Signed-off-by: Rudraksh Pareek <54525605+DelusionalOptimist@users.noreply.github.com> --- mediaapi/routing/preview_url.go | 400 ++++++++++++++++++++++++++++++++ mediaapi/routing/routing.go | 6 + 2 files changed, 406 insertions(+) create mode 100644 mediaapi/routing/preview_url.go diff --git a/mediaapi/routing/preview_url.go b/mediaapi/routing/preview_url.go new file mode 100644 index 000000000..94a39d9f6 --- /dev/null +++ b/mediaapi/routing/preview_url.go @@ -0,0 +1,400 @@ +// Copyright 2017 Vector Creations Ltd +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package routing + +import ( + "context" + "crypto/rand" + "encoding/hex" + "fmt" + "io/ioutil" + "net/http" + "net/url" + "path" + "strconv" + "strings" + "time" + + "golang.org/x/net/html" + + "github.com/matrix-org/dendrite/clientapi/jsonerror" + "github.com/matrix-org/dendrite/mediaapi/fileutils" + "github.com/matrix-org/dendrite/mediaapi/storage" + "github.com/matrix-org/dendrite/mediaapi/types" + "github.com/matrix-org/dendrite/setup/config" + "github.com/matrix-org/util" + log "github.com/sirupsen/logrus" +) + +// the parameters included in the incoming preview_url request +type PreviewUrlRequest struct { + url url.URL + ts types.UnixMs +} + +// metadata of the url (media/html) to be previewed +type mediaInfo struct { + MediaMetadata *types.MediaMetadata + Logger *log.Entry +} + +// PreviewUrlResponse defines the format of the JSON response +// https://matrix.org/docs/spec/client_server/latest#get-matrix-media-r0-preview-url +// TODO: add more fields to the response +type PreviewUrlResponse struct { + MatrixImageSize int64 `json:"matrix:image:size"` + OgImage string `json:"og:image"` + OgSiteName string `json:"og:site_name"` + OgType string `json:"og:type"` + OgTitle string `json:"og:title"` + OgUrl string `json:"og:url"` + OgDescription string `json:"og:description"` +} + +// PreviewUrl implements GET /preview_url. +// Current implementation gets the url, parses the meta tags to obtain OG +// data and returns a JSONResponse for the client to process. +// TODO: avoid this endpoint in encrypted rooms +func PreviewUrl( + req *http.Request, cfg *config.MediaAPI, db storage.Database) util.JSONResponse { + + // parsing the request + r, err := parseRequest(req) + if err != nil { + return *err + } + + // downloading metadata for the url + mediaInfo, err := r.downloadUrl(cfg, req.Context(), db) + if err != nil { + return *err + } + + // preparing the response + response, err := mediaInfo.prepareResponse(cfg) + if err != nil { + return *err + } + + return util.JSONResponse{ + Code: http.StatusOK, + JSON: response, + } +} + +// parseRequest parses the incoming preview request to extract the url and ts. +// Returns either a parsed PreviewUrlRequest or error formatted as +// util.JSONResponse +func parseRequest(req *http.Request) (*PreviewUrlRequest, *util.JSONResponse) { + + // get the url to be previewed from the request + urlToPreview := req.URL.Query().Get("url") + if len(urlToPreview) == 0 { + return nil, &util.JSONResponse{ + Code: http.StatusBadRequest, + JSON: jsonerror.MissingArgument("Missing URL"), + } + } + + // parse the url + parsedUrl, err := url.Parse(urlToPreview) + if err != nil { + return nil, &util.JSONResponse{ + Code: http.StatusBadRequest, + JSON: jsonerror.InvalidArgumentValue( + "Unable to parse url " + err.Error()), + } + } + + // get the ts if provided in the request + if tsStr := req.URL.Query().Get("ts"); len(tsStr) > 0 { + ts, err := strconv.ParseInt(tsStr, 10, 64) + if err != nil { + return nil, &util.JSONResponse{ + Code: http.StatusBadRequest, + JSON: jsonerror.InvalidArgumentValue( + "Couldn't parse 'ts' to a valid integer" + err.Error()), + } + } + + request := &PreviewUrlRequest{ + url: *parsedUrl, + // Convert timestamp to ms + ts: types.UnixMs(ts / 1000000), + } + + return request, nil + } + + // set ts to current time if none provided + ts := time.Now().UnixNano() + + request := &PreviewUrlRequest{ + url: *parsedUrl, + ts: types.UnixMs(ts / 1000000), + } + + return request, nil +} + +// downloadUrl downloads the url and saves the response body. +// Returns either mediaInfo (metadata about the file to preview) or error +// formatted as util.JSONResponse. +// Current implementation for saving response is heavily similar to /upload, +// need to work on this. +// TODO: better logging +func (previewReq *PreviewUrlRequest) downloadUrl(cfg *config.MediaAPI, ctx context.Context, db storage.Database) (*mediaInfo, *util.JSONResponse) { + + urlString := previewReq.url.String() + + // Get the URL + response, err := http.Get(urlString) + if err != nil { + return nil, &util.JSONResponse{ + Code: http.StatusBadRequest, + JSON: jsonerror.InvalidArgumentValue("Couldn't get the URL " + err.Error()), + } + } + + // TODO: check for other status codes too + if response.StatusCode == 404 { + return nil, &util.JSONResponse{ + Code: http.StatusBadRequest, + JSON: jsonerror.NotFound("Given url returned 404"), + } + } + + // Save the response body to the temporary dir + // TODO: should be a new directory like url_cache maybe and + hash, bytesWritten, tmpDir, err := fileutils.WriteTempFile(ctx, response.Body, cfg.AbsBasePath) + if err != nil { + return nil, &util.JSONResponse{ + Code: http.StatusBadRequest, + JSON: jsonerror.Unknown("Failed to store URL response " + err.Error()), + } + } + + // metadata of the media/webpage to preview + urlMetadata := &mediaInfo{ + MediaMetadata: &types.MediaMetadata{ + Origin: cfg.Matrix.ServerName, + FileSizeBytes: bytesWritten, + Base64Hash: hash, + ContentType: types.ContentType(response.Header["Content-Type"][0]), + }, + Logger: util.GetLogger(ctx).WithField("Origin", cfg.Matrix.ServerName), + } + + // :-\ + // Look up the media by the file hash. If we already have the file but under a + // different media ID then we won't upload the file again - instead we'll just + // add a new metadata entry that refers to the same file. + existingMetadata, err := db.GetMediaMetadataByHash( + ctx, hash, urlMetadata.MediaMetadata.Origin, + ) + if err != nil { + fileutils.RemoveDir(tmpDir, urlMetadata.Logger) + urlMetadata.Logger.WithError(err).Error("Error querying the database by hash.") + resErr := jsonerror.InternalServerError() + return nil, &resErr + } + if existingMetadata != nil { + // The file already exists, delete the uploaded temporary file. + defer fileutils.RemoveDir(tmpDir, urlMetadata.Logger) + // The file already exists. Make a new media ID up for it. + mediaID, merr := urlMetadata.generateMediaID(ctx, db) + if merr != nil { + urlMetadata.Logger.WithError(merr).Error("Failed to generate media ID for existing file") + resErr := jsonerror.InternalServerError() + return nil, &resErr + } + + // Then amend the upload metadata. + urlMetadata.MediaMetadata = &types.MediaMetadata{ + MediaID: mediaID, + Origin: urlMetadata.MediaMetadata.Origin, + ContentType: urlMetadata.MediaMetadata.ContentType, + FileSizeBytes: urlMetadata.MediaMetadata.FileSizeBytes, + CreationTimestamp: urlMetadata.MediaMetadata.CreationTimestamp, + UploadName: urlMetadata.MediaMetadata.UploadName, + Base64Hash: hash, + UserID: urlMetadata.MediaMetadata.UserID, + } + } else { + // The file doesn't exist. Update the request metadata. + urlMetadata.MediaMetadata.FileSizeBytes = bytesWritten + urlMetadata.MediaMetadata.Base64Hash = hash + urlMetadata.MediaMetadata.MediaID, err = urlMetadata.generateMediaID(ctx, db) + if err != nil { + fileutils.RemoveDir(tmpDir, urlMetadata.Logger) + urlMetadata.Logger.WithError(err).Error("Failed to generate media ID for new download") + resErr := jsonerror.InternalServerError() + return nil, &resErr + } + } + + urlMetadata.Logger = urlMetadata.Logger.WithField("media_id", urlMetadata.MediaMetadata.MediaID) + urlMetadata.Logger.WithFields(log.Fields{ + "Base64Hash": urlMetadata.MediaMetadata.Base64Hash, + "UploadName": urlMetadata.MediaMetadata.UploadName, + "FileSizeBytes": urlMetadata.MediaMetadata.FileSizeBytes, + "ContentType": urlMetadata.MediaMetadata.ContentType, + }).Info("File downloaded") + + if resErr := urlMetadata.storeFileAndMetadata(ctx, tmpDir, cfg.AbsBasePath, db); resErr != nil { + return nil, resErr + } + + return urlMetadata, nil +} + +func (m *mediaInfo) generateMediaID(ctx context.Context, db storage.Database) (types.MediaID, error) { + for { + // First try generating a media ID. We'll do this by + // generating some random bytes and then hex-encoding. + mediaIDBytes := make([]byte, 32) + _, err := rand.Read(mediaIDBytes) + if err != nil { + return "", fmt.Errorf("rand.Read: %w", err) + } + mediaID := types.MediaID(hex.EncodeToString(mediaIDBytes)) + // Then we will check if this media ID already exists in + // our database. If it does then we had best generate a + // new one. + existingMetadata, err := db.GetMediaMetadata(ctx, mediaID, m.MediaMetadata.Origin) + if err != nil { + return "", fmt.Errorf("db.GetMediaMetadata: %w", err) + } + if existingMetadata != nil { + // The media ID was already used - repeat the process + // and generate a new one instead. + continue + } + // The media ID was not already used - let's return that. + return mediaID, nil + } +} + +func (r *mediaInfo) storeFileAndMetadata( + ctx context.Context, + tmpDir types.Path, + absBasePath config.Path, + db storage.Database, +) *util.JSONResponse { + finalPath, duplicate, err := fileutils.MoveFileWithHashCheck(tmpDir, r.MediaMetadata, absBasePath, r.Logger) + if err != nil { + r.Logger.WithError(err).Error("Failed to move file.") + return &util.JSONResponse{ + Code: http.StatusBadRequest, + JSON: jsonerror.Unknown("Failed to upload"), + } + } + if duplicate { + r.Logger.WithField("dst", finalPath).Info("File was stored previously - discarding duplicate") + } + + if err = db.StoreMediaMetadata(ctx, r.MediaMetadata); err != nil { + r.Logger.WithError(err).Warn("Failed to store metadata") + // If the file is a duplicate (has the same hash as an existing file) then + // there is valid metadata in the database for that file. As such we only + // remove the file if it is not a duplicate. + if !duplicate { + fileutils.RemoveDir(types.Path(path.Dir(string(finalPath))), r.Logger) + } + return &util.JSONResponse{ + Code: http.StatusBadRequest, + JSON: jsonerror.Unknown("Failed to upload"), + } + } + + return nil +} + +// parseContent returns the data stored in content attributes +func parseContent(node *html.Node) string { + // iterating the attributes of the tag to get content + for _, attr := range node.Attr { + if attr.Key == "content" { + content := attr.Val + return content + } + } + return "" +} + +// prepareResponse prepares the response to be returned to the client +func (mediaInfo *mediaInfo) prepareResponse(cfg *config.MediaAPI) (*PreviewUrlResponse, *util.JSONResponse) { + + // Reading the file in which the response body was stored and parsing the html. + pathToFile, err := fileutils.GetPathFromBase64Hash(mediaInfo.MediaMetadata.Base64Hash, cfg.AbsBasePath) + if err != nil { + return nil, &util.JSONResponse{ + Code: http.StatusInternalServerError, + JSON: jsonerror.Unknown("Couldn't get the path to the stored response" + err.Error()), + } + } + + fileString, err := ioutil.ReadFile(pathToFile) + if err != nil { + return nil, &util.JSONResponse{ + Code: http.StatusInternalServerError, + JSON: jsonerror.Unknown("Couldn't read the stored response body" + err.Error()), + } + } + + file := strings.NewReader(string(fileString)) + + tree, err := html.Parse(file) + if err != nil { + return nil, &util.JSONResponse{ + Code: http.StatusInternalServerError, + JSON: jsonerror.Unknown("Couldn't get the path to the stored file " + err.Error()), + } + } + + // map for storing the content values + m := make(map[string]string) + + // Iterating the *html.Node, looking for og data + var f func(*html.Node) + f = func(n *html.Node) { + //check if meta tag + if n.Type == html.ElementNode && n.Data == "meta" { + //parse attributes of the tag + for _, a := range n.Attr { + if a.Key == "property" && strings.HasPrefix(a.Val, "og:") { + slice := strings.Split(a.Val, ":") + m[slice[1]] = parseContent(n) + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c) + } + } + + f(tree) + + response := &PreviewUrlResponse{ + OgImage: m["image"], + OgSiteName: m["site_name"], + OgType: m["type"], + OgTitle: m["title"], + OgUrl: m["url"], + OgDescription: m["description"], + } + + return response, nil +} diff --git a/mediaapi/routing/routing.go b/mediaapi/routing/routing.go index 6a88ea6e1..edcacb06a 100644 --- a/mediaapi/routing/routing.go +++ b/mediaapi/routing/routing.go @@ -80,6 +80,12 @@ func Setup( return Config(req, cfg) }), ).Methods(http.MethodGet, http.MethodOptions) + + r0mux.Handle("/preview_url", + httputil.MakeAuthAPI("preview_url", userAPI, func(req *http.Request, dev *userapi.Device) util.JSONResponse { + return PreviewUrl(req, cfg, db) + }), + ).Methods(http.MethodGet, http.MethodOptions) } func makeDownloadAPI(