From 5845244aa9d4caae724f87c567360a01e64be963 Mon Sep 17 00:00:00 2001 From: ad Date: Sun, 29 Sep 2024 17:02:42 +0200 Subject: [PATCH] url preview test version Signed-off-by: Aleksandr Dubovikov --- mediaapi/fileutils/fileutils.go | 4 + mediaapi/routing/download.go | 3 +- mediaapi/routing/routing.go | 17 + mediaapi/routing/url_preview.go | 427 +++++++++++++++++++++++ mediaapi/thumbnailer/thumbnailer_nfnt.go | 36 ++ mediaapi/types/types.go | 32 ++ setup/config/config_mediaapi.go | 24 ++ 7 files changed, 542 insertions(+), 1 deletion(-) create mode 100644 mediaapi/routing/url_preview.go diff --git a/mediaapi/fileutils/fileutils.go b/mediaapi/fileutils/fileutils.go index 2e719dc82..f3976e839 100644 --- a/mediaapi/fileutils/fileutils.go +++ b/mediaapi/fileutils/fileutils.go @@ -161,6 +161,10 @@ func moveFile(src types.Path, dst types.Path) error { return nil } +func MoveFile(src types.Path, dst types.Path) error { + return moveFile(src, dst) +} + func createTempFileWriter(absBasePath config.Path) (*bufio.Writer, *os.File, types.Path, error) { tmpDir, err := createTempDir(absBasePath) if err != nil { diff --git a/mediaapi/routing/download.go b/mediaapi/routing/download.go index c3ac3cdc7..4736d39e7 100644 --- a/mediaapi/routing/download.go +++ b/mediaapi/routing/download.go @@ -316,10 +316,11 @@ func (r *downloadRequest) respondFromLocalFile( return nil, fmt.Errorf("fileutils.GetPathFromBase64Hash: %w", err) } file, err := os.Open(filePath) - defer file.Close() // nolint: errcheck, staticcheck, megacheck if err != nil { return nil, fmt.Errorf("os.Open: %w", err) } + defer file.Close() // nolint: errcheck, staticcheck, megacheck + stat, err := file.Stat() if err != nil { return nil, fmt.Errorf("file.Stat: %w", err) diff --git a/mediaapi/routing/routing.go b/mediaapi/routing/routing.go index 2867df605..00a89a7ab 100644 --- a/mediaapi/routing/routing.go +++ b/mediaapi/routing/routing.go @@ -96,6 +96,8 @@ func Setup( MXCToResult: map[string]*types.RemoteRequestResult{}, } + // v1 url_preview endpoint requiring auth + downloadHandler := makeDownloadAPI("download_unauthed", &cfg.MediaAPI, rateLimits, db, client, federationClient, activeRemoteRequests, activeThumbnailGeneration, false) v3mux.Handle("/download/{serverName}/{mediaId}", downloadHandler).Methods(http.MethodGet, http.MethodOptions) v3mux.Handle("/download/{serverName}/{mediaId}/{downloadName}", downloadHandler).Methods(http.MethodGet, http.MethodOptions) @@ -110,6 +112,21 @@ func Setup( v1mux.Handle("/download/{serverName}/{mediaId}", downloadHandlerAuthed).Methods(http.MethodGet, http.MethodOptions) v1mux.Handle("/download/{serverName}/{mediaId}/{downloadName}", downloadHandlerAuthed).Methods(http.MethodGet, http.MethodOptions) + // urlPreviewHandler := httputil.MakeAuthAPI( + // "preview_url", userAPI, + // makeUrlPreviewHandler(&cfg.MediaAPI, rateLimits, db, client, activeThumbnailGeneration), + // ) + f := makeUrlPreviewHandler(&cfg.MediaAPI, rateLimits, db, activeThumbnailGeneration) + urlPreviewHandler := httputil.MakeExternalAPI( + "preview_url", + func(req *http.Request) util.JSONResponse { + return f(req, nil) + }, + ) + v1mux.Handle("/preview_url", urlPreviewHandler).Methods(http.MethodGet, http.MethodOptions) + // That method is deprecated according to spec but still in use + v3mux.Handle("/preview_url", urlPreviewHandler).Methods(http.MethodGet, http.MethodOptions) + v1mux.Handle("/thumbnail/{serverName}/{mediaId}", httputil.MakeHTTPAPI("thumbnail", userAPI, cfg.Global.Metrics.Enabled, makeDownloadAPI("thumbnail_authed_client", &cfg.MediaAPI, rateLimits, db, client, federationClient, activeRemoteRequests, activeThumbnailGeneration, false), httputil.WithAuth()), ).Methods(http.MethodGet, http.MethodOptions) diff --git a/mediaapi/routing/url_preview.go b/mediaapi/routing/url_preview.go new file mode 100644 index 000000000..500b18802 --- /dev/null +++ b/mediaapi/routing/url_preview.go @@ -0,0 +1,427 @@ +package routing + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "time" + + "github.com/matrix-org/dendrite/internal/httputil" + "github.com/matrix-org/dendrite/mediaapi/fileutils" + "github.com/matrix-org/dendrite/mediaapi/storage" + "github.com/matrix-org/dendrite/mediaapi/thumbnailer" + "github.com/matrix-org/dendrite/mediaapi/types" + "github.com/matrix-org/dendrite/setup/config" + userapi "github.com/matrix-org/dendrite/userapi/api" + "github.com/matrix-org/gomatrixserverlib/spec" + "github.com/matrix-org/util" + "github.com/pkg/errors" + log "github.com/sirupsen/logrus" + "golang.org/x/net/html" +) + +var ErrorMissingUrl = errors.New("missing url") +var ErrorUnsupportedContentType = errors.New("unsupported content type") +var ErrorFileTooLarge = errors.New("file too large") + +func makeUrlPreviewHandler( + cfg *config.MediaAPI, + rateLimits *httputil.RateLimits, + db storage.Database, + activeThumbnailGeneration *types.ActiveThumbnailGeneration, +) func(req *http.Request, device *userapi.Device) util.JSONResponse { + + activeUrlPreviewRequests := &types.ActiveUrlPreviewRequests{Url: map[string]*types.UrlPreviewResult{}} + urlPreviewCache := &types.UrlPreviewCache{Records: map[string]*types.UrlPreviewCacheRecord{}} + + go func() { + for { + t := time.Now().Unix() + for k, record := range urlPreviewCache.Records { + if record.Created < (t - int64(cfg.UrlPreviewCacheTime)) { + urlPreviewCache.Lock.Lock() + delete(urlPreviewCache.Records, k) + urlPreviewCache.Lock.Unlock() + } + } + time.Sleep(time.Duration(16) * time.Second) + } + }() + + httpHandler := func(req *http.Request, device *userapi.Device) util.JSONResponse { + req = util.RequestWithLogging(req) + + // log := util.GetLogger(req.Context()) + // Here be call to the url preview handler + pUrl := req.URL.Query().Get("url") + ts := req.URL.Query().Get("ts") + if pUrl == "" { + return util.ErrorResponse(ErrorMissingUrl) + } + _ = ts + + logger := util.GetLogger(req.Context()).WithFields(log.Fields{ + "url": pUrl, + }) + // Check rate limits + if r := rateLimits.Limit(req, device); r != nil { + return *r + } + + // Get url preview from cache + if cacheRecord, ok := urlPreviewCache.Records[pUrl]; ok { + if cacheRecord.Error != nil { + return util.ErrorResponse(cacheRecord.Error) + } + return util.JSONResponse{ + Code: http.StatusOK, + JSON: cacheRecord.Preview, + } + } + + // Check if there is an active request + activeUrlPreviewRequests.Lock() + if activeUrlPreviewRequest, ok := activeUrlPreviewRequests.Url[pUrl]; ok { + activeUrlPreviewRequests.Unlock() + // Wait for it to complete + activeUrlPreviewRequest.Cond.L.Lock() + defer activeUrlPreviewRequest.Cond.L.Unlock() + activeUrlPreviewRequest.Cond.Wait() + + if activeUrlPreviewRequest.Error != nil { + return util.ErrorResponse(activeUrlPreviewRequest.Error) + } + return util.JSONResponse{ + Code: http.StatusOK, + JSON: activeUrlPreviewRequest.Preview, + } + } + + // Start new url preview request + activeUrlPreviewRequest := &types.UrlPreviewResult{Cond: sync.NewCond(&sync.Mutex{})} + activeUrlPreviewRequests.Url[pUrl] = activeUrlPreviewRequest + activeUrlPreviewRequests.Unlock() + + // we defer caching the url preview response as well as signalling the waiting goroutines + // about the completion of the request + defer func() { + urlPreviewCacheItem := &types.UrlPreviewCacheRecord{ + Created: time.Now().Unix(), + } + if activeUrlPreviewRequest.Error != nil { + urlPreviewCacheItem.Error = activeUrlPreviewRequest.Error + } else { + urlPreviewCacheItem.Preview = activeUrlPreviewRequest.Preview + } + + urlPreviewCache.Lock.Lock() + urlPreviewCache.Records[pUrl] = urlPreviewCacheItem + defer urlPreviewCache.Lock.Unlock() + + activeUrlPreviewRequests.Lock() + activeUrlPreviewRequests.Url[pUrl].Cond.Broadcast() + delete(activeUrlPreviewRequests.Url, pUrl) + defer activeUrlPreviewRequests.Unlock() + }() + + resp, err := downloadUrl(pUrl, time.Duration(cfg.UrlPreviewTimeout)*time.Second) + if err != nil { + activeUrlPreviewRequest.Error = err + } else { + defer resp.Body.Close() + + var result *types.UrlPreview + var imgReader *http.Response + if strings.HasPrefix(resp.Header.Get("Content-Type"), "text/html") { + result, err = getPreviewFromHTML(resp, pUrl) + if err == nil && result.ImageUrl != "" { + if imgUrl, err := url.Parse(result.ImageUrl); err == nil { + imgReader, err = downloadUrl(result.ImageUrl, time.Duration(cfg.UrlPreviewTimeout)*time.Second) + if err == nil { + mediaData, err := downloadAndStoreImage(imgUrl.Path, req.Context(), imgReader, cfg, device, db, activeThumbnailGeneration, logger) + if err == nil { + result.ImageUrl = fmt.Sprintf("mxc://%s/%s", mediaData.Origin, mediaData.MediaID) + } + } + } + } + } else if strings.HasPrefix(resp.Header.Get("Content-Type"), "image/") { + mediaData, err := downloadAndStoreImage("somefile", req.Context(), resp, cfg, device, db, activeThumbnailGeneration, logger) + if err == nil { + result = &types.UrlPreview{ImageUrl: fmt.Sprintf("mxc://%s/%s", mediaData.Origin, mediaData.MediaID)} + } + } else { + return util.ErrorResponse(errors.New("Unsupported content type")) + } + + if err != nil { + activeUrlPreviewRequest.Error = err + } else { + activeUrlPreviewRequest.Preview = result + } + } + + // choose the answer based on the result + if activeUrlPreviewRequest.Error != nil { + return util.ErrorResponse(activeUrlPreviewRequest.Error) + } else { + return util.JSONResponse{ + Code: http.StatusOK, + JSON: activeUrlPreviewRequest.Preview, + } + } + } + + return httpHandler + +} + +func downloadUrl(url string, t time.Duration) (*http.Response, error) { + client := http.Client{Timeout: t} + resp, err := client.Get(url) + if err != nil { + return nil, err + } + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return nil, errors.New("HTTP status code: " + strconv.Itoa(resp.StatusCode)) + } + + return resp, nil +} + +func getPreviewFromHTML(resp *http.Response, url string) (*types.UrlPreview, error) { + fields := getMetaFieldsFromHTML(resp) + preview := &types.UrlPreview{ + Title: fields["og:title"], + Description: fields["og:description"], + } + + if fields["og:title"] == "" { + preview.Title = url + } + if fields["og:image"] != "" { + preview.ImageUrl = fields["og:image"] + } else if fields["og:image:url"] != "" { + preview.ImageUrl = fields["og:image:url"] + } else if fields["og:image:secure_url"] != "" { + preview.ImageUrl = fields["og:image:secure_url"] + } + + if fields["og:image:width"] != "" { + if width, err := strconv.Atoi(fields["og:image:width"]); err == nil { + preview.ImageWidth = width + } + } + if fields["og:image:height"] != "" { + if height, err := strconv.Atoi(fields["og:image:height"]); err == nil { + preview.ImageHeight = height + } + } + + return preview, nil +} + +func downloadAndStoreImage( + filename string, + ctx context.Context, + req *http.Response, + cfg *config.MediaAPI, + dev *userapi.Device, + db storage.Database, + activeThumbnailGeneration *types.ActiveThumbnailGeneration, + logger *log.Entry, + +) (*types.MediaMetadata, error) { + + userid := types.MatrixUserID("user") + if dev != nil { + userid = types.MatrixUserID(dev.UserID) + } + + reqReader := req.Body.(io.Reader) + if cfg.MaxFileSizeBytes > 0 { + reqReader = io.LimitReader(reqReader, int64(cfg.MaxFileSizeBytes)+1) + } + hash, bytesWritten, tmpDir, err := fileutils.WriteTempFile(ctx, reqReader, cfg.AbsBasePath) + if err != nil { + logger.WithError(err).WithFields(log.Fields{ + "MaxFileSizeBytes": cfg.MaxFileSizeBytes, + }).Warn("Error while transferring file") + return nil, err + } + defer fileutils.RemoveDir(tmpDir, logger) + + // Check if temp file size exceeds max file size configuration + if cfg.MaxFileSizeBytes > 0 && bytesWritten > types.FileSizeBytes(cfg.MaxFileSizeBytes) { + return nil, ErrorFileTooLarge + } + + // Check if we already have this file + existingMetadata, err := db.GetMediaMetadataByHash( + ctx, hash, cfg.Matrix.ServerName, + ) + + if err != nil { + logger.WithError(err).Error("unable to get media metadata by hash") + return nil, err + } + + if existingMetadata != nil { + + logger.WithField("mediaID", existingMetadata.MediaID).Debug("media already exists") + return existingMetadata, nil + } + + tmpFileName := filepath.Join(string(tmpDir), "content") + // Check if the file is an image. + // Otherwise return an error + file, err := os.Open(string(tmpFileName)) + if err != nil { + logger.WithError(err).Error("unable to open file") + return nil, err + } + defer file.Close() + + buf := make([]byte, 512) + + _, err = file.Read(buf) + if err != nil { + logger.WithError(err).Error("unable to read file") + return nil, err + } + + fileType := http.DetectContentType(buf) + if !strings.HasPrefix(fileType, "image") { + logger.WithField("contentType", fileType).Debugf("uploaded file is not an image or can not be thumbnailed, not generating thumbnails") + return nil, ErrorUnsupportedContentType + } + logger.WithField("contentType", fileType).Debug("uploaded file is an image") + + // Create a thumbnail from the image + thumbnailPath := tmpFileName + ".thumbnail" + err = thumbnailer.CreateThumbnailFromFile(types.Path(tmpFileName), types.Path(thumbnailPath), types.ThumbnailSize(cfg.UrlPreviewThumbnailSize), logger) + if err != nil { + if errors.Is(err, thumbnailer.ErrThumbnailTooLarge) { + thumbnailPath = tmpFileName + } else { + logger.WithError(err).Error("unable to create thumbnail") + return nil, err + } + } + logger.Debug("thumbnail created", thumbnailPath) + thumbnailFileInfo, err := os.Stat(string(thumbnailPath)) + if err != nil { + logger.WithError(err).Error("unable to get thumbnail file info") + return nil, err + } + + r := &uploadRequest{ + MediaMetadata: &types.MediaMetadata{ + Origin: cfg.Matrix.ServerName, + }, + Logger: logger, + } + + // Move the thumbnail to the media store + mediaID, err := r.generateMediaID(ctx, db) + if err != nil { + logger.WithError(err).Error("unable to generate media ID") + return nil, err + } + mediaMetaData := &types.MediaMetadata{ + MediaID: mediaID, + Origin: cfg.Matrix.ServerName, + ContentType: types.ContentType(fileType), + FileSizeBytes: types.FileSizeBytes(thumbnailFileInfo.Size()), + UploadName: types.Filename(filename), + CreationTimestamp: spec.Timestamp(time.Now().Unix()), + Base64Hash: hash, + UserID: userid, + } + fmt.Println("mediaMetaData", mediaMetaData) + finalPath, err := fileutils.GetPathFromBase64Hash(mediaMetaData.Base64Hash, cfg.AbsBasePath) + if err != nil { + logger.WithError(err).Error("unable to get path from base64 hash") + return nil, err + } + err = fileutils.MoveFile(types.Path(thumbnailPath), types.Path(finalPath)) + if err != nil { + logger.WithError(err).Error("unable to move thumbnail file") + return nil, err + } + // Store the metadata in the database + err = db.StoreMediaMetadata(ctx, mediaMetaData) + if err != nil { + logger.WithError(err).Error("unable to store media metadata") + return nil, err + } + + return mediaMetaData, nil +} + +func getMetaFieldsFromHTML(resp *http.Response) map[string]string { + htmlTokens := html.NewTokenizer(resp.Body) + ogValues := map[string]string{} + fieldsToGet := []string{ + "og:title", + "og:description", + "og:image", + "og:image:url", + "og:image:secure_url", + "og:image:width", + "og:image:height", + "og:image:type", + } + fieldsMap := make(map[string]bool, len(fieldsToGet)) + for _, field := range fieldsToGet { + fieldsMap[field] = true + ogValues[field] = "" + } + + headTagOpened := false + for { + tokenType := htmlTokens.Next() + if tokenType == html.ErrorToken { + break + } + token := htmlTokens.Token() + + // Check if there was opened a head tag + if tokenType == html.StartTagToken && token.Data == "head" { + headTagOpened = true + } + // We search for meta tags only inside the head tag if it exists + if headTagOpened && tokenType == html.EndTagToken && token.Data == "head" { + break + } + if (tokenType == html.SelfClosingTagToken || tokenType == html.StartTagToken) && token.Data == "meta" { + var propertyName string + var propertyContent string + for _, attr := range token.Attr { + if attr.Key == "property" { + propertyName = attr.Val + } + if attr.Key == "content" { + propertyContent = attr.Val + } + if propertyName != "" && propertyContent != "" { + break + } + } + // Push the values to the map if they are in the required fields list + if propertyName != "" && propertyContent != "" { + if _, ok := fieldsMap[propertyName]; ok { + ogValues[propertyName] = propertyContent + } + } + } + } + return ogValues +} diff --git a/mediaapi/thumbnailer/thumbnailer_nfnt.go b/mediaapi/thumbnailer/thumbnailer_nfnt.go index beae88c5c..9df4d9a63 100644 --- a/mediaapi/thumbnailer/thumbnailer_nfnt.go +++ b/mediaapi/thumbnailer/thumbnailer_nfnt.go @@ -19,6 +19,7 @@ package thumbnailer import ( "context" + "errors" "image" "image/draw" @@ -42,6 +43,8 @@ import ( log "github.com/sirupsen/logrus" ) +var ErrThumbnailTooLarge = errors.New("thumbnail is larger than original") + // GenerateThumbnails generates the configured thumbnail sizes for the source file func GenerateThumbnails( ctx context.Context, @@ -274,3 +277,36 @@ func adjustSize(dst types.Path, img image.Image, w, h int, crop bool, logger *lo return out.Bounds().Max.X, out.Bounds().Max.Y, nil } + +func CreateThumbnailFromFile( + src types.Path, + dst types.Path, + config types.ThumbnailSize, + logger *log.Entry, +) (err error) { + img, err := readFile(string(src)) + if err != nil { + logger.WithError(err).WithFields(log.Fields{ + "src": src, + }).Error("Failed to read src file") + return err + } + + // Check if request is larger than original + if config.Width >= img.Bounds().Dx() && config.Height >= img.Bounds().Dy() { + return ErrThumbnailTooLarge + } + + start := time.Now() + width, height, err := adjustSize(dst, img, config.Width, config.Height, config.ResizeMethod == types.Crop, logger) + if err != nil { + return err + } + logger.WithFields(log.Fields{ + "ActualWidth": width, + "ActualHeight": height, + "processTime": time.Since(start), + }).Info("Generated thumbnail") + + return nil +} diff --git a/mediaapi/types/types.go b/mediaapi/types/types.go index e1c29e0f6..c9380bf84 100644 --- a/mediaapi/types/types.go +++ b/mediaapi/types/types.go @@ -100,6 +100,38 @@ type ActiveThumbnailGeneration struct { PathToResult map[string]*ThumbnailGenerationResult } +type UrlPreviewCache struct { + Lock sync.Mutex + Records map[string]*UrlPreviewCacheRecord +} + +type UrlPreviewCacheRecord struct { + Created int64 + Preview *UrlPreview + Error error +} + +type UrlPreview struct { + ImageSize FileSizeBytes `json:"matrix:image:size"` + Description string `json:"og:description"` + ImageUrl string `json:"og:image"` + ImageType ContentType `json:"og:image:type"` + ImageHeight int `json:"og:image:height"` + ImageWidth int `json:"og:image:width"` + Title string `json:"og:title"` +} + +type UrlPreviewResult struct { + Cond *sync.Cond + Preview *UrlPreview + Error error +} + +type ActiveUrlPreviewRequests struct { + sync.Mutex + Url map[string]*UrlPreviewResult +} + // Crop indicates we should crop the thumbnail on resize const Crop = "crop" diff --git a/setup/config/config_mediaapi.go b/setup/config/config_mediaapi.go index 030bc3754..9a68add5b 100644 --- a/setup/config/config_mediaapi.go +++ b/setup/config/config_mediaapi.go @@ -30,6 +30,14 @@ type MediaAPI struct { // A list of thumbnail sizes to be pre-generated for downloaded remote / uploaded content ThumbnailSizes []ThumbnailSize `yaml:"thumbnail_sizes"` + + // The time in seconds to cache URL previews for + UrlPreviewCacheTime int `yaml:"url_preview_cache_time"` + + // The timeout in milliseconds for fetching URL previews + UrlPreviewTimeout int `yaml:"url_preview_timeout"` + + UrlPreviewThumbnailSize ThumbnailSize `yaml:"url_preview_thumbnail_size"` } // DefaultMaxFileSizeBytes defines the default file size allowed in transfers @@ -38,6 +46,9 @@ var DefaultMaxFileSizeBytes = FileSizeBytes(10485760) func (c *MediaAPI) Defaults(opts DefaultOpts) { c.MaxFileSizeBytes = DefaultMaxFileSizeBytes c.MaxThumbnailGenerators = 10 + c.UrlPreviewCacheTime = 10 + c.UrlPreviewTimeout = 10000 + if opts.Generate { c.ThumbnailSizes = []ThumbnailSize{ { @@ -61,6 +72,12 @@ func (c *MediaAPI) Defaults(opts DefaultOpts) { } c.BasePath = "./media_store" } + + c.UrlPreviewThumbnailSize = ThumbnailSize{ + Width: 200, + Height: 200, + ResizeMethod: "scale", + } } func (c *MediaAPI) Verify(configErrs *ConfigErrors) { @@ -76,4 +93,11 @@ func (c *MediaAPI) Verify(configErrs *ConfigErrors) { if c.Matrix.DatabaseOptions.ConnectionString == "" { checkNotEmpty(configErrs, "media_api.database.connection_string", string(c.Database.ConnectionString)) } + + // If MaxFileSizeBytes overflows int64, default to DefaultMaxFileSizeBytes + if c.MaxFileSizeBytes+1 <= 0 { + c.MaxFileSizeBytes = DefaultMaxFileSizeBytes + fmt.Printf("Configured MediaApi.MaxFileSizeBytes overflows int64, defaulting to %d bytes", DefaultMaxFileSizeBytes) + } + }