basic url previews

Signed-off-by: Rudraksh Pareek <54525605+DelusionalOptimist@users.noreply.github.com>
This commit is contained in:
Rudraksh Pareek 2021-01-29 01:26:28 +05:30
parent 59e1b96114
commit e5a59b7580
No known key found for this signature in database
GPG key ID: D3A77129495A243E
2 changed files with 406 additions and 0 deletions

View file

@ -0,0 +1,400 @@
// Copyright 2017 Vector Creations Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package routing
import (
"context"
"crypto/rand"
"encoding/hex"
"fmt"
"io/ioutil"
"net/http"
"net/url"
"path"
"strconv"
"strings"
"time"
"golang.org/x/net/html"
"github.com/matrix-org/dendrite/clientapi/jsonerror"
"github.com/matrix-org/dendrite/mediaapi/fileutils"
"github.com/matrix-org/dendrite/mediaapi/storage"
"github.com/matrix-org/dendrite/mediaapi/types"
"github.com/matrix-org/dendrite/setup/config"
"github.com/matrix-org/util"
log "github.com/sirupsen/logrus"
)
// the parameters included in the incoming preview_url request
type PreviewUrlRequest struct {
url url.URL
ts types.UnixMs
}
// metadata of the url (media/html) to be previewed
type mediaInfo struct {
MediaMetadata *types.MediaMetadata
Logger *log.Entry
}
// PreviewUrlResponse defines the format of the JSON response
// https://matrix.org/docs/spec/client_server/latest#get-matrix-media-r0-preview-url
// TODO: add more fields to the response
type PreviewUrlResponse struct {
MatrixImageSize int64 `json:"matrix:image:size"`
OgImage string `json:"og:image"`
OgSiteName string `json:"og:site_name"`
OgType string `json:"og:type"`
OgTitle string `json:"og:title"`
OgUrl string `json:"og:url"`
OgDescription string `json:"og:description"`
}
// PreviewUrl implements GET /preview_url.
// Current implementation gets the url, parses the meta tags to obtain OG
// data and returns a JSONResponse for the client to process.
// TODO: avoid this endpoint in encrypted rooms
func PreviewUrl(
req *http.Request, cfg *config.MediaAPI, db storage.Database) util.JSONResponse {
// parsing the request
r, err := parseRequest(req)
if err != nil {
return *err
}
// downloading metadata for the url
mediaInfo, err := r.downloadUrl(cfg, req.Context(), db)
if err != nil {
return *err
}
// preparing the response
response, err := mediaInfo.prepareResponse(cfg)
if err != nil {
return *err
}
return util.JSONResponse{
Code: http.StatusOK,
JSON: response,
}
}
// parseRequest parses the incoming preview request to extract the url and ts.
// Returns either a parsed PreviewUrlRequest or error formatted as
// util.JSONResponse
func parseRequest(req *http.Request) (*PreviewUrlRequest, *util.JSONResponse) {
// get the url to be previewed from the request
urlToPreview := req.URL.Query().Get("url")
if len(urlToPreview) == 0 {
return nil, &util.JSONResponse{
Code: http.StatusBadRequest,
JSON: jsonerror.MissingArgument("Missing URL"),
}
}
// parse the url
parsedUrl, err := url.Parse(urlToPreview)
if err != nil {
return nil, &util.JSONResponse{
Code: http.StatusBadRequest,
JSON: jsonerror.InvalidArgumentValue(
"Unable to parse url " + err.Error()),
}
}
// get the ts if provided in the request
if tsStr := req.URL.Query().Get("ts"); len(tsStr) > 0 {
ts, err := strconv.ParseInt(tsStr, 10, 64)
if err != nil {
return nil, &util.JSONResponse{
Code: http.StatusBadRequest,
JSON: jsonerror.InvalidArgumentValue(
"Couldn't parse 'ts' to a valid integer" + err.Error()),
}
}
request := &PreviewUrlRequest{
url: *parsedUrl,
// Convert timestamp to ms
ts: types.UnixMs(ts / 1000000),
}
return request, nil
}
// set ts to current time if none provided
ts := time.Now().UnixNano()
request := &PreviewUrlRequest{
url: *parsedUrl,
ts: types.UnixMs(ts / 1000000),
}
return request, nil
}
// downloadUrl downloads the url and saves the response body.
// Returns either mediaInfo (metadata about the file to preview) or error
// formatted as util.JSONResponse.
// Current implementation for saving response is heavily similar to /upload,
// need to work on this.
// TODO: better logging
func (previewReq *PreviewUrlRequest) downloadUrl(cfg *config.MediaAPI, ctx context.Context, db storage.Database) (*mediaInfo, *util.JSONResponse) {
urlString := previewReq.url.String()
// Get the URL
response, err := http.Get(urlString)
if err != nil {
return nil, &util.JSONResponse{
Code: http.StatusBadRequest,
JSON: jsonerror.InvalidArgumentValue("Couldn't get the URL " + err.Error()),
}
}
// TODO: check for other status codes too
if response.StatusCode == 404 {
return nil, &util.JSONResponse{
Code: http.StatusBadRequest,
JSON: jsonerror.NotFound("Given url returned 404"),
}
}
// Save the response body to the temporary dir
// TODO: should be a new directory like url_cache maybe and
hash, bytesWritten, tmpDir, err := fileutils.WriteTempFile(ctx, response.Body, cfg.AbsBasePath)
if err != nil {
return nil, &util.JSONResponse{
Code: http.StatusBadRequest,
JSON: jsonerror.Unknown("Failed to store URL response " + err.Error()),
}
}
// metadata of the media/webpage to preview
urlMetadata := &mediaInfo{
MediaMetadata: &types.MediaMetadata{
Origin: cfg.Matrix.ServerName,
FileSizeBytes: bytesWritten,
Base64Hash: hash,
ContentType: types.ContentType(response.Header["Content-Type"][0]),
},
Logger: util.GetLogger(ctx).WithField("Origin", cfg.Matrix.ServerName),
}
// :-\
// Look up the media by the file hash. If we already have the file but under a
// different media ID then we won't upload the file again - instead we'll just
// add a new metadata entry that refers to the same file.
existingMetadata, err := db.GetMediaMetadataByHash(
ctx, hash, urlMetadata.MediaMetadata.Origin,
)
if err != nil {
fileutils.RemoveDir(tmpDir, urlMetadata.Logger)
urlMetadata.Logger.WithError(err).Error("Error querying the database by hash.")
resErr := jsonerror.InternalServerError()
return nil, &resErr
}
if existingMetadata != nil {
// The file already exists, delete the uploaded temporary file.
defer fileutils.RemoveDir(tmpDir, urlMetadata.Logger)
// The file already exists. Make a new media ID up for it.
mediaID, merr := urlMetadata.generateMediaID(ctx, db)
if merr != nil {
urlMetadata.Logger.WithError(merr).Error("Failed to generate media ID for existing file")
resErr := jsonerror.InternalServerError()
return nil, &resErr
}
// Then amend the upload metadata.
urlMetadata.MediaMetadata = &types.MediaMetadata{
MediaID: mediaID,
Origin: urlMetadata.MediaMetadata.Origin,
ContentType: urlMetadata.MediaMetadata.ContentType,
FileSizeBytes: urlMetadata.MediaMetadata.FileSizeBytes,
CreationTimestamp: urlMetadata.MediaMetadata.CreationTimestamp,
UploadName: urlMetadata.MediaMetadata.UploadName,
Base64Hash: hash,
UserID: urlMetadata.MediaMetadata.UserID,
}
} else {
// The file doesn't exist. Update the request metadata.
urlMetadata.MediaMetadata.FileSizeBytes = bytesWritten
urlMetadata.MediaMetadata.Base64Hash = hash
urlMetadata.MediaMetadata.MediaID, err = urlMetadata.generateMediaID(ctx, db)
if err != nil {
fileutils.RemoveDir(tmpDir, urlMetadata.Logger)
urlMetadata.Logger.WithError(err).Error("Failed to generate media ID for new download")
resErr := jsonerror.InternalServerError()
return nil, &resErr
}
}
urlMetadata.Logger = urlMetadata.Logger.WithField("media_id", urlMetadata.MediaMetadata.MediaID)
urlMetadata.Logger.WithFields(log.Fields{
"Base64Hash": urlMetadata.MediaMetadata.Base64Hash,
"UploadName": urlMetadata.MediaMetadata.UploadName,
"FileSizeBytes": urlMetadata.MediaMetadata.FileSizeBytes,
"ContentType": urlMetadata.MediaMetadata.ContentType,
}).Info("File downloaded")
if resErr := urlMetadata.storeFileAndMetadata(ctx, tmpDir, cfg.AbsBasePath, db); resErr != nil {
return nil, resErr
}
return urlMetadata, nil
}
func (m *mediaInfo) generateMediaID(ctx context.Context, db storage.Database) (types.MediaID, error) {
for {
// First try generating a media ID. We'll do this by
// generating some random bytes and then hex-encoding.
mediaIDBytes := make([]byte, 32)
_, err := rand.Read(mediaIDBytes)
if err != nil {
return "", fmt.Errorf("rand.Read: %w", err)
}
mediaID := types.MediaID(hex.EncodeToString(mediaIDBytes))
// Then we will check if this media ID already exists in
// our database. If it does then we had best generate a
// new one.
existingMetadata, err := db.GetMediaMetadata(ctx, mediaID, m.MediaMetadata.Origin)
if err != nil {
return "", fmt.Errorf("db.GetMediaMetadata: %w", err)
}
if existingMetadata != nil {
// The media ID was already used - repeat the process
// and generate a new one instead.
continue
}
// The media ID was not already used - let's return that.
return mediaID, nil
}
}
func (r *mediaInfo) storeFileAndMetadata(
ctx context.Context,
tmpDir types.Path,
absBasePath config.Path,
db storage.Database,
) *util.JSONResponse {
finalPath, duplicate, err := fileutils.MoveFileWithHashCheck(tmpDir, r.MediaMetadata, absBasePath, r.Logger)
if err != nil {
r.Logger.WithError(err).Error("Failed to move file.")
return &util.JSONResponse{
Code: http.StatusBadRequest,
JSON: jsonerror.Unknown("Failed to upload"),
}
}
if duplicate {
r.Logger.WithField("dst", finalPath).Info("File was stored previously - discarding duplicate")
}
if err = db.StoreMediaMetadata(ctx, r.MediaMetadata); err != nil {
r.Logger.WithError(err).Warn("Failed to store metadata")
// If the file is a duplicate (has the same hash as an existing file) then
// there is valid metadata in the database for that file. As such we only
// remove the file if it is not a duplicate.
if !duplicate {
fileutils.RemoveDir(types.Path(path.Dir(string(finalPath))), r.Logger)
}
return &util.JSONResponse{
Code: http.StatusBadRequest,
JSON: jsonerror.Unknown("Failed to upload"),
}
}
return nil
}
// parseContent returns the data stored in content attributes
func parseContent(node *html.Node) string {
// iterating the attributes of the tag to get content
for _, attr := range node.Attr {
if attr.Key == "content" {
content := attr.Val
return content
}
}
return ""
}
// prepareResponse prepares the response to be returned to the client
func (mediaInfo *mediaInfo) prepareResponse(cfg *config.MediaAPI) (*PreviewUrlResponse, *util.JSONResponse) {
// Reading the file in which the response body was stored and parsing the html.
pathToFile, err := fileutils.GetPathFromBase64Hash(mediaInfo.MediaMetadata.Base64Hash, cfg.AbsBasePath)
if err != nil {
return nil, &util.JSONResponse{
Code: http.StatusInternalServerError,
JSON: jsonerror.Unknown("Couldn't get the path to the stored response" + err.Error()),
}
}
fileString, err := ioutil.ReadFile(pathToFile)
if err != nil {
return nil, &util.JSONResponse{
Code: http.StatusInternalServerError,
JSON: jsonerror.Unknown("Couldn't read the stored response body" + err.Error()),
}
}
file := strings.NewReader(string(fileString))
tree, err := html.Parse(file)
if err != nil {
return nil, &util.JSONResponse{
Code: http.StatusInternalServerError,
JSON: jsonerror.Unknown("Couldn't get the path to the stored file " + err.Error()),
}
}
// map for storing the content values
m := make(map[string]string)
// Iterating the *html.Node, looking for og data
var f func(*html.Node)
f = func(n *html.Node) {
//check if meta tag
if n.Type == html.ElementNode && n.Data == "meta" {
//parse attributes of the tag
for _, a := range n.Attr {
if a.Key == "property" && strings.HasPrefix(a.Val, "og:") {
slice := strings.Split(a.Val, ":")
m[slice[1]] = parseContent(n)
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(tree)
response := &PreviewUrlResponse{
OgImage: m["image"],
OgSiteName: m["site_name"],
OgType: m["type"],
OgTitle: m["title"],
OgUrl: m["url"],
OgDescription: m["description"],
}
return response, nil
}

View file

@ -80,6 +80,12 @@ func Setup(
return Config(req, cfg) return Config(req, cfg)
}), }),
).Methods(http.MethodGet, http.MethodOptions) ).Methods(http.MethodGet, http.MethodOptions)
r0mux.Handle("/preview_url",
httputil.MakeAuthAPI("preview_url", userAPI, func(req *http.Request, dev *userapi.Device) util.JSONResponse {
return PreviewUrl(req, cfg, db)
}),
).Methods(http.MethodGet, http.MethodOptions)
} }
func makeDownloadAPI( func makeDownloadAPI(