From 43b812e037a43a3295873971181f9cd004bea2c2 Mon Sep 17 00:00:00 2001 From: Paul Buetow Date: Sun, 3 Nov 2024 12:39:22 +0200 Subject: can also fetch first image for the preview --- internal/platforms/linkedin/preview.go | 114 ++++++++++++++++++++-------- internal/platforms/linkedin/preview_test.go | 26 +++++++ 2 files changed, 110 insertions(+), 30 deletions(-) create mode 100644 internal/platforms/linkedin/preview_test.go (limited to 'internal/platforms') diff --git a/internal/platforms/linkedin/preview.go b/internal/platforms/linkedin/preview.go index 1c0e5a7..03bf7e6 100644 --- a/internal/platforms/linkedin/preview.go +++ b/internal/platforms/linkedin/preview.go @@ -6,26 +6,30 @@ import ( "fmt" "log" "net/http" + "net/url" "golang.org/x/net/html" ) -var errNoTitleElementFound = errors.New("no title element found") +var ( + errNoTitleElementFound = errors.New("no title element found") + errNoImageElementFound = errors.New("no image element found") +) type preview struct { - title, url string + title, imageURL, url string } func NewPreview(ctx context.Context, urls []string) (preview, error) { if len(urls) == 0 { return preview{}, nil } - title, err := fetchHTMLTitle(ctx, urls[0]) + title, imageURL, err := fetchHTMLTitleAndFirstImage(ctx, urls[0]) if errors.Is(err, errNoTitleElementFound) || (err == nil && title == "") { log.Println("Setting title to", urls[0]) title = urls[0] } - return preview{title: title, url: urls[0]}, err + return preview{title: title, imageURL: imageURL, url: urls[0]}, err } func (p preview) String() string { @@ -36,47 +40,97 @@ func (p preview) Empty() bool { return p.url == "" } -func fetchHTMLTitle(ctx context.Context, url string) (string, error) { - req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) +func findTitle(n *html.Node) (string, error) { + var title string + var traverse func(*html.Node) + traverse = func(n *html.Node) { + if n.Type == html.ElementNode && n.Data == "title" { + if n.FirstChild != nil { + title = n.FirstChild.Data + } + return + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + traverse(c) + } + } + traverse(n) + if title == "" { + return "", errNoTitleElementFound + } + return title, nil +} + +func findFirstImage(n *html.Node) (string, error) { + var imageURL string + var traverse func(*html.Node) bool + traverse = func(n *html.Node) bool { + if n.Type == html.ElementNode && n.Data == "img" { + for _, attr := range n.Attr { + if attr.Key == "src" { + imageURL = attr.Val + return true // Stop searching when the first image's URL is found + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + if traverse(c) { + return true + } + } + return false + } + if !traverse(n) { + return "", errNoImageElementFound + } + return imageURL, nil +} + +func resolveURL(baseURL, rawURL string) (string, error) { + base, err := url.Parse(baseURL) + if err != nil { + return "", fmt.Errorf("failed to parse base URL: %w", err) + } + u, err := url.Parse(rawURL) if err != nil { - return "", fmt.Errorf("failed to create request: %w", err) + return "", fmt.Errorf("failed to parse raw URL: %w", err) } + return base.ResolveReference(u).String(), nil +} +func fetchHTMLTitleAndFirstImage(ctx context.Context, url string) (string, string, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return "", "", fmt.Errorf("failed to create request: %w", err) + } resp, err := http.DefaultClient.Do(req) if err != nil { - return "", fmt.Errorf("failed to get URL: %w", err) + return "", "", fmt.Errorf("failed to get URL: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - return "", fmt.Errorf("failed to get a successful response: %v", resp.StatusCode) + return "", "", fmt.Errorf("failed to get a successful response: %v", resp.StatusCode) } - doc, err := html.Parse(resp.Body) if err != nil { - return "", fmt.Errorf("failed to parse HTML: %w", err) + return "", "", fmt.Errorf("failed to parse HTML: %w", err) } - // Traverse the document and find the tag - var title string - var f func(*html.Node) - f = func(n *html.Node) { - if n.Type == html.ElementNode && n.Data == "title" { - if n.FirstChild != nil { - title = n.FirstChild.Data - } - return - } - for c := n.FirstChild; c != nil; c = c.NextSibling { - f(c) - } + var errs error + title, err := findTitle(doc) + if err != nil { + errs = errors.Join(errs, err) } - // Call the function to search for the title - f(doc) - - if title == "" { - return "", errNoTitleElementFound + imageURL, err := findFirstImage(doc) + if err != nil { + errs = errors.Join(errs, err) + } else if imageURL != "" { + if imageURL, err = resolveURL(url, imageURL); err != nil { + errs = errors.Join(errs, err) + } } - return title, nil + + return title, imageURL, errs } diff --git a/internal/platforms/linkedin/preview_test.go b/internal/platforms/linkedin/preview_test.go new file mode 100644 index 0000000..ea47d08 --- /dev/null +++ b/internal/platforms/linkedin/preview_test.go @@ -0,0 +1,26 @@ +package linkedin + +import ( + "context" + "testing" +) + +// TODO: Mock the http request, and serve a local HTML page! +func TestFetchHTMLTitleAndFirstImage(t *testing.T) { + var ( + url = "https://foo.zone/about/" + expectedTitle = "About" + expectedImageURL = "https://foo.zone/about/paul.jpg" + ) + + title, imageURL, err := fetchHTMLTitleAndFirstImage(context.Background(), url) + if err != nil { + t.Error(err) + } + if title != expectedTitle { + t.Errorf("expected title '%s' but got '%s'", expectedTitle, title) + } + if imageURL != expectedImageURL { + t.Errorf("expected imageURL '%s' but got '%s'", expectedImageURL, imageURL) + } +} -- cgit v1.2.3