From 9a68ca0461bdf6d093213a218fbf255678018a16 Mon Sep 17 00:00:00 2001 From: Paul Buetow Date: Thu, 24 Oct 2024 10:25:00 +0300 Subject: move extract URLs to types.Entry --- internal/entry/entry.go | 13 +++++ internal/entry/entry_test.go | 76 ++++++++++++++++++++++++++++ internal/platforms/linkedin/escapes.go | 15 ------ internal/platforms/linkedin/escapes_test.go | 77 ----------------------------- 4 files changed, 89 insertions(+), 92 deletions(-) (limited to 'internal') diff --git a/internal/entry/entry.go b/internal/entry/entry.go index 99f7ee3..34ce5f0 100644 --- a/internal/entry/entry.go +++ b/internal/entry/entry.go @@ -4,6 +4,7 @@ import ( "errors" "fmt" "os" + "regexp" "strings" "time" @@ -126,3 +127,15 @@ func (e Entry) Edit() error { } return nil } + +// extractURLs finds all occurrences of URLs starting with "http://" or "https://" in a given string. +func (e Entry) ExtractURLs() []string { + content, _ := e.Content() + return extractURLs(content) +} + +func extractURLs(input string) []string { + urlPattern := `(http://|https://|ftp://)[^\s]+` + re := regexp.MustCompile(urlPattern) + return re.FindAllString(input, -1) +} diff --git a/internal/entry/entry_test.go b/internal/entry/entry_test.go index 0ec8166..43c2243 100644 --- a/internal/entry/entry_test.go +++ b/internal/entry/entry_test.go @@ -2,6 +2,9 @@ package entry import ( "fmt" + "regexp" + "slices" + "strings" "testing" "codeberg.org/snonux/gos/internal/timestamp" @@ -36,3 +39,76 @@ func TestEntry(t *testing.T) { } } } + +func TestExtractTwoURLs(t *testing.T) { + text := `Hello world https://foo.zone + Hello universe http://world.universe test 123` + + urls := extractURLs(text) + if len(urls) != 2 { + t.Errorf("expected 2 URLs, but got %d", len(urls)) + } + + if !slices.Contains(urls, "https://foo.zone") { + t.Errorf("expected 'https://foo.zone' in the URL list, but got %v", urls) + } + if !slices.Contains(urls, "http://world.universe") { + t.Errorf("expected 'http://world.universe' in the URL list, but got %v", urls) + } +} + +func TestExtractURLs(t *testing.T) { + urls := []string{ + "http://foo.zone", + "http://foo.zone/", + "http://foo.zone?foo=bar", + "http://foo.zone/?foo=bar", + "http://foo.zone/?foo=bar", + "http://foo.zone/hurs?foo=bar", + "http://foo.zone?foo=bar&baz=bay", + } + + for _, url := range urls { + text := fmt.Sprintf("Hello world %s Hello World", url) + found := extractURLs(text) + if len(found) != 1 { + t.Errorf("expected 1 URL, but got %d for text '%s'", len(found), text) + } + if found[0] != url { + t.Errorf("expected URL '%s', but got '%s' for text '%s'", url, found[0], text) + } + } +} + +func FuzzExtractURLs(f *testing.F) { + f.Add("/path?myjfa=lwsr4imj&dgqeg=m3uwwsak") + f.Add("/?amfbm=bwzqu46m&xheuh=nv588d98") + f.Add("?tuupm=reng2p1y&cbjot=0g5qvpty") + f.Add("/path?qmcok=f%20w4tfp7g&awsnq=sjizuore&owdix=8s2dmqsv") + f.Add("?zwilf=868o24x1&fiwmp=1d5aqbvo&irhhr=xar7qbq7&eetpy=scmi9s8i") + f.Add("/path?mwhbm=psinstn6&nsjic=pfu0wnk9&lbmrz=5bixkhdt") + f.Add("/path?owbwo=67mkjiz2") + f.Add("/path?ohvxi=esy5qvml&zlvzt=2yi4q4ef&cnich=sgc8sahs") + f.Add("/path?codsl=fpwfto6j") + f.Add("tvdus=fhlhlh1y") + f.Add("/foo.txt") + noWhitespace := regexp.MustCompile(`\s+`) + + f.Fuzz(func(t *testing.T, urlPath string) { + urlPath = noWhitespace.ReplaceAllString(strings.TrimSpace(urlPath), "%20") + baseURLs := []string{"https://foo.zone", "http://foo.zone", "ftp://foo.zone"} + for _, baseURL := range baseURLs { + fullURL := fmt.Sprintf("%s%s", baseURL, urlPath) + text := fmt.Sprintf("Hello world %s Hello World", fullURL) + found := extractURLs(text) + if len(found) != 1 { + t.Errorf("expected 1 URL '%s', but got %d for text '%s'", + fullURL, len(found), text) + } + if found[0] != fullURL { + t.Errorf("expected URL '%s', but got '%s' for text '%s'", + fullURL, found[0], text) + } + } + }) +} diff --git a/internal/platforms/linkedin/escapes.go b/internal/platforms/linkedin/escapes.go index 31110a0..5f803a5 100644 --- a/internal/platforms/linkedin/escapes.go +++ b/internal/platforms/linkedin/escapes.go @@ -1,7 +1,6 @@ package linkedin import ( - "regexp" "strings" ) @@ -38,17 +37,3 @@ func escapeLinkedInText(input string) string { return builder.String() } - -// extractURLs finds all occurrences of URLs starting with "http://" or "https://" in a given string. -func extractURLs(input string) []string { - // Regular expression pattern to match URLs starting with http:// or https:// - urlPattern := `(http://|https://|ftp://)[^\s]+` - - // Compile the regular expression - re := regexp.MustCompile(urlPattern) - - // Find all matches in the input string - urls := re.FindAllString(input, -1) - - return urls -} diff --git a/internal/platforms/linkedin/escapes_test.go b/internal/platforms/linkedin/escapes_test.go index f93b260..b1f9203 100644 --- a/internal/platforms/linkedin/escapes_test.go +++ b/internal/platforms/linkedin/escapes_test.go @@ -1,10 +1,6 @@ package linkedin import ( - "fmt" - "regexp" - "slices" - "strings" "testing" ) @@ -17,76 +13,3 @@ func TestLinkedInEscapes(t *testing.T) { t.Errorf("expected '%s' but got '%s'", expected, escaped) } } - -func TestLinkedInTwoURLsExtract(t *testing.T) { - text := `Hello world https://foo.zone - Hello universe http://world.universe test 123` - - urls := extractURLs(text) - if len(urls) != 2 { - t.Errorf("expected 2 URLs, but got %d", len(urls)) - } - - if !slices.Contains(urls, "https://foo.zone") { - t.Errorf("expected 'https://foo.zone' in the URL list, but got %v", urls) - } - if !slices.Contains(urls, "http://world.universe") { - t.Errorf("expected 'http://world.universe' in the URL list, but got %v", urls) - } -} - -func TestLinkedInURLExtract(t *testing.T) { - urls := []string{ - "http://foo.zone", - "http://foo.zone/", - "http://foo.zone?foo=bar", - "http://foo.zone/?foo=bar", - "http://foo.zone/?foo=bar", - "http://foo.zone/hurs?foo=bar", - "http://foo.zone?foo=bar&baz=bay", - } - - for _, url := range urls { - text := fmt.Sprintf("Hello world %s Hello World", url) - found := extractURLs(text) - if len(found) != 1 { - t.Errorf("expected 1 URL, but got %d for text '%s'", len(found), text) - } - if found[0] != url { - t.Errorf("expected URL '%s', but got '%s' for text '%s'", url, found[0], text) - } - } -} - -func FuzzLinkedInURLExtract(f *testing.F) { - f.Add("/path?myjfa=lwsr4imj&dgqeg=m3uwwsak") - f.Add("/?amfbm=bwzqu46m&xheuh=nv588d98") - f.Add("?tuupm=reng2p1y&cbjot=0g5qvpty") - f.Add("/path?qmcok=f%20w4tfp7g&awsnq=sjizuore&owdix=8s2dmqsv") - f.Add("?zwilf=868o24x1&fiwmp=1d5aqbvo&irhhr=xar7qbq7&eetpy=scmi9s8i") - f.Add("/path?mwhbm=psinstn6&nsjic=pfu0wnk9&lbmrz=5bixkhdt") - f.Add("/path?owbwo=67mkjiz2") - f.Add("/path?ohvxi=esy5qvml&zlvzt=2yi4q4ef&cnich=sgc8sahs") - f.Add("/path?codsl=fpwfto6j") - f.Add("tvdus=fhlhlh1y") - f.Add("/foo.txt") - noWhitespace := regexp.MustCompile(`\s+`) - - f.Fuzz(func(t *testing.T, urlPath string) { - urlPath = noWhitespace.ReplaceAllString(strings.TrimSpace(urlPath), "%20") - baseURLs := []string{"https://foo.zone", "http://foo.zone", "ftp://foo.zone"} - for _, baseURL := range baseURLs { - fullURL := fmt.Sprintf("%s%s", baseURL, urlPath) - text := fmt.Sprintf("Hello world %s Hello World", fullURL) - found := extractURLs(text) - if len(found) != 1 { - t.Errorf("expected 1 URL '%s', but got %d for text '%s'", - fullURL, len(found), text) - } - if found[0] != fullURL { - t.Errorf("expected URL '%s', but got '%s' for text '%s'", - fullURL, found[0], text) - } - } - }) -} -- cgit v1.2.3