Skip to content

Commit

Permalink
Merge pull request #7 from pppontusw/remove-unique-url-parameter
Browse files Browse the repository at this point in the history
Support removing query parameters in URL
  • Loading branch information
pppontusw authored May 9, 2024
2 parents e692435 + 2de0d26 commit 402a8fc
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 19 deletions.
1 change: 1 addition & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ type ScraperConfig struct {
NextPageSelector string `yaml:"nextPageSelector"`
PriceFormat string `yaml:"priceFormat"`
RetryString string `yaml:"retryString"`
UniqueParameters []string `yaml:"uniqueParameters"`
}

type EmailConfig struct {
Expand Down
4 changes: 2 additions & 2 deletions pkg/scraper/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ func (bs *BaseScraper) ParseHTML(htmlContent, fetchedUrl string) ([]models.Produ
}

itemLink, _ := s.Find(bs.Config.LinkSelector).Attr("href")
itemLink, err = utils.EnsureFullUrl(itemLink, fetchedUrl)
itemLink, err = utils.EnsureFullUrl(itemLink, fetchedUrl, bs.Config.UniqueParameters)
if err != nil {
log.Printf("Failed to get full URL %v", err)
}
Expand Down Expand Up @@ -139,7 +139,7 @@ func (bs *BaseScraper) ParseHTML(htmlContent, fetchedUrl string) ([]models.Produ
return
}
if href, exists := s.Attr("href"); exists {
nextURL, err = utils.EnsureFullUrl(href, fetchedUrl)
nextURL, err = utils.EnsureFullUrl(href, fetchedUrl, []string{})
if err != nil {
log.Printf("Failed to get full URL %v", err)
}
Expand Down
41 changes: 30 additions & 11 deletions pkg/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@ import (
"strings"
)

func EnsureFullUrl(newUrl, fetchedUrl string) (string, error) {
func EnsureFullUrl(newUrl, fetchedUrl string, uniqueParameters []string) (string, error) {
newUrl = strings.TrimSpace(newUrl)
if newUrl == "" {
return "", nil
}

// Handle relative URLs starting with "./"
var finalUrl *url.URL
var err error

if strings.HasPrefix(newUrl, "./") {
// Handle relative URLs starting with "./"
newUrl = strings.TrimPrefix(newUrl, "./")
baseURL, err := url.Parse(fetchedUrl)
if err != nil {
Expand All @@ -32,12 +35,12 @@ func EnsureFullUrl(newUrl, fetchedUrl string) (string, error) {
}

// Construct the new full URL
newFullUrl := baseURL.Scheme + "://" + baseURL.Host + baseURL.Path + newUrl
return newFullUrl, nil
}

// Handle absolute URLs that do not start with "http://" or "https://"
if !strings.HasPrefix(newUrl, "http://") && !strings.HasPrefix(newUrl, "https://") {
finalUrl, err = url.Parse(baseURL.Scheme + "://" + baseURL.Host + baseURL.Path + newUrl)
if err != nil {
return "", fmt.Errorf("error constructing final URL: %w", err)
}
} else if !strings.HasPrefix(newUrl, "http://") && !strings.HasPrefix(newUrl, "https://") {
// Handle absolute URLs that do not start with "http://" or "https://"
baseURL, err := url.Parse(fetchedUrl)
if err != nil {
return "", fmt.Errorf("error parsing URL: %w", err)
Expand All @@ -46,9 +49,25 @@ func EnsureFullUrl(newUrl, fetchedUrl string) (string, error) {
if !strings.HasPrefix(newUrl, "/") {
newUrl = "/" + newUrl
}
return baseURL.Scheme + "://" + baseURL.Host + newUrl, nil
finalUrl, err = url.Parse(baseURL.Scheme + "://" + baseURL.Host + newUrl)
if err != nil {
return "", fmt.Errorf("error constructing final URL: %w", err)
}
} else {
// Parse complete URL
finalUrl, err = url.Parse(newUrl)
if err != nil {
return "", fmt.Errorf("error parsing URL: %w", err)
}
}

// Remove specified unique parameters from the URL
queryParams := finalUrl.Query()
for _, param := range uniqueParameters {
queryParams.Del(param)
}
finalUrl.RawQuery = queryParams.Encode()

// Return the new URL if it's already a complete URL
return newUrl, nil
// Return the modified URL
return finalUrl.String(), nil
}
37 changes: 31 additions & 6 deletions pkg/utils/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl := ""
fetchedUrl := "https://example.com"
expectedResult := ""
result, err := EnsureFullUrl(newUrl, fetchedUrl)
result, err := EnsureFullUrl(newUrl, fetchedUrl, []string{})
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand All @@ -21,7 +21,7 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl = "https://example.com/product1"
fetchedUrl = "https://example.com"
expectedResult = "https://example.com/product1"
result, err = EnsureFullUrl(newUrl, fetchedUrl)
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{})
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand All @@ -33,7 +33,7 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl = "/product1"
fetchedUrl = "https://example.com"
expectedResult = "https://example.com/product1"
result, err = EnsureFullUrl(newUrl, fetchedUrl)
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{})
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand All @@ -45,7 +45,7 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl = "/product1?param=value"
fetchedUrl = "https://example.com"
expectedResult = "https://example.com/product1?param=value"
result, err = EnsureFullUrl(newUrl, fetchedUrl)
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{})
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand All @@ -57,7 +57,7 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl = "/product1#section"
fetchedUrl = "https://example.com"
expectedResult = "https://example.com/product1#section"
result, err = EnsureFullUrl(newUrl, fetchedUrl)
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{})
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand All @@ -69,11 +69,36 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl = "./baz.php"
fetchedUrl = "https://example.com/foo/bar.php"
expectedResult = "https://example.com/foo/baz.php"
result, err = EnsureFullUrl(newUrl, fetchedUrl)
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{})
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
if result != expectedResult {
t.Errorf("Expected %s, but got %s", expectedResult, result)
}

// Test case 7: newUrl contains &sid query parameter
newUrl = "./baz.php?sid=12345"
fetchedUrl = "https://example.com/foo/bar.php"
expectedResult = "https://example.com/foo/baz.php"
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{"sid"})
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
if result != expectedResult {
t.Errorf("Expected %s, but got %s", expectedResult, result)
}

// Test case 8: newUrl contains many query parameters, sid is stripped
newUrl = "./baz.php?page=10&sid=12345&sort=asc"
fetchedUrl = "https://example.com/foo/bar.php"
expectedResult = "https://example.com/foo/baz.php?page=10&sort=asc"
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{"sid"})
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
if result != expectedResult {
t.Errorf("Expected %s, but got %s", expectedResult, result)
}

}

0 comments on commit 402a8fc

Please sign in to comment.