diff --git a/pkg/config/config.go b/pkg/config/config.go index 887335e..33925e4 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -17,6 +17,7 @@ type ScraperConfig struct { NextPageSelector string `yaml:"nextPageSelector"` PriceFormat string `yaml:"priceFormat"` RetryString string `yaml:"retryString"` + UniqueParameters []string `yaml:"uniqueParameters"` } type EmailConfig struct { diff --git a/pkg/scraper/parser.go b/pkg/scraper/parser.go index aa1fa56..15468f3 100644 --- a/pkg/scraper/parser.go +++ b/pkg/scraper/parser.go @@ -98,7 +98,7 @@ func (bs *BaseScraper) ParseHTML(htmlContent, fetchedUrl string) ([]models.Produ } itemLink, _ := s.Find(bs.Config.LinkSelector).Attr("href") - itemLink, err = utils.EnsureFullUrl(itemLink, fetchedUrl) + itemLink, err = utils.EnsureFullUrl(itemLink, fetchedUrl, bs.Config.UniqueParameters) if err != nil { log.Printf("Failed to get full URL %v", err) } @@ -139,7 +139,7 @@ func (bs *BaseScraper) ParseHTML(htmlContent, fetchedUrl string) ([]models.Produ return } if href, exists := s.Attr("href"); exists { - nextURL, err = utils.EnsureFullUrl(href, fetchedUrl) + nextURL, err = utils.EnsureFullUrl(href, fetchedUrl, []string{}) if err != nil { log.Printf("Failed to get full URL %v", err) } diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go index c04eb39..dbc81d0 100644 --- a/pkg/utils/utils.go +++ b/pkg/utils/utils.go @@ -6,14 +6,17 @@ import ( "strings" ) -func EnsureFullUrl(newUrl, fetchedUrl string) (string, error) { +func EnsureFullUrl(newUrl, fetchedUrl string, uniqueParameters []string) (string, error) { newUrl = strings.TrimSpace(newUrl) if newUrl == "" { return "", nil } - // Handle relative URLs starting with "./" + var finalUrl *url.URL + var err error + if strings.HasPrefix(newUrl, "./") { + // Handle relative URLs starting with "./" newUrl = strings.TrimPrefix(newUrl, "./") baseURL, err := url.Parse(fetchedUrl) if err != nil { @@ -32,12 +35,12 @@ func EnsureFullUrl(newUrl, fetchedUrl string) (string, error) { } // Construct the new full URL - newFullUrl := baseURL.Scheme + "://" + baseURL.Host + baseURL.Path + newUrl - return newFullUrl, nil - } - - // Handle absolute URLs that do not start with "http://" or "https://" - if !strings.HasPrefix(newUrl, "http://") && !strings.HasPrefix(newUrl, "https://") { + finalUrl, err = url.Parse(baseURL.Scheme + "://" + baseURL.Host + baseURL.Path + newUrl) + if err != nil { + return "", fmt.Errorf("error constructing final URL: %w", err) + } + } else if !strings.HasPrefix(newUrl, "http://") && !strings.HasPrefix(newUrl, "https://") { + // Handle absolute URLs that do not start with "http://" or "https://" baseURL, err := url.Parse(fetchedUrl) if err != nil { return "", fmt.Errorf("error parsing URL: %w", err) @@ -46,9 +49,25 @@ func EnsureFullUrl(newUrl, fetchedUrl string) (string, error) { if !strings.HasPrefix(newUrl, "/") { newUrl = "/" + newUrl } - return baseURL.Scheme + "://" + baseURL.Host + newUrl, nil + finalUrl, err = url.Parse(baseURL.Scheme + "://" + baseURL.Host + newUrl) + if err != nil { + return "", fmt.Errorf("error constructing final URL: %w", err) + } + } else { + // Parse complete URL + finalUrl, err = url.Parse(newUrl) + if err != nil { + return "", fmt.Errorf("error parsing URL: %w", err) + } + } + + // Remove specified unique parameters from the URL + queryParams := finalUrl.Query() + for _, param := range uniqueParameters { + queryParams.Del(param) } + finalUrl.RawQuery = queryParams.Encode() - // Return the new URL if it's already a complete URL - return newUrl, nil + // Return the modified URL + return finalUrl.String(), nil } diff --git a/pkg/utils/utils_test.go b/pkg/utils/utils_test.go index d3dd477..6581cf6 100644 --- a/pkg/utils/utils_test.go +++ b/pkg/utils/utils_test.go @@ -9,7 +9,7 @@ func TestEnsureFullUrl(t *testing.T) { newUrl := "" fetchedUrl := "https://example.com" expectedResult := "" - result, err := EnsureFullUrl(newUrl, fetchedUrl) + result, err := EnsureFullUrl(newUrl, fetchedUrl, []string{}) if err != nil { t.Errorf("Unexpected error: %v", err) } @@ -21,7 +21,7 @@ func TestEnsureFullUrl(t *testing.T) { newUrl = "https://example.com/product1" fetchedUrl = "https://example.com" expectedResult = "https://example.com/product1" - result, err = EnsureFullUrl(newUrl, fetchedUrl) + result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{}) if err != nil { t.Errorf("Unexpected error: %v", err) } @@ -33,7 +33,7 @@ func TestEnsureFullUrl(t *testing.T) { newUrl = "/product1" fetchedUrl = "https://example.com" expectedResult = "https://example.com/product1" - result, err = EnsureFullUrl(newUrl, fetchedUrl) + result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{}) if err != nil { t.Errorf("Unexpected error: %v", err) } @@ -45,7 +45,7 @@ func TestEnsureFullUrl(t *testing.T) { newUrl = "/product1?param=value" fetchedUrl = "https://example.com" expectedResult = "https://example.com/product1?param=value" - result, err = EnsureFullUrl(newUrl, fetchedUrl) + result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{}) if err != nil { t.Errorf("Unexpected error: %v", err) } @@ -57,7 +57,7 @@ func TestEnsureFullUrl(t *testing.T) { newUrl = "/product1#section" fetchedUrl = "https://example.com" expectedResult = "https://example.com/product1#section" - result, err = EnsureFullUrl(newUrl, fetchedUrl) + result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{}) if err != nil { t.Errorf("Unexpected error: %v", err) } @@ -69,11 +69,36 @@ func TestEnsureFullUrl(t *testing.T) { newUrl = "./baz.php" fetchedUrl = "https://example.com/foo/bar.php" expectedResult = "https://example.com/foo/baz.php" - result, err = EnsureFullUrl(newUrl, fetchedUrl) + result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{}) if err != nil { t.Errorf("Unexpected error: %v", err) } if result != expectedResult { t.Errorf("Expected %s, but got %s", expectedResult, result) } + + // Test case 7: newUrl contains &sid query parameter + newUrl = "./baz.php?sid=12345" + fetchedUrl = "https://example.com/foo/bar.php" + expectedResult = "https://example.com/foo/baz.php" + result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{"sid"}) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if result != expectedResult { + t.Errorf("Expected %s, but got %s", expectedResult, result) + } + + // Test case 8: newUrl contains many query parameters, sid is stripped + newUrl = "./baz.php?page=10&sid=12345&sort=asc" + fetchedUrl = "https://example.com/foo/bar.php" + expectedResult = "https://example.com/foo/baz.php?page=10&sort=asc" + result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{"sid"}) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if result != expectedResult { + t.Errorf("Expected %s, but got %s", expectedResult, result) + } + }