Skip to content

Commit

Permalink
Merge pull request #9 from pppontusw/remove-fragments
Browse files Browse the repository at this point in the history
Enable removing fragments from URLs
  • Loading branch information
pppontusw authored May 9, 2024
2 parents 417dd38 + 8dd7038 commit e7fc257
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 11 deletions.
1 change: 1 addition & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ type ScraperConfig struct {
PriceFormat string `yaml:"priceFormat"`
RetryString string `yaml:"retryString"`
UniqueParameters []string `yaml:"uniqueParameters"`
RemoveFragment bool `yaml:"removeFragment"`
}

type EmailConfig struct {
Expand Down
4 changes: 2 additions & 2 deletions pkg/scraper/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ func (bs *BaseScraper) ParseHTML(htmlContent, fetchedUrl string) ([]models.Produ
}

itemLink, _ := s.Find(bs.Config.LinkSelector).Attr("href")
itemLink, err = utils.EnsureFullUrl(itemLink, fetchedUrl, bs.Config.UniqueParameters)
itemLink, err = utils.EnsureFullUrl(itemLink, fetchedUrl, bs.Config.UniqueParameters, bs.Config.RemoveFragment)
if err != nil {
log.Printf("Failed to get full URL %v", err)
}
Expand Down Expand Up @@ -139,7 +139,7 @@ func (bs *BaseScraper) ParseHTML(htmlContent, fetchedUrl string) ([]models.Produ
return
}
if href, exists := s.Attr("href"); exists {
nextURL, err = utils.EnsureFullUrl(href, fetchedUrl, []string{})
nextURL, err = utils.EnsureFullUrl(href, fetchedUrl, []string{}, false)
if err != nil {
log.Printf("Failed to get full URL %v", err)
}
Expand Down
6 changes: 5 additions & 1 deletion pkg/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"strings"
)

func EnsureFullUrl(newUrl, fetchedUrl string, uniqueParameters []string) (string, error) {
func EnsureFullUrl(newUrl, fetchedUrl string, uniqueParameters []string, removeFragment bool) (string, error) {
newUrl = strings.TrimSpace(newUrl)
if newUrl == "" {
return "", nil
Expand Down Expand Up @@ -68,6 +68,10 @@ func EnsureFullUrl(newUrl, fetchedUrl string, uniqueParameters []string) (string
}
finalUrl.RawQuery = queryParams.Encode()

if removeFragment {
finalUrl.Fragment = ""
}

// Return the modified URL
return finalUrl.String(), nil
}
39 changes: 31 additions & 8 deletions pkg/utils/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl := ""
fetchedUrl := "https://example.com"
expectedResult := ""
result, err := EnsureFullUrl(newUrl, fetchedUrl, []string{})
result, err := EnsureFullUrl(newUrl, fetchedUrl, []string{}, false)
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand All @@ -21,7 +21,7 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl = "https://example.com/product1"
fetchedUrl = "https://example.com"
expectedResult = "https://example.com/product1"
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{})
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{}, false)
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand All @@ -33,7 +33,7 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl = "/product1"
fetchedUrl = "https://example.com"
expectedResult = "https://example.com/product1"
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{})
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{}, false)
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand All @@ -45,7 +45,7 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl = "/product1?param=value"
fetchedUrl = "https://example.com"
expectedResult = "https://example.com/product1?param=value"
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{})
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{}, false)
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand All @@ -57,7 +57,7 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl = "/product1#section"
fetchedUrl = "https://example.com"
expectedResult = "https://example.com/product1#section"
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{})
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{}, false)
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand All @@ -69,7 +69,7 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl = "./baz.php"
fetchedUrl = "https://example.com/foo/bar.php"
expectedResult = "https://example.com/foo/baz.php"
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{})
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{}, false)
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand All @@ -81,7 +81,7 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl = "./baz.php?sid=12345"
fetchedUrl = "https://example.com/foo/bar.php"
expectedResult = "https://example.com/foo/baz.php"
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{"sid"})
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{"sid"}, false)
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand All @@ -93,12 +93,35 @@ func TestEnsureFullUrl(t *testing.T) {
newUrl = "./baz.php?page=10&sid=12345&sort=asc"
fetchedUrl = "https://example.com/foo/bar.php"
expectedResult = "https://example.com/foo/baz.php?page=10&sort=asc"
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{"sid"})
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{"sid"}, false)
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
if result != expectedResult {
t.Errorf("Expected %s, but got %s", expectedResult, result)
}

// Test case 9: newUrl contains fragment that should be removed
newUrl = "/bar#list=2345"
fetchedUrl = "https://example.com/foo/bar"
expectedResult = "https://example.com/bar"
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{"sid"}, true)
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
if result != expectedResult {
t.Errorf("Expected %s, but got %s", expectedResult, result)
}

// Test case 10: newUrl contains fragment that should not be removed
newUrl = "/bar#list=2345"
fetchedUrl = "https://example.com/foo/bar"
expectedResult = "https://example.com/bar#list=2345"
result, err = EnsureFullUrl(newUrl, fetchedUrl, []string{}, false)
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
if result != expectedResult {
t.Errorf("Expected %s, but got %s", expectedResult, result)
}
}

0 comments on commit e7fc257

Please sign in to comment.