From d1b381234d749586a51f7108ca2bd7925b96b56d Mon Sep 17 00:00:00 2001 From: bebiksik Date: Thu, 11 May 2023 19:20:55 +0200 Subject: [PATCH 1/4] Update hakrawler.go --- hakrawler.go | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/hakrawler.go b/hakrawler.go index 1da86ed..d164808 100644 --- a/hakrawler.go +++ b/hakrawler.go @@ -45,6 +45,7 @@ func main() { proxy := flag.String(("proxy"), "", "Proxy URL. E.g. -proxy http://127.0.0.1:8080") timeout := flag.Int("timeout", -1, "Maximum time to crawl each URL from stdin, in seconds.") disableRedirects := flag.Bool("dr", false, "Disable following HTTP redirects.") + match := flag.String("match", "", "Match a specific domain apex.") flag.Parse() @@ -127,19 +128,19 @@ func main() { abs_link := e.Request.AbsoluteURL(link) if strings.Contains(abs_link, url) || !*inside { - printResult(link, "href", *showSource, *showWhere, *showJson, results, e) + printResult(link, "href", *showSource, *showWhere, *showJson, results, e, *match) e.Request.Visit(link) } }) // find and print all the JavaScript files c.OnHTML("script[src]", func(e *colly.HTMLElement) { - printResult(e.Attr("src"), "script", *showSource, *showWhere, *showJson, results, e) + printResult(e.Attr("src"), "script", *showSource, *showWhere, *showJson, results, e, *match) }) // find and print all the form action URLs c.OnHTML("form[action]", func(e *colly.HTMLElement) { - printResult(e.Attr("action"), "form", *showSource, *showWhere, *showJson, results, e) + printResult(e.Attr("action"), "form", *showSource, *showWhere, *showJson, results, e, *match) }) // add the custom headers @@ -248,10 +249,19 @@ func extractHostname(urlString string) (string, error) { } // print result constructs output lines and sends them to the results chan -func printResult(link string, sourceName string, showSource bool, showWhere bool, showJson bool, results chan string, e *colly.HTMLElement) { +func printResult(link string, sourceName string, showSource bool, showWhere bool, showJson bool, results chan string, e *colly.HTMLElement, match string) { result := e.Request.AbsoluteURL(link) whereURL := e.Request.URL.String() if result != "" { + parsedUrl, err := url.Parse(result) + if err != nil { + log.Println("Error parsing URL:", err) + return + } + if match != "" && !strings.HasSuffix(parsedUrl.Hostname(), match) { + return + } + if showJson { where := "" if showWhere { From f9d1c55fd6bfa5a60fd450be397d77fde07e7f0a Mon Sep 17 00:00:00 2001 From: bebiksik Date: Thu, 11 May 2023 19:27:27 +0200 Subject: [PATCH 2/4] fix spacing --- hakrawler.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/hakrawler.go b/hakrawler.go index d164808..4fbf0c0 100644 --- a/hakrawler.go +++ b/hakrawler.go @@ -254,13 +254,13 @@ func printResult(link string, sourceName string, showSource bool, showWhere bool whereURL := e.Request.URL.String() if result != "" { parsedUrl, err := url.Parse(result) - if err != nil { - log.Println("Error parsing URL:", err) - return - } - if match != "" && !strings.HasSuffix(parsedUrl.Hostname(), match) { - return - } + if err != nil { + log.Println("Error parsing URL:", err) + return + } + if match != "" && !strings.HasSuffix(parsedUrl.Hostname(), match) { + return + } if showJson { where := "" From b14e59161c0db39c89da8b22854565ddfd65e4cf Mon Sep 17 00:00:00 2001 From: bebiksik Date: Thu, 11 May 2023 19:37:07 +0200 Subject: [PATCH 3/4] Update hakrawler.go --- hakrawler.go | 45 +++++++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/hakrawler.go b/hakrawler.go index 4fbf0c0..42fe02e 100644 --- a/hakrawler.go +++ b/hakrawler.go @@ -45,7 +45,7 @@ func main() { proxy := flag.String(("proxy"), "", "Proxy URL. E.g. -proxy http://127.0.0.1:8080") timeout := flag.Int("timeout", -1, "Maximum time to crawl each URL from stdin, in seconds.") disableRedirects := flag.Bool("dr", false, "Disable following HTTP redirects.") - match := flag.String("match", "", "Match a specific domain apex.") + matchApex := flag.Bool("match-apex", false, "Match domain apex.") flag.Parse() @@ -80,6 +80,17 @@ func main() { continue } + apexDomain, err := getApexDomain(hostname) + if err != nil { + log.Println("Error getting apex domain:", err) + continue + } + + match := apexDomain + if !*matchApex { + match = "" + } + allowed_domains := []string{hostname} // if "Host" header is set, append it to allowed domains if headers != nil { @@ -93,7 +104,7 @@ func main() { // default user agent header colly.UserAgent("Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"), // set custom headers - colly.Headers(headers), + // colly.Headers(headers), for some reason this doesn't work :( // limit crawling to the domain of the specified URL colly.AllowedDomains(allowed_domains...), // set MaxDepth to the specified depth @@ -128,19 +139,19 @@ func main() { abs_link := e.Request.AbsoluteURL(link) if strings.Contains(abs_link, url) || !*inside { - printResult(link, "href", *showSource, *showWhere, *showJson, results, e, *match) + printResult(link, "href", *showSource, *showWhere, *showJson, results, e, match) e.Request.Visit(link) } }) // find and print all the JavaScript files c.OnHTML("script[src]", func(e *colly.HTMLElement) { - printResult(e.Attr("src"), "script", *showSource, *showWhere, *showJson, results, e, *match) + printResult(e.Attr("src"), "script", *showSource, *showWhere, *showJson, results, e, match) }) // find and print all the form action URLs c.OnHTML("form[action]", func(e *colly.HTMLElement) { - printResult(e.Attr("action"), "form", *showSource, *showWhere, *showJson, results, e, *match) + printResult(e.Attr("action"), "form", *showSource, *showWhere, *showJson, results, e, match) }) // add the custom headers @@ -254,13 +265,13 @@ func printResult(link string, sourceName string, showSource bool, showWhere bool whereURL := e.Request.URL.String() if result != "" { parsedUrl, err := url.Parse(result) - if err != nil { - log.Println("Error parsing URL:", err) - return - } - if match != "" && !strings.HasSuffix(parsedUrl.Hostname(), match) { - return - } + if err != nil { + log.Println("Error parsing URL:", err) + return + } + if match != "" && !strings.HasSuffix(parsedUrl.Hostname(), match) { + return + } if showJson { where := "" @@ -300,3 +311,13 @@ func isUnique(url string) bool { sm.Store(url, true) return true } + +// getApexDomain returns the apex domain of a hostname +func getApexDomain(hostname string) (string, error) { + parts := strings.Split(hostname, ".") + if len(parts) < 2 { + return "", errors.New("Invalid hostname") + } + + return parts[len(parts)-2] + "." + parts[len(parts)-1], nil +} From 2474a7019ee1e4156de9594cf4d4415f51decce7 Mon Sep 17 00:00:00 2001 From: bebiksik Date: Thu, 11 May 2023 19:42:18 +0200 Subject: [PATCH 4/4] Update hakrawler.go --- hakrawler.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hakrawler.go b/hakrawler.go index 42fe02e..1630997 100644 --- a/hakrawler.go +++ b/hakrawler.go @@ -104,7 +104,7 @@ func main() { // default user agent header colly.UserAgent("Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"), // set custom headers - // colly.Headers(headers), for some reason this doesn't work :( + colly.Headers(headers) // limit crawling to the domain of the specified URL colly.AllowedDomains(allowed_domains...), // set MaxDepth to the specified depth