From cd73c583404f5311111b5a77a7d6ed9576eabae4 Mon Sep 17 00:00:00 2001 From: hakluke <13975395+hakluke@users.noreply.github.com> Date: Tue, 5 Apr 2022 16:25:04 +1000 Subject: [PATCH 1/5] updated readme to include proxy example --- README.md | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 1c407cf..8e76a53 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,12 @@ Timeout for each line of stdin after 5 seconds: cat urls.txt | hakrawler -timeout 5 ``` +Send all requests through a proxy: + +``` +cat urls.txt | hakrawler -proxy http://localhost:8080 +``` + Include subdomains: ``` @@ -76,25 +82,21 @@ echo https://www.google.com | docker run --rm -i hakluke/hakrawler -subs ## Command-line options ``` +Usage of ./hakrawler: -d int Depth to crawl. (default 2) -h string Custom headers separated by two semi-colons. E.g. -h "Cookie: foo=bar;;Referer: http://example.com/" -insecure Disable TLS verification. + -proxy string + Proxy URL. Example: -proxy http://127.0.0.1:8080 -s Show the source of URL based on where it was found (href, form, script, etc.) -subs Include subdomains for crawling. -t int Number of threads to utilise. (default 8) + -timeout int + Maximum time to crawl each URL from stdin, in seconds (default -1) -u Show only unique urls ``` - -## Version 2 note - -From version 2, hakrawler has been completely rewritten and dramatically simplified to align more closely with the unix philosophy. - -- It is now much faster and less buggy. -- Many features have been deprecated (robots.txt parsing, JS file parsing, sitemap parsing, waybackurls), instead, these features are written into separate tools that can be piped to from hakrawler. -- No more terminal colours because they can cause annoying issues when piping to other tools. -- Version 1 was my first ever Go project and the code was bad. From db2a242866c4e80574f2c93f0779ebeb74a0fb92 Mon Sep 17 00:00:00 2001 From: "Luke Stephens (hakluke)" <13975395+hakluke@users.noreply.github.com> Date: Wed, 6 Apr 2022 16:19:40 +1000 Subject: [PATCH 2/5] Update hakrawler.go --- hakrawler.go | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/hakrawler.go b/hakrawler.go index 02dd6ed..719a5bb 100644 --- a/hakrawler.go +++ b/hakrawler.go @@ -228,21 +228,15 @@ func extractHostname(urlString string) (string, error) { // print result constructs output lines and sends them to the results chan func printResult(link string, sourceName string, showSource bool, results chan string, e *colly.HTMLElement) { - // If timeout occurs before goroutines are finished, recover from panic that may occur when attempting writing to results to closed result channel - defer func() { - if r := recover(); r != nil { - return - } - }() - result := e.Request.AbsoluteURL(link) if result != "" { if showSource { result = "[" + sourceName + "] " + result } + // If timeout occurs before goroutines are finished, recover from panic that may occur when attempting writing to results to closed results channel defer func() { if err := recover(); err != nil { - // nop dont care + return } }() results <- result From 2c0541b52955c10724e87383d31adf857bd8bd77 Mon Sep 17 00:00:00 2001 From: "Luke Stephens (hakluke)" <13975395+hakluke@users.noreply.github.com> Date: Wed, 6 Apr 2022 16:21:29 +1000 Subject: [PATCH 3/5] update readme --- hakrawler.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hakrawler.go b/hakrawler.go index 719a5bb..1b7a643 100644 --- a/hakrawler.go +++ b/hakrawler.go @@ -29,11 +29,11 @@ func main() { maxSize := flag.Int("size", -1, "Page size limit, in KB.") insecure := flag.Bool("insecure", false, "Disable TLS verification.") subsInScope := flag.Bool("subs", false, "Include subdomains for crawling.") - showSource := flag.Bool("s", false, "Show the source of URL based on where it was found (href, form, script, etc.)") + showSource := flag.Bool("s", false, "Show the source of URL based on where it was found. E.g. href, form, script, etc.") rawHeaders := flag.String(("h"), "", "Custom headers separated by two semi-colons. E.g. -h \"Cookie: foo=bar;;Referer: http://example.com/\" ") - unique := flag.Bool(("u"), false, "Show only unique urls") - proxy := flag.String(("proxy"), "", "Proxy URL. Example: -proxy http://127.0.0.1:8080") - timeout := flag.Int("timeout", -1, "Maximum time to crawl each URL from stdin, in seconds") + unique := flag.Bool(("u"), false, "Show only unique urls.") + proxy := flag.String(("proxy"), "", "Proxy URL. E.g.: -proxy http://127.0.0.1:8080") + timeout := flag.Int("timeout", -1, "Maximum time to crawl each URL from stdin, in seconds.") flag.Parse() From 640718eb32cd3943460fadff39c2c8981b16c230 Mon Sep 17 00:00:00 2001 From: "Luke Stephens (hakluke)" <13975395+hakluke@users.noreply.github.com> Date: Wed, 6 Apr 2022 16:24:15 +1000 Subject: [PATCH 4/5] Update README.md --- README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 8e76a53..94cfb4c 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ echo https://www.google.com | docker run --rm -i hakluke/hakrawler -subs ## Command-line options ``` -Usage of ./hakrawler: +Usage of hakrawler: -d int Depth to crawl. (default 2) -h string @@ -90,13 +90,15 @@ Usage of ./hakrawler: -insecure Disable TLS verification. -proxy string - Proxy URL. Example: -proxy http://127.0.0.1:8080 - -s Show the source of URL based on where it was found (href, form, script, etc.) + Proxy URL. E.g. -proxy http://127.0.0.1:8080 + -s Show the source of URL based on where it was found. E.g. href, form, script, etc. + -size int + Page size limit, in KB. (default -1) -subs Include subdomains for crawling. -t int Number of threads to utilise. (default 8) -timeout int - Maximum time to crawl each URL from stdin, in seconds (default -1) - -u Show only unique urls + Maximum time to crawl each URL from stdin, in seconds. (default -1) + -u Show only unique urls. ``` From 3d73808cce12a7d5f5473c4dba867e1ca2d0f6c1 Mon Sep 17 00:00:00 2001 From: "Luke Stephens (hakluke)" <13975395+hakluke@users.noreply.github.com> Date: Wed, 6 Apr 2022 16:25:09 +1000 Subject: [PATCH 5/5] minor update to help menu --- hakrawler.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hakrawler.go b/hakrawler.go index 1b7a643..ff68871 100644 --- a/hakrawler.go +++ b/hakrawler.go @@ -32,7 +32,7 @@ func main() { showSource := flag.Bool("s", false, "Show the source of URL based on where it was found. E.g. href, form, script, etc.") rawHeaders := flag.String(("h"), "", "Custom headers separated by two semi-colons. E.g. -h \"Cookie: foo=bar;;Referer: http://example.com/\" ") unique := flag.Bool(("u"), false, "Show only unique urls.") - proxy := flag.String(("proxy"), "", "Proxy URL. E.g.: -proxy http://127.0.0.1:8080") + proxy := flag.String(("proxy"), "", "Proxy URL. E.g. -proxy http://127.0.0.1:8080") timeout := flag.Int("timeout", -1, "Maximum time to crawl each URL from stdin, in seconds.") flag.Parse()