Skip to content

Commit

Permalink
Merge pull request #71 from epicfaace/www
Browse files Browse the repository at this point in the history
Add www option for -scope, add tests and CI
  • Loading branch information
hakluke authored Aug 3, 2020
2 parents 2e8b6b4 + c5e3156 commit e39a514
Show file tree
Hide file tree
Showing 7 changed files with 217 additions and 40 deletions.
24 changes: 24 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: Test
on: [push, pull_request]
jobs:
build:
name: Build
runs-on: ubuntu-latest
steps:
- name: Set up Go 1.14
uses: actions/setup-go@v1
with:
go-version: 1.14

- name: Check out source code
uses: actions/checkout@v1

- name: Build
env:
GOPROXY: "https://proxy.golang.org"
run: go build .

- name: Test
env:
GOPROXY: "https://proxy.golang.org"
run: go test -v .
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ go get github.com/hakluke/hakrawler
Note that if you need to do this, you probably want to add your Go bin directory to your $PATH to make things easier!

## Usage
Note: multiple domains can be crawled by piping them into hakrawler from stdin. If only a single domain is being crawled, it can be added by using the -domain flag.
Note: multiple domains can be crawled by piping them into hakrawler from stdin. If only a single domain is being crawled, it can be added by using the -url flag.
```
$ hakrawler -h
Usage of hakrawler:
Expand All @@ -94,6 +94,7 @@ Usage of hakrawler:
-scope string
Scope to include:
strict = specified domain only
www = specified domain and "www" subdomain
subs = specified domain and subdomains
yolo = everything (default "subs")
-sitemap
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,5 @@ require (
github.com/temoto/robotstxt v1.1.1 // indirect
golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553 // indirect
google.golang.org/appengine v1.6.5 // indirect
gopkg.in/h2non/gock.v1 v1.0.15
)
52 changes: 52 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP7EJk=
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/antchfx/htmlquery v1.2.1 h1:bSH+uvb5fh6gLAi2UXVwD4qGJVNJi9P+46gvPhZ+D/s=
github.com/antchfx/htmlquery v1.2.1/go.mod h1:MS9yksVSQXls00iXkiMqXr0J+umL/AmxXKuP28SUJM8=
github.com/antchfx/xmlquery v1.2.2 h1:5FHCVxIjULz8pYI8n+MwbdblnLDmK6LQJicRy/aCtTI=
github.com/antchfx/xmlquery v1.2.2/go.mod h1:/+CnyD/DzHRnv2eRxrVbieRU/FIF6N0C+7oTtyUtCKk=
github.com/antchfx/xpath v1.1.3 h1:daQFH0uBhQsuNLrO+YxaPUNrxM5xgTA1kGPtVE4hWpI=
github.com/antchfx/xpath v1.1.3/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7 h1:5ZkaAPbicIKTF2I64qf5Fh8Aa83Q/dnOafMYV0OMwjA=
github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/google/uuid v1.1.1 h1:Gkbcsh/GbpXz7lPftLA3P6TYMwjCLYm83jiFQZF/3gY=
github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/h2non/parth v0.0.0-20190131123155-b4df798d6542 h1:2VTzZjLZBgl62/EtslCrtky5vbi9dd7HrQPQIx6wqiw=
github.com/h2non/parth v0.0.0-20190131123155-b4df798d6542/go.mod h1:Ow0tF8D4Kplbc8s8sSb3V2oUCygFHVp8gC3Dn6U4MNI=
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
github.com/logrusorgru/aurora v0.0.0-20200102142835-e9ef32dff381 h1:bqDmpDG49ZRnB5PcgP0RXtQvnMSgIF14M7CBd2shtXs=
github.com/logrusorgru/aurora v0.0.0-20200102142835-e9ef32dff381/go.mod h1:7rIyQOR62GCctdiQpZ/zOJlFyk6y+94wXzv6RNZgaR4=
github.com/nbio/st v0.0.0-20140626010706-e9e8d9816f32/go.mod h1:9wM+0iRr9ahx58uYLpLIr5fm8diHn0JbqRycJi6w0Ms=
github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4 h1:2vmb32OdDhjZf2ETGDlr9n8RYXx7c+jXPxMiPbwnA+8=
github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4/go.mod h1:2JQx4jDHmWrbABvpOayg/+OTU6ehN0IyK2EHzceXpJo=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI=
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA=
github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553 h1:efeOvDhwQ29Dj3SdAV/MJf8oukgn+8D8WgaCaRMchF8=
golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
google.golang.org/appengine v1.6.5 h1:tycE03LOZYQNhDpS27tcQdAzLCVMaj7QT2SXxebnpCM=
google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
gopkg.in/h2non/gock.v1 v1.0.15 h1:SzLqcIlb/fDfg7UvukMpNcWsu7sI5tWwL+KCATZqks0=
gopkg.in/h2non/gock.v1 v1.0.15/go.mod h1:sX4zAkdYX1TRGJ2JY156cFspQn4yRWn6p9EMdODlynE=
81 changes: 42 additions & 39 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@ import (
"bufio"
"flag"
"fmt"
"io"
"io/ioutil"
"net/http"
"net/url"
"net/http/httputil"
"net/url"
"os"
"strings"
"sync"
Expand All @@ -19,57 +20,60 @@ import (
"github.com/logrusorgru/aurora"
)

var out io.Writer = os.Stdout

func banner(au aurora.Aurora) {
fmt.Print(au.BrightRed(`
fmt.Fprint(out, au.BrightRed(`
██╗ ██╗ █████╗ ██╗ ██╗██████╗ █████╗ ██╗ ██╗██╗ ███████╗██████╗
██║ ██║██╔══██╗██║ ██╔╝██╔══██╗██╔══██╗██║ ██║██║ ██╔════╝██╔══██╗
███████║███████║█████╔╝ ██████╔╝███████║██║ █╗ ██║██║ █████╗ ██████╔╝
██╔══██║██╔══██║██╔═██╗ ██╔══██╗██╔══██║██║███╗██║██║ ██╔══╝ ██╔══██╗
██║ ██║██║ ██║██║ ██╗██║ ██║██║ ██║╚███╔███╔╝███████╗███████╗██║ ██║
╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚══╝╚══╝ ╚══════╝╚══════╝╚═╝ ╚═╝
`))
fmt.Println(aurora.BgBlue(au.BrightYellow(" Crafted with <3 by hakluke ")))
fmt.Fprintln(out, aurora.BgBlue(au.BrightYellow(" Crafted with <3 by hakluke ")))
}

func main() {
conf := config.NewConfig()
// define and parse command line flags
flag.StringVar(&conf.Url, "url", "", "The url that you wish to crawl, e.g. google.com or https://example.com. Schema defaults to http")
flag.IntVar(&conf.Depth, "depth", 1, "Maximum depth to crawl, the default is 1. Anything above 1 will include URLs from robots, sitemap, waybackurls and the initial crawler as a seed. Higher numbers take longer but yield more results.")
flag.StringVar(&conf.Outdir, "outdir", "", "Directory to save discovered raw HTTP requests")
flag.StringVar(&conf.Cookie, "cookie", "", "The value of this will be included as a Cookie header")
flag.StringVar(&conf.AuthHeader, "auth", "", "The value of this will be included as a Authorization header")
flag.StringVar(&conf.Headers, "headers", "", "Headers to add in all requests. Multiple should be separated by semi-colon, e.g. HeaderOne: ValueOne;HeaderTwo: ValueTwo")
flag.StringVar(&conf.Scope, "scope", "subs", "Scope to include:\nstrict = specified domain only\nsubs = specified domain and subdomains\nyolo = everything")
flag.BoolVar(&conf.Wayback, "usewayback", false, "Query wayback machine for URLs and add them as seeds for the crawler")
flag.BoolVar(&conf.Plain, "plain", false, "Don't use colours or print the banners to allow for easier parsing")
flag.BoolVar(&conf.Nocolor, "nocolor", false, "Print the banners but without ANSI color codes")
flag.BoolVar(&conf.Runlinkfinder, "linkfinder", false, "Run linkfinder on javascript files.")
commandLine := flag.NewFlagSet("", flag.ExitOnError)
commandLine.StringVar(&conf.Url, "url", "", "The url that you wish to crawl, e.g. google.com or https://example.com. Schema defaults to http")
commandLine.IntVar(&conf.Depth, "depth", 1, "Maximum depth to crawl, the default is 1. Anything above 1 will include URLs from robots, sitemap, waybackurls and the initial crawler as a seed. Higher numbers take longer but yield more results.")
commandLine.StringVar(&conf.Outdir, "outdir", "", "Directory to save discovered raw HTTP requests")
commandLine.StringVar(&conf.Cookie, "cookie", "", "The value of this will be included as a Cookie header")
commandLine.StringVar(&conf.AuthHeader, "auth", "", "The value of this will be included as a Authorization header")
commandLine.StringVar(&conf.Headers, "headers", "", "Headers to add in all requests. Multiple should be separated by semi-colon, e.g. HeaderOne: ValueOne;HeaderTwo: ValueTwo")
commandLine.StringVar(&conf.Scope, "scope", "subs", "Scope to include:\nstrict = specified domain only\nwww = specified domain and \"www\" subdomain\nsubs = specified domain and subdomains\nyolo = everything")
commandLine.BoolVar(&conf.Wayback, "usewayback", false, "Query wayback machine for URLs and add them as seeds for the crawler")
commandLine.BoolVar(&conf.Plain, "plain", false, "Don't use colours or print the banners to allow for easier parsing")
commandLine.BoolVar(&conf.Nocolor, "nocolor", false, "Print the banners but without ANSI color codes")
commandLine.BoolVar(&conf.Runlinkfinder, "linkfinder", false, "Run linkfinder on javascript files.")

// which data to include in output?
flag.BoolVar(&conf.DisplayVersion, "v", false, "Display version and exit")
flag.BoolVar(&conf.IncludeJS, "js", false, "Include links to utilised JavaScript files")
flag.BoolVar(&conf.IncludeSubs, "subs", false, "Include subdomains in output")
flag.BoolVar(&conf.IncludeURLs, "urls", false, "Include URLs in output")
flag.BoolVar(&conf.IncludeForms, "forms", false, "Include form actions in output")
flag.BoolVar(&conf.IncludeRobots, "robots", false, "Include robots.txt entries in output")
flag.BoolVar(&conf.IncludeSitemap, "sitemap", false, "Include sitemap.xml entries in output")
flag.BoolVar(&conf.IncludeWayback, "wayback", false, "Include wayback machine entries in output")
flag.BoolVar(&conf.IncludeAll, "all", true, "Include everything in output - this is the default, so this option is superfluous")
flag.BoolVar(&conf.Insecure, "insecure", false, "Ignore invalid HTTPS certificates")
flag.Parse()
commandLine.BoolVar(&conf.DisplayVersion, "v", false, "Display version and exit")
commandLine.BoolVar(&conf.IncludeJS, "js", false, "Include links to utilised JavaScript files")
commandLine.BoolVar(&conf.IncludeSubs, "subs", false, "Include subdomains in output")
commandLine.BoolVar(&conf.IncludeURLs, "urls", false, "Include URLs in output")
commandLine.BoolVar(&conf.IncludeForms, "forms", false, "Include form actions in output")
commandLine.BoolVar(&conf.IncludeRobots, "robots", false, "Include robots.txt entries in output")
commandLine.BoolVar(&conf.IncludeSitemap, "sitemap", false, "Include sitemap.xml entries in output")
commandLine.BoolVar(&conf.IncludeWayback, "wayback", false, "Include wayback machine entries in output")
commandLine.BoolVar(&conf.IncludeAll, "all", true, "Include everything in output - this is the default, so this option is superfluous")
commandLine.BoolVar(&conf.Insecure, "insecure", false, "Ignore invalid HTTPS certificates")
commandLine.Parse(os.Args[1:])

// Verify flags
err := config.VerifyFlags(&conf)
if err != nil {
fmt.Println(err)
flag.Usage()
fmt.Fprintln(out, err)
commandLine.Usage()
os.Exit(1)
}

// if -v is given, just display version number and exit
if conf.DisplayVersion {
fmt.Println(conf.Version)
fmt.Fprintln(out, conf.Version)
os.Exit(1)
}

Expand All @@ -85,10 +89,9 @@ func main() {
banner(au)
}

stdout := bufio.NewWriter(os.Stdout)

writer := bufio.NewWriter(out)

// c := collector.NewCollector(&conf, au, stdout)
// c := collector.NewCollector(&conf, au, writer)

urls := make(chan string, 1)
var reqsMade []*http.Request
Expand All @@ -109,14 +112,14 @@ func main() {
}()
}

// flush to stdout periodically
// flush to writer periodically
t := time.NewTicker(time.Millisecond * 500)
defer t.Stop()
go func() {
for {
select {
case <-t.C:
stdout.Flush()
writer.Flush()
}
}
}()
Expand All @@ -130,29 +133,29 @@ func main() {
}
parsedUrl, err := url.Parse(site)
if err != nil {
writeErrAndFlush(stdout, err.Error(), au)
writeErrAndFlush(writer, err.Error(), au)
return
}
c := collector.NewCollector(&conf, au, stdout, parsedUrl.Host)
c := collector.NewCollector(&conf, au, writer, parsedUrl.Host)
// url set but does not include schema
reqsMade, crawlErr = c.Crawl(site)

// Report errors and flush requests to files as we go
if crawlErr != nil {
writeErrAndFlush(stdout, crawlErr.Error(), au)
writeErrAndFlush(writer, crawlErr.Error(), au)
}
if conf.Outdir != "" {
_, err := os.Stat(conf.Outdir)
if os.IsNotExist(err) {
errDir := os.MkdirAll(conf.Outdir, 0755)
if errDir != nil {
writeErrAndFlush(stdout, errDir.Error(), au)
writeErrAndFlush(writer, errDir.Error(), au)
}
}

err = printRequestsToRandomFiles(reqsMade, conf.Outdir)
if err != nil {
writeErrAndFlush(stdout, err.Error(), au)
writeErrAndFlush(writer, err.Error(), au)
}
}

Expand All @@ -162,7 +165,7 @@ func main() {
wg.Wait()

// just in case anything is still in buffer
stdout.Flush()
writer.Flush()
}

func readStdin() <-chan string {
Expand Down
88 changes: 88 additions & 0 deletions main_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
package main

import (
"bytes"
"os"
"strings"
"testing"

"gopkg.in/h2non/gock.v1"
)

func Test_main(t *testing.T) {
defer gock.Off()

gock.New("http://example.com").
Get("/").
Persist().
Reply(200).
SetHeader("Content-Type", "text/html").
BodyString(`
<a href="http://example.com/link"></a>
<a href="http://www.example.com/link"></a>
<a href="http://sub.example.com/link"></a>
<a href="http://another-example.com/link"></a>
`)

tests := []struct {
name string
args []string
output []string
}{
{
name: "normal scope (subs)",
args: []string{"hakrawler", "-url", "http://example.com", "-plain"},
output: []string{
"http://example.com/link",
// "example.com", // TODO: this url should show up -- fix this bug.
"http://www.example.com/link",
"www.example.com",
"http://sub.example.com/link",
"sub.example.com",
},
},
{
name: "scope strict",
args: []string{"hakrawler", "-url", "http://example.com", "-plain", "-scope", "strict"},
output: []string{
"http://example.com/link",
"example.com",
},
},
{
name: "scope www",
args: []string{"hakrawler", "-url", "http://example.com", "-plain", "-scope", "www"},
output: []string{
"http://example.com/link",
"example.com",
"http://www.example.com/link",
"www.example.com",
},
},
{
name: "scope yolo",
args: []string{"hakrawler", "-url", "http://example.com", "-plain", "-scope", "yolo"},
output: []string{
"http://example.com/link",
"example.com",
"http://www.example.com/link",
"www.example.com",
"http://sub.example.com/link",
"sub.example.com",
"http://another-example.com/link",
"another-example.com",
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
os.Args = tt.args
out = bytes.NewBuffer(nil)
main()
output := strings.Join(tt.output[:], "\n")
if actual := strings.TrimRight(out.(*bytes.Buffer).String(), "\n"); actual != output {
t.Fatalf("expected <%s>, but got <%s>", output, actual)
}
})
}
}
8 changes: 8 additions & 0 deletions pkg/collector/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ func NewCollector(config *config.Config, au aurora.Aurora, w io.Writer, url stri
c := colly.NewCollector()

switch config.Scope {
case "www":
c = colly.NewCollector(
colly.AllowedDomains(basehost, "www." + basehost),
colly.MaxDepth(config.Depth),
colly.UserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"),
)
case "strict":
c = colly.NewCollector(
colly.AllowedDomains(basehost),
Expand Down Expand Up @@ -270,6 +276,8 @@ func (c *Collector) recordIfInScope(tag aurora.Value, u string, msg string, reqs
var shouldPrint bool

switch c.conf.Scope {
case "www":
shouldPrint = msgHost == basehost || msgHost == "www." + basehost
case "strict":
shouldPrint = msgHost == basehost
case "subs":
Expand Down

1 comment on commit e39a514

@JeffreyShran
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could these changes be linked to this issue first raised around the same time? #78

Please sign in to comment.