Skip to content

Commit

Permalink
update readme for installation
Browse files Browse the repository at this point in the history
  • Loading branch information
davemolk committed Sep 15, 2022
1 parent b44a792 commit 3dec4f9
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 17 deletions.
18 changes: 13 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[![Go Report Card](https://goreportcard.com/badge/github.com/davemolk/goGetJS)](https://goreportcard.com/report/github.com/davemolk/goGetJS)
[![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/davemolk/goGetJS/issues)

goGetJS extracts, searches, and saves JavaScript files. Includes an optional chromium browser (via playwright) for dealing with JavaScript-heavy sites.
goGetJS extracts, searches, and saves JavaScript files. Includes an optional chromium headless browser (playwright) for dealing with JavaScript-heavy sites.

![demo](demo.gif)

Expand All @@ -16,12 +16,12 @@ goGetJS extracts, searches, and saves JavaScript files. Includes an optional chr
* Use -term, -regex, and -terms, respectively, to scan each script for a specific word, with a regular expression, or with a list of words (input as a file).
* goGetJS does not follow redirects by default, but this can be toggled with -redirect=true.

## Example Usages
## Example Usages (use browser and search each script for a list of terms in search.txt)
```
go run ./cmd/goGetJS -u=https://go.dev -b -terms=search.txt
go run ./cmd/goGetJS -u https://go.dev -b -terms search.txt
```
```
echo https://go.dev | goGetJS -b -terms=search.txt
echo https://go.dev | goGetJS -b -terms search.txt
```

## Command-line Options
Expand Down Expand Up @@ -51,9 +51,17 @@ Usage of goGetJS:
URL to extract JS files from.
```

## Installation
First, you'll need to [install go](https://golang.org/doc/install).

Then run this command to download + compile goGetJS:
```
go install github.com/davemolk/goGetJS@latest
```

## Additional Notes
* goGetJS names JavaScript files with ```fName := regexp.MustCompile(`[\w-&]+(\.js)?$`)```. Most scripts play nice, but those that don't are still saved. Each saved script has the full URL prepended to the file.
* Occasionally, an src will link to an empty page. These are automatically retried and will sometimes get a script on that second attempt (which is searched and saved). Set a timeout for these retries with -rt. More often, these pages are legitimately blank, causing the number of saved files printed to the terminal to be fewer than the number of processed files.
* Occasionally, an src will link to an empty page. These are automatically retried (set a timeout for these retries with -rt). Typically, these pages are legitimately blank, causing the number of saved files printed to the terminal to be fewer than the number of processed files. Sometimes we're lucky though, and the successful retry will be searched and saved.

## Changelog
* **2022-08-26** : Add proxy, redirect, and rt flags. Refactor client creation. Improve error handling throughout.
Expand Down
4 changes: 2 additions & 2 deletions cmd/goGetJS/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ import (
"regexp"
)

// assertErrorToNilf is a simple helper function for error handling.
func (app *application) assertErrorToNilf(err error) {
// assertErrorToNil is a simple helper function for error handling.
func (app *application) assertErrorToNil(err error) {
if err != nil {
app.errorLog.Fatal(err)
}
Expand Down
18 changes: 9 additions & 9 deletions cmd/goGetJS/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,19 +72,19 @@ func main() {

if app.config.url == "" {
err := app.getInput()
app.assertErrorToNilf(err)
app.assertErrorToNil(err)
}

baseURL, err := app.getBaseURL(cfg.url)
app.assertErrorToNilf(err)
app.assertErrorToNil(err)
app.baseURL = baseURL

err = os.Mkdir("data", 0755)
app.assertErrorToNilf(err)
app.assertErrorToNil(err)

if cfg.term != "" || cfg.terms != "" || cfg.regex != "" {
err := os.Mkdir("searchResults", 0755)
app.assertErrorToNilf(err)
app.assertErrorToNil(err)
}

app.client = app.makeClient(cfg.timeout, cfg.proxy, cfg.redirect)
Expand All @@ -96,21 +96,21 @@ func main() {
switch {
case cfg.useBrowser:
reader, err = app.browser(cfg.url, &cfg.browserTimeout, cfg.extraWait, app.client)
app.assertErrorToNilf(err)
app.assertErrorToNil(err)
default:
resp, err := app.makeRequest(cfg.url, app.client)
app.assertErrorToNilf(err)
app.assertErrorToNil(err)
defer resp.Body.Close()
reader = resp.Body
}

// parse for src, writing javascript files without src
srcs, anonCount, err := app.parseDoc(reader, cfg.url, app.query)
app.assertErrorToNilf(err)
app.assertErrorToNil(err)

// write src text file
err = app.writeFile(srcs, "scriptSRC.txt")
app.assertErrorToNilf(err)
app.assertErrorToNil(err)

// handling situations when src doesn't end with .js
fName := regexp.MustCompile(`[\w-&]+(\.js)?$`)
Expand Down Expand Up @@ -138,7 +138,7 @@ func main() {
// save search results (if applicable)
if cfg.term != "" || cfg.terms != "" || cfg.regex != "" {
err = app.writeSearchResults(app.searches.Searches)
app.assertErrorToNilf(err)
app.assertErrorToNil(err)
}

fmt.Println()
Expand Down
1 change: 0 additions & 1 deletion cmd/goGetJS/requests.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ func (app *application) quickRetry(url string, query interface{}, r *regexp.Rege
resp, err := app.makeRequest(url, app.retryClient)
if err != nil {
app.errorLog.Printf("retry request error for %v: %v\n", url, err)
resp.Body.Close()
return
}
defer resp.Body.Close()
Expand Down

0 comments on commit 3dec4f9

Please sign in to comment.