Skip to content

Commit

Permalink
Merge pull request #33 from alphagov/add-positive-url-rules
Browse files Browse the repository at this point in the history
Add positive URL_RULES
  • Loading branch information
richardTowers authored Dec 2, 2024
2 parents cb23965 + c344839 commit b62e975
Show file tree
Hide file tree
Showing 7 changed files with 32 additions and 2 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,7 @@ Configuration is handled through environment variables as listed below:
- Example: `HEADERS=Rate-Limit-Token:ABC123,X-Header:X-Value`
- CONCURRENCY: Controls the number of concurrent requests, useful for controlling request rate.
- Example: `CONCURRENCY=10`
- URL_RULES: A comma-separated list of regex patterns matching URLs that the crawler should crawl. All other URLs will be avoided.
- Example: `URL_RULES=https://www.gov.uk/.*`
- DISALLOWED_URL_RULES: A comma-separated list of regex patterns matching URLs that the crawler should avoid.
- Example: `DISALLOWED_URL_RULES=/search/.*,/government/.*\.atom`
6 changes: 6 additions & 0 deletions internal/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ func NewClient(c *colly.Collector, redirectHandler func(*http.Request, []*http.R
func isRequestAllowed(c *colly.Collector, parsedURL *url.URL) bool {
u := []byte(parsedURL.String())

for _, r := range c.URLFilters {
if !r.Match(u) {
return false
}
}

for _, r := range c.DisallowedURLFilters {
if r.Match(u) {
return false
Expand Down
8 changes: 8 additions & 0 deletions internal/client/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ func TestNewClient(t *testing.T) {
func TestIsRequestAllowedTableDriven(t *testing.T) {
tests := []struct {
name string
allowedURLs []*regexp.Regexp
disallowedURLs []*regexp.Regexp
allowedDomains []string
url string
Expand All @@ -60,6 +61,12 @@ func TestIsRequestAllowedTableDriven(t *testing.T) {
url: "http://example.com",
expectedAllowed: false,
},
{
name: "URL filter",
allowedURLs: []*regexp.Regexp{regexp.MustCompile("https://www.gov.uk")},
url: "http://example.com",
expectedAllowed: false,
},
{
name: "allowed domain",
allowedDomains: []string{"example.com"},
Expand All @@ -78,6 +85,7 @@ func TestIsRequestAllowedTableDriven(t *testing.T) {
t.Run(tt.name, func(t *testing.T) {
c := colly.NewCollector()
c.DisallowedURLFilters = tt.disallowedURLs
c.URLFilters = tt.allowedURLs
c.AllowedDomains = tt.allowedDomains
parsedURL, _ := url.Parse(tt.url)
assert.Equal(t, tt.expectedAllowed, isRequestAllowed(c, parsedURL))
Expand Down
1 change: 1 addition & 0 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ type Config struct {
UserAgent string `env:"USER_AGENT" envDefault:"govukbot"`
Headers map[string]string `env:"HEADERS"`
Concurrency int `env:"CONCURRENCY" envDefault:"10"`
URLFilters []*regexp.Regexp `env:"URL_RULES" envSeparator:","`
DisallowedURLFilters []*regexp.Regexp `env:"DISALLOWED_URL_RULES" envSeparator:","`
}

Expand Down
9 changes: 7 additions & 2 deletions internal/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ func TestNewConfig(t *testing.T) {
"USER_AGENT": "custom-agent",
"HEADERS": "Test-Header:Test-Value",
"CONCURRENCY": "20",
"DISALLOWED_URL_RULES": "rule1,rule2",
"URL_RULES": "rule1,rule2",
"DISALLOWED_URL_RULES": "rule3,rule4",
},
expected: &Config{
Site: "example.com",
Expand All @@ -39,10 +40,14 @@ func TestNewConfig(t *testing.T) {
"Test-Header": "Test-Value",
},
Concurrency: 20,
DisallowedURLFilters: []*regexp.Regexp{
URLFilters: []*regexp.Regexp{
regexp.MustCompile("rule1"),
regexp.MustCompile("rule2"),
},
DisallowedURLFilters: []*regexp.Regexp{
regexp.MustCompile("rule3"),
regexp.MustCompile("rule4"),
},
},
},
}
Expand Down
1 change: 1 addition & 0 deletions internal/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ func newCollector(cfg *config.Config) (*colly.Collector, error) {
c := colly.NewCollector(
colly.UserAgent(cfg.UserAgent),
colly.AllowedDomains(cfg.AllowedDomains...),
colly.URLFilters(cfg.URLFilters...),
colly.DisallowedURLFilters(cfg.DisallowedURLFilters...),
colly.Async(true),
)
Expand Down
7 changes: 7 additions & 0 deletions internal/crawler/crawler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,9 @@ func TestNewCrawler(t *testing.T) {
cfg := &config.Config{
UserAgent: "custom-agent",
AllowedDomains: []string{"example.com"},
URLFilters: []*regexp.Regexp{
regexp.MustCompile(".*"),
},
DisallowedURLFilters: []*regexp.Regexp{
regexp.MustCompile(".*disallowed.*"),
},
Expand All @@ -196,6 +199,7 @@ func TestNewCrawler(t *testing.T) {
assert.IsType(t, &colly.Collector{}, cr.collector)
assert.Equal(t, "custom-agent", cr.collector.UserAgent)
assert.Equal(t, []string{"example.com"}, cr.collector.AllowedDomains)
assert.Equal(t, []*regexp.Regexp{regexp.MustCompile(".*")}, cr.collector.URLFilters)
assert.Equal(t, []*regexp.Regexp{regexp.MustCompile(".*disallowed.*")}, cr.collector.DisallowedURLFilters)
assert.Equal(t, true, cr.collector.Async)
}
Expand Down Expand Up @@ -284,6 +288,9 @@ func TestRun(t *testing.T) {
cfg := &config.Config{
Site: ts.URL + "/sitemap.xml",
AllowedDomains: []string{hostname},
URLFilters: []*regexp.Regexp{
regexp.MustCompile(".*"),
},
DisallowedURLFilters: []*regexp.Regexp{
regexp.MustCompile("/disallowed"),
},
Expand Down

0 comments on commit b62e975

Please sign in to comment.