Skip to content

Commit

Permalink
Add a path exclusion list to the Tsunami web fingerprinter's crawler.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 651771629
Change-Id: I36111e903fd10ae5488fc62abbeb65e50eb1d321
  • Loading branch information
maoning authored and copybara-github committed Jul 12, 2024
1 parent 2917af2 commit 3fae889
Show file tree
Hide file tree
Showing 7 changed files with 156 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,10 @@ public final class WebServiceFingerprinterConfigs {
private static final ImmutableList<String> DEFAULT_FILE_EXTENSION_EXCLUSIONS =
ImmutableList.of("application/zip", "application/gzip");

private final WebServiceFingerprinterCliOptions cliOptions;
private final WebServiceFingerprinterConfigProperties configProperties;
private static final ImmutableList<String> DEFAULT_PATH_EXCLUSIONS = ImmutableList.of();

final WebServiceFingerprinterCliOptions cliOptions;
final WebServiceFingerprinterConfigProperties configProperties;

@Inject
WebServiceFingerprinterConfigs(
Expand Down Expand Up @@ -95,8 +97,19 @@ public List<String> getContentTypeExclusions() {
}
}

public List<String> getPathExclusions() {
if (cliOptions.pathExclusions != null) {
return cliOptions.pathExclusions;
} else if (configProperties.pathExclusions != null) {
return configProperties.pathExclusions;
} else {
return DEFAULT_PATH_EXCLUSIONS;
}
}

/** CLI options for {@link WebServiceFingerprinter}. */
@Parameters(separators = "=")
static final class WebServiceFingerprinterCliOptions implements CliOption {
public static final class WebServiceFingerprinterCliOptions implements CliOption {

@Parameter(
names = "--web-service-fingerprinter-enforce-crawling-scope-check",
Expand Down Expand Up @@ -148,12 +161,18 @@ static final class WebServiceFingerprinterCliOptions implements CliOption {
+ "purpose.")
List<String> contentTypeExclusions;

@Parameter(
names = "--web-service-fingerprinter-crawl-path-exclusions",
description = "A comma separated list of path regexes to exclude during crawling.")
List<String> pathExclusions;

@Override
public void validate() {}
}

/** Config properties for {@link WebServiceFingerprinter}. */
@ConfigProperties("plugins.google.fingerprinter.web")
static final class WebServiceFingerprinterConfigProperties {
public static final class WebServiceFingerprinterConfigProperties {

/**
* Configuration options for the {@code
Expand Down Expand Up @@ -186,5 +205,11 @@ static final class WebServiceFingerprinterConfigProperties {
* CLI flag's description for more details.
*/
List<String> contentTypeExclusions;

/**
* Configuration option for the @code --web-service-fingerprinter-crawl-path-exclusions} CLI
* flag. See the CLI flag's description for more details.
*/
List<String> pathExclusions;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

import com.google.tsunami.proto.CrawlConfig;
import com.google.tsunami.proto.CrawlTarget;
import java.util.List;
import java.util.regex.Pattern;

/** Static utility methods pertaining to {@link CrawlConfig} proto buffer. */
final class CrawlConfigUtils {
Expand All @@ -41,4 +43,13 @@ static boolean isCrawlTargetInScope(CrawlConfig crawlConfig, CrawlTarget crawlTa
return !crawlConfig.getShouldEnforceScopeCheck() || crawlConfig.getScopesList().stream()
.anyMatch(scope -> ScopeUtils.isInScope(scope, crawlTarget.getUrl()));
}

static boolean isCrawlTargetInBlockList(CrawlTarget crawlTarget, List<String> pathExclusions) {
for (String regex : pathExclusions) {
if (Pattern.compile(regex).matcher(crawlTarget.getUrl()).find()) {
return true;
}
}
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import com.google.tsunami.common.net.http.HttpMethod;
import com.google.tsunami.common.net.http.HttpRequest;
import com.google.tsunami.common.net.http.HttpResponse;
import com.google.tsunami.plugins.fingerprinters.web.WebServiceFingerprinterConfigs;
import com.google.tsunami.proto.CrawlConfig;
import com.google.tsunami.proto.CrawlResult;
import com.google.tsunami.proto.CrawlTarget;
Expand Down Expand Up @@ -62,18 +63,21 @@ final class SimpleCrawlAction extends RecursiveAction {
private final CrawlConfig crawlConfig;
private final CrawlTarget crawlTarget;
private final SimpleCrawlerResults crawlerResults;
private final WebServiceFingerprinterConfigs configs;

SimpleCrawlAction(
int currentDepth,
HttpClient httpClient,
CrawlConfig crawlConfig,
CrawlTarget crawlTarget,
SimpleCrawlerResults crawlerResults) {
SimpleCrawlerResults crawlerResults,
WebServiceFingerprinterConfigs configs) {
this.currentDepth = currentDepth;
this.httpClient = checkNotNull(httpClient);
this.crawlConfig = checkNotNull(crawlConfig);
this.crawlTarget = checkNotNull(crawlTarget);
this.crawlerResults = checkNotNull(crawlerResults);
this.configs = checkNotNull(configs);
}

String getTargetUrl() {
Expand Down Expand Up @@ -153,6 +157,10 @@ private void spawnNewCrawlActions(HttpResponse httpResponse) {
.map(crawlTarget -> normalizeHost(crawlConfig, crawlTarget))
// Ignore out-of-scope URLs.
.filter(crawlTarget -> CrawlConfigUtils.isCrawlTargetInScope(crawlConfig, crawlTarget))
.filter(
crawlTarget ->
!CrawlConfigUtils.isCrawlTargetInBlockList(
crawlTarget, configs.getPathExclusions()))
.map(this::newCrawlAction)
.collect(toImmutableSet());
invokeAll(newCrawlActions);
Expand All @@ -166,6 +174,6 @@ private static boolean isValidCrawlTarget(CrawlTarget crawlTarget) {

private SimpleCrawlAction newCrawlAction(CrawlTarget newCrawlTarget) {
return new SimpleCrawlAction(
currentDepth + 1, httpClient, crawlConfig, newCrawlTarget, crawlerResults);
currentDepth + 1, httpClient, crawlConfig, newCrawlTarget, crawlerResults, configs);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.tsunami.common.net.http.HttpClient;
import com.google.tsunami.common.net.http.HttpMethod;
import com.google.tsunami.plugins.fingerprinters.web.WebServiceFingerprinterConfigs;
import com.google.tsunami.proto.CrawlConfig;
import com.google.tsunami.proto.CrawlResult;
import com.google.tsunami.proto.CrawlTarget;
Expand All @@ -45,15 +46,18 @@ public final class SimpleCrawler implements Crawler {
private final ForkJoinPool forkJoinPool;
private final ListeningExecutorService schedulingPool;
private final HttpClient httpClient;
private final WebServiceFingerprinterConfigs configs;

@Inject
SimpleCrawler(
@SimpleCrawlerWorkerPool ForkJoinPool forkJoinPool,
@SimpleCrawlerSchedulingPool ListeningExecutorService schedulingPool,
HttpClient httpClient) {
HttpClient httpClient,
WebServiceFingerprinterConfigs configs) {
this.forkJoinPool = checkNotNull(forkJoinPool);
this.schedulingPool = checkNotNull(schedulingPool);
this.httpClient = checkNotNull(httpClient).modify().setFollowRedirects(false).build();
this.configs = checkNotNull(configs);
}

@Override
Expand All @@ -76,7 +80,7 @@ private SimpleCrawlAction buildCrawlAction(
CrawlConfig crawlConfig, String url, SimpleCrawlerResults crawlerResults) {
CrawlTarget crawlTarget =
CrawlTarget.newBuilder().setHttpMethod(HttpMethod.GET.toString()).setUrl(url).build();
return new SimpleCrawlAction(0, httpClient, crawlConfig, crawlTarget, crawlerResults);
return new SimpleCrawlAction(0, httpClient, crawlConfig, crawlTarget, crawlerResults, configs);
}

private ListenableFuture<Void> startCrawlAction(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,4 +149,36 @@ public void contentTypeExclusions_whenBothCliAndConfigAreNotSet_returnsDefaultVa
.containsExactly("application/zip", "application/gzip")
.inOrder();
}

@Test
public void pathExclusions_whenCliOptionSet_returnsCliOptionSetting() {
cliOptions.pathExclusions = ImmutableList.of(".*/logout$", ".*/dangerous$");
assertThat(configs.getPathExclusions())
.containsExactly(".*/logout$", ".*/dangerous$")
.inOrder();
}

@Test
public void pathExclusions_whenConfigPropertySet_returnsConfigPropertySetting() {
configProperties.pathExclusions = ImmutableList.of(".*/logout$", ".*/dangerous$");
assertThat(configs.getPathExclusions())
.containsExactly(".*/logout$", ".*/dangerous$")
.inOrder();
}

@Test
public void pathExclusions_whenBothCliAndConfigAreSet_cliOptionTakesPrecedence() {
cliOptions.pathExclusions = ImmutableList.of(".*/logout$", ".*/dangerous$");
configProperties.pathExclusions = ImmutableList.of(".*/login$", ".*/safe$");
assertThat(configs.getPathExclusions())
.containsExactly(".*/logout$", ".*/dangerous$")
.inOrder();
}

@Test
public void pathExclusions_whenBothCliAndConfigAreNotSet_returnsDefaultValue() {
cliOptions.pathExclusions = null;
configProperties.pathExclusions = null;
assertThat(configs.getPathExclusions()).isEmpty();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@

import static com.google.common.truth.Truth.assertThat;
import static com.google.common.truth.extensions.proto.ProtoTruth.assertThat;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import com.google.common.collect.ImmutableList;
import com.google.tsunami.proto.CrawlConfig;
import com.google.tsunami.proto.CrawlConfig.Scope;
import com.google.tsunami.proto.CrawlTarget;
Expand Down Expand Up @@ -71,9 +74,11 @@ public void isCrawlTargetInScope_whenScopeEnforcementDisabled_alwaysReturnsTrue(
CrawlTarget.newBuilder().setUrl("http://localhost:8080/in-scope/index.html").build()))
.isTrue();
assertThat(
CrawlConfigUtils.isCrawlTargetInScope(
crawlConfig,
CrawlTarget.newBuilder().setUrl("http://localhost:8080/not-in-scope/index.html").build()))
CrawlConfigUtils.isCrawlTargetInScope(
crawlConfig,
CrawlTarget.newBuilder()
.setUrl("http://localhost:8080/not-in-scope/index.html")
.build()))
.isTrue();
}

Expand Down Expand Up @@ -124,4 +129,20 @@ public void isCrawlTargetInScope_whenEnforcingScopeCheckAndTargetNotInScope_retu
.build()))
.isFalse();
}

@Test
public void isCrawlTargetInBlockList_inBlockList_returnsTrue() {
ImmutableList<String> blockList = ImmutableList.of(".*/quit$", ".*/logout$");
CrawlTarget target = CrawlTarget.newBuilder().setUrl("http://127.0.0.1/logout").build();

assertTrue(CrawlConfigUtils.isCrawlTargetInBlockList(target, blockList));
}

@Test
public void isCrawlTargetInBlockList_notInBlockList_returnsFalse() {
ImmutableList<String> blockList = ImmutableList.of(".*/quit$", ".*/logout$");
CrawlTarget target = CrawlTarget.newBuilder().setUrl("http://127.0.0.1/login").build();

assertFalse(CrawlConfigUtils.isCrawlTargetInBlockList(target, blockList));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,17 @@
import static java.nio.charset.StandardCharsets.UTF_8;

import com.google.common.io.Resources;
import com.google.inject.AbstractModule;
import com.google.inject.Guice;
import com.google.tsunami.common.net.http.HttpClient;
import com.google.tsunami.common.net.http.HttpClientModule;
import com.google.tsunami.plugins.fingerprinters.web.WebServiceFingerprinterConfigs;
import com.google.tsunami.plugins.fingerprinters.web.WebServiceFingerprinterConfigs.WebServiceFingerprinterCliOptions;
import com.google.tsunami.plugins.fingerprinters.web.WebServiceFingerprinterConfigs.WebServiceFingerprinterConfigProperties;
import com.google.tsunami.proto.CrawlTarget;
import java.io.IOException;
import java.util.concurrent.ForkJoinPool;
import javax.inject.Inject;
import okhttp3.mockwebserver.MockWebServer;
import org.junit.After;
import org.junit.Before;
Expand All @@ -41,11 +46,28 @@ public final class SimpleCrawlActionTest {
private SimpleCrawlerResults crawlerResults;
private MockWebServer mockWebServer;
private TestDataBuilder dataBuilder;
private WebServiceFingerprinterCliOptions cliOptions;
private WebServiceFingerprinterConfigProperties configProperties;

@Inject WebServiceFingerprinterConfigs configs;

@Before
public void setUp() {
cliOptions = new WebServiceFingerprinterCliOptions();
configProperties = new WebServiceFingerprinterConfigProperties();
Guice.createInjector(
new AbstractModule() {
@Override
protected void configure() {
bind(WebServiceFingerprinterCliOptions.class).toInstance(cliOptions);
bind(WebServiceFingerprinterConfigProperties.class)
.toInstance(configProperties);
}
}
).injectMembers(this);
httpClient =
Guice.createInjector(new HttpClientModule.Builder().build())
Guice.createInjector(
new HttpClientModule.Builder().build())
.getInstance(HttpClient.class)
.modify()
.setFollowRedirects(false)
Expand All @@ -64,11 +86,13 @@ public void tearDown() throws IOException {
public void getTargetUrl_always_returnsUrlFromCrawlTarget() {
assertThat(
new SimpleCrawlAction(
0,
httpClient,
dataBuilder.buildCrawlConfig(),
dataBuilder.buildCrawlTargetForSeedPath("/path"),
crawlerResults).getTargetUrl())
0,
httpClient,
dataBuilder.buildCrawlConfig(),
dataBuilder.buildCrawlTargetForSeedPath("/path"),
crawlerResults,
configs)
.getTargetUrl())
.isEqualTo(mockWebServer.url("/path").toString());
}

Expand All @@ -83,7 +107,8 @@ public void compute_whenUrlAlreadyVisited_doesNotCrawlSameTarget() {
httpClient,
dataBuilder.buildCrawlConfig(),
CrawlTarget.getDefaultInstance(),
crawlerResults));
crawlerResults,
configs));

assertThat(mockWebServer.getRequestCount()).isEqualTo(0);
}
Expand All @@ -99,7 +124,8 @@ public void compute_whenTargetUrlIsInvalid_ignoresCrawlTarget() {
dataBuilder.buildCrawlTargetForSeedPath("").toBuilder()
.setUrl("invalid-url")
.build(),
crawlerResults));
crawlerResults,
configs));

assertThat(mockWebServer.getRequestCount()).isEqualTo(0);
assertThat(crawlerResults.getFinalResults()).isEmpty();
Expand All @@ -114,7 +140,8 @@ public void compute_whenHttpRequestError_ignoresCrawlTarget() {
httpClient,
dataBuilder.buildCrawlConfig(),
dataBuilder.buildCrawlTargetForSeedPath("/timeout").toBuilder().build(),
crawlerResults));
crawlerResults,
configs));

assertThat(mockWebServer.getRequestCount()).isEqualTo(1);
assertThat(crawlerResults.getFinalResults()).isEmpty();
Expand All @@ -134,7 +161,8 @@ public void compute_whenSeedingUrlRedirects_followsRedirect() throws IOException
httpClient,
dataBuilder.buildCrawlConfig(),
dataBuilder.buildCrawlTargetForSeedPath("/redirect"),
crawlerResults));
crawlerResults,
configs));

assertThat(crawlerResults.getFinalResults())
.containsExactly(
Expand Down Expand Up @@ -163,7 +191,8 @@ public void compute_whenExceedsMaxDepth_stopsCrawlingAtMaxDepth() throws IOExcep
httpClient,
dataBuilder.buildCrawlConfig().toBuilder().setMaxDepth(1).build(),
dataBuilder.buildCrawlTargetForSeedPath("/redirect"),
crawlerResults));
crawlerResults,
configs));

assertThat(crawlerResults.getFinalResults())
.containsExactly(
Expand All @@ -186,7 +215,8 @@ public void compute_whenHtmlPageContainsOutOfScopeLink_ignoresOutOfScopeLink()
httpClient,
dataBuilder.buildCrawlConfig(),
dataBuilder.buildCrawlTargetForSeedPath("/"),
crawlerResults));
crawlerResults,
configs));

assertThat(crawlerResults.getFinalResults())
.containsExactly(dataBuilder.buildCrawlResult(0, "/", body));
Expand All @@ -209,7 +239,8 @@ private void assetCrawlResults(String testdataResourceName) throws Exception {
httpClient,
dataBuilder.buildCrawlConfig(),
dataBuilder.buildCrawlTargetForSeedPath("/"),
crawlerResults));
crawlerResults,
configs));

assertThat(crawlerResults.getFinalResults())
.containsExactly(
Expand Down

0 comments on commit 3fae889

Please sign in to comment.