Skip to content

Latest commit

 

History

History
63 lines (48 loc) · 1.49 KB

1236.Web-Crawler.md

File metadata and controls

63 lines (48 loc) · 1.49 KB

1236. Web Crawler


题目地址

https://leetcode.com/problems/web-crawler/

题目描述

Given a url startUrl and an interface HtmlParser, implement a web crawler to crawl all links that are under the same hostname as startUrl. 

Return all urls obtained by your web crawler in any order.

Your crawler should:

Start from the page: startUrl
Call HtmlParser.getUrls(url) to get all urls from a webpage of given url.
Do not crawl the same link twice.
Explore only the links that are under the same hostname as startUrl.

代码

Approach 1: BFS

/**
 * // This is the HtmlParser's API interface.
 * // You should not implement it, or speculate about its implementation
 * interface HtmlParser {
 *     public List<String> getUrls(String url) {}
 * }
 */
class Solution {
  public List<String> crawl(String startUrl, HtmlParser htmlParser) {
        Set<String> set = new HashSet();
    Queue<String> queue = new LinkedList();
    String hostname = getHostname(startUrl);

    queue.offer(startUrl);
    set.add(startUrl);

    while (!queue.isEmpty()) {
      String currentUrl = queue.poll();
      for (String url : htmlParser.getUrls(currentUrl)) {
        if (url.contains(hostname) && !set.contains(url)) {
          queue.offer(url);
          set.add(url);
        }
      }
    }

    return new ArrayList<String>(set);
  }

  private String getHostname(String Url) {
    String[] ss = Url.split("/");
    return ss[2];
  }
}