From 9e8dc92a41d8793f3d7904808646d4c9cf3e4b85 Mon Sep 17 00:00:00 2001 From: Mikkel Denker Date: Tue, 3 Dec 2024 14:57:54 +0100 Subject: [PATCH] Improve architecture documentation (#243) * cleanup assets * update crawler docs * update search index docs * update webgraph docs --- .../add_browser/chrome_search_1.png | Bin .../add_browser/chrome_search_2.png | Bin .../add_browser/chrome_search_3.png | Bin .../add_browser/chrome_search_4.png | Bin .../add_browser/chrome_search_5.png | Bin .../add_browser/edge_search_1.png | Bin .../add_browser/edge_search_2.png | Bin .../add_browser/edge_search_3.png | Bin .../add_browser/edge_search_4.png | Bin .../add_browser/firefox_search_1.png | Bin .../add_browser/firefox_search_3.png | Bin .../add_browser/safari_search_1.png | Bin .../add_browser/safari_search_2.png | Bin .../add_browser/safari_search_3.png | Bin docs/{src => }/add_to_browser.md | 29 +++---- docs/api/README.md | 42 +--------- docs/architecture/crawler.md | 22 +++++ docs/{src => architecture}/optics.md | 0 docs/{src => architecture}/overview.md | 10 +-- docs/{src => architecture}/search_index.md | 2 +- docs/{src => architecture}/webgraph.md | 9 +- docs/mkdocs.yml | 37 --------- docs/src/assets/images/biglogo.svg | 10 --- docs/src/assets/images/crawler_overview.svg | 78 ------------------ docs/src/assets/images/favicon.ico | Bin 4286 -> 0 bytes docs/src/assets/js/katex.js | 12 --- docs/src/assets/styles/main.css | 18 ---- docs/src/crawler.md | 29 ------- docs/src/index.md | 12 --- 29 files changed, 46 insertions(+), 264 deletions(-) rename {docs/src/assets/images => assets}/add_browser/chrome_search_1.png (100%) rename {docs/src/assets/images => assets}/add_browser/chrome_search_2.png (100%) rename {docs/src/assets/images => assets}/add_browser/chrome_search_3.png (100%) rename {docs/src/assets/images => assets}/add_browser/chrome_search_4.png (100%) rename {docs/src/assets/images => assets}/add_browser/chrome_search_5.png (100%) rename {docs/src/assets/images => assets}/add_browser/edge_search_1.png (100%) rename {docs/src/assets/images => assets}/add_browser/edge_search_2.png (100%) rename {docs/src/assets/images => assets}/add_browser/edge_search_3.png (100%) rename {docs/src/assets/images => assets}/add_browser/edge_search_4.png (100%) rename {docs/src/assets/images => assets}/add_browser/firefox_search_1.png (100%) rename {docs/src/assets/images => assets}/add_browser/firefox_search_3.png (100%) rename {docs/src/assets/images => assets}/add_browser/safari_search_1.png (100%) rename {docs/src/assets/images => assets}/add_browser/safari_search_2.png (100%) rename {docs/src/assets/images => assets}/add_browser/safari_search_3.png (100%) rename docs/{src => }/add_to_browser.md (58%) create mode 100644 docs/architecture/crawler.md rename docs/{src => architecture}/optics.md (100%) rename docs/{src => architecture}/overview.md (56%) rename docs/{src => architecture}/search_index.md (96%) rename docs/{src => architecture}/webgraph.md (69%) delete mode 100644 docs/mkdocs.yml delete mode 100644 docs/src/assets/images/biglogo.svg delete mode 100644 docs/src/assets/images/crawler_overview.svg delete mode 100644 docs/src/assets/images/favicon.ico delete mode 100644 docs/src/assets/js/katex.js delete mode 100644 docs/src/assets/styles/main.css delete mode 100644 docs/src/crawler.md delete mode 100644 docs/src/index.md diff --git a/docs/src/assets/images/add_browser/chrome_search_1.png b/assets/add_browser/chrome_search_1.png similarity index 100% rename from docs/src/assets/images/add_browser/chrome_search_1.png rename to assets/add_browser/chrome_search_1.png diff --git a/docs/src/assets/images/add_browser/chrome_search_2.png b/assets/add_browser/chrome_search_2.png similarity index 100% rename from docs/src/assets/images/add_browser/chrome_search_2.png rename to assets/add_browser/chrome_search_2.png diff --git a/docs/src/assets/images/add_browser/chrome_search_3.png b/assets/add_browser/chrome_search_3.png similarity index 100% rename from docs/src/assets/images/add_browser/chrome_search_3.png rename to assets/add_browser/chrome_search_3.png diff --git a/docs/src/assets/images/add_browser/chrome_search_4.png b/assets/add_browser/chrome_search_4.png similarity index 100% rename from docs/src/assets/images/add_browser/chrome_search_4.png rename to assets/add_browser/chrome_search_4.png diff --git a/docs/src/assets/images/add_browser/chrome_search_5.png b/assets/add_browser/chrome_search_5.png similarity index 100% rename from docs/src/assets/images/add_browser/chrome_search_5.png rename to assets/add_browser/chrome_search_5.png diff --git a/docs/src/assets/images/add_browser/edge_search_1.png b/assets/add_browser/edge_search_1.png similarity index 100% rename from docs/src/assets/images/add_browser/edge_search_1.png rename to assets/add_browser/edge_search_1.png diff --git a/docs/src/assets/images/add_browser/edge_search_2.png b/assets/add_browser/edge_search_2.png similarity index 100% rename from docs/src/assets/images/add_browser/edge_search_2.png rename to assets/add_browser/edge_search_2.png diff --git a/docs/src/assets/images/add_browser/edge_search_3.png b/assets/add_browser/edge_search_3.png similarity index 100% rename from docs/src/assets/images/add_browser/edge_search_3.png rename to assets/add_browser/edge_search_3.png diff --git a/docs/src/assets/images/add_browser/edge_search_4.png b/assets/add_browser/edge_search_4.png similarity index 100% rename from docs/src/assets/images/add_browser/edge_search_4.png rename to assets/add_browser/edge_search_4.png diff --git a/docs/src/assets/images/add_browser/firefox_search_1.png b/assets/add_browser/firefox_search_1.png similarity index 100% rename from docs/src/assets/images/add_browser/firefox_search_1.png rename to assets/add_browser/firefox_search_1.png diff --git a/docs/src/assets/images/add_browser/firefox_search_3.png b/assets/add_browser/firefox_search_3.png similarity index 100% rename from docs/src/assets/images/add_browser/firefox_search_3.png rename to assets/add_browser/firefox_search_3.png diff --git a/docs/src/assets/images/add_browser/safari_search_1.png b/assets/add_browser/safari_search_1.png similarity index 100% rename from docs/src/assets/images/add_browser/safari_search_1.png rename to assets/add_browser/safari_search_1.png diff --git a/docs/src/assets/images/add_browser/safari_search_2.png b/assets/add_browser/safari_search_2.png similarity index 100% rename from docs/src/assets/images/add_browser/safari_search_2.png rename to assets/add_browser/safari_search_2.png diff --git a/docs/src/assets/images/add_browser/safari_search_3.png b/assets/add_browser/safari_search_3.png similarity index 100% rename from docs/src/assets/images/add_browser/safari_search_3.png rename to assets/add_browser/safari_search_3.png diff --git a/docs/src/add_to_browser.md b/docs/add_to_browser.md similarity index 58% rename from docs/src/add_to_browser.md rename to docs/add_to_browser.md index 4f12fd6b..ecc8e9e2 100644 --- a/docs/src/add_to_browser.md +++ b/docs/add_to_browser.md @@ -7,16 +7,16 @@ to your browser by following the instructions listed for your respective browser 1. Navigate to [stract.com](https://stract.com) 2. Navigate to settings -![chrome settings button](assets/images/add_browser/chrome_search_1.png) +![chrome settings button](../assets/add_browser/chrome_search_1.png) 3. Select `Search engine` -![chrome search engine button](assets/images/add_browser/chrome_search_2.png) +![chrome search engine button](../assets/add_browser/chrome_search_2.png) 4. Click `Manage search engines and site search` -![chrome manage search engines](assets/images/add_browser/chrome_search_3.png) +![chrome manage search engines](../assets/add_browser/chrome_search_3.png) 5. Scroll down to `Inactive shortcuts` -![chrome inactive shortcuts](assets/images/add_browser/chrome_search_4.png) +![chrome inactive shortcuts](../assets/add_browser/chrome_search_4.png) 6. Select the peapod menu 7. Select `Make Default` -![chrome make default](assets/images/add_browser/chrome_search_5.png) +![chrome make default](../assets/add_browser/chrome_search_5.png) ## Firefox @@ -24,24 +24,24 @@ to your browser by following the instructions listed for your respective browser 1. Navigate to [stract.com](https://stract.com) 2. Right click the bar. 3. Select `Add "Stract Search"` -![Add Stract to Search button](assets/images/add_browser/firefox_search_1.png) +![Add Stract to Search button](../assets/add_browser/firefox_search_1.png) 4. Navigate to Settings 5. Select Search 6. Use the `Default Search Engine` dropdown to select Stract -![Make Stract default search](assets/images/add_browser/firefox_search_3.png) +![Make Stract default search](../assets/add_browser/firefox_search_3.png) ## Microsoft Edge 1. Navigate to [stract.com](https://stract.com) 2. Navigate to `Settings` -![edge settings button](assets/images/add_browser/edge_search_1.png) +![edge settings button](../assets/add_browser/edge_search_1.png) 3. Select `Privacy, search, and services` -![edge privacy, search, and services button](assets/images/add_browser/edge_search_2.png) +![edge privacy, search, and services button](../assets/add_browser/edge_search_2.png) 4. Scroll down to `Services` -![edge Services section](assets/images/add_browser/edge_search_3.png) +![edge Services section](../assets/add_browser/edge_search_3.png) 5. Select `Address Search Bar` 6. Select `Manage search engines` 7. Click the menu next to Stract and select `Make default` -![edge make default button](assets/images/add_browser/edge_search_4.png) +![edge make default button](../assets/add_browser/edge_search_4.png) ## Safari @@ -52,12 +52,13 @@ as a site search option. What follows describes that process. 1. Navigate to [stract.com](https://stract.com) 2. Open Preferences 3. Navigate to the `Search` panel -![safari search settings](assets/images/add_browser/safari_search_1.png) +![safari search settings](../assets/add_browser/safari_search_1.png) 4. Select `Manage Websites...` 5. Select stract.com from the options -![safari website options](assets/images/add_browser/safari_search_2.png) +![safari website options](../assets/add_browser/safari_search_2.png) From here stract.com should appear in the search bar and you can arrow down to it and begin typing. -![safari stract searching](assets/images/add_browser/safari_search_3.png) \ No newline at end of file +![safari stract searching](../assets/add_browser/safari_search_3.png) + \ No newline at end of file diff --git a/docs/api/README.md b/docs/api/README.md index 0c6c2c27..392de04a 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -1,41 +1,3 @@ -# Website +# API Docs -This website is built using [Docusaurus](https://docusaurus.io/), a modern static website generator. - -### Installation - -``` -$ yarn -``` - -### Local Development - -``` -$ yarn start -``` - -This command starts a local development server and opens up a browser window. Most changes are reflected live without having to restart the server. - -### Build - -``` -$ yarn build -``` - -This command generates static content into the `build` directory and can be served using any static contents hosting service. - -### Deployment - -Using SSH: - -``` -$ USE_SSH=true yarn deploy -``` - -Not using SSH: - -``` -$ GIT_USER= yarn deploy -``` - -If you are using GitHub pages for hosting, this command is a convenient way to build the website and push to the `gh-pages` branch. +This website is built using [Docusaurus](https://docusaurus.io/), a modern static website generator. \ No newline at end of file diff --git a/docs/architecture/crawler.md b/docs/architecture/crawler.md new file mode 100644 index 00000000..bc8cdcd2 --- /dev/null +++ b/docs/architecture/crawler.md @@ -0,0 +1,22 @@ +# Crawler +[Information for webmasters here](https://stract.com/webmasters) + +The crawler is a distributed system that scours the web. It has a coordinator process that determines which URLs to crawl and a set of worker processes that fetch the content of those URLs. Each worker receives a batch of crawl jobs to process, stores the fetched contents in an S3 bucket and retrieves a new batch of jobs to process. This continues until the coordinator has determined that the crawl is complete. + +Each crawl job contains a site, a crawl budget and a list of some known high-authority urls for that site. The crawl budget is used to determine how many pages to fetch from the site. Each site is only allowed to be crawled by a single worker at a time to ensure that we don't overload a website. + +## Coordinator +The coordinator is responsible for planning and orchestrating the crawl process. It analyzes data from previous crawls to determine an appropriate crawl budget for each website. This budget helps ensure fair resource allocation and prevents overloading any single site. + +Based on this analysis, the coordinator creates a crawl plan that takes the form of a queue of jobs to be processed. This approach allows for efficient distribution to worker nodes while ensuring the coordinator does not become a bottleneck. + +### Respectfullness +It is of utmost importance that we are respectful of the websites we crawl. We do not want to overload a website with requests and we do not want to crawl pages from the website that the website owner does not want us to crawl. + +To ensure this, the jobs are oriented by site so each site is only included in a single job. When a site gets scheduled to a worker it is then the responsibility of the worker to respect the `robots.txt` file of the domain and to not overload the domain with requests. For more details see the [webmasters](https://stract.com/webmasters) documentation. + +## Worker +The worker is responsible for crawling the sites scheduled by the coordinator. It is completely stateless and stores the fetched data directly to an S3 bucket. It recursively discovers new urls on the assigned site and crawls them until the crawl budget is exhausted. + +When a worker is tasked to crawl a new site, it first checks the `robots.txt` file for the site to see which urls (if any) it is allowed to crawl. +If the worker receives a `429 Too Many Requests` response from the site, it backs off for a while before trying again. The specific backoff time depends on how fast the server responds. diff --git a/docs/src/optics.md b/docs/architecture/optics.md similarity index 100% rename from docs/src/optics.md rename to docs/architecture/optics.md diff --git a/docs/src/overview.md b/docs/architecture/overview.md similarity index 56% rename from docs/src/overview.md rename to docs/architecture/overview.md index b9390e2e..e7e5ab0f 100644 --- a/docs/src/overview.md +++ b/docs/architecture/overview.md @@ -1,11 +1,11 @@ # Overview -Stract (and most other web search engines) is composed of three main components: the crawler, the webgraph and the search index. +Stract (and most other web search engines) is composed of three main components: the crawler, the web graph and the search index. ## Crawler -The crawler, often also referred to as a spider or bot, is the component responsible for collecting and scanning websites across the internet. It begins with a seed list of URLs, which it visits to fetch web pages. The crawler then parses these pages to extract additional URLs, which are then added to the list of URLs to be crawled in the future. This process repeats in a cycle, allowing the crawler to discover new web pages or updates to existing pages continuously. The content fetched by the crawler is passed on to the next components of the search engine: the webgraph and the search index. +The crawler, often also referred to as a spider or bot, is the component responsible for collecting and scanning websites across the internet. It begins with a seed list of URLs, which it visits to fetch web pages. The crawler then parses these pages to extract additional URLs, which are then added to the list of URLs to be crawled in the future. This process repeats in a cycle, allowing the crawler to discover new web pages or updates to existing pages continuously. The content fetched by the crawler is passed on to the next components of the search engine: the web graph and the search index. -## Webgraph -The webgraph is a data structure that represents the relationships between different web pages. Each node in the webgraph represents a unique web page, and each edge represents a hyperlink from one page to another. The webgraph helps the search engine understand the structure of the web and the authority of different web pages. Authority is determined by factors such as the number of other pages linking to a given page (also known as "backlinks"), which is an important factor in ranking search results. This concept is often referred to as "link analysis." +## Web graph +The web graph is a data structure that represents the relationships between different web pages. Each node in the web graph represents a unique web page, and each edge represents a hyperlink from one page to another. The web graph helps the search engine understand the structure of the web and the authority of different web pages. Stract uses the [harmonic centrality](webgraph.md#harmonic-centrality) to determine the authority of a webpage. ## Search Index -The search index is the component that facilitates fast and accurate search results. It is akin to the index at the back of a book, providing a direct mapping from words or phrases to the web pages in which they appear. This data structure is often referred to as an "inverted index". The search index is designed to handle complex search queries and return relevant results in a fraction of a second. The index uses the information gathered by the crawler and the structure of the webgraph to rank search results according to their relevance. +The search index is the component that facilitates fast and accurate search results. It is akin to the index at the back of a book, providing a direct mapping from words or phrases to the web pages in which they appear. This data structure is often referred to as an "inverted index". The search index is designed to handle complex search queries and return relevant results in a fraction of a second. The index uses the information gathered by the crawler and the structure of the web graph to rank search results according to their relevance. diff --git a/docs/src/search_index.md b/docs/architecture/search_index.md similarity index 96% rename from docs/src/search_index.md rename to docs/architecture/search_index.md index 94d39a37..63029eff 100644 --- a/docs/src/search_index.md +++ b/docs/architecture/search_index.md @@ -40,4 +40,4 @@ The ranking happens in multiple stages. Some of these stages occur at the shard 3. If a lambdamart model has been defined, the best results from the linear regression stage gets passed into the lambdamart model. - Combining results from all shards 1. Results from each shard are re-ranked using both the linear regression and lambdamart models. This ensures the scores can be properly compared and ordered. - 2. The best 20 results, corresponding to the first page, gets scored with a cross encoder and again ranked using the linear regression followed by the lambdamart model. + 2. Multiple ranking stages gets applied in the ranking [pipeline](https://github.com/StractOrg/stract/tree/main/crates/core/src/ranking/pipeline) until the top 20 results are found. \ No newline at end of file diff --git a/docs/src/webgraph.md b/docs/architecture/webgraph.md similarity index 69% rename from docs/src/webgraph.md rename to docs/architecture/webgraph.md index 44218dc0..ca4e38b8 100644 --- a/docs/src/webgraph.md +++ b/docs/architecture/webgraph.md @@ -1,14 +1,7 @@ # Webgraph The webgraph, often conceptualized as the "internet's map," provides a structured view of the interconnectedness of pages across the World Wide Web. With billions of pages linked together, the webgraph is a crucial tool for understanding the structure, pattern, and dynamics of the internet. -There are two primary ways of constructing the webgraph: - -- **Page-Level Webgraph**: This method involves constructing the graph by analyzing individual pages and their outbound links. The nodes in this graph represent individual web pages, while the edges represent hyperlinks between them. This detailed view is especially helpful for understanding specific page connections. - -- **Host-Level Webgraph**: Instead of examining individual pages, this approach consolidates all the links associated with a particular host, effectively simplifying the webgraph. In this representation, nodes represent entire websites or hosts, and edges represent connections between them. This broader perspective is suitable for understanding the authority and influence of entire websites. - -## Segments -Given the extreme size of the internet, managing the webgraph as a single monolithic structure in memory is neither efficient nor practical. Thus, it's segmented into smaller parts called segments. Each segment is essentially a portion of the overall webgraph stored in a [RocksDB](https://rocksdb.org/) database on disk. This allows us to create webgraphs that are much larger than what we would otherwise be able to fit in memory. +The webgraph is stored in a tantivy index on disk, where each document represents an edge (hyperlink) between two web pages. Each document contains metadata about the link, such as the source URL, destination URL, anchor text etc. ## Webgraph Uses The structure of the web can provide highly valuable information when detemining the relevance of a page to a user's search query. PageRank, which is a centrality meassure developed by Larry Page and Sergey Brin, was one of the primary reasons why Google provided much better search results than their competitors in the early days. diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml deleted file mode 100644 index bc14d704..00000000 --- a/docs/mkdocs.yml +++ /dev/null @@ -1,37 +0,0 @@ -site_name: Overview Docs -docs_dir: src - -nav: - - Introduction: index.md - - Overview: overview.md - # - Crawler: crawler.md - - Webgraph: webgraph.md - - Search Index: search_index.md - - Optics: optics.md - - Add To Browser: add_to_browser.md - -theme: - name: material - logo: assets/images/biglogo.svg - favicon: assets/images/favicon.ico - features: - - navigation.expand - palette: - primary: white - accent: light blue - -extra_javascript: - - assets/js/katex.js - - https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.7/katex.min.js - - https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.7/contrib/auto-render.min.js - -extra_css: - - assets/styles/main.css - - https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.7/katex.min.css - -extra: - expand: true - generator: false - homepage: https://stract.com -plugins: - - search diff --git a/docs/src/assets/images/biglogo.svg b/docs/src/assets/images/biglogo.svg deleted file mode 100644 index f99a3558..00000000 --- a/docs/src/assets/images/biglogo.svg +++ /dev/null @@ -1,10 +0,0 @@ - - - - - - - - - - diff --git a/docs/src/assets/images/crawler_overview.svg b/docs/src/assets/images/crawler_overview.svg deleted file mode 100644 index 066a8f08..00000000 --- a/docs/src/assets/images/crawler_overview.svg +++ /dev/null @@ -1,78 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/src/assets/images/favicon.ico b/docs/src/assets/images/favicon.ico deleted file mode 100644 index f6c98157264038e65006b4fbfe1dc8c05b2228d4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4286 zcmb_fYfM$y6(T2uVPioOGT&&RZ($9Mj0Or+JO!uQSea&0bfN01nx!dJzu|dt`{r!Tmxy(&Aw-! zwbx$X+H0@1_Szzbzljq?|D`BgWUPopu#qHDY)ZEuj*WiD*L;Mb{olTQTk7iSq@kg~ zp^78-#OZ$&w}ZIu0H@IPA%jCo^d4c#p^PrPu3ivMj5E&Go8v zkanMA=Upz>=Esj8&*a{WP|&@5_nPA3VmW&B=%92^-O1Z;L*h;*Zko#H*kB&2WVQD8 z_NG64_;8Yfs&DMtwM#NHGvBHMg-d*VyxG5hzge_sQ3&lCP26hIt-$~f;ITNjs=d8E zIxa3QWb4+g#&1_AB`#dJAZO2>m4t)@6CE8L%6*OH{N@2Nhy1BBIyyRHqoSfhQc_Y( zc6N4v9h#e)<;9B^vS-g8d%wva!+F8&?ezfePN3`GzSjGK7V-ww?UfGEWRUIKx0~I& zcME;WHUssN*|TTMzJ2>_zoj2fCVy3cFqQwb0jMqjPOSq9J^*rl2jsp76n+SNkpTSb zBH(ccOjQD9exv@tt}I`^+)SD@Nyd#Er~WC0g@tnD$Pts1lQTm7CP09<52(rq4*Uv8 zeHTcc45Un9Kf>3N+1Iw~$3S^1p!RTh=*Ovyt6}s-%_TxSA)Dy8zT4W`W>U|VfT?Ql z(hb08#LfIUP`n(d{0mU~ci?(4aA_xPK|82!JAMTGvmii(5-7WgHkiS#+`4tk#>~yl zRXudyjmECyL?t#|1YvI^P%HF&Ye4F{C@xJ*|R9(cLalZO=l|V zc?L8yM83%bPJaSq#c?H{`Q)EKu-lu!#tj>v`A6{`9P5<|J!&<=A{?^<#P!+{cEXCKjTe)(jjeqany$JPA;_7+d0VU>xK2w1*^9kR#zLmj0f)p=H4%8ey zx}VU~(_`nKf`S6Ou9`l5y0C7v_u#>U2QxHB`gz}Y>$)@t)BmiKf5OX#O1N&8!Bv?D zS7{>L`3vF6h-7~L1@Pq78&mxs-1`>hUot_@98*?SCfeuOZ`W#T*RCB-8>IR91Kt|< znrCw7!&CGz+-H-}Q}QWXg@44$yjWOA{s2^D0ve|d4`rwC+_`gv#vN@g>({TBrAwCz zYaKflH77PUHcp`*R5?udku(qM{x?V zsgjRqHS2 z$hiN94l0AWJuWgbGBi0k*<8JP)zB8gTA&|SJ8BngCr+F&+#{_&Lpg6Ad83P)VD%vHef4AJ zv?(fM#flYX^XAQF)v8spX3ZKuoa4KurbcROYvsU!1NNMVhzP^lER1JnWLH;Lyyg#; zuX&VB?;mdMD^16`ySo#3md{K~OdLLK+BAF3dMCSi^QK(Ce%%o>7?Cxq*2{W-FOWs;yI?Ru>AZ*7 zbu#a7wqI`Aw8_N8#5nqHSXh`{OC3IZ*uEEExpKucH8n|neSJ{*gB{RVUAJzXoIH8b zzH1*lcFbgDWtlT)&KSijDk@U@7~M0i>v;E+9XoamN<7>XfNvS<6NkY{Gzq;T { - - - renderMathInElement(body, { - delimiters: [ - { left: "$$", right: "$$", display: true }, - { left: "$", right: "$", display: false }, - { left: "\\(", right: "\\)", display: false }, - { left: "\\[", right: "\\]", display: true } - ], - }) -}) diff --git a/docs/src/assets/styles/main.css b/docs/src/assets/styles/main.css deleted file mode 100644 index b5de4861..00000000 --- a/docs/src/assets/styles/main.css +++ /dev/null @@ -1,18 +0,0 @@ -.md-header__title { - font-size: 0.8rem; -} - -.md-logo>img { - width: 100% !important; - height: 100% !important; -} - -.md-logo { - width: 5rem; - height: auto; -} - -.md-footer, -.md-footer-meta { - background-color: white !important; -} \ No newline at end of file diff --git a/docs/src/crawler.md b/docs/src/crawler.md deleted file mode 100644 index 73fe4120..00000000 --- a/docs/src/crawler.md +++ /dev/null @@ -1,29 +0,0 @@ -# Crawler -[Information for webmasters here](https://stract.com/webmasters) - -![Overview of Crawler Architecture](assets/images/crawler_overview.svg) -The crawler is a distributed system that scours the web. It has a coordinator process that determines which URLs to crawl and a set of worker processes that fetch the content of those URLs. Each worker receives a batch of crawl jobs to process, stores the fetched contents in an S3 bucket and sends newly discovered URLs back to the coordinator. This continues until the coordinator has determined that the crawl is complete. - -## Coordinator -This is the brains of the crawl operation. The coordinator is responsible for determining which URLs to crawl and distributing them to the workers. - -### URL Frontier -The coordinator starts with a list of seed urls, schedules these to the available workers and receives a list of newly discovered urls from each worker. These newly discovered urls are added to the url frontier, which is a list of urls to crawl. - -You can imagine that the url frontier can grow very large, very quickly. This begs the question: How does the coordinator determine which urls to crawl next? We could just crawl the urls in the order they were discovered, but this might not lead to the most interesting results. - -Instead, the coordinator assigns a score to each url and performs a weighted random selection of the next url to crawl. -Each domain starts with a score of 1.0 and is summed with the weight of all the ingoing links. Whenever a domain is sampled, it spreads its score to all urls that are linked to from the domain. Thus if a domain has many outgoing links, the weight for each of those links will be relatively low compared to if the domain only has a few outgoing links. - -This prioritizes urls and domains that has many high-valued incoming links and are therefore more likely to be interesting for the user. The sampled urls are then scheduled to the available workers and the process repeats. - -### Respectfullness -It is of utmost importance that we are respectful of the websites we crawl. We do not want to overload a website with requests and we do not want to crawl pages from the website that the website owner does not want us to crawl. - -When a domain has been sampled it is therefore marked as `CrawlInProgress` until the worker sends results back to the coordinator for the job it was assigned. This ensures that each domain is only scheduled to a single worker at a time. It is then the responsibility of the worker to respect the `robots.txt` file of the domain and to not overload the domain with requests. - -## Worker -The worker is quite simple and is responsible for fetching data from urls scheduled by the coordinator. It is completely stateless and stores the fetched data directly to an S3 bucket while sending newly discovered urls back to the coordinator. - -When a worker is tasked to crawl a new site, it first checks the `robots.txt` file for the site to see which urls (if any) it is allowed to crawl. -If the worker receives a `429 Too Many Requests` response from the site, it backs off for a while before trying again. The specific backoff time depends on how fast the server responds. Further details can be found [here](https://stract.com/webmasters). diff --git a/docs/src/index.md b/docs/src/index.md deleted file mode 100644 index b92e226f..00000000 --- a/docs/src/index.md +++ /dev/null @@ -1,12 +0,0 @@ -# Introduction -Stract is an open source web search engine written in Rust. It is designed to be fast, customizable, easy to use and scalable. - -This documentation is intended to be at a high level of abstraction to give an overview of the project and its components. For more detailed information, please refer to the documentation directly in the source code. - -These docs are very incomplete and work-in-progress. - -# Contributing -We welcome and greatly appreciate contributions of all kinds. Please refer to [CONTRIBUTING.md](https://github.com/StractOrg/stract/blob/main/CONTRIBUTING.md) for more information on how you can contribute to the project. - -# License -Stract is offered under the terms defined under the [LICENSE.md](https://github.com/StractOrg/stract/blob/main/LICENSE.md) file. \ No newline at end of file