-
Notifications
You must be signed in to change notification settings - Fork 1
/
default_scraper.xml
288 lines (288 loc) · 13 KB
/
default_scraper.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
<?xml version="1.0" encoding="utf-8"?>
<Sinequa>
<Description>Default crawler to create a URL candidate list</Description>
<visibility></visibility>
<Connector>crawler2</Connector>
<Identity></Identity>
<Indexers></Indexers>
<Index></Index>
<Domain></Domain>
<TreeRoot>fake treeroot</TreeRoot>
<ForceReindexation>false</ForceReindexation>
<Plugin>SMD_Plugins/Sinequa.Plugin.ListCandidateUrls</Plugin>
<IncludedExtensions></IncludedExtensions>
<ExcludedExtensions></ExcludedExtensions>
<IncludedFilenames></IncludedFilenames>
<ExcludedFilenames></ExcludedFilenames>
<IncludedFolders></IncludedFolders>
<ExcludedFolders></ExcludedFolders>
<Indexation>
<Mappings></Mappings>
<SimulateLemma>false</SimulateLemma>
<SimulateEngine>false</SimulateEngine>
<SimulateCache>false</SimulateCache>
<SimulateLemmaMin></SimulateLemmaMin>
<SimulateLemmaMax></SimulateLemmaMax>
<EngineMetaEnabled>true</EngineMetaEnabled>
<ThumbnailHeight></ThumbnailHeight>
<ThumbnailWidth></ThumbnailWidth>
<ThumbnailSmallTimeout></ThumbnailSmallTimeout>
<ThumbnailMediumTimeout></ThumbnailMediumTimeout>
<ThumbnailLargeTimeout></ThumbnailLargeTimeout>
<SynchThumbnailGen>false</SynchThumbnailGen>
<StoreInCollectionCache>false</StoreInCollectionCache>
<GetFilePropertiesFromConverter>false</GetFilePropertiesFromConverter>
</Indexation>
<System>
</System>
<DisplayLongProperties>false</DisplayLongProperties>
<LongPropertyLimit></LongPropertyLimit>
<UsePerformanceMetrics>true</UsePerformanceMetrics>
<LogPerformanceMetricsPeriodically>false</LogPerformanceMetricsPeriodically>
<PasswordRepository></PasswordRepository>
<StoreDocumentCache></StoreDocumentCache>
<AuditEnabled>false</AuditEnabled>
<SaveDeniedDocs>false</SaveDeniedDocs>
<SavePropertiesToRegistry>false</SavePropertiesToRegistry>
<CollectionStateNative>false</CollectionStateNative>
<HtmlNavigatorNative>true</HtmlNavigatorNative>
<XPathNavigatorNative>false</XPathNavigatorNative>
<StatusMaxOk></StatusMaxOk>
<DelApiSecret></DelApiSecret>
<IndexerClient>
<Simulate>false</Simulate>
<SimulateGetCollectionState>false</SimulateGetCollectionState>
<QueueMaxCount></QueueMaxCount>
<DirectFileAccess>false</DirectFileAccess>
<UseCompression>false</UseCompression>
<SessionIsFinishedWait>false</SessionIsFinishedWait>
<SendTimeout></SendTimeout>
<ReceiveTimeout></ReceiveTimeout>
<RetryConnectCount></RetryConnectCount>
<RetryConnectDelay></RetryConnectDelay>
<SleepQueueFull></SleepQueueFull>
<SleepQueueFullCount></SleepQueueFullCount>
<SleepQueueFullQuick></SleepQueueFullQuick>
<SleepQueueFullQuickCount></SleepQueueFullQuickCount>
<SleepCheckOpen></SleepCheckOpen>
<SleepCheckOpenCount></SleepCheckOpenCount>
<SleepCheckOpenQuick></SleepCheckOpenQuick>
<SleepCheckOpenQuickCount></SleepCheckOpenQuickCount>
<DeactivationTimeout></DeactivationTimeout>
<BackToSendingQueueCount></BackToSendingQueueCount>
</IndexerClient>
<ForceBlobSend>false</ForceBlobSend>
<ContinueOnError>true</ContinueOnError>
<DoDelete>true</DoDelete>
<DeleteOnError>false</DeleteOnError>
<DeleteOnEnumerationError>false</DeleteOnEnumerationError>
<AcceptDeleteAll>false</AcceptDeleteAll>
<DeleteMaxPercentThreshold></DeleteMaxPercentThreshold>
<DeleteMaxThreshold></DeleteMaxThreshold>
<DeleteMinRemainingThreshold></DeleteMinRemainingThreshold>
<SaveCollectionState>false</SaveCollectionState>
<IncrementalState>false</IncrementalState>
<RealTimeIncrementalState>true</RealTimeIncrementalState>
<RealTimeInfoOnError>false</RealTimeInfoOnError>
<ConversionProxies></ConversionProxies>
<ConversionPlan>_Advanced</ConversionPlan>
<AddBaseHref>true</AddBaseHref>
<AddMetaContentType>false</AddMetaContentType>
<Throttle></Throttle>
<DocumentClass></DocumentClass>
<ConnectorLanguage></ConnectorLanguage>
<ClearHttpRequestCanonicalizeAsFilePath>true</ClearHttpRequestCanonicalizeAsFilePath>
<PdfGen>
<ConverterType></ConverterType>
<TimeoutSmall></TimeoutSmall>
<TimeoutMedium></TimeoutMedium>
<TimeoutLarge></TimeoutLarge>
</PdfGen>
<IndexZipContent>false</IndexZipContent>
<IndexPdfAttachments>false</IndexPdfAttachments>
<IndexOleAttachments>false</IndexOleAttachments>
<IndexMsgContent>false</IndexMsgContent>
<IndexMsgAttachments>false</IndexMsgAttachments>
<IndexOftContent>false</IndexOftContent>
<IndexOftAttachments>false</IndexOftAttachments>
<IndexEmlContent>false</IndexEmlContent>
<IndexEmlAttachments>false</IndexEmlAttachments>
<IndexPstContent>false</IndexPstContent>
<IndexOstContent>false</IndexOstContent>
<IndexPstMsg>true</IndexPstMsg>
<IndexPstMsgAttachments>true</IndexPstMsgAttachments>
<IndexPstContact>false</IndexPstContact>
<IndexPstCalendar>false</IndexPstCalendar>
<IndexPstNote>false</IndexPstNote>
<IndexPstTask>false</IndexPstTask>
<IndexPstDocument>true</IndexPstDocument>
<PstUseSafeId>false</PstUseSafeId>
<IndexArchivesExtensions></IndexArchivesExtensions>
<ArchiveItemsUseArchiveVersion>false</ArchiveItemsUseArchiveVersion>
<UseShortAttachmentId>false</UseShortAttachmentId>
<UseExtendedExtensionGuesser>false</UseExtendedExtensionGuesser>
<XmpExtensions></XmpExtensions>
<MediaExtensions></MediaExtensions>
<ExiftoolExtensions></ExiftoolExtensions>
<EarlySelectionQuery></EarlySelectionQuery>
<SelectionQuery></SelectionQuery>
<AttachmentSelectionQuery></AttachmentSelectionQuery>
<ArchiveItemSelectionQuery></ArchiveItemSelectionQuery>
<EngineConnectionWait></EngineConnectionWait>
<CalculateGraphBoost>false</CalculateGraphBoost>
<GraphBoostColumn></GraphBoostColumn>
<GraphBoostEMColumn></GraphBoostEMColumn>
<GraphBoostIterations></GraphBoostIterations>
<GraphBoostPower></GraphBoostPower>
<GraphBoostAdd></GraphBoostAdd>
<UseFieldPermissions>false</UseFieldPermissions>
<ShardIndexes></ShardIndexes>
<ShardingStrategy></ShardingStrategy>
<ShardSelections></ShardSelections>
<CurationType></CurationType>
<CurationIdPattern></CurationIdPattern>
<RunIndexMiningInIndexer>false</RunIndexMiningInIndexer>
<Namespace></Namespace>
<WorkerCount>8</WorkerCount>
<MaxWorkerPerHost></MaxWorkerPerHost>
<Url>enter your url here</Url>
<UrlList></UrlList>
<DynamicUrlList></DynamicUrlList>
<UrlStayInside>true</UrlStayInside>
<UrlRefererStayInside>true</UrlRefererStayInside>
<FollowLinks>true</FollowLinks>
<MaxLevel>100</MaxLevel>
<MaxToIndex>100000</MaxToIndex>
<MaxToCrawl>100000</MaxToCrawl>
<MaxRedirection>10</MaxRedirection>
<CrawlMaxSize>-1</CrawlMaxSize>
<CrawlTimeout>-1</CrawlTimeout>
<NormalizeUrls>true</NormalizeUrls>
<CorrectDomainCookies>false</CorrectDomainCookies>
<IgnoreSessionCookies>false</IgnoreSessionCookies>
<DownloadImages>false</DownloadImages>
<DownloadMedia>false</DownloadMedia>
<DownloadCss>false</DownloadCss>
<DownloadFtp>true</DownloadFtp>
<DownloadFile>true</DownloadFile>
<IndexJs>false</IndexJs>
<FollowJs>true</FollowJs>
<CrawlFlash>true</CrawlFlash>
<IndexEmptyPages>true</IndexEmptyPages>
<CrawlWebsphereSeedlist>true</CrawlWebsphereSeedlist>
<KeepHashFragmentInUrl>false</KeepHashFragmentInUrl>
<RetryCount>1</RetryCount>
<RetryPause>0 ms</RetryPause>
<UseIfModifiedSince>true</UseIfModifiedSince>
<UseIfNoneMatch>no</UseIfNoneMatch>
<AcceptWeakETag>false</AcceptWeakETag>
<ForcedEncoding></ForcedEncoding>
<UseCompression>false</UseCompression>
<UseUnsafeHeaderParsing>false</UseUnsafeHeaderParsing>
<NormalizeSecureSchemesWhenTestingVisited>false</NormalizeSecureSchemesWhenTestingVisited>
<ExactDeduplication>false</ExactDeduplication>
<NearDeduplication>false</NearDeduplication>
<CrawlPauseDelay></CrawlPauseDelay>
<CrawlPauseCount></CrawlPauseCount>
<UseRuntimeAutoRedirect>false</UseRuntimeAutoRedirect>
<RememberDnsFailure>true</RememberDnsFailure>
<RememberConnectFailure>true</RememberConnectFailure>
<RememberTrustFailure>true</RememberTrustFailure>
<RememberProxyNameResolutionFailure>false</RememberProxyNameResolutionFailure>
<UseRobotsNoIndex>true</UseRobotsNoIndex>
<UseRobotsNoFollow>true</UseRobotsNoFollow>
<UseRobotsTxt>false</UseRobotsTxt>
<RobotsTxtCaseSensitive>false</RobotsTxtCaseSensitive>
<LoadRobotsTxtSitemapUrls>false</LoadRobotsTxtSitemapUrls>
<CheckSitemapUrlLastmodInRealtimeMode>false</CheckSitemapUrlLastmodInRealtimeMode>
<AddRobotsTxtAllowUrlsToSeedList>false</AddRobotsTxtAllowUrlsToSeedList>
<UseCanonicalLinks>false</UseCanonicalLinks>
<UseRelNoFollow>false</UseRelNoFollow>
<UrlIndexExcluded>*.rtf,*.jy,*.xml,*.ico,*.gz,*.act</UrlIndexExcluded>
<DownloadSelectionQuery></DownloadSelectionQuery>
<FollowSelectionQuery></FollowSelectionQuery>
<IndexSelectionQuery></IndexSelectionQuery>
<LoadDefaultTags>true</LoadDefaultTags>
<LoadDefaultJsTransforms>true</LoadDefaultJsTransforms>
<UrlAccess>
<UseDefaultCredentials>true</UseDefaultCredentials>
<UseDefaultNetworkCredentials>false</UseDefaultNetworkCredentials>
<User></User>
<Password></Password>
<Domain></Domain>
<UseRfc1945>false</UseRfc1945>
<Timeout></Timeout>
<ChangeConnectionGroupNameOnTimeout>false</ChangeConnectionGroupNameOnTimeout>
<AllowAuthenticatedConnectionSharing>true</AllowAuthenticatedConnectionSharing>
<PreAuthenticate>false</PreAuthenticate>
<HttpVersion></HttpVersion>
<KeepAlive>true</KeepAlive>
<SecurityProtocol></SecurityProtocol>
<UserAgent></UserAgent>
<ClientCertificateFile></ClientCertificateFile>
<ClientCertificatePassword></ClientCertificatePassword>
<ClientCertificateStorage></ClientCertificateStorage>
<AllowXPathCookies>false</AllowXPathCookies>
<UseHttpClientForWebRequests>false</UseHttpClientForWebRequests>
<UseBrowserForWebRequests>true</UseBrowserForWebRequests>
<BrowserForWebRequestsReadinessThreshold></BrowserForWebRequestsReadinessThreshold>
<BrowserForWebRequestsInitialDelay></BrowserForWebRequestsInitialDelay>
<BrowserForWebRequestsMaxTotalDelay></BrowserForWebRequestsMaxTotalDelay>
<BrowserForWebRequestsMaxResourcesDelay></BrowserForWebRequestsMaxResourcesDelay>
<BrowserForWebRequestsLogLevel></BrowserForWebRequestsLogLevel>
<BrowserForWebRequestsViewportWidth></BrowserForWebRequestsViewportWidth>
<BrowserForWebRequestsViewportHeight></BrowserForWebRequestsViewportHeight>
<WebConnectionPluginName></WebConnectionPluginName>
<PostLoginUrl></PostLoginUrl>
<PostLoginData></PostLoginData>
<GetBeforePostLogin>false</GetBeforePostLogin>
<PostLoginAutoRedirect>true</PostLoginAutoRedirect>
<ReLoginCount></ReLoginCount>
<ReLoginDelay></ReLoginDelay>
<DetectHtmlLoginPattern></DetectHtmlLoginPattern>
<BrowserLogin>
<Activate>false</Activate>
<RemoteDebuggingPort></RemoteDebuggingPort>
<BrowserLogLevel></BrowserLogLevel>
<SuccessCondition></SuccessCondition>
<CookieFilter></CookieFilter>
</BrowserLogin>
<FtpUser></FtpUser>
<FtpPassword></FtpPassword>
<FtpDomain></FtpDomain>
<FtpUseBinary>true</FtpUseBinary>
<FtpUsePassive>true</FtpUsePassive>
<FtpReadWriteTimeout></FtpReadWriteTimeout>
<FtpTimeout></FtpTimeout>
<FtpEnableSsl>false</FtpEnableSsl>
<FileUser></FileUser>
<FilePassword></FilePassword>
<FileDomain></FileDomain>
<FileTimeout></FileTimeout>
<AmazonS3>
<AccessKey></AccessKey>
<SecretKey></SecretKey>
<RegionEndpoint>eu-west-1</RegionEndpoint>
</AmazonS3>
<ProxyAutoDetect>true</ProxyAutoDetect>
<ProxyAddress></ProxyAddress>
<ProxyBypassOnLocal>true</ProxyBypassOnLocal>
<ProxyServer></ProxyServer>
<ProxyPort>80</ProxyPort>
<ProxyUseDefaultCredentials>true</ProxyUseDefaultCredentials>
<ProxyUseDefaultNetworkCredentials>false</ProxyUseDefaultNetworkCredentials>
<ProxyUser></ProxyUser>
<ProxyPassword></ProxyPassword>
<ProxyDomain></ProxyDomain>
</UrlAccess>
<DeleteOnNetworkOrServerError>false</DeleteOnNetworkOrServerError>
<EnableNeuralIndexing>false</EnableNeuralIndexing>
<NeuralSearchSelectionQuery></NeuralSearchSelectionQuery>
<Mapping>
<Name>id</Name>
<Value>doc.url1</Value>
<Description></Description>
<Selection></Selection>
<DefaultValue></DefaultValue>
</Mapping>
</Sinequa>