-
Notifications
You must be signed in to change notification settings - Fork 0
/
site_spider.py
844 lines (676 loc) · 35.8 KB
/
site_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
import urllib2 # For downloading resourses from Internet
import urlparse # For operations with URL stings
import socket # For using socket.error exception
import httplib # For using httplib.InvalidURL, httplib.HTTPException
# and httplib.NotConnected exceptions
import exceptions # For using ValueError and IOError exceptions
import robotparser # For parsing robots.txt files
import time # For using time.sleep() function
import random # For using random numbers generator
import logging # For logging
from sitemap_tree import SitemapTreeElement, HeadlineElement, \
TextReferenceElement
from site_page_parser import SitePageParser, SitePageParseError
__doc__ = """
Contains site spider and auxiliary classes.
Uses site_page_parser module to parse website pages and
sitemap_tree module to build sitemap tree.
"""
__all__ = ["SiteSpider"]
class ReferenceCrawlingInfo(object):
"""Base class for clawling information about reference. Reference may be
not just a text reference, it also for example may be an image reference.
But at the current time we will only deal with text references.
"""
def __init__(self, reference, depth, parent = None):
"""Initializes reference crawlong info base properties.
reference -- corresponding URL
depth -- crawling depth of reference - number of clicks we need
to do in order to reach a resourse corresponding to this
reference from a websitesite homepage
parent -- sitemap element which is intended to be a parent of a
sitemap element corresponding to this reference
(by default is None)
"""
# Check a type of 'reference' parametr
if not isinstance(reference, basestring):
raise TypeError('string type expected')
self._reference = reference
# Check a type of 'depth' parametr
if not isinstance(depth, (int, long)):
raise TypeError('int or long type expected')
self._depth = depth
# Check a type of 'parent' parametr
if parent and not isinstance(parent, SitemapTreeElement):
raise TypeError('SitemapTreeElement type expected')
self._parent = parent
@property
def reference(self):
"""Returns reference value."""
return self._reference
@property
def depth(self):
"""Returns reference crawling depth."""
return self._depth
@property
def parent(self):
"""Returns reference sitemap parent element."""
return self._parent
@reference.setter
def reference(self, new_reference):
"""Sets reference value."""
# Check a type of 'new_reference' parametr
if not isinstance(new_reference, basestring):
raise TypeError('string type expected')
self._reference = new_reference
@depth.setter
def depth(self, new_depth):
"""Sets reference crawling depth."""
# Check a type of 'new_depth' parametr
if not isinstance(new_depth, (int, long)):
raise TypeError('int or long type expected')
self._depth = new_depth
@parent.setter
def parent(self, new_parent):
"""Sets reference sitemap parent element."""
# Check a type of 'new_parent' parametr
if parent and not isinstance(new_parent, SitemapTreeElement):
raise TypeError('SitemapTreeElement type expected')
self._parent = new_parent
class TextReferenceCrawlingInfo(ReferenceCrawlingInfo):
"""Class for clawling information about text reference.
At the current time we will only deal with text references."""
def __init__(self, reference, depth, parent = None, title = ''):
"""Initializes text reference crawlong info properties.
reference -- corresponding URL
depth -- crawling depth of reference - number of clicks we need
to do in order to reach a resourse corresponding to this
reference from a websitesite homepage
parent -- sitemap element which is intended to be a parent of a
sitemap element corresponding to this reference
(by default is None)
title -- title of text reference (by default is '')
"""
ReferenceCrawlingInfo.__init__(self, reference, depth, parent)
# Check a type of 'title' parametr
if not isinstance(title, basestring):
raise TypeError('string type expected')
self._title = title
@property
def title(self):
"""Returns text reference title."""
return self._title
@title.setter
def title(self, new_title):
"""Sets text reference title."""
# Check a type of 'new_title' parametr
if not isinstance(new_title, basestring):
raise TypeError('string type expected')
self._title = new_title
class SiteSpiderError(Exception):
"""Base class for site spider errors."""
pass
class ConnectionError(SiteSpiderError):
"""Class for Internet connection errors. Exceptions of this class are
raised if site spider can't connect to given URL. Contains URL, connection
to which was failed.
"""
def __init__(self, url):
self._url = url
def __str__(self):
return 'Unable to connect to %s' % self._url
class InvalidURLError(SiteSpiderError):
"""Class for errors connected with URL validity. Contains invalid URL,
due to which exception was raised.
"""
def __init__(self, url):
self._url = url
def __str__(self):
return 'Invalid URL: %s' % self._url
class ResourseRetrieveError(SiteSpiderError):
"""Class for client or server errors, due to which site spider is unable
to retrieve resourse corresponding to given URL. Contains URL, due to which
exception was raised and 'exception' attribute for further error information
wich is instance of urllib2.URLError class or its subclass.
"""
def __init__(self, url, exception):
self._url = url
self._exception = exception
def __str__(self):
return 'Unable to retrieve resourse from %s: '% self._url + \
str(self._exception)
class SiteSpider(object):
"""Site spider class. Site spider is intended to crawl given website with
given preferences and to build a sitemap tree while crawling.
"""
# Crawling statuses of site spider. They are used to recogniaize whether
# clawling was finished successfully or not.
CRAWLING_STATUS_SUCCESS = 1
CRAWLING_STATUS_ERROR = 0
def __init__(self, site_homepage_address, depth_limit = 0,
download_delay = 0, connection_attempts_number = 1,
connection_attempt_timeout = 0, robotstxt_obey = True):
"""Initializes site spider preferences.
site_homepage_address -- URL of a website which is intended
to be crawled
depth_limit -- crawling depth limit. References which
depth is grater then this parametr won't
be crawled an won't be added to sitemap
tree. '0' means no depth limitation.
(by default is 0)
download_delay -- average interval between site resourse
downloads. If nonzero, site spider will
use random dowload delay from
[0.5 * download_delay,
1.5 * download_delay].
It is highly recommeded to set this
parametr in order not to override with
requests server of website you are giong
to crawl. (by default is 0)
connection_attempts_number -- maximum number of connection attempts
site spider will do in case of connection
errors before dumping crawling process.
(by default is 0)
connection_attempt_timeout -- interval between connection attempts in
case of connection errors.
(by default is 0)
robotstxt_obey -- boolean parametr which states whether
site spider will respect robots.txt
policies or not (by default is True)
"""
# Set site spider name. It is used to form user-agent request header
self._name = 'SiteSpider/1.0'
# Check a type of 'site_homepage_address' parametr
if not isinstance(site_homepage_address, basestring):
raise TypeError('string type expected')
# Set the allowed domain of spider. Spider is allowed to crawl only
# with the limits of this domain.
self._allowed_domain = self._normalize_homepage_address(
site_homepage_address)
# Check a type of 'depth_limit' parametr
if not isinstance(depth_limit, (int, long)):
raise TypeError('int or long type expected')
self._depth_limit = depth_limit
# Check a type of 'download_delay' parametr
if not isinstance(download_delay, (int, long)):
raise TypeError('int or long type expected')
self._download_delay = download_delay
# Set download delay bounds. Site spider will use random download
# delay between this buonds.
self._download_delay_upper_bound = 1.5 * self._download_delay
self._download_delay_lower_bound = 0.5 * self._download_delay
# Check a type of 'connection_attempts_number' parametr
if not isinstance(connection_attempts_number, (int, long)):
raise TypeError('int or long type expected')
self._connection_attempts_number = connection_attempts_number
# Check a type of 'connection_attempt_timeout' parametr
if not isinstance(connection_attempt_timeout, (int, long)):
raise TypeError('int or long type expected')
self._connection_attempt_timeout = connection_attempt_timeout
self._robotstxt_obey = robotstxt_obey
# Set a parser of robots.txt file which is provided by
# python built-in library robotparser
self._robotstxt_file_parser = robotparser.RobotFileParser()
# Set request headers
user_agent_header = self._name
accept_header = \
'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
self._request_headers = {
'User-Agent': user_agent_header,
'Accept': accept_header
}
# Set parser of web pages
self._site_page_parser = SitePageParser()
# self._references_crawling_info_schedule contains instances of
# ReferenceCrawlingInfo class corresponding to references that
# site spider have to process. Initialize it with an empty list
self._references_crawling_info_schedule = []
# self._viewed_references is a set of references that were
# already processed. Initialize it with an empty set
self._viewed_references = set()
# self._sitemap_tree contains sitemap tree (root element of this tree)
# built by site spider. Initialize it with None
self._sitemap_tree = None
# self._crawling_status contains current crawling status of a
# site spider. Crwaling is not started, so initialize it with None
self._crawling_status = None
# Initialize random number generator in order to use it in order
# to randomize dawnload delay
random.seed()
def _normalize_homepage_address(self, site_homepage_address):
"""Normalizes given site homepage address - if address has no
URL scheme, defines it as 'http', cuts the path and quers parts of URL
and adds tracing slash if needed.
site homepage address -- corresponding URL
Returns normalized URL.
"""
# Split URL into parts
address_parts = urlparse.urlsplit(site_homepage_address,
allow_fragments = False)
scheme = address_parts.scheme
netloc = address_parts.netloc
path = address_parts.path
query = address_parts.query
# If originally there was no URL scheme, urlparse.urlsplit will
# recognize netloc part as path part of URL. We need to improve it
# Also we set 'http' scheme as default URL scheme in this case
if not scheme:
scheme = 'http'
path_slash_position = path.find('/')
if path_slash_position != -1:
netloc = path[ : path_slash_position]
path = path[path_slash_position : ]
else:
netloc = path
path = ''
# Cut the query part if needed
if query:
query = ''
# Cut path part and add tracing '/' if needed
path_slash_position = path.find('/')
if path_slash_position != -1:
path = path[ : path_slash_position + 1]
else:
path += '/'
# Build normalized URL
normalized_address_parts = (scheme, netloc, path, query,
address_parts.fragment)
normalized_address = urlparse.urlunsplit(normalized_address_parts)
return normalized_address
def _download_site_resourse(self, reference):
"""Dawnloads site resourse by given reference.
reference -- corresponding URL
Returns file-like object, containing requsted resourse.
Raises InvalidURLError exception in case of invalid URL,
ConnectionError exception in case of connection errors
and ResourseRetrieveError in case of impossibility of
resourse retrieval.
"""
# Form the request
request = urllib2.Request(reference, headers = self._request_headers)
# Resourse is not downloaded yet and it's the first attempt to do it
resourse_is_recieved = False
connection_attempt_number = 1
resourse = None
# Try to download resourse
while not resourse_is_recieved:
try:
resourse = urllib2.urlopen(request)
except (httplib.InvalidURL, exceptions.ValueError):
# Given URL is invalid
raise InvalidURLError(reference)
except httplib.NotConnected:
# Connection was not established
# Check number of a connection attempt
if connection_attempt_number < self._connection_attempts_number:
logging.error('Connection to ' + reference +
' failed, trying to connect again')
connection_attempt_number += 1
time.sleep(self._connection_attempt_timeout)
else:
raise ConnectionError(reference)
except (urllib2.URLError, httplib.HTTPException,
exceptions.IOError), error:
# Check the nature of a raised exception
if hasattr(error, 'reason') and \
isinstance(error.reason, socket.error):
# Connection also was not established (but another
# exception is raised by urllib2)
# Check number of a connection attempt
if connection_attempt_number < self._connection_attempts_number:
logging.error('Connection to ' + reference +
' failed, trying to connect again')
connection_attempt_number += 1
time.sleep(self._connection_attempt_timeout)
else:
raise ConnectionError(reference)
else:
# Another error happend due to wich site spider is
# unable to download requested resourse
raise ResourseRetrieveError(reference, error)
else:
# Resourse was downloaded succesfully
resourse_is_recieved = True
return resourse
def _parse_robotstxt_file(self, robotstxt_file):
"""Feeds robotstxt file to site spider robots.txt file parser.
robotstxt_file -- file-like object containing robots.txt file contents
"""
# Get lines of robotstxt file and feed it to parser
robotstxt_lines = [line.strip() for line in robotstxt_file]
self._robotstxt_file_parser.parse(robotstxt_lines)
def _normalize_reference(self, reference):
"""Normalizes reference - makes it absolute and
adds tracing slash if needed
reference -- corresponding URL
Returns normalized URL
"""
# Split URL to parts
reference_parts = urlparse.urlsplit(reference, allow_fragments = False)
scheme = reference_parts.scheme
path = reference_parts.path
normalized_reference = reference
# Add tracing '/' if there no query part of URL and no file extention
# in the path part of URL
if not reference_parts.query and '.' not in path:
if not path.endswith('/'):
path += '/'
reference_parts = (scheme, reference_parts.netloc, path,
reference_parts.query, reference_parts.fragment)
normalized_reference = urlparse.urlunsplit(reference_parts)
# Make URL absolute if it is relative
if not scheme:
normalized_reference = urlparse.urljoin(self._allowed_domain,
normalized_reference)
return normalized_reference
def _normalize_reference_parsing_info(self, reference_parsing_info):
"""Normalizes given parsing information about reference.
Actually, normalizes reference in this parsing information.
reference_parsing_info -- ReferenceParsingInfo class or its subclass
instance containing reference
parsing information
Returns ReferenceParsingInfo class or its subclass instance with
normalized reference.
"""
reference = reference_parsing_info.reference
normalized_reference = self._normalize_reference(reference)
reference_parsing_info.reference = normalized_reference
return reference_parsing_info
def _filter_reference(self, reference):
"""Filtrs given reference. Discards reference if it leads out of
crawling website or the resourse it leads to is not a website page
or it was already processed by site spider or it is already put in
schedule in order to process later.
reference -- corresponding URL
Returns given URL back if given reference was not discarded or
None otherwise
"""
if self._allowed_domain not in reference:
logging.info('Filtered offsite reference: %s' % reference)
return None
reference_parts = urlparse.urlsplit(reference)
if '.' in reference_parts.path:
if not reference_parts.path.endswith('.html') and \
not reference_parts.path.endswith('.htm') and \
not reference_parts.path.endswith('.xhtml') and \
not reference_parts.path.endswith('.xht') and \
not reference_parts.path.endswith('.xml') and \
not reference_parts.path.endswith('.php'):
logging.info('Filtered reference (unsuitable file format): %s' %
reference)
return None
if reference in self._viewed_references:
return None
for reference_crawling_info in self._references_crawling_info_schedule:
if reference_crawling_info.reference == reference:
return None
return reference
def _filter_reference_parsing_info(self, reference_parsing_info):
"""Filters given parsing information of reference by its value.
Discards parsing information if reference was discarded by
_filter_reference method.
reference_parsing_info -- ReferenceParsingInfo class or its subclass
instance containing reference
parsing information
Returns back given instance if parsing information was not discarded
or None otherwise.
"""
reference = reference_parsing_info.reference
if not self._filter_reference(reference):
return None
return reference_parsing_info
def _add_sitemap_tree_element(self, element_class, element_depth,
element_parent, **kwargs):
"""Adds new sitemap element to sitemap tree.
element_class -- class of added element (HeadlineElement or
TextReferenceElement)
element_depth -- crawling depth of added element
element_parent -- sitemap tree element which is intended to be a parent
of added element
**kwargs -- dictionry with other element parametrs specified for
its class
Returns added sitemap element.
"""
# Create new sitemap element corresponding to given element class
if element_class is HeadlineElement:
element_headline = kwargs['headline']
new_element = HeadlineElement(element_headline, element_depth,
element_parent)
elif element_class is TextReferenceElement:
element_reference = kwargs['reference']
element_title = kwargs['title']
new_element = TextReferenceElement(element_reference, element_title,
element_depth, element_parent)
# Check whether added element is root of sitemap tree or not
if element_parent:
element_parent.append_child(new_element)
else:
self._sitemap_tree = new_element
return new_element
def _delay(self):
"""Delays site spider. Delay interval is a random number between
bounds that were set while initialization.
"""
download_delay = random.uniform(self._download_delay_upper_bound,
self._download_delay_lower_bound)
time.sleep(download_delay)
def crawl(self):
"""Main site spider method. Manages the process of crawling and
building sitemap tree.
"""
# Start crawling process
logging.info('Crawling started (bot %s)' % self._name)
self._crawling_status = self.CRAWLING_STATUS_SUCCESS
# Test connection to given website
# If it is failed, there is no point to continue crawling
try:
self._download_site_resourse(self._allowed_domain)
except ConnectionError, error:
logging.critical(str(error))
logging.critical('Crawling dumped (bot %s)' % self._name)
# Inform user abuot a failure
print error
# Dump crawling
self._crawling_status = self.CRAWLING_STATUS_ERROR
return
except SiteSpiderError, error:
logging.critical(str(error))
logging.critical('Crawling dumped (bot %s)' % self._name)
# Inform user abuot a failure
print error
# Dump crawling
self._crawling_status = self.CRAWLING_STATUS_ERROR
return
# Delay spider if corresponding parametr it is stated
if self._download_delay:
self._delay()
# Try to retrieve and parse robots.txt file if corresponding
# parametr it is stated
if self._robotstxt_obey:
# Defile robots.txt file location
robotstxt_file_url = urlparse.urljoin(self._allowed_domain,
'/robots.txt')
try:
robotstxt_file = self._download_site_resourse(
robotstxt_file_url)
except SiteSpiderError, error:
# robots.txt file was not downloaded
logging.warning('Unable to retrieve robots.txt file')
self._robotstxt_obey = False
else:
# Parse robots.txt file
self._parse_robotstxt_file(robotstxt_file)
logging.info('File robots.txt retrieved and parsed')
# Delay spider if corresponding parametr it is stated
if self._download_delay:
self._delay()
# Set start reference crawling information
start_reference = self._allowed_domain
start_reference_depth = 0
start_reference_crawling_info = TextReferenceCrawlingInfo(start_reference,
start_reference_depth)
self._references_crawling_info_schedule.append(start_reference_crawling_info)
# Main crawling loop
while self._references_crawling_info_schedule:
# Get next reference crawling information
reference_crawling_info = self._references_crawling_info_schedule.pop()
reference = reference_crawling_info.reference
reference_depth = reference_crawling_info.depth
reference_title = reference_crawling_info.title
reference_parent = reference_crawling_info.parent
# Check depth of reference
if self._depth_limit and reference_depth > self._depth_limit:
logging.info('Ignoring link (depth > %d): %s' %
(self._depth_limit, reference))
continue
# Check whether reference was already processed
if reference in self._viewed_references:
continue
# We oly deal with text references. But if reference is not a text
# reference it has no title. In this case we'll suppose that title
# of reference is a title of page it leads to.
# So, if reference has title and depth limit is stated and reference
# depth is equal to depth limit, there is no need to dowload the
# page that reference leads to.
# Otherwise, we have to download it
if not reference_title or \
(self._depth_limit and reference_depth != self._depth_limit):
# Before downloading we have to check if it is allowed by
# robots.txt file
if self._robotstxt_obey and \
not self._robotstxt_file_parser.can_fetch(self._name,
reference):
logging.info(
'Filtered reference (forbidden by robots.txt): %s' %
reference)
continue
# Try to download the page
try:
page = self._download_site_resourse(reference)
except ConnectionError, error:
# Problems with connection, spider unable to
# continue crawling
logging.critical(str(error))
logging.critical('Crawling dumped (bot %s)' % self._name)
# Inform user abuot a failure
print error
# Dump crawling
self._crawling_status = self.CRAWLING_STATUS_ERROR
return
except SiteSpiderError, error:
# Other problems with downloading. It may be a single error
# cased by current reference. So spider have to continue
# crawling.
logging.error(str(error))
continue
# Page downloaded successfully
logging.info('Crawled: %s' % reference)
# Delay spider if corresponding parametr it is stated
if self._download_delay:
self._delay()
# Try to parse downloaded page
try:
page_parsing_info = self._site_page_parser.parse_site_page(
page.read())
except SitePageParseError, parse_error:
# Parse error. Spider have to continue crawling.
logging.error(str(parse_error))
continue
# Page persed succesfully
logging.info('Parsed: %s' % reference)
# Try to get page title if current reference has no title
if not reference_title:
reference_title = page_parsing_info.title
# But it may happen that downoaded page has no title.
# Because at the current time we support only text references we
# can't add this reference to the sitemap
if not reference_title:
logging.warning(
'Reference has no title, can`t add to sitemap: %s' %
reference)
continue
# Otherwise, add text reference element to the sitemap
new_text_reference_element = self._add_sitemap_tree_element(
TextReferenceElement,
reference_depth, reference_parent,
reference = reference,
title = reference_title)
self._viewed_references.add(reference)
logging.info('Added sitemap text reference element: %s, %s' %
(reference_title, reference))
# If depth limit is stated and we have reached it, there is no need
# to process reference groups and their titles that were retrieved
# from parsed page
if not self._depth_limit or reference_depth != self._depth_limit:
# Set haedlines paramentrs
healine_elements_parent = new_text_reference_element
headline_elements_depth = reference_depth + 1
# Process groups of references retrieved from parsed page
page_references_groups = page_parsing_info.references_groups
for references_group in page_references_groups:
# Get references parsing info of reference group
references_parsing_info = references_group.references
# Normalize this parsing info
normalized_references_parsing_info = map(
self._normalize_reference_parsing_info,
references_parsing_info)
# Filter this parsing info
filtered_references_parsing_info = filter(
self._filter_reference_parsing_info,
normalized_references_parsing_info)
# If there still references in references group after
# filtering, we have to process them
if filtered_references_parsing_info:
# Check if current group of references has a headline
refences_group_headline = references_group.headline
if refences_group_headline:
# Add healine element to the sitemap
new_headline_element = \
self._add_sitemap_tree_element(
HeadlineElement,
headline_elements_depth,
healine_elements_parent,
headline = refences_group_headline)
logging.info('Added sitemap headline element: %s' %
refences_group_headline)
# Set this element as a parent of references in
# current group
text_reference_elements_parent = \
new_headline_element
else:
# Otherwise, set added text reference element as
# a parent of references in current group
text_reference_elements_parent = \
healine_elements_parent
# Depth of new text refernce elements is depth of
# last added text reference element + 1
text_reference_elements_depth = reference_depth + 1
# Reverse list with references parsing information
# In order to process referencs in the same order
# as they follow each other on the site page
filtered_references_parsing_info.reverse()
# Process references group
for reference_parsing_info in \
filtered_references_parsing_info:
# Create text reference crawling information
reference = reference_parsing_info.reference
reference_title = reference_parsing_info.title
reference_crawling_info = \
TextReferenceCrawlingInfo(reference,
text_reference_elements_depth,
text_reference_elements_parent,
reference_title)
# And put apend it to schedule in order to process
# later
self._references_crawling_info_schedule.append(
reference_crawling_info)
# Finish crawing process
logging.info('Crawling finished (bot %s)' % self._name)
@property
def sitemap_tree(self):
"""Returns sitemap tree created by site spider while crawling."""
return self._sitemap_tree
@property
def crawling_status(self):
"""Returns current crawling state of site spider."""
return self._crawling_status