Skip to content

Commit

Permalink
WP Draft
Browse files Browse the repository at this point in the history
  • Loading branch information
Simon Hardy committed Jan 31, 2018
1 parent e903bed commit 99b50df
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 2 deletions.
14 changes: 12 additions & 2 deletions spiders/programme.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,17 @@
class ProgrammeSpider(scrapy.Spider):
name = 'programme'
allowed_domains = ['cordis.europa.eu']
start_urls = ['http://cordis.europa.eu/projects/result_en?q=(relatedProgramme/programme/code%3D%27H2020-EU.1.1.*%27%20OR%20relatedSubProgramme/programme/code%3D%27H2020-EU.1.1.*%27)%20AND%20contenttype%3D%27project%27']
start_urls = ['http://cordis.europa.eu/projects/home_en.html']

# BASE_URL = 'http://cordis.europa.eu/projects/'

def parse(self, response):
pass
links = response.xpath('//*[@id="cattree"]/div[2]/div[1]/dl/dd[1]/a').extract()
for link in links:
absolute_url = self.BASE_URL = link
yield scrapy.Request(absolute_url, callback=self.parse_attr)

def parse_attr(self, response):
item = ProgrammeItem()
item['link'] = response.url
yield item
Binary file added spiders/programme.pyc
Binary file not shown.

0 comments on commit 99b50df

Please sign in to comment.