diff --git a/spiders/programme.py b/spiders/programme.py index 93445a6..aabef12 100644 --- a/spiders/programme.py +++ b/spiders/programme.py @@ -5,7 +5,17 @@ class ProgrammeSpider(scrapy.Spider): name = 'programme' allowed_domains = ['cordis.europa.eu'] - start_urls = ['http://cordis.europa.eu/projects/result_en?q=(relatedProgramme/programme/code%3D%27H2020-EU.1.1.*%27%20OR%20relatedSubProgramme/programme/code%3D%27H2020-EU.1.1.*%27)%20AND%20contenttype%3D%27project%27'] + start_urls = ['http://cordis.europa.eu/projects/home_en.html'] + + # BASE_URL = 'http://cordis.europa.eu/projects/' def parse(self, response): - pass + links = response.xpath('//*[@id="cattree"]/div[2]/div[1]/dl/dd[1]/a').extract() + for link in links: + absolute_url = self.BASE_URL = link + yield scrapy.Request(absolute_url, callback=self.parse_attr) + + def parse_attr(self, response): + item = ProgrammeItem() + item['link'] = response.url + yield item diff --git a/spiders/programme.pyc b/spiders/programme.pyc new file mode 100644 index 0000000..c3f779c Binary files /dev/null and b/spiders/programme.pyc differ