Skip to content

Commit

Permalink
Grabbing urls form file.txt
Browse files Browse the repository at this point in the history
  • Loading branch information
Simon Hardy committed Feb 2, 2018
1 parent b44134b commit 7313a60
Show file tree
Hide file tree
Showing 10 changed files with 315,919 additions and 10 deletions.
5,000 changes: 5,000 additions & 0 deletions spiders/H2020ES.csv

Large diffs are not rendered by default.

5,002 changes: 5,002 additions & 0 deletions spiders/H2020_Excellent_Science.json

Large diffs are not rendered by default.

300,876 changes: 300,876 additions & 0 deletions spiders/H2020_Excellent_Science.xml

Large diffs are not rendered by default.

34 changes: 34 additions & 0 deletions spiders/back-up.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
import scrapy
from scrapy.loader import ItemLoader
from CORDIS.items import CordisItem

class CordisSpider(scrapy.Spider):
name = 'cordis'
allowed_domains = ['cordis.europa.eu']
start_urls = ['http://cordis.europa.eu/project/rcn/%d_en.html' %(n) for n in range(210216, 210217)]
# Max EU CORDIS 213445

def parse(self, response):
# Misconfiguration to check - eu in response.xpath not needed
#for eu in response.xpath('//*[@id="container-pack"]'):
item = CordisItem()
item['Meta'] = response.xpath('/html/head/meta[23]').extract()
item['Project_ACR'] = response.xpath('//*[@id="dynamiccontent"]/div[1]/h1/text()').extract()
item['Project_Title'] = response.xpath('//*[@id="dynamiccontent"]/h2/text()').extract()
item['Total_Cost'] = response.xpath('//*[@id="dynamiccontent"]/div[3]/div/div[1]/div[1]/text()').extract()
item['EU_Contribution'] = response.xpath('//*[@id="dynamiccontent"]/div[3]/div/div[1]/div[2]/text()').extract()
item['Coordinated_in'] = response.xpath('//*[@id="dynamiccontent"]/div[3]/div/div[1]/div[3]/text()').extract()
item['Topic_s'] = response.xpath('//*[@id="dynamiccontent"]/div[3]/div/div[2]/div[1]/a/text()').extract()
item['Call_for_Proposal'] = response.xpath('//*[@id="dynamiccontent"]/div[3]/div/div[2]/div[2]/text()').extract()
item['Funding_scheme'] = response.xpath('//*[@id="dynamiccontent"]/div[3]/div/div[2]/div[3]/text()').extract()
item['Project_ID'] = map(unicode.strip, response.xpath('.//*[@id="dynamiccontent"]/div[1]/text()').re('[.0-9]+'))
item['To'] = response.xpath('normalize-space(.//*[@id="dynamiccontent"]/div[2]/text()[3])').extract()
item['From'] = response.xpath('normalize-space(.//*[@id="dynamiccontent"]/div[2]/text()[2])').extract()
item['Partners'] = response.css('.name').xpath('text()').extract()
item['Country'] = response.css('.country').xpath('text()').extract()
item['Activity'] = response.css('.contact').xpath('text()').extract()

#for eu in response.css('div.objective'):
item['Technology_Description'] = response.css('p::text').extract_first()
yield item
9 changes: 6 additions & 3 deletions spiders/cordis_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@
import scrapy
from scrapy.loader import ItemLoader
from CORDIS.items import CordisItem
from scrapy.spider import BaseSpider

class CordisSpider(scrapy.Spider):
name = 'cordis'
allowed_domains = ['cordis.europa.eu']
start_urls = ['http://cordis.europa.eu/project/rcn/%d_en.html' %(n) for n in range(210216, 210217)]
# Max EU CORDIS 213445
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
# allowed_domains = ['cordis.europa.eu']
# start_urls = ['http://cordis.europa.eu/project/rcn/%d_en.html' %(n) for n in range(210216, 210217)]

def parse(self, response):
# Misconfiguration to check - eu in response.xpath not needed
Expand Down
Binary file modified spiders/cordis_spider.pyc
Binary file not shown.
4 changes: 0 additions & 4 deletions spiders/test.xml

This file was deleted.

Loading

0 comments on commit 7313a60

Please sign in to comment.