From ff2d91496abdafe7c9c749c9f7aac7ea11ab1698 Mon Sep 17 00:00:00 2001 From: Simon Hardy Date: Tue, 30 Jan 2018 15:55:23 +0100 Subject: [PATCH] l.load_item - draft --- spiders/cordis_spider.py | 21 --------------------- spiders/cordis_spider.pyc | Bin 2525 -> 2525 bytes spiders/draft_loader.txt | 21 +++++++++++++++++++++ 3 files changed, 21 insertions(+), 21 deletions(-) create mode 100644 spiders/draft_loader.txt diff --git a/spiders/cordis_spider.py b/spiders/cordis_spider.py index ab9d73e..2b91423 100644 --- a/spiders/cordis_spider.py +++ b/spiders/cordis_spider.py @@ -31,24 +31,3 @@ def parse(self, response): #for eu in response.css('div.objective'): item['Technology_Description'] = response.css('p::text').extract_first() yield item -""" - def parse(self, response): - l = ItemLoader(item=CordisItem(), response=response) - l.add_xpath('Project_ACR', '//*[@id="dynamiccontent"]/div[1]/h1/text()') - l.add_xpath('Project_Title', '//*[@id="dynamiccontent"]/h2/text()') - l.add_xpath('Total_Cost', '//*[@id="dynamiccontent"]/div[3]/div/div[1]/div[1]/text()') - l.add_xpath('EU_Contribution', '//*[@id="dynamiccontent"]/div[3]/div/div[1]/div[2]/text()') - l.add_xpath('Coordinated_in', '//*[@id="dynamiccontent"]/div[3]/div/div[1]/div[3]/text()') - l.add_xpath('Topic_s', '//*[@id="dynamiccontent"]/div[3]/div/div[2]/div[1]/a/text()') - l.add_xpath('Call_for_Proposal', '//*[@id="dynamiccontent"]/div[3]/div/div[2]/div[2]/text()') - l.add_xpath('Funding_scheme', '//*[@id="dynamiccontent"]/div[3]/div/div[2]/div[3]/text()') - #l.add_xpath('Project_ID', '//*[@id="dynamiccontent"]/div[1]/text()'.re('[.0-9]+')') map(unicode.strip, response.xpath('.//*[@id="dynamiccontent"]/div[1]/text()').re('[.0-9]+')) - l.add_xpath('To', 'normalize-space(.//*[@id="dynamiccontent"]/div[2]/text()[3])') - l.add_xpath('From', 'normalize-space(.//*[@id="dynamiccontent"]/div[2]/text()[2])') - #l.add_xpath('Partners', 'response.css('.name').xpath('text()')') - #l.add_xpath('Country', 'response.css('.country').xpath('text()')') - #l.add_xpath('Activity', 'response.css('.contact').xpath('text()')') - l.add_css('Technology_Description', 'p::text') # error extract all p - - return l.load_item() -""" diff --git a/spiders/cordis_spider.pyc b/spiders/cordis_spider.pyc index 90cc2642865e368bb8a119aefc6c70759ba1eeef..fdd931c2bbc1130fd6cbf8f7da75a4c0fa88d962 100644 GIT binary patch delta 46 zcmcaBd{>yA`7?|CN+>?1YzA&;)X5>6A$i%?FP|N`&co+p31(+la03GKE AKL7v# delta 46 zcmcaBd{>yA`7^@Gjg65WMp7qDCPhXJd6U20!)$y00+ki Ar2qf` diff --git a/spiders/draft_loader.txt b/spiders/draft_loader.txt new file mode 100644 index 0000000..2417ff8 --- /dev/null +++ b/spiders/draft_loader.txt @@ -0,0 +1,21 @@ +""" + def parse(self, response): + l = ItemLoader(item=CordisItem(), response=response) + l.add_xpath('Project_ACR', '//*[@id="dynamiccontent"]/div[1]/h1/text()') + l.add_xpath('Project_Title', '//*[@id="dynamiccontent"]/h2/text()') + l.add_xpath('Total_Cost', '//*[@id="dynamiccontent"]/div[3]/div/div[1]/div[1]/text()') + l.add_xpath('EU_Contribution', '//*[@id="dynamiccontent"]/div[3]/div/div[1]/div[2]/text()') + l.add_xpath('Coordinated_in', '//*[@id="dynamiccontent"]/div[3]/div/div[1]/div[3]/text()') + l.add_xpath('Topic_s', '//*[@id="dynamiccontent"]/div[3]/div/div[2]/div[1]/a/text()') + l.add_xpath('Call_for_Proposal', '//*[@id="dynamiccontent"]/div[3]/div/div[2]/div[2]/text()') + l.add_xpath('Funding_scheme', '//*[@id="dynamiccontent"]/div[3]/div/div[2]/div[3]/text()') + #l.add_xpath('Project_ID', '//*[@id="dynamiccontent"]/div[1]/text()'.re('[.0-9]+')') map(unicode.strip, response.xpath('.//*[@id="dynamiccontent"]/div[1]/text()').re('[.0-9]+')) + l.add_xpath('To', 'normalize-space(.//*[@id="dynamiccontent"]/div[2]/text()[3])') + l.add_xpath('From', 'normalize-space(.//*[@id="dynamiccontent"]/div[2]/text()[2])') + #l.add_xpath('Partners', 'response.css('.name').xpath('text()')') + #l.add_xpath('Country', 'response.css('.country').xpath('text()')') + #l.add_xpath('Activity', 'response.css('.contact').xpath('text()')') + l.add_css('Technology_Description', 'p::text') # error extract all p + + return l.load_item() +"""