From b2e4e409a37610ee60c02c777c1a1f0affcb6beb Mon Sep 17 00:00:00 2001 From: Simon Hardy Date: Fri, 2 Feb 2018 14:37:58 +0100 Subject: [PATCH] Scrapy only whne keyword found --- keyword.xml | 4 ++++ spiders/back-up.txt | 7 ++++++- spiders/cordis_spider.py | 15 +++++++++------ spiders/cordis_spider.pyc | Bin 2545 -> 2652 bytes spiders/keyword-draft.txt | 3 +++ 5 files changed, 22 insertions(+), 7 deletions(-) create mode 100644 keyword.xml create mode 100644 spiders/keyword-draft.txt diff --git a/keyword.xml b/keyword.xml new file mode 100644 index 0000000..9cdf568 --- /dev/null +++ b/keyword.xml @@ -0,0 +1,4 @@ + + +STIFTELSEN SINTEFIWW RHEINISCH WESTFALISCHES INSTITUT FUR WASSERFORSCHUNG GEMEINNUTZIGE GMBHCETAQUA, CENTRO TECNOLOGICO DEL AGUA, FUNDACION PRIVADAKWR WATER B.V.FUNDACIO EURECATTECHNION - ISRAEL INSTITUTE OF TECHNOLOGYATOS SPAIN SAMEKOROT WATER COMPANY LIMITEDAIGUES DE BARCELONA, EMPRESA METROPOLITANA DE GESTIO DEL CICLE INTEGRAL DE L'AIGUA SAHESSENWASSER GMBH & CO. KGOSLO KOMMUNEINSTITUTE OF COMMUNICATION AND COMPUTER SYSTEMSBERGEN KOMMUNEBERLINER WASSERBETRIEBEEUROPEAN WATER SUPPLY AND SANITATION TECHNOLOGY PLATFORMPNO INNOVATIONBEIT TOCHNA APLICATZIA LTDEMPRESA MUNICIPAL DE ABASTECIMIENTO Y SANEAMIENTO DE GRANADA SAWORLDSENSING SLRISA SICHERHEITSANALYSEN GMBHMNEMONIC ASVLAAMSE MAATSCHAPPIJ VOORWATERVOORZIENING CVBAEUR 8 255 319,50Strategic, Tactical, Operational Protection of water Infrastructure against cyber-physical ThreatsEUR 9 616 525,18NorwayGermanySpainNetherlandsSpainIsraelSpainIsraelSpainGermanyNorwayGreeceNorwayGermanyBelgiumBelgiumIsraelSpainSpainGermanyNorwayBelgiumResearch OrganisationsResearch OrganisationsResearch OrganisationsPrivate for-profit entities (excluding Higher or Secondary Education Establishments)Research OrganisationsHigher or Secondary Education EstablishmentsPrivate for-profit entities (excluding Higher or Secondary Education Establishments)Private for-profit entities (excluding Higher or Secondary Education Establishments)Private for-profit entities (excluding Higher or Secondary Education Establishments)Private for-profit entities (excluding Higher or Secondary Education Establishments)Public bodies (excluding Research Organisations and Secondary or Higher Education Establishments)Research OrganisationsPublic bodies (excluding Research Organisations and Secondary or Higher Education Establishments)Public bodies (excluding Research Organisations and Secondary or Higher Education Establishments)OtherPrivate for-profit entities (excluding Higher or Secondary Education Establishments)Private for-profit entities (excluding Higher or Secondary Education Establishments)Private for-profit entities (excluding Higher or Secondary Education Establishments)Private for-profit entities (excluding Higher or Secondary Education Establishments)Private for-profit entities (excluding Higher or Secondary Education Establishments)Private for-profit entities (excluding Higher or Secondary Education Establishments)Private for-profit entities (excluding Higher or Secondary Education Establishments)Water critical infrastructures (CIs) are essential for human society, life and health and they can be endangered by physical/cyber threats with severe societal consequences. To address this, STOP-IT assembles a team of major Water Utilities, industrial technology developers, high tech SMEs and top EU R&D providers. It organizes communities of practice for water systems protection to identify current and future risk landscapes and to co-develop an all-hazards risk management framework for the physical and cyber protection of water CIs. Prevention, Detection, Response and Mitigation of relevant risks at strategic, tactical and operational levels of planning will be taken into account to generate modular solutions (technologies, tools and guidelines) and an integrated software platform. STOP-IT solutions are based on: a) mature technologies improved via their combination and embedment (incl. public warning systems, smart locks) and b) novel technologies whose TRL will be increased (incl. cyber threat incident services, secure wireless sensor communications modules, context-aware anomaly detection technologies; fault-tolerant control strategies for SCADA integrated sensors, high-volume real-time sensor data protection via blockchain schemes; authorization engines; irregular human detection using new computer vision methods and WiFi and efficient water contamination detection algorithms). STOP-IT solutions are demonstrated through a front-runner/follower approach where 4 advanced utilities, Aigües de Barcelona (ES), Berliner Wasserbetriebe (DE), MEKOROT (IL) and Oslo VAV (NO) are twinned with 4 less advanced, but ambitious ones, to stimulate mutual learning, transfer and uptake. Building on this solid basis STOP-IT delivers high impact through the creation of hands-on training, best practice guidelines, support for certification and standardization as well as by fostering market opportunities, also leveraging the EU water technology platform's multi-stakeholder network.2021-05-31, ongoing project2017-06-01<meta name="WT.cg_s" content="H2020-EU.3.7.4., H2020-EU.3.7.2.">CIP-01-2016-2017 - Prevention, detection, response and mitigation of the combination of physical and cyber threats to the critical infrastructure of Europe.IA - Innovation actionNorwaySTOP-ITCIP-2016-2017-1740610 + \ No newline at end of file diff --git a/spiders/back-up.txt b/spiders/back-up.txt index c3ce376..7a4bde7 100644 --- a/spiders/back-up.txt +++ b/spiders/back-up.txt @@ -2,13 +2,18 @@ import scrapy from scrapy.loader import ItemLoader from CORDIS.items import CordisItem +from scrapy.spider import BaseSpider class CordisSpider(scrapy.Spider): name = 'cordis' + # f = open("urls.txt") + # start_urls = [url.strip() for url in f.readlines()] + # f.close() allowed_domains = ['cordis.europa.eu'] start_urls = ['http://cordis.europa.eu/project/rcn/%d_en.html' %(n) for n in range(210216, 210217)] - # Max EU CORDIS 213445 + # def parse_keywordpage(self, response): + # if water in response.xpath('//*[@id="ica:content"]'): def parse(self, response): # Misconfiguration to check - eu in response.xpath not needed #for eu in response.xpath('//*[@id="container-pack"]'): diff --git a/spiders/cordis_spider.py b/spiders/cordis_spider.py index 92235d9..20832b8 100644 --- a/spiders/cordis_spider.py +++ b/spiders/cordis_spider.py @@ -6,15 +6,18 @@ class CordisSpider(scrapy.Spider): name = 'cordis' - f = open("urls.txt") - start_urls = [url.strip() for url in f.readlines()] - f.close() - # allowed_domains = ['cordis.europa.eu'] - # start_urls = ['http://cordis.europa.eu/project/rcn/%d_en.html' %(n) for n in range(210216, 210217)] + # f = open("urls.txt") + # start_urls = [url.strip() for url in f.readlines()] + # f.close() + allowed_domains = ['cordis.europa.eu'] + start_urls = ['http://cordis.europa.eu/project/rcn/%d_en.html' %(n) for n in range(210216, 210217)] + # def parse_keywordpage(self, response): + # if water in response.xpath('//*[@id="ica:content"]'): def parse(self, response): # Misconfiguration to check - eu in response.xpath not needed #for eu in response.xpath('//*[@id="container-pack"]'): + if response.xpath('//*[@id="ica:content"][contains(.,"water")]'): item = CordisItem() item['Meta'] = response.xpath('/html/head/meta[23]').extract() item['Project_ACR'] = response.xpath('//*[@id="dynamiccontent"]/div[1]/h1/text()').extract() @@ -34,4 +37,4 @@ def parse(self, response): #for eu in response.css('div.objective'): item['Technology_Description'] = response.css('p::text').extract_first() - yield item + yield item diff --git a/spiders/cordis_spider.pyc b/spiders/cordis_spider.pyc index 2b14a42b08834e9a8bfb0b192493ad846ff5fb72..f32e56ba0502f6d646600559f4f8be0f5a0149c5 100644 GIT binary patch delta 878 zcmZ{i&2G~`6ot?DFHTBGN=x}G&>B)XfMN-WO%)`B5K9)V#3EIQERU5qN^N6LRThOT z$^(G>D7*<$Ss_-u0OAF>V=IF!#A1%G?sx8;JD&Nq@qN?Mf45s7qR-aXr2*hMt^6fP z2#gpwsRqt4Wf_1Suzg;;blSW_4p)=2QzWJ2sA3u zY02>eTKNwKoH5KH2R~=XE|{PIc(1MUn#|8h$yuIec*d4xg;6Iah?KypgN6cj9W)ha z)xnwqm+GKhfhEm*xlTH&q+18;3S6m!4Ge`2VPI~q&bnDT=vuDIfUZrRcuO_7wrW88 zOLAS28!J+Qn-yq)-zou(yIptOR*pOWaokmooqrtk*5v-X%Hot3`n?snhhbbsuNhen z-TQRcukweap7-GBS;U_@QQ+?fNi4!xIO8K3`B9v^gGbJ(FT%{}kG~A{Z+b_VRP`?P z#q5CI3X0vvMSGose2)ZrhZW4;F}9g&7d^ZGxI;;Qu}DrsKH{)im{CFDD(*k1A dBF%-LiOET}$cs;Qx@J?#VwTZldqunX^AF4Zs=ojL delta 793 zcmZ{iF-{vn7=_>LF5~qY$AAgo5E2qVa++8QEfu1sFcQTgf-LU_V;Sst2FV493+DpS zwp3gp2jBvTB2v1Pxj;mX#G3^MZ$YtdXXbnJ^v^%@clLi-x;MpOI{Ybr4_yG8{8XJ1 z1ED}fbr8FV1V#d(heIL{?Fo#4t^FB<60t-(XXc+Bd|gB6r;fZ?5s`x*%m`55shWH( zDa~#_I#SB#@U-7iAN4>t9Si~E@Qi_#k6EmIoCJ1DAqyrgC}zP63rblqWx;e7%wR*a|2pmL!;#9VMIxFt zgAsec6Th?xbJ>J>3l_4Vau0HdUZsFtT+BLN-#do1EIng+V?*9P<5;$icd5gFyrcI7 z4t52f(oS`S7rU*>)LKg)cWSM;5!H@k-PrrI{_EPkbW3&KI`jl(&}-;ca-REBC~>A% zDR7hT{<2pQ)k1RNf7mo{rQ3^7wAu7U=c#=rc8*SBrCrYWyy_MHbwE+N(bos&#ijYp XPFKart=}seD1{W0Voe0eAph?UAft|f diff --git a/spiders/keyword-draft.txt b/spiders/keyword-draft.txt new file mode 100644 index 0000000..d602a46 --- /dev/null +++ b/spiders/keyword-draft.txt @@ -0,0 +1,3 @@ +def parse_keywordpage(self, response): + if keyword in response.body: + #do something