From 13a9bdada5d77d00fdf54e6e13f6060279f8b477 Mon Sep 17 00:00:00 2001 From: Joey Erskine Date: Tue, 18 Jun 2013 21:06:41 -0500 Subject: [PATCH] added feed spider for noRecipes --- .../spiders/norecipes_feedspider.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 scrapy_proj/openrecipes/spiders/norecipes_feedspider.py diff --git a/scrapy_proj/openrecipes/spiders/norecipes_feedspider.py b/scrapy_proj/openrecipes/spiders/norecipes_feedspider.py new file mode 100644 index 0000000..93e6745 --- /dev/null +++ b/scrapy_proj/openrecipes/spiders/norecipes_feedspider.py @@ -0,0 +1,37 @@ +from scrapy.spider import BaseSpider +from scrapy.http import Request +from scrapy.selector import XmlXPathSelector +from openrecipes.spiders.norecipes_spider import NorecipesMixin + + +class NorecipesSpider(BaseSpider, NorecipesMixin): + """ + This parses the RSS feed for thepioneerwoman.com, grabs the original + links to each entry, and scrapes just those pages. This should be used + to keep up to date after we have backfilled the existing recipes by + crawling the whole site + """ + name = "norecipes.feed" + allowed_domains = [ + "norecipes.com", + "feeds.feedburner.com", + "feedproxy.google.com" + ] + start_urls = [ + "http://feeds.feedburner.com/NoRecipes", + ] + + def parse(self, response): + """ + We define a custom parser here because we need to get the link from + the feed item and then follow it to get the recipe data. + + Getting the data from seems overly complex, as we + would have to decode all the encoded characters and then build a DOM + from that. + """ + xxs = XmlXPathSelector(response) + links = xxs.select("//item/*[local-name()='origLink']/text()").extract() + + # self.parse_item comes from ThepioneerwomanMixin + return [Request(x, callback=self.parse_item) for x in links] \ No newline at end of file