Skip to content
This repository has been archived by the owner on Feb 8, 2018. It is now read-only.

added feed spider for noRecipes #190

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions scrapy_proj/openrecipes/spiders/norecipes_feedspider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import XmlXPathSelector
from openrecipes.spiders.norecipes_spider import NorecipesMixin


class NorecipesSpider(BaseSpider, NorecipesMixin):
"""
This parses the RSS feed for thepioneerwoman.com, grabs the original
links to each entry, and scrapes just those pages. This should be used
to keep up to date after we have backfilled the existing recipes by
crawling the whole site
"""
name = "norecipes.feed"
allowed_domains = [
"norecipes.com",
"feeds.feedburner.com",
"feedproxy.google.com"
]
start_urls = [
"http://feeds.feedburner.com/NoRecipes",
]

def parse(self, response):
"""
We define a custom parser here because we need to get the link from
the feed item and then follow it to get the recipe data.

Getting the data from <content:encoded> seems overly complex, as we
would have to decode all the encoded characters and then build a DOM
from that.
"""
xxs = XmlXPathSelector(response)
links = xxs.select("//item/*[local-name()='origLink']/text()").extract()

# self.parse_item comes from ThepioneerwomanMixin
return [Request(x, callback=self.parse_item) for x in links]