What Exactly Can You Do With Scrapy? Here Are Scrapy’s 5 Main Applications

Scrapy in Action

Write sophisticated spiders

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

i=1

class MySpider(CrawlSpider):
name = 'Wikipedia'
allowed_domains = ['en.wikipedia.org', 'upload.wikimedia.org']
start_urls = ['https://en.wikipedia.org/wiki/Lists_of_animals']
rules = (
# This rule set makes sure it downloads anything with the extension .jpg in it and also removes the deny_extensions default setting

Rule(LinkExtractor(allow=('.jpg'), deny_extensions=set(), tags = ('img',), attrs=('src',), canonicalize = True, unique = True), follow = False, callback='parse_item'),

)

def parse_item(self, response):
global i
i=i+1
self.logger.info('Found image - %s', response.url)
flname='image'+str(i)+'.jpg'
with open('image'+str(i)+'.jpg', 'wb') as html_file:
html_file.write(response.body)

self.logger.info('Saved image as - %s', flname)
item = scrapy.Item()
return item
import scrapy


class QuotesSpider(scrapy.Spider):
name = 'quotes'
start_urls = [
'http://quotes.toscrape.com/tag/humor/',
]

def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').get(),
'author': quote.xpath('span/small/text()').get(),
}

next_page = response.css('li.next a::attr("href")').get()
if next_page is not None:
yield response.follow(next_page, self.parse)

Use selectors to extract content

Do Interactive testing in Scrapy Shell

scrapy shell http://example.com
response.xpath('//title/text()')
[<Selector xpath='//title/text()' data=u'Example Domain'>]
response.css('h1::text').get()
Out[10]: u'Example Domain'

Export data in many ways and store it in different systems

Use the Signals API to get notified when certain events occur

from scrapy import signals
from scrapy import Spider


class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/",
]


@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(DmozSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider


def spider_closed(self, spider):
spider.logger.info('Spider closed: %s', spider.name)


def parse(self, response):
pass