Scraping Multiple Pages with Scrapy

pip install scrapy
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from bs4 import BeautifulSoup
import urllib
class SimpleNextPage(CrawlSpider):
name = 'SimpleNextPage'
allowed_domains = ['copyblogger.com']
start_urls = [
'https://copyblogger.com/blog/',
]
custom_settings = { 'LOG_LEVEL': 'INFO',

}
def parse(self, response):
nextpage = response.css('.pagination-next').extract()
nextpage = response.css('.pagination-next a::attr(href)').extract()
yield scrapy.Request(nextpage[0], callback=self.parse_next_page)
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from bs4 import BeautifulSoup
import urllib
class SimpleNextPage(CrawlSpider):
name = 'SimpleNextPage'
allowed_domains = ['copyblogger.com']
start_urls = [
'https://copyblogger.com/blog/',
]
custom_settings = { 'LOG_LEVEL': 'INFO',

}
def parse(self, response):
print('Current page ' response.url)
nextpage = response.css('.pagination-next a::attr(href)').extract()
nextpagetext = response.css('.pagination-next').extract()
yield scrapy.Request(nextpage[0], callback=self.parse_next_page)
return
def parse_next_page(self, response):
print('Fetched next page' response.url)
return
scrapy runspider SimpleNextPage.py -s USER_AGENT="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36" -s ROBOTSTXT_OBEY=False
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from bs4 import BeautifulSoup
import urllib
class SimpleNextPage(CrawlSpider):
name = 'SimpleNextPage'
allowed_domains = ['copyblogger.com']
start_urls = [
'https://copyblogger.com/blog/',
]
custom_settings = { 'LOG_LEVEL': 'INFO',

}
def parse(self, response):
print('Current page ' response.url)
nextpage = response.css('.pagination-next a::attr(href)').extract()
nextpagetext = response.css('.pagination-next').extract()
yield scrapy.Request(nextpage[0], callback=self.parse) return
def parse_next_page(self, response):
print('Fetched next page' response.url)
return
-s USER_AGENT="Mozilla/5.0 (Windows NT 6.1; WOW64)/
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36" /
-s ROBOTSTXT_OBEY=False
  • With millions of high speed rotating proxies located all over the world,
  • With our automatic IP rotation
  • With our automatic User-Agent-String rotation (which simulates requests from different, valid web browsers and web browser versions)
  • With our automatic CAPTCHA solving technology,
curl "http://api.proxiesapi.com/?key=API_KEY&url=https://example.com"

--

--

--

Founder @ ProxiesAPI.com

Love podcasts or audiobooks? Learn on the go with our new app.

Recommended from Medium

How To Write Generic Helper Functions With Go 1.18 — Part 2

Gopher read a book

REPL Development in Go with Gore

Introduction to GOLANG and its installation guide.

How To Launch Your HTML5 Game In The ARK Desktop Wallet

Android Clean Architecture — Kotlin -Boilerplate

Simple and Custom List View Step By Step Breakdown

Wire for web, 2020–04–29

To NFT or Not to NFT? That is the Question.

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Mohan Ganesan

Mohan Ganesan

Founder @ ProxiesAPI.com

More from Medium

Why Is a Full Automation Workflow Necessary for Effective RPA?

Appending data to Google Sheets using Python and BigQuery

Install PyLucene on macOS

Creating Weather Chat-bot