原理
- 所有机子连接同一个redis,获取和存放url
- scrapy和redis通过scrapy-redis连接,直接通过pip install scrapy-redis安装,scrapy-redis的官方文档 http://scrapy-redis.readthedocs.io/en/stable/readme.html
- 爬取的内容保存在一台机器的mongodb数据库
要求
- 熟悉scrapy这个框架
- 熟悉redis和MongoDb数据库
实现方法:以爬取简书这个网站为例
一、setting文件的设置
1、启用scrapy-redis
SCHEDULER = "scrapy_redis.scheduler.Scheduler" DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" REDIS_HOST = '127.0.0.0' #redis的远程ip REDIS_PORT = 6379 #redis的端口 REDIS_PARAMS = {'password':'密码'} #redis的密码
2、启用中间件设置userAgent
DOWNLOADER_MIDDLEWARES = { 'jianshu.middlewares.UserAgentmiddleware': 543, }
3、设置MongoDb保存数据
ITEM_PIPELINES = { 'scrapy_redis.pipelines.RedisPipeline': 300, 'jianshu.pipelines.MongoPipeline': 800, } #mongDb MONGO_URI = '127.0.0.1:27017' #mongodb的ip和端口 MONGO_DATABASE = 'test' #链接的数据库 MONGO_USER = 'admin' #账号 MONGO_PASSWORD = 'admin'#密码
二、新建middlewares.py,轮换userAgent
import random from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware class UserAgentmiddleware(UserAgentMiddleware): def process_request(self, request, spider): USER_AGENTS =[ 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)', 'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)', 'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" ] agent = random.choice(USER_AGENTS) request.headers["User-Agent"] = agent
三、items.py文件的设置,这里保存url和网页源码为例
import scrapy class JianshuItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() html = scrapy.Field() url = scrapy.Field()
四、pipelines保存数据到MongoDb的设置
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymongo class MongoPipeline(object): collection_name = 'scrapy_items' def __init__(self, mongo_uri, mongo_db,mongo_user,mongo_password): self.mongo_uri = mongo_uri self.mongo_db = mongo_db self.mongo_user = mongo_user self.mongo_password = mongo_password @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE', 'items'), mongo_user = crawler.settings.get('MONGO_USER'), mongo_password = crawler.settings.get('MONGO_PASSWORD'), ) def open_spider(self, spider): self.client = pymongo.MongoClient('mongodb://%s:%s@%s/admin' % (self.mongo_user,self.mongo_password,self.mongo_uri)) self.db = self.client[self.mongo_db] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): self.db[self.collection_name].insert(dict(item)) return item
五、爬虫文件的设置spider1.py,start_urls列表,是爬虫入口的url
# -*- coding: utf-8 -*- import scrapy from jianshu.items import JianshuItem from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor class Spider1Spider(CrawlSpider): name = "spider1" allowed_domains = ["jianshu.com"] start_urls = ( 'http://www.jianshu.com/', ) rules = ( # 提取匹配 'category.php' (但不匹配 'subsection.php') 的链接并跟进链接(没有callback意味着follow默认为True) Rule(LinkExtractor(allow=('/c/', '/u/',) )), # 提取匹配 'item.php' 的链接并使用spider的parse_item方法进行分析 Rule(LinkExtractor(allow=('/p/', )), callback='parse_item',follow=True), ) def parse_item(self,response): html = response.body url = response.url item = JianshuItem() item['url'] = url item['html'] = html return item
六、配置完成,启动
配置完成之后,在一台机子启动,如果没问题。可以将这份代码复制到其他的机子上一起爬取,越多台机子爬取速度更快。由于scrapy-redis是在同一个redis数据库中去重的,所以不用担心url重复爬取的问题。