python 3.6
scrapy 1.5.1
pymysql 0.9.2
IDE:pycharm
命令行运行:
scrapy startproject douban
如下图所示:
产生的目录和文件如下图所示
进到项目目录下,运行:
scrapy genspider douban_spider movie.douban.com
在应用目录spiders下产生一个名为douban_spider文件
对items.py进行设置
#serial_number,movie_name,introduce,star,evaluate,describe要与数据库表的字段名对应 # 序号 serial_number = scrapy.Field() # 电影名称 movie_name = scrapy.Field() # 电影介绍 introduce = scrapy.Field() # 电影评级 star = scrapy.Field() # 电影评论数 evaluate = scrapy.Field() # 电影描述 describe = scrapy.Field()
命令行方式启动爬虫
scrapy crawl douban_spider
设置默认user-agent头
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
为方便测试,把启动爬虫的命令写到python文件main.py里
#!/usr/bin/env python# -*- coding: utf-8 -*-from scrapy import cmdlinecmdline.execute('scrapy crawl douban_spider'.split())
对爬取到的内容进行分析处理
# -*- coding: utf-8 -*-import scrapyfrom douban.items import DoubanItemclass DoubanSpiderSpider(scrapy.Spider): # 爬虫名称 name = 'douban_spider' # 爬取域名范围 allowed_domains = ['movie.douban.com'] # 入口url start_urls = ['https://movie.douban.com/top250'] # 默认解析方法 def parse(self, response): # print(response.text) # 循环电影的条目 movie_list = response.xpath("//div[@class='article']//ol[@class='grid_view']/li") for i_item in movie_list: # items文件导进来 douban_item = DoubanItem() # 详细的xpath,进行数据的解析 douban_item['serial_number'] = i_item.xpath(".//div[@class='item']//em/text()").extract_first() douban_item['movie_name'] = i_item.xpath( ".//div[@class='info']/div[@class='hd']/a/span[1]/text()").extract_first() content = i_item.xpath(".//div[@class='info']//div[@class='bd']/p[1]/text()").extract() # 多行数据处理 for i_content in content: content_s = "".join(i_content.split()) douban_item['introduce'] = content_s douban_item['star'] = i_item.xpath(".//span[@class='rating_num']/text()").extract_first() douban_item['evaluate'] = i_item.xpath(".//div['star']/span[4]/text()").extract_first() douban_item['describe'] = i_item.xpath(".//p[@class='quote']/span/text()").extract_first() # 将数据yield到pipelines里去进行数据处理 yield douban_item # 解析下一页规则,取得下一页的xpath next_link = response.xpath("//span[@class='next']/link/@href").extract() if next_link: next_link = next_link[0] yield scrapy.Request(self.start_urls[0] + next_link, callback=self.parse)
存储为json
存储为csv
存储为mysql
mysql参数配置,在settings.py文件设置
# mysql参数配置MYSQL_DATABASE = { 'hostname': 'localhost', # 服务器地址 'hostpost': 3306, # 端口 'username': 'root', # 用户名 'password': 'admin123', # 密码 'database': 'douban', # 数据库名 'charset': 'utf8', # 数据库编码}
数据入库操作
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport pymysql.cursorsfrom douban.settings import MYSQL_DATABASEclass DoubanPipeline(object): def __init__(self): # 数据库初始化配置 self.connect = pymysql.Connect( host=MYSQL_DATABASE['hostname'], port=MYSQL_DATABASE['hostpost'], user=MYSQL_DATABASE['username'], password=MYSQL_DATABASE['password'], db=MYSQL_DATABASE['database'], charset=MYSQL_DATABASE['charset'], ) self.cursor = self.connect.cursor() def process_item(self, item, spider): data = dict(item) serial_number = '%d' % int(data['serial_number']) movie_name = data['movie_name'] introduce = data['introduce'] star = '%.1f' % float(data['star']) evaluate = data['evaluate'] describe = data['describe'] insert_sql = "insert into `douban_movie`(`serial_number`,`movie_name`,`introduce`,`star`,`evaluate`,`describe`) values(%s,%s,%s,%s,%s,%s)" self.cursor.execute(insert_sql, (serial_number, movie_name, introduce, star, evaluate, describe)) return item
开始item_pipelines,在settings.py文件里找到如下选项,如不设置此项,则数据不会添加到数据库里
ITEM_PIPELINES = { 'douban.pipelines.DoubanPipeline': 300,}
伪装的目的是为防止目标阻止爬取,伪装就是为了提高爬取成功率
两种伪装方式:代理和随机USER-AGENT
1.代理
具体代码如下:
# 代理处理class MyProxy(object): def process_request(self, request, spider): # 蘑菇代理免费试用 www.moguproxy.com request.meta['proxy'] = 'transfer.mogumiao.com:9001' # 两种方式设置代理 # 第一种 appkey方式 appkey = 'd0ZBT2d5RlRZcG94Q2haMDpqajdZMXJqdEhCbnU0ZVFF' request.headers['Authorization'] = "Basic " + appkey # 第二种 账户密码方式 # proxy_name_pass = b'wFAOgyFTYpoxChZ0:jj7Y1rjtHBnu4eQE' # 引入base64 # encode_pass_name = base64.b64encode(proxy_name_pass) # # request.headers['Authorization'] = "Basic " + encode_pass_name.decode()
2.随机USER_AGENT
具体代码如下:
# 设置随机agentclass MyUserAgent(object): def process_request(self, request, spider): # user agent 列表 USER_AGENT_LIST = [ 'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23', 'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)', 'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)', 'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)', 'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)', 'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0', 'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)', 'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)' ] # 导入random agent = random.choice(USER_AGENT_LIST) request.headers['User_Agent'] = agent
设置配置,开启代理和user-agent,在settings.py里设置
DOWNLOADER_MIDDLEWARES = { # 'douban.middlewares.DoubanDownloaderMiddleware': 543, 'douban.middlewares.MyProxy': 543, 'douban.middlewares.MyUserAgent': 544,}
联系客服