博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
scrapy框架 + selenium 爬取豆瓣电影top250......
阅读量:4658 次
发布时间:2019-06-09

本文共 9267 字,大约阅读时间需要 30 分钟。

废话不说,直接上代码.....

目录结构

items.py

import scrapy class DoubanCrawlerItem(scrapy.Item):   # 电影名称   movieName = scrapy.Field()   # 电影id   movieId = scrapy.Field()   # 海报地址   img = scrapy.Field()   # 电影信息网址   info_website = scrapy.Field()   # 评分   data_score = scrapy.Field()   # 片长   data_duration = scrapy.Field()   # 上映日期   data_release = scrapy.Field()   # 导演   data_director = scrapy.Field()   # 主演   data_actors = scrapy.Field()   # 制作国家/地区   data_region = scrapy.Field()   # 编剧   data_attrs = scrapy.Field()   # 评论人数   data_number = scrapy.Field()   # 简介   introduction = scrapy.Field()   # 类型   movie_type = scrapy.Field()   # 语言   movie_language = scrapy.Field()   # 又名   also_called = scrapy.Field()   # 排名   movie_ranking = scrapy.Field()

# 这里是重点

spiders

movie_spider.py

import scrapy from scrapy.spiders import Rule, CrawlSpider from ..items import DoubanCrawlerItem from scrapy.linkextractors import LinkExtractor from selenium import webdriver from scrapy import log # 请求添加cookies和headers cookies = { } headers = {''} class DoubanspiderSpider(CrawlSpider):   # spider名称 name = 'douban'   # 爬取规则,获取给定request中的电影信息界面url rules = ( Rule(LinkExtractor(allow=r'^https://movie.douban.com/subject/\d+/$'), callback='parse_item', follow=True), )   # 初始化request def start_requests(self): for i in range(0, 250, 25): url = 'https://movie.douban.com/top250?start={}&filter='.format(i) request = scrapy.Request(url, cookies=cookies, headers=headers) yield request   # 爬取数据并处理 def parse_item(self, response): info_url = response.url item = self.selenium_js(info_url) item['website'] = info_url url_list = info_url.split('/') for i in url_list: if i == '': url_list.remove(i) item['movieId'] = url_list[-1] movie_introduction = response.xpath('//*[@id="link-report"]/span[1]/text()').extract() introduction = '' for i in movie_introduction: introduction += i.strip().replace('\n', '').replace('\t', '') item['introduction'] = introduction item['movie_ranking'] = response.xpath('//*[@id="content"]/div[1]/span[1]/text()').extract() img_list = response.xpath('//*[@id="mainpic"]/a/img/@src').extract() item['movieName'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract() for img in img_list: item['img'] = img.replace('.webp', '.jpg') log.msg(item) yield item   # 使用 selenium 爬取一些动态数据 def selenium_js(self, info_url): item = DoubanCrawlerItem() driver = webdriver.Chrome() driver.get(info_url) driver.maximize_window() driver.implicitly_wait(10) data = driver.find_element_by_xpath('//div[@class="subject clearfix"]/div[2]').text data_list = data.split('\n') for d in data_list: if d != '': j = d.split(':', 1) if '导演' in j[0]: item['data_director'] = j[1] elif '编剧' in j: item['data_attrs'] = j[1] elif '主演' in j: item['data_actors'] = j[1] elif '类型' in j: item['movie_type'] = j[1] elif '制片国家/地区' in j: item['data_region'] = j[1] elif '语言' in j: item['movie_language'] = j[1] elif '上映日期' in j: item['data_release'] = j[1] elif '片长' in j: item['data_duration'] = j[1] elif '又名' in j: item['also_called'] = j[1] else: pass     item['data_number'] = driver.find_element_by_xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span').text        driver.close()        return item

pipelines.py

import pymysql import time from scrapy.exceptions import DropItem from scrapy import log class DoubancrawlerPipeline: def process_item(self, item, spider): # 判断如果没有本科人数字段,则丢弃该item if not item.get('img'): raise DropItem('缺少字段:{}'.format('img')) # 判断如果没有研究生人数字段,则丢弃该item if not item.get('movieName'): raise DropItem('缺少字段:{}'.format('movieName')) if not item.get('also_called'): item['also_called'] = '' return item class MysqlPipeline(object): def __init__(self): """"初始化mysql链接和游标对象""" self.conn = None self.cur = None # self.movies = False # self.commentary = False def open_spider(self, spider): """"初始化mysql链接""" self.conn = pymysql.connect( host='localhost', port=3306, user='root', password='xxxxx', db='douban', charset='utf8mb4', ) # 初始化游标对象 self.cur = self.conn.cursor() # 初始化数据库 # self.delete_data() def delete_data(self): """在保存爬取数据前,清空库 递归清空""" sql = 'select `movieName` from `top250`' self.cur.execute(sql) if self.cur.fetchone(): sql = 'delete from `top250`' self.cur.execute(sql) self.conn.commit() time.sleep(1) self.delete_data() else: log.msg('数据库初始化完成!') def check_data(self, table): sql = 'select `movieId` from {}'.format(table) self.cur.execute(sql) self.conn.commit() s = self.cur.fetchall() id_list = [] # 判断数据是否已经存在 for i in range(len(s)): for j in s[i]: id_list.append(j) return set(id_list) def process_item(self, item, spider): """处理,保存爬取数据""" if spider.name == 'douban': id_list = self.check_data('top250') if int(item['movieId']) not in id_list: sql = 'insert into `top250`(`movieName`, `movieId`, `data_number`,`data_release`,`img`, ' \ '`introduction`, `data_duration`, `data_region`, `data_director`, `data_actors`, `data_attrs`, ' \ '`website`, `movie_ranking`, `movie_type`, `movie_language`, `also_called`) values ' \ '(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' self.cur.execute(sql, (item['movieName'], item['movieId'], item['data_number'], item['data_release'], item['img'], item['introduction'], item['data_duration'], item['data_region'], item['data_director'], item['data_actors'], item['data_attrs'], item['website'], item['movie_ranking'], item['movie_type'], item['movie_language'], item['also_called'])) self.conn.commit() log.msg('{},保存成功!'.format(item['movieName'])) return item def close_spider(self, spider): """关闭mysql链接和游标""" self.cur.close()           self.conn.close() middlewares proxy.py
import random from urllib.request import _parse_proxy import requests from scrapy.exceptions import NotConfigured from scrapy import log def reform_url(url): # 重组url,返回不带用户名密码的格式 proxy_type, *_, hostport = _parse_proxy(url) return '{}://{}'.format(proxy_type, hostport) class RandomProxyMiddleware: # 代理的最多失败次数,超过此值的代理,从代理池中删除 max_failed = 3 def __init__(self, settings): # 从设置中获取代理池 # self.proxies = settings.getlist('PROXIES') self.proxies = self.choice_proxies() if self.proxies: # 初始化统计信息,一开始失败次数都是0 self.stats = {}.fromkeys(map(reform_url, self.proxies), 0) def choice_proxies(self): self.proxies = [] # 1个ip url = '可以返回ip的url' # 30个 # url = 'xxxx' r = requests.get(url) # eval() 计算字符串中的有效表达式 ip_dict = eval(r.text) if ip_dict['code'] == '0': for i in ip_dict['msg']: # 拼接成有效的代理ip ip = 'http://' + i['ip'] + ':' + i['port'] self.proxies.append(ip) log.msg(self.proxies) return self.proxies else: log.msg('代理ip接口返回状态码异常...{}'.format(ip_dict['code'])) return '-1'   # 判断是否启用了代理 @classmethod def from_crawler(cls, crawler): if not crawler.settings.getbool('HTTPPROXY_ENABLED'): raise NotConfigured return cls(crawler.settings) def process_request(self, request, spider): # 如果request.meta中没有设置proxy,则从代理池中随机设置一个,作为本次请求的代理 if 'proxy' not in request.meta: request.meta['proxy'] = random.choice(self.proxies) def process_response(self, request, response, spider): # 获取当前使用的proxy cur_proxy = request.meta['proxy'] # 判断http code是否大于400,响应是否出错 if response.status >= 400: # 将该代理的失败次数加1 self.stats[cur_proxy] += 1 # 判断该代理的总失败次数是否已经超过最大失败次数 if self.stats[cur_proxy] >= self.max_failed: log.msg('{} 获得一个 {} 返回结果'.format(cur_proxy, response.status)) # 从代理池中删除该代理 # if cur_proxy in self.proxies: # self.proxies.remove(cur_proxy) for proxy in self.proxies: if reform_url(proxy) == cur_proxy: self.proxies.remove(proxy) break log.msg('{} 超过最大失败次数,从代理列表删除'.format(cur_proxy)) # 将本次请求重新设置一个代理,并返回 if not self.proxies: self.proxies = self.choice_proxies() log.msg('超过最大失败次数,代理池为空...再次请求api') # return request.meta['proxy'] = random.choice(self.proxies) return request return response def process_exception(self, request, exception, spider): cur_proxy = request.meta['proxy'] # 如果出现网络超时或者链接被拒绝,则删除该代理 if cur_proxy in self.proxies: self.proxies.remove(cur_proxy) log.msg('{} 代理ip出现错误,从代理列表删除'.format(cur_proxy)) # 将本次请求重新设置一个d代理并返回 if not self.proxies: self.choice_proxies() log.msg('代理ip出现错误,代理池为空...再次请求api') # return request.meta['proxy'] = random.choice(self.proxies) return request useragent.py
import faker class RandomUserAgentMiddleware(object): """该中间件负责给每个请求随机分配一个user agent""" def __init__(self, settings): self.faker = faker.Faker() @classmethod def from_crawler(cls, crawler): # 创建一个中间件实例,并返回 return cls(crawler.settings) def process_request(self, request, spider): # 设置request头信息内的user-Agent字段 request.headers['User-Agent'] = self.faker.user_agent() def process_response(self, request, response, spider): # print(request.headers['User-Agent']) return response
settings.py import time
# 是否使用代理 HTTPPROXY_ENABLED = True
# 启用log LOG_ENABLED = True # log文件编码 LOG_ENCODING = 'utf-8' # 打印日志文件位置 today = time.strftime('%Y-%m-%d') LOG_FILE = "./log/{}.log".format(today) # 提高日志级别 LOG_LEVEL = 'INFO' # 禁⽤用重定向 REDIRECT_ENABLED = False # 请求之间的间隔 DOWNLOAD_DELAY = 2
DOWNLOADER_MIDDLEWARES = {
'doubanCrawler.middlewares.useragent.RandomUserAgentMiddleware': 543, 'doubanCrawler.middlewares.proxy.RandomProxyMiddleware': 749, }
ITEM_PIPELINES = {
'doubanCrawler.pipelines.DoubancrawlerPipeline': 300, # # 'doubanCrawler.pipelines.RedisPipeline': 301, 'doubanCrawler.pipelines.MysqlPipeline': 302, }
run:scrapy crawl douban result:

 

  so, 大家多多指教...

 

 

 

转载于:https://www.cnblogs.com/changqing8023/p/8535034.html

你可能感兴趣的文章
xpath
查看>>
Ubuntu12安装RobotFramework
查看>>
hdu 1269 迷宫城堡 强连通图 tarjan
查看>>
inner join和outer join
查看>>
补码与符号位取反
查看>>
生日。金鼎轩吃饭;亿旺中影看《后会无期》。
查看>>
[蓝桥杯] 排它平方数
查看>>
jmeter学习记录--03--jmeter负载与监听
查看>>
Altium Designer 复制和粘贴功能
查看>>
zynq基础-->LINUX 设备树
查看>>
C++友元函数、友元类
查看>>
Linux基本操作指令
查看>>
用TypeScript开发Vue——如何通过vue实例化对象访问实际ViewModel对象
查看>>
图解二叉树遍历(递归调用)
查看>>
IIS 应用程序池 已停用
查看>>
==还款-代偿(csv循环自动代偿)
查看>>
BZOJ 2402 陶陶的难题II (01分数规划+树剖+线段树+凸包+二分)
查看>>
升级openssh踩得坑
查看>>
【openCV】openCV2.4.8在vs2010旗舰版中的配置
查看>>
继承小结
查看>>