打开APP
userphoto
未登录

开通VIP,畅享免费电子书等14项超值服

开通VIP
Python爬虫系列,采集好看视频网站视频

源代码地址

https://github.com/kylin1020/haokan

运行环境

  • Python 3.6+
  • windows/Linux

源代码

# coding: utf-8

import re

import os

import logging

from threading import Thread

import http.cookiejar

from json import loads

from urllib import request, parse, error

from bs4 import BeautifulSoup

from time import time

class Spider():

def __init__(self):

self.main_url = 'http://sv.baidu.com'

self.tab_url = 'http://sv.baidu.com/videoui/list/tab'

self.header = {

'Referer': 'http://sv.baidu.com/',

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36'

}

self.log_dir = self.mkdir(os.path.join(os.path.dirname(__file__), 'log'))

self.video_dir = self.mkdir(os.path.join(os.path.dirname(__file__), 'video'))

self.opener = self.build_opener()

self.logger = self.__build_logger()

# 创建目录

def mkdir(self, dir):

if not os.path.isdir(dir):

os.mkdir(dir)

return dir

# 构建日志输出函数

def __build_logger(self):

filename = os.path.join(self.log_dir, 'spider.log')

logger = logging.getLogger(__name__)

logger.setLevel(level = logging.INFO)

handler = logging.FileHandler(filename)

handler.setLevel(level = logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

handler.setFormatter(formatter)

logger.addHandler(handler)

console = logging.StreamHandler()

console.setLevel(level = logging.INFO)

logger.addHandler(console)

return logger

# 构建带cookie的url opener

def build_opener(self):

cookie_name = os.path.join(self.log_dir, 'cookie.log')

cookie = http.cookiejar.MozillaCookieJar(cookie_name)

if os.path.isfile(cookie_name):

cookie.load(cookie_name, ignore_discard=True, ignore_expires=True)

handler = request.HTTPCookieProcessor(cookie)

opener = request.build_opener(handler)

else:

handler = request.HTTPCookieProcessor(cookie)

opener = request.build_opener(handler)

req = request.Request(self.main_url, headers=self.header)

try:

respone = opener.open(req)

except Exception as e:

print(e)

cookie.save(filename=cookie_name, ignore_discard=True, ignore_expires=True)

return opener

# 获取指定url内容

def urlopen(self, url, data = None, is_redecode = False, is_format = True):

if data is not None:

data = parse.urlencode(data).encode(encoding='UTF-8')

req = request.Request(url, data=data, headers=self.header)

respone = self.opener.open(req)

if is_format:

respone = BeautifulSoup(respone, 'lxml')

# unicode编码转中文

if is_redecode:

respone = BeautifulSoup(self.redecode(respone), 'lxml')

return respone

# 获取分类信息

def get_index(self):

html = self.urlopen(self.main_url)

data_list = html.find_all(name='li', attrs={'tid': True})

url_list = {}

for data in data_list:

name_url = data.find('a')['href']

name = data.find('a').string

url_list[name] = name_url

return url_list

# unicode编码转中文

def redecode(self, content):

content = content.encode('utf-8').decode('unicode_escape')

return content

# 获取指定页视频内容

def expand(self, item, page = 1):

data = parse.urlencode({

'source': 'wise-channel',

'pd': '',

'subTab': item,

'direction': 'down',

'refreshType': 1,

'ua': 'Mozilla%2F5.0%20(Windows%20NT%2010.0%3B%20Win64%3B%20x64)%20AppleWebKit%2F537.36%20(KHTML%2C%20like%20Gecko)%20Chrome%2F72.0.3626.81%20Safari%2F537.36',

'bt': '1549533482',

'caller': 'bdwise',

'_': int(time()),

'cb': 'jsonp%d' % (page),

})

url = self.tab_url + '?' + data

respone = self.urlopen(url = url, is_redecode=True)

video_list = respone.find_all('div', attrs={'data-authorid': True})

items = []

for video in video_list:

pattern = r'(\\)|(')'

url = re.sub(pattern, '', video['data-vsrc'])

title = re.sub(pattern, '', video['data-title'])

items.append({'title': title, 'url': url})

return items

# 保存视频

def save_video(self, page = 1):

items = self.expand('qiongying', page)

for item in items:

respone = self.urlopen(item['url'], is_format=False)

filename = os.path.join(self.video_dir, item['title'].strip() + '.mp4')

if not os.path.isfile(filename):

with open(filename, 'wb') as f:

f.write(respone.read())

if os.path.isfile(filename):

self.logger.info(filename)

# 执行视频采集

def run(self, total_page = 1):

for i in range(1, total_page+1):

self.save_video(i)

# 视频采集示例

if __name__ == '__main__':

spider = Spider()

spider.run(total_page=10)

Python抓取好看视频简单示例

代码简单易懂,暂未加入多线程\多进程采集,也未设置代理。

本站仅提供存储服务,所有内容均由用户发布,如发现有害或侵权内容,请点击举报
打开APP,阅读全文并永久保存 查看更多类似文章
猜你喜欢
类似文章
【热】打开小程序,算一算2024你的财运
python实现的ftp服务器
哈撒给,英雄联盟全皮肤下载器来了...
爬虫学习(十二)
迅雷不给力,我DIY了个下载器
自动化测试项目实战08:日志
python笔记46-史上最强大最好用的python日志模块nb_log
更多类似文章 >>
生活服务
热点新闻
分享 收藏 导长图 关注 下载文章
绑定账号成功
后续可登录账号畅享VIP特权!
如果VIP功能使用有故障,
可点击这里联系客服!

联系客服