打开APP
userphoto
未登录

开通VIP,畅享免费电子书等14项超值服

开通VIP
Python 3 多线程下载百度图片搜索结果 | 岁月如歌
http://lovenight.github.io/2015/11/15/Python-3-多线程下载百度图片搜索结果/


#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: loveNight
# @Date: 2015-10-28 19:59:24
# @Last Modified by: loveNight
# @Last Modified time: 2015-11-15 19:24:57


import urllib
import requests
import os
import re
import sys
import time
import threading
from datetime import datetime as dt
from multiprocessing.dummy import Pool
from multiprocessing import Queue


class BaiduImgDownloader(object):

"""百度图片下载工具,目前只支持单个关键词"""

# 解码网址用的映射表
str_table = {
'_z2C$q': ':',
'_z&e3B': '.',
'AzdH3F': '/'
}

char_table = {
'w': 'a',
'k': 'b',
'v': 'c',
'1': 'd',
'j': 'e',
'u': 'f',
'2': 'g',
'i': 'h',
't': 'i',
'3': 'j',
'h': 'k',
's': 'l',
'4': 'm',
'g': 'n',
'5': 'o',
'r': 'p',
'q': 'q',
'6': 'r',
'f': 's',
'p': 't',
'7': 'u',
'e': 'v',
'o': 'w',
'8': '1',
'd': '2',
'n': '3',
'9': '4',
'c': '5',
'm': '6',
'0': '7',
'b': '8',
'l': '9',
'a': '0'
}

re_objURL = re.compile(r'"objURL":"(.*?)".*?"type":"(.*?)"')
re_downNum = re.compile(r"已下载\s(\d+)\s张图片")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36",
"Accept-Encoding": "gzip, deflate, sdch",
}

def __init__(self, word, dirpath=None, processNum=30):
if " " in word:
raise AttributeError("本脚本仅支持单个关键字")
self.word = word
self.char_table = {ord(key): ord(value)
for key, value in BaiduImgDownloader.char_table.items()}
if not dirpath:
dirpath = os.path.join(sys.path[0], 'results')
self.dirpath = dirpath
self.jsonUrlFile = os.path.join(sys.path[0], 'jsonUrl.txt')
self.logFile = os.path.join(sys.path[0], 'logInfo.txt')
self.errorFile = os.path.join(sys.path[0], 'errorUrl.txt')
if os.path.exists(self.errorFile):
os.remove(self.errorFile)
if not os.path.exists(self.dirpath):
os.mkdir(self.dirpath)
self.pool = Pool(30)
self.session = requests.Session()
self.session.headers = BaiduImgDownloader.headers
self.queue = Queue()
self.messageQueue = Queue()
self.index = 0 # 图片起始编号,牵涉到计数,不要更改
self.promptNum = 10 # 下载几张图片提示一次
self.lock = threading.Lock()
self.delay = 1.5 # 网络请求太频繁会被封
self.QUIT = "QUIT" # Queue中表示任务结束
self.printPrefix = "**" # 用于指定在控制台输出

def start(self):
# 控制台输出线程
t = threading.Thread(target=self.__log)
t.setDaemon(True)
t.start()
self.messageQueue.put(self.printPrefix + "脚本开始执行")
start_time = dt.now()
urls = self.__buildUrls()
self.messageQueue.put(self.printPrefix + "已获取 %s 个Json请求网址" % len(urls))
# 解析出所有图片网址,该方法会阻塞直到任务完成
self.pool.map(self.__resolveImgUrl, urls)
while self.queue.qsize():
imgs = self.queue.get()
self.pool.map_async(self.__downImg, imgs)
self.pool.close()
self.pool.join()
self.messageQueue.put(self.printPrefix + "下载完成!已下载 %s 张图片,总用时 %s" %
(self.index, dt.now() - start_time))
self.messageQueue.put(self.printPrefix + "请到 %s 查看结果!" % self.dirpath)
self.messageQueue.put(self.printPrefix + "错误信息保存在 %s" % self.errorFile)
self.messageQueue.put(self.QUIT)


def __log(self):
"""控制台输出,加锁以免被多线程打乱"""
with open(self.logFile, "w", encoding = "utf-8") as f:
while True:
message = self.messageQueue.get()
if message == self.QUIT:
break
message = str(dt.now()) + " " + message
if self.printPrefix in message:
print(message)
elif "已下载" in message:
# 下载N张图片提示一次
downNum = self.re_downNum.findall(message)
if downNum and int(downNum[0]) % self.promptNum == 0:
print(message)
f.write(message + '\n')
f.flush()

def __getIndex(self):
"""获取文件编号"""
self.lock.acquire()
try:
return self.index
finally:
self.index += 1
self.lock.release()

def decode(self, url):
"""解码图片URL
解码前:
ippr_z2C$qAzdH3FAzdH3Ffl_z&e3Bftgwt42_z&e3BvgAzdH3F4omlaAzdH3Faa8W3ZyEpymRmx3Y1p7bb&mla
解码后:
http://s9.sinaimg.cn/mw690/001WjZyEty6R6xjYdtu88&690
"""

# 先替换字符串
for key, value in self.str_table.items():
url = url.replace(key, value)
# 再替换剩下的字符
return url.translate(self.char_table)

def __buildUrls(self):
"""json请求网址生成器"""
word = urllib.parse.quote(self.word)
url = r"http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&st=-1&ic=0&word={word}&face=0&istype=2nc=1&pn={pn}&rn=60"
time.sleep(self.delay)
html = self.session.get(url.format(word=word, pn=0), timeout = 15).content.decode('utf-8')
results = re.findall(r'"displayNum":(\d+),', html)
maxNum = int(results[0]) if results else 0
urls = [url.format(word=word, pn=x)
for x in range(0, maxNum + 1, 60)]
with open(self.jsonUrlFile, "w", encoding="utf-8") as f:
for url in urls:
f.write(url + "\n")
return urls

def __resolveImgUrl(self, url):
"""从指定网页中解析出图片URL"""
time.sleep(self.delay)
html = self.session.get(url, timeout = 15).content.decode('utf-8')
datas = self.re_objURL.findall(html)
imgs = [Image(self.decode(x[0]), x[1]) for x in datas]
self.messageQueue.put(self.printPrefix + "已解析出 %s 个图片网址" % len(imgs))
self.queue.put(imgs)

def __downImg(self, img):
"""下载单张图片,传入的是Image对象"""
imgUrl = img.url
# self.messageQueue.put("线程 %s 正在下载 %s " %
# (threading.current_thread().name, imgUrl))
try:
time.sleep(self.delay)
res = self.session.get(imgUrl, timeout = 15)
message = None
if str(res.status_code)[0] == "4":
message = "\n%s: %s" % (res.status_code, imgUrl)
elif "text/html" in res.headers["Content-Type"]:
message = "\n无法打开图片: %s" % imgUrl
except Exception as e:
message = "\n抛出异常: %s\n%s" % (imgUrl, str(e))
finally:
if message:
self.messageQueue.put(message)
self.__saveError(message)
return
index = self.__getIndex()
# index从0开始
self.messageQueue.put("已下载 %s 张图片:%s" % (index + 1, imgUrl))
filename = os.path.join(self.dirpath, str(index) + "." + img.type)
with open(filename, "wb") as f:
f.write(res.content)

def __saveError(self, message):
self.lock.acquire()
try:
with open(self.errorFile, "a", encoding="utf-8") as f:
f.write(message)
finally:
self.lock.release()


class Image(object):

"""图片类,保存图片信息"""

def __init__(self, url, type):
super(Image, self).__init__()
self.url = url
self.type = type


if __name__ == '__main__':
print("欢迎使用百度图片下载脚本!\n目前仅支持单个关键词。")
print("下载结果保存在脚本目录下的results文件夹中。")
print("=" * 50)
word = input("请输入你要下载的图片关键词:\n")
down = BaiduImgDownloader(word)
down.start()
本站仅提供存储服务,所有内容均由用户发布,如发现有害或侵权内容,请点击举报
打开APP,阅读全文并永久保存 查看更多类似文章
猜你喜欢
类似文章
【热】打开小程序,算一算2024你的财运
如何用Python实现目录遍历
python实现文件搜索工具(简易版)
多线程爬取电影天堂最新电影
TF Boys (TensorFlow Boys ) 养成记(二): TensorFlow 数据读取
​LeetCode刷题实战126:单词接龙 II
WCF分布式开发必备知识(1):MSMQ消息队列
更多类似文章 >>
生活服务
热点新闻
分享 收藏 导长图 关注 下载文章
绑定账号成功
后续可登录账号畅享VIP特权!
如果VIP功能使用有故障,
可点击这里联系客服!

联系客服