打开APP
userphoto
未登录

开通VIP,畅享免费电子书等14项超值服

开通VIP
爬虫爬取代理ip
import urllib.request from bs4 import BeautifulSoup import re import time import random # --------------------公用方法----------------------------- class CommanCalss: def __init__(self): self.header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'} self.testurl="www.baidu.com" def getresponse(self,url): req = urllib.request.Request(url, headers=self.header) resp = urllib.request.urlopen(req, timeout=5) content = resp.read() return content def _is_alive(self,proxy): try: resp=0 for i in range(3): proxy_support = urllib.request.ProxyHandler({"http": proxy}) opener = urllib.request.build_opener(proxy_support) urllib.request.install_opener(opener) req = urllib.request.Request(self.url, headers=self.header) # 访问 resp = urllib.request.urlopen(req, timeout=5) if resp == 200: return True except: return False # -----------代理池----------------------- class ProxyPool: def __init__(self,proxy_finder): self.pool=[] self.proxy_finder=proxy_finder self.cominstan=CommanCalss() def get_proxies(self): self.pool=self.proxy_finder.find() for p in self.pool: if self.cominstan._is_alive(p): continue else: self.pool.remove(p) def get_one_proxy(self): return random.choice(self.pool) def writeToTxt(self,file_path): try: fp = open(file_path, "w+") for item in self.pool: fp.write(str(item) + "\n") fp.close() except IOError: print("fail to open file") #----------------------获取代理方法--------------------- #定义一个基类 class IProxyFinder: def __init__(self): self.pool = [] def find(self): return #西祠代理爬取 class XiciProxyFinder(IProxyFinder): def __init__(self, url): super(XiciProxyFinder,self).__init__() self.url=url self.cominstan = CommanCalss() def find(self): for i in range(1, 10): content = self.cominstan.getresponse(self.url + str(i)) soup = BeautifulSoup(content, 'lxml') ips = soup.findAll('tr') for x in range(2, len(ips)): ip = ips[x] tds = ip.findAll("td") if tds == []: continue ip_temp = tds[1].contents[0] + ":" + tds[2].contents[0] self.pool.append(ip_temp) time.sleep(1) return self.pool #---------------------测试-------------------------------- if __name__ == '__main__': finder = XiciProxyFinder("http://www.xicidaili.com/wn/") ppool_instance = ProxyPool(finder) ppool_instance.get_proxies() ppool_instance.writeToTxt("proxy2.txt") 
本站仅提供存储服务,所有内容均由用户发布,如发现有害或侵权内容,请点击举报
打开APP,阅读全文并永久保存 查看更多类似文章
猜你喜欢
类似文章
【热】打开小程序,算一算2024你的财运
Python3学习笔记:使用代理访问url地址 - Ken's blog - 开源中国社区
hha
爬虫学习(十二)
Python 下载文件的七种方式,你get了吗?
小白学 Python 爬虫(16):urllib 实战之爬取妹子图
python,web登录
更多类似文章 >>
生活服务
热点新闻
分享 收藏 导长图 关注 下载文章
绑定账号成功
后续可登录账号畅享VIP特权!
如果VIP功能使用有故障,
可点击这里联系客服!

联系客服