python爬虫的5种方式之requests、re、parsel、bs4

url = 'https://www.dygod.net/html/tv/hytv/20220603/116997.html'# 方式一：re正则匹配：import requests,reresponse = requests.get(url=url)# 自动解码为合适的格式：response.encoding = response.apparent_encodingresponse = response.textdata = re.findall('bgcolor='#fdfddf'><a href='(.*?)'>magnet:',response)print(len(data),data)with open(r'C:\\Users\\Administrator\\Desktop\\梦华录.txt','a+',encoding='utf-8') as f: for i in data: f.write(i) f.write('\n')# 方式二：requests,parsel：通过标签的文本数据获取链接# parsel是一个python的第三方库，相当于css选择器+xpath+re，# 无论是使用css选择器，还是xpath，re，都需要先创建一个parsel.Selector对象，# 创建了Selector对象之后，可以进行xpath、css的任意切换import requests,parselresponse = requests.get(url=url)# 自动解码为合适的格式：response.encoding = response.apparent_encodingselector = parsel.Selector(response.text)# print(selector)# 拿到文本数据：#downlist a::textcontent = selector.css('#downlist a::text').getall()print(content)with open(r'C:\\Users\\Administrator\\Desktop\\梦华录02.txt','a+',encoding='utf-8') as f: for i in content: print(i) f.write(i) f.write('\n')# 方式三：requests,parsel：通过标签的属性值的数据获取链接# (和方式二是大相同小差异)import requests,parselresponse = requests.get(url=url)# 自动解码为合适的格式：response.encoding = response.apparent_encodingselector = parsel.Selector(response.text)# print(selector)# 拿到文本数据：#downlist a::attr(href)：获取a标签的href属性的值，即为下载的链接content = selector.css('#downlist a::attr(href)').getall()print(content)with open(r'C:\\Users\\Administrator\\Desktop\\梦华录03.txt','a+',encoding='utf-8') as f: for i in content: print(i) f.write(i) f.write('\n')# 方式四：requests,parsel,re：# 通过xpath路径来获取链接数据import requests,parsel,reresponse = requests.get(url=url)# 自动解码为合适的格式：response.encoding = response.apparent_encodingselector = parsel.Selector(response.text)# print(selector)content = selector.xpath('//div[@id='downlist']/table/tbody/tr/td/a').getall()with open(r'C:\\Users\\Administrator\\Desktop\\梦华录04.txt','a+',encoding='utf-8') as f: for i in content: i = re.findall('.mp4'>(.*?)</a>',i)[0] print(i) f.write(i) f.write('\n')# 方式五：BeautifulSoup,re：from bs4 import BeautifulSoupimport requests,re'''参考代码：parse = BeautifulSoup(res.text, 'html.parser') #告诉BeautifulSoup(res.text)以html的格式来处理detail_div = parse.find('div', class_='pMain') #在parrse中查找第一个div的标签属性并且为:class='pMain'的内容并将结果给到detail_divdetail_like = detail_div.find_all('a', class_='img') #在detail_div中查找全部a标签属性并且为class_='img'的内容结果返回给detail_like'''response = requests.get(url=url)# 自动解码为合适的格式：response.encoding = response.apparent_encodingparse = BeautifulSoup(response.text,'html')# print(parse)detail_div = parse.find('div', class_='player_list').find_all('a')# print(len(detail_div),detail_div)with open(r'C:\\Users\\Administrator\\Desktop\\梦华录05.txt', 'a+', encoding='utf-8') as f: for i in detail_div: i = re.findall('<a href='(.*?)'>第',str(i))[0] # print(type(i),i) f.write(i) f.write('\n')

本站仅提供存储服务，所有内容均由用户发布，如发现有害或侵权内容，请点击举报。