Python爬取网站数据后,数据的保存方式是大家比较关心的意一件事情,也是为接下来是否能够更简便的处理数据的关键步骤。下面,就Python爬取网页数据后的保存格式进行简单介绍。三种保存格式为txt格式、CSV格式和数据库格式。
首先,保存为txt格式。话不多说,直接上代码!
- # -*- coding: utf-8 -*-
- import requests
- import json
- import html
- import urllib
- import sys
- import re
- import random
- import time
- from threading import Timer
- from bs4 import BeautifulSoup
- reload(sys)
- sys.setdefaultencoding('utf-8')
- headers ={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36'}
- def get_html1(i):
- url = 'https://www.ppmoney.com/StepUp/List/-1/{}/fixedterm/true/false?_={}'
- html = requests.get(url.format(i,random.randint(1501050773102,1501051774102)),headers=headers)
- return html.content
- def get_data1(html):
- data1 = json.loads(html)
- data = data1['PackageList']['Data']
- for i in data:
- #产品名称,利率,金额
- print i['name'],'\t',i['profit'],'\t',i['investedMoney']
- with open('d:PPmonenyshengxinbao9.6.txt','a') as f:
- f.write(i['name']+'\t'+str(i['profit'])+'\t'+str(i['investedMoney'])+'\n'
- for i in range(1,10):
- get_data1(get_html1(i))
执行代码后,生成文件打开后显示如下:
2.保存为CSV格式。
- # -*- coding: utf-8 -*-
- import requests
- import pandas as pd
- import numpy as np
- import json
- import html
- import urllib
- import sys
- import re
- import random
- import time
- from threading import Timer
- from bs4 import BeautifulSoup
- reload(sys)
- sys.setdefaultencoding('utf8')
- headers ={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36'}
- def get_html1(i):
- url = 'https://www.ppmoney.com/StepUp/List/-1/{}/fixedterm/true/false?_={}'
- html = requests.get(url.format(i,random.randint(1501050773102,1501051774102)),headers=headers)
- ceshi1=html.content
- data = json.loads(ceshi1)
- return(data['PackageList']['Data'])
- data_ceshi=pd.DataFrame([])
- html_list=[]
- for i in range(100):
- html_list.append(get_html1(i))
- for i,heml_avg in enumerate(html_list):
- tmp=pd.DataFrame(heml_avg)
- tmp["page_id"]=i
- data_ceshi=data_ceshi.append(tmp)
- print data_ceshi
- data_ceshi.to_csv('e:/data.csv',encoding='gbk')
保存后,结果如下:
3.保存到数据库。
- # -*- coding: utf-8 -*-
- import requests
- import pandas as pd
- import numpy as np
- import json
- import html
- import urllib
- import sys
- import re
- import random
- import MySQLdb
- import time
- from threading import Timer
- from bs4 import BeautifulSoup
- reload(sys)
- sys.setdefaultencoding('utf8')
- headers ={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36'}
- db = MySQLdb.connect(host = 'localhost',port = 3306,user = 'zhouliye',passwd = '123456zz',db = 'abbc',charset='utf8')
- print '连接上数据库了!'
- cursor = db.cursor()
- cursor.execute("DROP TABLE IF EXISTS shengxb")
- sql = """CREAtE TABLE SHENGXB(
- beginTime DATETIME,
- endTime DATETIME,
- investedMoney float,
- name CHAR(50))"""
- cursor.execute(sql)
- def get_html1(i):
- url = 'https://www.ppmoney.com/StepUp/List/-1/{}/fixedterm/true/false?_={}'
- html = requests.get(url.format(i,random.randint(1501050773102,1501051774102)),headers=headers)
- ceshi1=html.content
- data = json.loads(ceshi1)
- return(data['PackageList']['Data'])
- data_ceshi=pd.DataFrame([])#建立一个空数据框
- html_list =[]#建立一个空列表
- for i in range(10):
- html_list.append(get_html1(i))
- for i in html_list:
- for j in i:
- a = j['beginTime']
- b = j['endTime']
- c = j['investedMoney']
- d = j['name']
- print u'开始时间: ' + str(a) + u'结束时间: '+ str(b) + u'投资金额: ' + str(c) + u'项目名称' + str(d)
- insert_SHENGXB = ("INSERT INTO SHENGXB (beginTime,endTime,investedMoney,name) VALUES(%s,%s,%s,%s)")
- data_data= (a, b, c, d)
- cursor.execute(insert_SHENGXB,data_data)
- db.commit()
- print '******完成此条插入!'
print '爬取数据并插入mysql数据库完成...'
保存后显示如下:
联系客服