一个DataFrame就是一张表格,Series可以理解成一维数据,DataFrame就是一个二维数据,DataFrame可以由多个Series组成(DataFrame可以理解成Series的一个集合)
import pandas as pd
population={'city':['Beijing','Shanghai','Guangzhou','Shenzhen','Hangzhou','Chongqing'], 'year':[2016,2017,2016,2017,2016,2016], 'population':[2100,2300,1000,700,500,500] }population=pd.DataFrame(population) ###print(population)
city population year0 Beijing 2100 20161 Shanghai 2300 20172 Guangzhou 1000 20163 Shenzhen 700 20174 Hangzhou 500 20165 Chongqing 500 2016
pdc=pd.DataFrame(population,columns=['year','city','population']) #columns参数改变列名print(pdc)
year city population0 2016 Beijing 21001 2017 Shanghai 23002 2016 Guangzhou 10003 2017 Shenzhen 7004 2016 Hangzhou 5005 2016 Chongqing 500
tmp={'city':['Beijing','Shanghai','Guangzhou','Shenzhen','Hangzhou','Chongqing'], 'year':[2016,2017,2016,2017,2016,2016], 'population':[2100,2300,1000,700,500,500] }pdci=pd.DataFrame(tmp,columns=['year','city','population'], index=['one','two','three','four','five','six']) #改变行index索引和列名columnsprint(pdci)
year city populationone 2016 Beijing 2100two 2017 Shanghai 2300three 2016 Guangzhou 1000four 2017 Shenzhen 700five 2016 Hangzhou 500six 2016 Chongqing 500
cities={'Beijing':55000,'Shanghai':60000,'shenzhen':50000,'Hangzhou':20000,'Guangzhou':45000,'Suzhou':None}apts=pd.Series(cities,name='income')apts['shenzhen']=70000less_than_50000=(apts<50000)apts[less_than_50000]=40000apts2=pd.Series({'Beijing':10000,'Shanghai':8000,'shenzhen':6000,'Tianjin':40000,'Guangzhou':7000,'Chongqing':30000})# print(apts2)apts=apts+apts2apts[apts.isnull()]=apts.mean()# print(apts)df=pd.DataFrame({'apts':apts,'apts2':apts2}) ###print(df)
apts apts2Beijing 65000.0 10000.0Chongqing 64000.0 30000.0Guangzhou 47000.0 7000.0Hangzhou 64000.0 NaNShanghai 68000.0 8000.0Suzhou 64000.0 NaNTianjin 64000.0 40000.0shenzhen 76000.0 6000.0
data=[{'JackMa':99999999999,'Han':5000,'David':10000}, {'JackMa':99999999998,'Han':4000,'David':11000}]pdl=pd.DataFrame(data,index=['salary1','salary2'])print(pdl)
David Han JackMasalary1 10000 5000 99999999999salary2 11000 4000 99999999998
cities={'Beijing':55000,'Shanghai':60000,'shenzhen':50000,'Hangzhou':20000,'Guangzhou':45000,'Suzhou':None}apts=pd.Series(cities,name='income')apts['shenzhen']=70000less_than_50000=(apts<50000)apts[less_than_50000]=40000apts2=pd.Series({'Beijing':10000,'Shanghai':8000,'shenzhen':6000,'Tianjin':40000,'Guangzhou':7000,'Chongqing':30000})apts=apts+apts2apts[apts.isnull()]=apts.mean()df=pd.DataFrame({'apts':apts,'apts2':apts2})#print(df)df['bonus']=2000 #增加一个新列bonus,并且都赋值2000print(df)
apts apts2 bonusBeijing 65000.0 10000.0 2000Chongqing 64000.0 30000.0 2000Guangzhou 47000.0 7000.0 2000Hangzhou 64000.0 NaN 2000Shanghai 68000.0 8000.0 2000Suzhou 64000.0 NaN 2000Tianjin 64000.0 40000.0 2000shenzhen 76000.0 6000.0 2000
df['income']=df['apts']*2+df['apts2']*1.5+df['bonus']print(df)
apts apts2 bonus incomeBeijing 65000.0 10000.0 2000 147000.0Chongqing 64000.0 30000.0 2000 175000.0Guangzhou 47000.0 7000.0 2000 106500.0Hangzhou 64000.0 NaN 2000 NaNShanghai 68000.0 8000.0 2000 150000.0Suzhou 64000.0 NaN 2000 NaNTianjin 64000.0 40000.0 2000 190000.0shenzhen 76000.0 6000.0 2000 163000.0
print(df.index)
Index(['Beijing', 'Chongqing', 'Guangzhou', 'Hangzhou', 'Shanghai', 'Suzhou', 'Tianjin', 'shenzhen'], dtype='object')
import pandas as pdcities={'Beijing':55000,'Shanghai':60000,'shenzhen':50000,'Hangzhou':20000,'Guangzhou':45000,'Suzhou':None}apts=pd.Series(cities,name='income')apts['shenzhen']=70000less_than_50000=(apts<50000)apts[less_than_50000]=40000apts2=pd.Series({'Beijing':10000,'Shanghai':8000,'shenzhen':6000,'Tianjin':40000,'Guangzhou':7000,'Chongqing':30000})apts=apts+apts2apts[apts.isnull()]=apts.mean()df=pd.DataFrame({'apts':apts,'apts2':apts2})df['bonus']=2000 #增加一个新列bonus,并且都赋值2000df['income']=df['apts']*2+df['apts2']*1.5+df['bonus']#print(df)# apts apts2 bonus income#Beijing 65000.0 10000.0 2000 147000.0#Chongqing 64000.0 30000.0 2000 175000.0#Guangzhou 47000.0 7000.0 2000 106500.0#Hangzhou 64000.0 NaN 2000 NaN#Shanghai 68000.0 8000.0 2000 150000.0#Suzhou 64000.0 NaN 2000 NaN#Tianjin 64000.0 40000.0 2000 190000.0#shenzhen 76000.0 6000.0 2000 163000.0print(df.apts==64000)print(df['apts']==64000) #boolean条件
Beijing FalseChongqing TrueGuangzhou FalseHangzhou TrueShanghai FalseSuzhou TrueTianjin Trueshenzhen FalseName: apts, dtype: bool
print(df[df['apts']==64000]) #对行做选择,就是把apts列等于64000的行取出来
apts apts2 bonus incomeChongqing 64000.0 30000.0 2000 175000.0Hangzhou 64000.0 NaN 2000 NaNSuzhou 64000.0 NaN 2000 NaNTianjin 64000.0 40000.0 2000 190000.0
df[df.apts==64000]['income']=200000 #报错,在复制片段上赋值,原来的df没被改变
print(df.loc['Hangzhou']) #定位选某一行
apts 64000.0apts2 NaNbonus 2000.0income NaNName: Hangzhou, dtype: float64
print(df.loc[['Hangzhou','Shanghai']])
apts apts2 bonus incomeHangzhou 64000.0 NaN 2000 NaNShanghai 68000.0 8000.0 2000 150000.0
print(df.loc[df['apts']==64000,['apts2','apts','bonus']])#前面的部分是对行做选择,后面的部分是对列做选择
apts2 apts bonusChongqing 30000.0 64000.0 2000Hangzhou NaN 64000.0 2000Suzhou NaN 64000.0 2000Tianjin 40000.0 64000.0 2000
print(df.iloc[0:5])
apts apts2 bonus incomeBeijing 65000.0 10000.0 2000 147000.0Chongqing 64000.0 30000.0 2000 175000.0Guangzhou 47000.0 7000.0 2000 106500.0Hangzhou 64000.0 NaN 2000 NaNShanghai 68000.0 8000.0 2000 150000.0
print(df.ix[1:4,1:3]) #用行号和列号做数据选择
apts2 bonusChongqing 30000.0 2000Guangzhou 7000.0 2000Hangzhou NaN 2000
df.loc[:,'income']=5000print(df)
apts apts2 bonus incomeBeijing 65000.0 10000.0 2000 5000Chongqing 64000.0 30000.0 2000 5000Guangzhou 47000.0 7000.0 2000 5000Hangzhou 64000.0 NaN 2000 5000Shanghai 68000.0 8000.0 2000 5000Suzhou 64000.0 NaN 2000 5000Tianjin 64000.0 40000.0 2000 5000shenzhen 76000.0 6000.0 2000 5000
print(df.info())
<class 'pandas.core.frame.DataFrame'>Index: 8 entries, Beijing to shenzhenData columns (total 4 columns):apts 8 non-null float64apts2 6 non-null float64bonus 8 non-null int64income 8 non-null int64dtypes: float64(2), int64(2)memory usage: 320.0+ bytesNone
print(df.describe())
apts apts2 bonus incomecount 8.000000 6.000000 8.0 8.0mean 64000.000000 16833.333333 2000.0 5000.0std 8017.837257 14483.323744 0.0 0.0min 47000.000000 6000.000000 2000.0 5000.025% 64000.000000 7250.000000 2000.0 5000.050% 64000.000000 9000.000000 2000.0 5000.075% 65750.000000 25000.000000 2000.0 5000.0max 76000.000000 40000.000000 2000.0 5000.0
print(df.head(2))
apts apts2 bonus incomeBeijing 65000.0 10000.0 2000 5000Chongqing 64000.0 30000.0 2000 5000
print(df.tail(2))
apts apts2 bonus incomeTianjin 64000.0 40000.0 2000 5000shenzhen 76000.0 6000.0 2000 5000
#df2.loc[((df2['dow']==0)|(df2['dow']==2)|(df2['dow']==4)),:]#df2.loc[ df2['dow'].isin([0,2,4]) , : ] #可以是一个列表,numpy array,Series#~(df2['dow'].isin([0,2,4]))
import pandas as pdcities={'Beijing':55000,'Shanghai':60000,'shenzhen':50000,'Hangzhou':20000,'Guangzhou':45000,'Suzhou':None}apts=pd.Series(cities,name='income')apts['shenzhen']=70000less_than_50000=(apts<50000)apts[less_than_50000]=40000apts2=pd.Series({'Beijing':10000,'Shanghai':8000,'shenzhen':6000,'Tianjin':40000,'Guangzhou':7000,'Chongqing':30000})apts=apts+apts2apts[apts.isnull()]=apts.mean()df=pd.DataFrame({'apts':apts,'apts2':apts2})df['bonus']=2000 df['income']=df['apts']*2+df['apts2']*1.5+df['bonus']#print(df)# apts apts2 bonus income#Beijing 65000.0 10000.0 2000 147000.0#Chongqing 64000.0 30000.0 2000 175000.0#Guangzhou 47000.0 7000.0 2000 106500.0#Hangzhou 64000.0 NaN 2000 NaN#Shanghai 68000.0 8000.0 2000 150000.0#Suzhou 64000.0 NaN 2000 NaN#Tianjin 64000.0 40000.0 2000 190000.0#shenzhen 76000.0 6000.0 2000 163000.0
dff=df.fillna(value=0) #df没变print(dff)
apts apts2 bonus incomeBeijing 65000.0 10000.0 2000 147000.0Chongqing 64000.0 30000.0 2000 175000.0Guangzhou 47000.0 7000.0 2000 106500.0Hangzhou 64000.0 0.0 2000 0.0Shanghai 68000.0 8000.0 2000 150000.0Suzhou 64000.0 0.0 2000 0.0Tianjin 64000.0 40000.0 2000 190000.0shenzhen 76000.0 6000.0 2000 163000.0
dff=df.fillna(value=0, inplace=True)print(df);print(dff) #inplace参数True,df改变,没有新的dff拷贝
apts apts2 bonus incomeBeijing 65000.0 10000.0 2000 147000.0Chongqing 64000.0 30000.0 2000 175000.0Guangzhou 47000.0 7000.0 2000 106500.0Hangzhou 64000.0 0.0 2000 0.0Shanghai 68000.0 8000.0 2000 150000.0Suzhou 64000.0 0.0 2000 0.0Tianjin 64000.0 40000.0 2000 190000.0shenzhen 76000.0 6000.0 2000 163000.0None
dffr=df.fillna(method='ffill') #新生成的补NaN前向拷贝,df没变print(dffr)
apts apts2 bonus incomeBeijing 65000.0 10000.0 2000 147000.0Chongqing 64000.0 30000.0 2000 175000.0Guangzhou 47000.0 7000.0 2000 106500.0Hangzhou 64000.0 7000.0 2000 106500.0Shanghai 68000.0 8000.0 2000 150000.0Suzhou 64000.0 8000.0 2000 150000.0Tianjin 64000.0 40000.0 2000 190000.0shenzhen 76000.0 6000.0 2000 163000.0
dfba=df.fillna(method='bfill') #新生成的补NaN后向拷贝,df没变print(dfba)
apts apts2 bonus incomeBeijing 65000.0 10000.0 2000 147000.0Chongqing 64000.0 30000.0 2000 175000.0Guangzhou 47000.0 7000.0 2000 106500.0Hangzhou 64000.0 8000.0 2000 150000.0Shanghai 68000.0 8000.0 2000 150000.0Suzhou 64000.0 40000.0 2000 190000.0Tianjin 64000.0 40000.0 2000 190000.0shenzhen 76000.0 6000.0 2000 163000.0
import pandas as pdimport numpy as npdata=pd.Series(np.random.randn(10),index=[['a','a','a','b','b','c','c','d','d','d'], [1,2,3,1,2,1,2,1,2,3]])print(data)print(type(data))
a 1 0.346467 2 -0.043077 3 0.043878b 1 0.107763 2 -0.175726c 1 -1.833683 2 0.033884d 1 -1.807021 2 0.819740 3 0.294679dtype: float64<class 'pandas.core.series.Series'>
print(data.index)
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 0, 1, 0, 1, 2]])
print(data['b':'c'])
b 1 0.353241 2 0.379744c 1 -0.860706 2 -0.795483dtype: float64
print(data[:2])
a 1 0.763116 2 0.058009dtype: float64
unstack=data.unstack() #将层级数据横向拉开,不够长的补NaNprint(unstack)print(type(unstack))
1 2 3a -0.637935 -0.104897 -1.536381b 2.448302 1.679833 NaNc -0.845155 0.829459 NaNd 0.597535 -0.464255 -0.898994<class 'pandas.core.frame.DataFrame'> #对比data的类型
import pandas as pdcities={'Beijing':55000,'Shanghai':60000,'shenzhen':50000,'Hangzhou':20000,'Guangzhou':45000,'Suzhou':None}apts=pd.Series(cities,name='income')apts['shenzhen']=70000less_than_50000=(apts<50000)apts[less_than_50000]=40000apts2=pd.Series({'Beijing':10000,'Shanghai':8000,'shenzhen':6000,'Tianjin':40000,'Guangzhou':7000,'Chongqing':30000})apts=apts+apts2apts[apts.isnull()]=apts.mean()df=pd.DataFrame({'apts':apts,'apts2':apts2})df['bonus']=2000 df['income']=df['apts']*2+df['apts2']*1.5+df['bonus']#print(df)df.to_csv('df.csv')df.to_csv('df2.csv',index=False) #去掉第一列,行索引列import osdf2_site = r"D:\PYTHON35\idle\df2.csv"pwd = os.getcwd() #获取当前工作目录os.chdir(os.path.dirname(df2_site))tmp_df = pd.read_csv(os.path.basename(df2_site)) ###print(tmp_df)
apts apts2 bonus income0 65000.0 10000.0 2000 147000.01 64000.0 30000.0 2000 175000.02 47000.0 7000.0 2000 106500.03 64000.0 NaN 2000 NaN4 68000.0 8000.0 2000 150000.05 64000.0 NaN 2000 NaN6 64000.0 40000.0 2000 190000.07 76000.0 6000.0 2000 163000.0
tmp_df_index = pd.Index(['Beijing','Shanghai',"Suzhou",'Hangzhou','Tianjin','Chongqing','Nanjing','Shenzhen'])tmp_df.index=tmp_df_index #修改索引print(tmp_df)
apts apts2 bonus incomeBeijing 65000.0 10000.0 2000 147000.0Shanghai 64000.0 30000.0 2000 175000.0Suzhou 47000.0 7000.0 2000 106500.0Hangzhou 64000.0 NaN 2000 NaNTianjin 68000.0 8000.0 2000 150000.0Chongqing 64000.0 NaN 2000 NaNNanjing 64000.0 40000.0 2000 190000.0Shenzhen 76000.0 6000.0 2000 163000.0
df.to_csv('df3.csv',sep='\t')
联系客服