데이터 처리 패키지 판다스 일반 문
21156 단어 파이썬 프로그래밍
# coding=gbk
'''''
Created on 2016 6 1
@author: bryan
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 1、 list Series,pandas :
# s=pd.Series([1,3,5,np.nan,6,8])
# print(s)
#
# # 2、 numpy array, DataFrame:
# dates=pd.date_range('20130101',periods=6)
# print(dates)
# df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
# print(df)
#
# # 3、 DataFrame:
# df2=pd.DataFrame({'A':1,'B':pd.Timestamp('20130102'),
# 'C':pd.Series(1,index=list(range(4)),dtype='float32'),
# 'D':np.array([3]*4,dtype='int32'),
# 'E':pd.Categorical(['test','train','test','train']),
# 'F':'fool'
# })
# print(df2)
# # 4、 :
# print(df2.dtypes)
#
#
#
#
# # 1、 frame :
# print(df.head(1))
# print(df.tail(1))
#
# # 2、 、 numpy :
# print(df.index)
# print(df.columns)
# print(df.values)
#
# # 3、 describe() :
# print(df.describe())
#
# # 4、 :
# print(df.T)
#
# # 5、
# print(df.sort_index(axis=1,ascending=False))
#
# # 6、
# print(df.sort(columns='B'))
#
#
#
#
#
# # l
# #
# # 1、 , Series, df.A:
# print(df['A'])
#
# # 2、 [] ,
# print(df[:3]) # 0 print(df[0:3])
#
# # l
# #
# # 1、
# print(df.loc[dates[0]])
#
# # 2、
# print(df.loc[:,['A','B']])
#
# # 3、
# print(df.loc['20130102':'20130104',['A','B']])
#
# # 4、
# print(df.loc['20130101',['A','B']])
#
# # 5、
# print(df.loc[dates[0],'A'])
#
# # 6、 ( )
# print(df.at[dates[0],'A'])
#
# # l
# #
# # 1、 ( )
# print(df.iloc[3])
#
# # 2、 , numpy/python
# print(df.iloc[3:5,0:2])
#
# # 3、 , numpy/python
# print(df.iloc[[1,2,4],[0,2]])
#
# # 4、
# print(df.iloc[1:3,:])
#
# # 5、
# print(df.iloc[:,1:3])
#
# # 6、
# print(df.iloc[1,1])
#
# # l
# #
# # 1、 :
# print(df[df.A>0])
#
# # 2、 where :
# print(df[df>0])
#
# # 3、 isin() :
# df2=df.copy()
# df2['E']=['one','one','one','one','one','two']
# print(df2)
#
# # l
# #
# # 1、 :
# s1=pd.Series([1,2,3,4,5,6],index=pd.date_range('20130101', periods=6))
#
# print(s1)
# df['F']=s1
# print(df)
#
# # 2、 :
# df.at[dates[0],'A']=0
# print(df)
#
# # 3、 :
# df.iat[0,1]=0
# print(df)
#
# # 4、 numpy :
# df.loc[:,'D']=np.array([5]*len(df))
# print(df)
#
# # 5、 where :
# df2=df.copy()
# df2[df2>0]=-df2
# print(df2)
#
#
#
#
#
#
#
# # 、
# # pandas , np.nan , , :Missing Data Section。
# #
# # 1、 reindex() / / , :、
#
# df1=df.reindex(index=dates[0:4],columns=list(df.columns)+['E'])
# print(df1)
#
# # 2、 :
# # df1.dropna(how='any',inplace=True)
# # print(df1)
#
# # 3、 :
# # df1=df1.fillna(value=5)
# # print(df1)
#
# # 4、 :
# # print(pd.isnull(df1))
#
#
#
#
# # 、
# # Basic Section On Binary Ops
# #
# # l ( )
# #
# # 1、 :
# print(df.mean())
#
# # 2、 :
# print(df.mean(1))
#
# # 3、 , 。Pandas :
# s=pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2)
# print(s)
#
#
# # l Apply
# #
# # 1、 :
# print(df.apply(np.cumsum))
# print(df.apply(lambda x:x.max()-x.min()))
#
# # l
#
# # :Histogramming and Discretization
#
# s=pd.Series(np.random.randint(0,7,size=10))
# print(s)
#
# print(s.value_counts())
#
# # l
# #
# # Series str ,
# s=pd.Series(['A','B','C','Bcaa',np.nan,'CBA','dog','cat'])
# print(s.str.lower())
# 、
# Pandas Series,DataFrame Panel 。 :Merging section
#
# l Concat
# df=pd.DataFrame(np.random.randn(10,4))
# # print(df)
#
# pieces=[df[:3],df[3:7],df[7:]]
# print(pd.concat(pieces))
# l Join SQL
# left=pd.DataFrame({'key':['foo','foo'],'lval':[1,2]})
# right=pd.DataFrame({'key':['foo','foo'],'rval':[4,5]})
#
# print(left)
# print(right)
#
# mid=pd.merge(left,right,on='key')
# print(mid)
# l Append DataFrame
# df=pd.DataFrame(np.random.randn(8,4),columns=['A','B','C','D'])
# print(df)
# s=df.iloc[3]
# print(s)
# df=df.append(s,ignore_index=True)
# print(df)
# 、
# ”group by” , :
#
# l (Splitting) ;
#
# l (Applying) ;
#
# l (Combining) ;
# df=pd.DataFrame({'A':['foo','bar','foo','bar','foo','bar','foo','bar']
# ,'B':['one','two','two','one','one','two','one','two']
# ,'C':np.random.randn(8),'D':np.random.randn(8)})
# print(df)
#
# # 1、 sum :
# print(df.groupby('A').sum())
#
# # 2、 , :
# print(df.groupby(['A','B']).sum())
# 、 Reshaping
# l Stack
# tuples=list(zip(*[['bar','bar','baz','baz','foo','foo','qux','qux']
# ,['one','two','one','two','one','two','one','two']]))
#
# index=pd.MultiIndex.from_tuples(tuples, names=['first','second'])
# df=pd.DataFrame(np.random.randn(8,2),index=index,columns=['A','B'])
# df2=df[:4]
# # print(df2)
# print(df2.stack().unstack(1))
# 、
# Pandas 、 ( 5 )
# rng=pd.date_range('1/1/2012',periods=100,freq='S')
# print(rng)
# ts=pd.Series(np.random.randint(0,500,len(rng)),index=rng)
# print(ts)
# print(ts.resample('5Min',how='sum'))
# 1、 :
# rng=pd.date_range('3/6/2012 00:00',periods=5,freq='D')
# print(rng)
# ts=pd.Series(np.random.randn(len(rng)),index=rng)
# print(ts)
# ts_utc=ts.tz_localize('UTC')
# print(ts_utc)
#
# # 2、 :
# print(ts_utc.tz_convert('US/Eastern'))
#
# # 3、 :
# rng=pd.date_range('1/1/2012',periods=5,freq='M')
# print(rng)
# ts=pd.Series(np.random.randn(len(rng)),index=rng)
# print(ts)
# ps=ts.to_period()
# print(ps)
# print(ps.to_timestamp())
# 4、 。
# prng=pd.period_range('1990Q1','2000Q4',freq='Q-NOV')
# print(prng)
# ts=pd.Series(np.random.randn(len(prng)),index=prng)
# print(ts)
# ts.index=(prng.asfreq('M','e')+1).asfreq('H', 's')+8
# print(ts.head())
# 、 Categorical
# 0.15 ,pandas DataFrame Categorical
# df=pd.DataFrame({'id':[1,2,3,4,5,6],'raw_grade':['a','b','b','a','a','e']})
# print(df)
#
# # 1、 grade Categorical :
# df['grade']=df['raw_grade'].astype('category')
# print(df)
#
# # 2、 Categorical :
# df['grade'].cat.categories=['very good','good','very bad']
# print(df)
#
# # 3、 , :
# df['grade']=df['grade'].cat.set_categories(['very bad','bad','medium','good','very good'])
# print(df['grade'])
#
# # 4、 Categorical :
# print(df.sort('grade'))
#
# # 5、 Categorical :
# print(df.groupby('grade').size())
# 、
# ts=pd.Series(np.random.randn(1000),index=pd.date_range('1/1/2012',periods=1000,freq='D'))
# ts=ts.cumsum()
# ts.plot()
#
# df=pd.DataFrame(np.random.randn(1000,4),index=ts.index,columns=['A','B','C','D'])
# df=df.cumsum()
# plt.figure();df.plot();plt.legend(loc='best')
#
#
# # 、
# # 1、 csv :
# df.to_csv('foo.csv',index=False)
#
# # 2、 csv :
# pd.read_csv('foo.csv')
#
# # 1、 excel :
# df.to_excel('foo.xlsx',sheet_name='Sheet1')
#
# # 2、 excel :
# pd.read_excel('foo.xlsx','Sheet1',index_col=None,na_values=['NA'])