pandas 상세 설명

9146 단어 데이터 분석
Series             ,      (  NumPy    )             (   )  。
              
Series:

In [4]: obj = Series([4, 7, -5, 3])

In [5]: obj
Out[5]:
0    4
1    7
2   -5
3    3

  :axis=0   :axis=1           [ ][ ]
DataFrame           ,         ,           (  、   、    )。
DataFrame          ,       Series     (       )。
            ( R data.frame),DataFrame                  。
  ,DataFrame                 (     、           )。
import pandas as pd
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = pd.DataFrame(data)
In [38]: frame
Out[38]:
   pop   state  year
0  1.5    Ohio  2000
1  1.7    Ohio  2001
2  3.6    Ohio  2002
3  2.4  Nevada  2001
4  2.9  Nevada  2002



3.  DateFrame  
(1)df.index #       
(2)df.columns #       
(3)df.shape #   DateFrame shape   
(4)df.head(5) #           5
(5)df.tail(5) #           5
(6)df.dtypes #          
(7)df.info() #        (   ),   、  、     、       
(8)df.values #     


isin                                 
delete              ,     index
drop              ,     index
            data.drop(['Colorado', 'Ohio'])
            frame.drop(['year'], axis=1)             
            frame.drop([0, 1], axis=0)      0    1 
            frame =  frame.drop(['year'], axis=1)          
insert               ,     index
				frame.insert(1, 'year', [2012, 2013, 2015, 2017, 2014])  2         (   0  )
unique        index       
rename	        ,  DataFrame.rename(mapper=None, index=None, columns=None, axis=None, copy=True, inplace=False, level=None)
			frame.rename(columns={'built':'bui'}, inplace=True) inplace  True          
			  ,      ,    frame = frame.rename(columns={'built':'bui'}, inplace=True)     
			frame.rename(index={0:'bu'}, inplace=False)
			frame.rename(index={0:'bu'}, columns={'year': 'day'}, inplace=False)     

reindex    obj3.reindex(range(6), method='ffill')
method:
ffill  pad               
bfill  backfill         
   :
del  
In [5]:
del odata['date']
.pop                ,          

In [6]:
spring = odata.pop('spring')

.drop()  
drop                ,     ,      inplace

In [8]:
withoutSummer = odata.drop(['summer'],axis=1)


  :
In [112]: data = DataFrame(np.arange(16).reshape((4, 4)),
     ...:                  index=['Ohio', 'Colorado', 'Utah', 'New York'],
     ...:                  columns=['one', 'two', 'three', 'four'])

In [113]: data
Out[113]:
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15

In [114]: data['two']       In [115]: data[['three', 'one']]
Out[114]:                   Out[115]:
Ohio         1                        three  one
Colorado     5              Ohio          2    0
Utah         9              Colorado      6    4
New York    13              Utah         10    8
Name: two                   New York     14   12
              。               :

In [116]: data[:2]                   In [117]: data[data['three'] > 5]
Out[116]:                            Out[117]:
          one  two  three  four                 one  two  three  four
Ohio        0    1      2     3      Colorado    4    5      6     7
Colorado    4    5      6     7      Utah        8    9     10    11
                                     New York   12   13     14    15



DataFrame     

obj[val]               Dataframe        。             :     、  、    f.ix[:2, :5]     ,      
obj.ix[val]            Dataframe        
obj.ix[:, val]                                                                        f.ix[:, 3]
obj.ix[val1, val2]                                                                                


In[62]:f['drink']               
Out[62]:
0    0
1    1
2    2
Name: drink, dtype: int64

       loc    iloc                               column           value
df.loc['one'].T
Out[67]:
(column)                (one)   
id                         1001
date        2013-01-02 00:00:00
city                   Beijing
age                          23
category                  100-A
price                      1200
name                          0
Name: one, dtype: object

df.ix[:3]       

  
          (sorting)           。
           (     ),   sort_index  ,             :


Series
In [169]: obj = Series(range(4), index=['d', 'a', 'b', 'c'])

In [170]: obj.sort_index()
Out[170]:
a   1
b   2
c   3
d   0

     Series    ,    order  :

In [175]: obj = Series([4, 7, -3, 2])

In [176]: obj.order()
Out[176]:
2   -3
3    2
0    4
1    7



   DataFrame,                  :

In [171]: frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
     ...:                   columns=['d', 'a', 'b', 'c'])

In [172]: frame.sort_index()         In [173]: frame.sort_index(axis=1)
Out[172]:                            Out[173]:
       d  a  b  c                           a  b  c  d
one    4  5  6  7                    three  1  2  3  0
three  0  1  2  3                    one    5  6  7  4




 5-8              method  。DataFrame           :

In [187]: frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
     ...:                    'c': [-2, 5, 8, -2.5]})

In [188]: frame              In [189]: frame.rank(axis=1, method='average')
Out[188]:                    Out[189]:
   a    b    c                   a  b  c
0  0  4.3 -2.0                0  2  3  1
1  1  7.0  5.0                1  1  3  2
2  0 -3.0  8.0                2  2  1  3
3  1  2.0 -2.5                3  2  3  1


average                   :      ,          
min                                
max                                
first                                      


axis            Dataframe   0,  1



       
count                        NA    
describe                      series  Dataframe;       
min,max                              
argmin,argmax                                  
idxmin,idxmax                                  
quantile                            (0 1)
sum                             
mean                             
median                             (50%   )
mad                                      
var                               
std                                
diff                              
cumsum                             
cummin cummax                              
cumprod                            


df.describe()
Out[45]:
                id       age        price      name
count     6.000000   6.00000     4.000000  6.000000
mean   1003.500000  36.50000  3299.500000  2.500000
std       1.870829  10.87658  1966.638503  1.870829
min    1001.000000  23.00000  1200.000000  0.000000
25%    1002.250000  32.00000  1899.750000  1.250000
50%    1003.500000  33.00000  3282.500000  2.500000
75%    1004.750000  41.50000  4682.250000  3.750000
max    1006.000000  54.00000  5433.000000  5.000000



   
isin                          series                    
unique                    Series      ,        
value_counts                Series,       ,     ,        

In [220]: obj.value_counts()
Out[220]:
c    3
a    3
b    2
d    1


        

dropna                                           ,               
fillna                           ( ffill bfill)      
isnull                              ,              ,            
notnull                 isnull    
In [235]: data = Series([1, NA, 3.5, NA, 7])

In [236]: data.dropna()
Out[236]:
0    1.0
2    3.5
4    7.0




    
read_csv                       、URL、               。        
read_table                     、URL、               。      '\t'
read_fwf                             (    ,     )
read_clipboard                       ,    read_table    。            


  
path                                、URL、         
Sep / delimiter                                     
header                             。   0(   ),    header       None
skiprows                           (       ),           ( 0  )
index_col                               。       /        /       
names                                ,  header=None
na_values                        NA  
comment                                      (     )
dayfirst                              ,        
nrows                              (       )
skip_footer                        (        
encoding                      Unicode       ,  ,utf-8    utf-8      
chunksize                         (    )

chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)
tot = Series([])
for piece in chunker:
tot = tot.add(piece['key'].value_counts(), fill_value=0)
tot = tot.order(ascending=False)
       :
In [877]: tot[:10]
Out[877]:
E 368
X 364
L 346
O 343
Q 340
M 338
J 337
F 335
K 334
H 330


좋은 웹페이지 즐겨찾기