[빅데이터시스템] pandas

Pandas

  • 데이터 분석을 위한 데이터 처리 라이브러리
  • 대용량 데이터를 처리하기 위한 함수, 메소드들이 제공

Series (one-dimensional array)

: index와 data로 이루어진 pandas의 데이터구조

DataFrame (two-dimensional array)

: 각각의 인덱스가 series 로 이루어져 있는 pandas의 데이터구조

Pandas Tutorial

import pandas as pd
import numpy as np
import matplotlib as plt
#check pandas version
pd.__version__
'0.24.1'
#creating a series from a list
arr = [0,1,2,3,4]
print(type(arr))
s1 = pd.Series(arr)
s1
<class 'list'>





0    0
1    1
2    2
3    3
4    4
dtype: int64
s1.index
RangeIndex(start=0, stop=5, step=1)
npArr = np.arange(5)
print(type(npArr))
s1_1=pd.Series(npArr)
s1_1
<class 'numpy.ndarray'>





0    0
1    1
2    2
3    3
4    4
dtype: int64
np.array([0,1,2,3,4])
array([0, 1, 2, 3, 4])
#change index
arr = [0,1,2,3,4]
s1 = pd.Series(arr)
s1
0    0
1    1
2    2
3    3
4    4
dtype: int64
order = [1,2,3,4,5]
s2 = pd.Series(arr, index = order) # index 값을 할당해서 인덱스 변경
s2
1    0
2    1
3    2
4    3
5    4
dtype: int64
n = np.random.randn(5)
n
array([ 0.86042598, -0.96561393, -0.34827275, -0.41878316,  0.24770315])
pd.Series(n)
0    0.860426
1   -0.965614
2   -0.348273
3   -0.418783
4    0.247703
dtype: float64
index = ['a','b','c','d','e']
s2 = pd.Series(n, index = index)
s2
a    0.860426
b   -0.965614
c   -0.348273
d   -0.418783
e    0.247703
dtype: float64
# create a series from dictionary
d = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5}
s3 = pd.Series(d)
s3
a    1
b    2
c    3
d    4
e    5
dtype: int64
# you can modify the index of series
print(s1)
print(s1.index)
0    0
1    1
2    2
3    3
4    4
dtype: int64
RangeIndex(start=0, stop=5, step=1)
s1.index = ['A','B','C','D','E']
s1
A    0
B    1
C    2
D    3
E    4
dtype: int64
print(s1.index)
Index(['A', 'B', 'C', 'D', 'E'], dtype='object')
# slicing
a = s1[:3]
a
A    0
B    1
C    2
dtype: int64
s1[:-1]
A    0
B    1
C    2
D    3
dtype: int64
s1[-2:]
D    3
E    4
dtype: int64
print(s1)
print(s3)
s4 = s1.append(s3)
print(s4)
A    0
B    1
C    2
D    3
E    4
dtype: int64
a    1
b    2
c    3
d    4
e    5
dtype: int64
A    0
B    1
C    2
D    3
E    4
a    1
b    2
c    3
d    4
e    5
dtype: int64
# drop
s4
A    0
B    1
C    2
D    3
E    4
a    1
b    2
c    3
d    4
e    5
dtype: int64
s4.drop('e') # e를 제외하고 출력
A    0
B    1
C    2
D    3
E    4
a    1
b    2
c    3
d    4
dtype: int64
s4 # s4 자체는 변하지 않음
A    0
B    1
C    2
D    3
E    4
a    1
b    2
c    3
d    4
e    5
dtype: int64
s4 = s4.drop('e') # s4를 변화
s4
A    0
B    1
C    2
D    3
E    4
a    1
b    2
c    3
d    4
dtype: int64

Series operation

arr1 = [0,1,2,3,4,5,7]
arr2 = [6,7,8,9,5]
s5 = pd.Series(arr2)
s5
0    6
1    7
2    8
3    9
4    5
dtype: int64
s6 = pd.Series(arr1)
s6
0    0
1    1
2    2
3    3
4    4
5    5
6    7
dtype: int64
s5.add(s6) # 같은 인덱스까리 더해주고, 없는 것들은 NaN
0     6.0
1     8.0
2    10.0
3    12.0
4     9.0
5     NaN
6     NaN
dtype: float64
s5.sub(s6)
0    6.0
1    6.0
2    6.0
3    6.0
4    1.0
5    NaN
6    NaN
dtype: float64
s5.mul(s6)
0     0.0
1     7.0
2    16.0
3    27.0
4    20.0
5     NaN
6     NaN
dtype: float64
s5.div(s6) #  inf
0     inf
1    7.00
2    4.00
3    3.00
4    1.25
5     NaN
6     NaN
dtype: float64
s6
0    0
1    1
2    2
3    3
4    4
5    5
6    7
dtype: int64
print("median", s6.median())
print("max", s6.max())
print("min", s6.min())
median 3.0
max 7
min 0
s7 = s5.mul(s6)
s7
0     0.0
1     7.0
2    16.0
3    27.0
4    20.0
5     NaN
6     NaN
dtype: float64
# NaN은 제외하고 계산
print("median", s5.median())
print("max", s5.max())
print("min", s5.min())
median 7.0
max 9
min 5

Create Dataframe

dates = pd.date_range('today', periods=6)
dates
DatetimeIndex(['2020-09-29 19:59:15.088382', '2020-09-30 19:59:15.088382',
               '2020-10-01 19:59:15.088382', '2020-10-02 19:59:15.088382',
               '2020-10-03 19:59:15.088382', '2020-10-04 19:59:15.088382'],
              dtype='datetime64[ns]', freq='D')
num_arr = np.random.randn(6,4)
num_arr
array([[ 0.38406967, -1.02847711,  0.83911484,  0.79294576],
       [-0.23224823, -0.78362413, -3.17592451,  0.61708198],
       [-1.59406003,  1.646996  ,  0.45997278,  0.28394742],
       [ 0.0244321 , -0.15328843, -1.70151417,  0.64868737],
       [-0.56298435, -0.83298394,  1.26453586, -1.59056282],
       [ 0.01777079,  0.50983863,  2.29359854,  2.04781485]])
columns = ['A','B','C','D']
df1 = pd.DataFrame(num_arr, index = dates, columns = columns)
df1
A B C D
2020-09-29 19:59:15.088382 0.384070 -1.028477 0.839115 0.792946
2020-09-30 19:59:15.088382 -0.232248 -0.783624 -3.175925 0.617082
2020-10-01 19:59:15.088382 -1.594060 1.646996 0.459973 0.283947
2020-10-02 19:59:15.088382 0.024432 -0.153288 -1.701514 0.648687
2020-10-03 19:59:15.088382 -0.562984 -0.832984 1.264536 -1.590563
2020-10-04 19:59:15.088382 0.017771 0.509839 2.293599 2.047815
pd.DataFrame(num_arr)
0 1 2 3
0 0.384070 -1.028477 0.839115 0.792946
1 -0.232248 -0.783624 -3.175925 0.617082
2 -1.594060 1.646996 0.459973 0.283947
3 0.024432 -0.153288 -1.701514 0.648687
4 -0.562984 -0.832984 1.264536 -1.590563
5 0.017771 0.509839 2.293599 2.047815
df1 = pd.DataFrame(num_arr, index = [1,2,3,4,5,6], columns = columns)
df1
A B C D
1 0.384070 -1.028477 0.839115 0.792946
2 -0.232248 -0.783624 -3.175925 0.617082
3 -1.594060 1.646996 0.459973 0.283947
4 0.024432 -0.153288 -1.701514 0.648687
5 -0.562984 -0.832984 1.264536 -1.590563
6 0.017771 0.509839 2.293599 2.047815
# create dataframe with dictionary array

data = {'animal':['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog','dog'],
       'age':[2.5, 3, 0.5, np.nan, 5, 2, 4.5 , np.nan, 7, 3],
       'visits':[1,3,2,3,2,3,1,1,2,1],
       'priority':['yes', 'yes', 'no', 'no', 'yes', 'yes', 'no', 'no', 'yes', 'no']}
df2 = pd.DataFrame(data)
df2
animal age visits priority
0 cat 2.5 1 yes
1 cat 3.0 3 yes
2 snake 0.5 2 no
3 dog NaN 3 no
4 dog 5.0 2 yes
5 cat 2.0 3 yes
6 snake 4.5 1 no
7 cat NaN 1 no
8 dog 7.0 2 yes
9 dog 3.0 1 no
data = {'animal':['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog','dog'],
       'age':[2.5, 3, 0.5, np.nan, 5, 2, 4.5 , np.nan, 7, 3],
       'visits':[1,3,2,3,2,3,1,1,2,1],
       'priority':['yes', 'yes', 'no', 'no', 'yes', 'yes', 'no', 'no', 'yes', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df2 = pd.DataFrame(data, index = labels)
df2
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 no
e dog 5.0 2 yes
f cat 2.0 3 yes
g snake 4.5 1 no
h cat NaN 1 no
i dog 7.0 2 yes
j dog 3.0 1 no
df2.dtypes
animal       object
age         float64
visits        int64
priority     object
dtype: object
df2.head() # 데이터프레임의 앞 5개만 나옴
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 no
e dog 5.0 2 yes
df2.head(3)
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
df3 = df2.head(3)
df3
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
df2.tail() # tail은 뒤에 5개만 나옴
animal age visits priority
f cat 2.0 3 yes
g snake 4.5 1 no
h cat NaN 1 no
i dog 7.0 2 yes
j dog 3.0 1 no
df2.index
Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')
df2.columns
Index(['animal', 'age', 'visits', 'priority'], dtype='object')
df2.values
array([['cat', 2.5, 1, 'yes'],
       ['cat', 3.0, 3, 'yes'],
       ['snake', 0.5, 2, 'no'],
       ['dog', nan, 3, 'no'],
       ['dog', 5.0, 2, 'yes'],
       ['cat', 2.0, 3, 'yes'],
       ['snake', 4.5, 1, 'no'],
       ['cat', nan, 1, 'no'],
       ['dog', 7.0, 2, 'yes'],
       ['dog', 3.0, 1, 'no']], dtype=object)
type(df2.values)
numpy.ndarray
df2
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 no
e dog 5.0 2 yes
f cat 2.0 3 yes
g snake 4.5 1 no
h cat NaN 1 no
i dog 7.0 2 yes
j dog 3.0 1 no
df2.describe() # 데이터프레임을 바탕을 여러가지 통계값을 출력한다.
age visits
count 8.000000 10.000000
mean 3.437500 1.900000
std 2.007797 0.875595
min 0.500000 1.000000
25% 2.375000 1.000000
50% 3.000000 2.000000
75% 4.625000 2.750000
max 7.000000 3.000000
df2.T # transpose
a b c d e f g h i j
animal cat cat snake dog dog cat snake cat dog dog
age 2.5 3 0.5 NaN 5 2 4.5 NaN 7 3
visits 1 3 2 3 2 3 1 1 2 1
priority yes yes no no yes yes no no yes no
df2.sort_values(by='age') # age에 따라서 인덱스들이 정렬된다.
animal age visits priority
c snake 0.5 2 no
f cat 2.0 3 yes
a cat 2.5 1 yes
b cat 3.0 3 yes
j dog 3.0 1 no
g snake 4.5 1 no
e dog 5.0 2 yes
i dog 7.0 2 yes
d dog NaN 3 no
h cat NaN 1 no
#slicing dataframe
df2[1:3]
animal age visits priority
b cat 3.0 3 yes
c snake 0.5 2 no
df2.sort_values(by='age')[1:3]
animal age visits priority
f cat 2.0 3 yes
a cat 2.5 1 yes
# query dataframe by tag
df2
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 no
e dog 5.0 2 yes
f cat 2.0 3 yes
g snake 4.5 1 no
h cat NaN 1 no
i dog 7.0 2 yes
j dog 3.0 1 no
# query dataframe by tag
df2[['age', 'visits']] # [] 대괄호 두개가 필요
age visits
a 2.5 1
b 3.0 3
c 0.5 2
d NaN 3
e 5.0 2
f 2.0 3
g 4.5 1
h NaN 1
i 7.0 2
j 3.0 1
type(df2[['age','visits']])
pandas.core.frame.DataFrame
df2
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 no
e dog 5.0 2 yes
f cat 2.0 3 yes
g snake 4.5 1 no
h cat NaN 1 no
i dog 7.0 2 yes
j dog 3.0 1 no
#iloc  slice와 같음
df2.iloc[1:3]
animal age visits priority
b cat 3.0 3 yes
c snake 0.5 2 no
df2[1:3]
animal age visits priority
b cat 3.0 3 yes
c snake 0.5 2 no
print(type(df2.iloc[1:3]))
print(type(df2[1:3]))
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
df3 = df2.copy()
df3
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 no
e dog 5.0 2 yes
f cat 2.0 3 yes
g snake 4.5 1 no
h cat NaN 1 no
i dog 7.0 2 yes
j dog 3.0 1 no
df3.isnull() # Nan에 True임을 확인할 수 있다
animal age visits priority
a False False False False
b False False False False
c False False False False
d False True False False
e False False False False
f False False False False
g False False False False
h False True False False
i False False False False
j False False False False
df3
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 no
e dog 5.0 2 yes
f cat 2.0 3 yes
g snake 4.5 1 no
h cat NaN 1 no
i dog 7.0 2 yes
j dog 3.0 1 no
#change value
df3.loc['f','age']
2.0
df3.loc['f','age'] = 1.5
df3 # f의 age가 1.5로 변경
animal age visits priority
a cat 2.5 1.0 yes
b cat 3.0 3.0 yes
c snake 0.5 2.0 no
d dog NaN 3.0 no
e dog 5.0 2.0 yes
f cat 1.5 3.0 yes
g snake 4.5 1.0 no
h cat NaN 1.0 no
i dog 7.0 2.0 yes
j dog 3.0 1.0 no
fage 1.5 1.5 1.5 1.5
# mean 숫자가 들어있는 row의 mean을 구함
df3.mean()
age       3.166667
visits    1.863636
dtype: float64
df3[['age']].mean()
age    3.166667
dtype: float64
df3['age'].mean()
3.1666666666666665
# 대괄호를 두개 입력해야함. 타입이 달라짐
print(type(df3[['age']].mean()))
print(type(df3['age'].mean()))
<class 'pandas.core.series.Series'>
<class 'float'>
print(df3['age'])
type(df3['age'])
a       2.5
b       3.0
c       0.5
d       NaN
e       5.0
f       1.5
g       4.5
h       NaN
i       7.0
j       3.0
fage    1.5
Name: age, dtype: float64





pandas.core.series.Series
print(df3[['age']])
type(df3[['age']])
      age
a     2.5
b     3.0
c     0.5
d     NaN
e     5.0
f     1.5
g     4.5
h     NaN
i     7.0
j     3.0
fage  1.5





pandas.core.frame.DataFrame
print(df3['visits'].sum())
print(type(df3['visits'].sum()))
20.5
<class 'numpy.float64'>
df3[['visits']].sum()
print(type(df3[['visits']].sum()))
<class 'pandas.core.series.Series'>
# lowwer, upper
string = pd.Series(['A','C','D','Aaa','BaCa',np.nan,'CBA', 'cow', 'owl'])
string
0       A
1       C
2       D
3     Aaa
4    BaCa
5     NaN
6     CBA
7     cow
8     owl
dtype: object
string.str.upper()
0       A
1       C
2       D
3     AAA
4    BACA
5     NaN
6     CBA
7     COW
8     OWL
dtype: object
string.str.lower()
0       a
1       c
2       d
3     aaa
4    baca
5     NaN
6     cba
7     cow
8     owl
dtype: object

Operations for DataFrame missing values

df4 = df3.copy()
df4
animal age visits priority
a cat 2.5 1.0 yes
b cat 3.0 3.0 yes
c snake 0.5 2.0 no
d dog NaN 3.0 no
e dog 5.0 2.0 yes
f cat 1.5 3.0 yes
g snake 4.5 1.0 no
h cat NaN 1.0 no
i dog 7.0 2.0 yes
j dog 3.0 1.0 no
fage 1.5 1.5 1.5 1.5
df4.fillna(4, inplace=True) # NaN 값에 4를 넣어줌
df4
animal age visits priority
a cat 2.5 1.0 yes
b cat 3.0 3.0 yes
c snake 0.5 2.0 no
d dog 4.0 3.0 no
e dog 5.0 2.0 yes
f cat 1.5 3.0 yes
g snake 4.5 1.0 no
h cat 4.0 1.0 no
i dog 7.0 2.0 yes
j dog 3.0 1.0 no
fage 1.5 1.5 1.5 1.5

CSV file

  • CSV (comma separated values) file read
  • read.csv("csv file")
  • head
import pandas as pd

df = pd.read_csv("./8.intropandas/olympics.csv")
df.head()
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
0 NaN № Summer 01 ! 02 ! 03 ! Total № Winter 01 ! 02 ! 03 ! Total № Games 01 ! 02 ! 03 ! Combined total
1 Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2
2 Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
3 Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
4 Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12

Index and Skiprows

df = pd.read_csv("./8.intropandas/olympics.csv", index_col=0) # index를 컬럼 0으로 지정
df.head()
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
0
NaN № Summer 01 ! 02 ! 03 ! Total № Winter 01 ! 02 ! 03 ! Total № Games 01 ! 02 ! 03 ! Combined total
Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
df = pd.read_csv("./8.intropandas/olympics.csv", index_col=0, skiprows = 1) # 시작 row를 1번으로 지정
df.head()
№ Summer 01 ! 02 ! 03 ! Total № Winter 01 !.1 02 !.1 03 !.1 Total.1 № Games 01 !.2 02 !.2 03 !.2 Combined total
Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12

Columns and indexs

df.columns
Index(['№ Summer', '01 !', '02 !', '03 !', 'Total', '№ Winter', '01 !.1',
       '02 !.1', '03 !.1', 'Total.1', '№ Games', '01 !.2', '02 !.2', '03 !.2',
       'Combined total'],
      dtype='object')
df.index
Index(['Afghanistan (AFG)', 'Algeria (ALG)', 'Argentina (ARG)',
       'Armenia (ARM)', 'Australasia (ANZ) [ANZ]', 'Australia (AUS) [AUS] [Z]',
       'Austria (AUT)', 'Azerbaijan (AZE)', 'Bahamas (BAH)', 'Bahrain (BRN)',
       ...
       'Uzbekistan (UZB)', 'Venezuela (VEN)', 'Vietnam (VIE)',
       'Virgin Islands (ISV)', 'Yugoslavia (YUG) [YUG]',
       'Independent Olympic Participants (IOP) [IOP]', 'Zambia (ZAM) [ZAM]',
       'Zimbabwe (ZIM) [ZIM]', 'Mixed team (ZZX) [ZZX]', 'Totals'],
      dtype='object', length=147)

Cleaning Dataframes

df.columns 를 보면, 01!, 02!, 03! 으로 금,은,동메달을 표현하는 등 데이터 저장을 위한 표기법을 사용한 경우가 있음. 데이터를 시각화 할 때 이와같은 데이터들을 보기좋게 표현할 수 있음.

for col in df.columns:
    if col[:2] == '01' :
        df.rename(columns={col:"Gold"+col[4:]}, inplace = True)
    if col[:2] == '02' : 
        df.rename(columns={col:"Silver"+col[4:]}, inplace = True)
    if col[:2] == '03' :
        df.rename(columns={col:"Bronze"+col[4:]}, inplace = True)
    if col[:1] == '№' :
        df.rename(columns={col:"#"+col[2:]}, inplace = True)
df.head()
#Summer Gold Silver Bronze Total #Winter Gold.1 Silver.1 Bronze.1 Total.1 #Games Gold.2 Silver.2 Bronze.2 Combined total
Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12

Boolean Masking

df['Gold'].head()
Afghanistan (AFG)           0
Algeria (ALG)               5
Argentina (ARG)            18
Armenia (ARM)               1
Australasia (ANZ) [ANZ]     3
Name: Gold, dtype: int64
(df['Gold'] > 0).head()
Afghanistan (AFG)          False
Algeria (ALG)               True
Argentina (ARG)             True
Armenia (ARM)               True
Australasia (ANZ) [ANZ]     True
Name: Gold, dtype: bool
gold = df.where(df["Gold"]>0)
gold.head()
# True는 값이 나오고 false는 NaN 출력
#Summer Gold Silver Bronze Total #Winter Gold.1 Silver.1 Bronze.1 Total.1 #Games Gold.2 Silver.2 Bronze.2 Combined total
Afghanistan (AFG) NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Algeria (ALG) 12.0 5.0 2.0 8.0 15.0 3.0 0.0 0.0 0.0 0.0 15.0 5.0 2.0 8.0 15.0
Argentina (ARG) 23.0 18.0 24.0 28.0 70.0 18.0 0.0 0.0 0.0 0.0 41.0 18.0 24.0 28.0 70.0
Armenia (ARM) 5.0 1.0 2.0 9.0 12.0 6.0 0.0 0.0 0.0 0.0 11.0 1.0 2.0 9.0 12.0
Australasia (ANZ) [ANZ] 2.0 3.0 4.0 5.0 12.0 0.0 0.0 0.0 0.0 0.0 2.0 3.0 4.0 5.0 12.0
#drop NaN -> dropna
gold["Gold"].count()
100
df["Gold"].count()
147
gold1 = gold.dropna()
gold1.head()
#Summer Gold Silver Bronze Total #Winter Gold.1 Silver.1 Bronze.1 Total.1 #Games Gold.2 Silver.2 Bronze.2 Combined total
Algeria (ALG) 12.0 5.0 2.0 8.0 15.0 3.0 0.0 0.0 0.0 0.0 15.0 5.0 2.0 8.0 15.0
Argentina (ARG) 23.0 18.0 24.0 28.0 70.0 18.0 0.0 0.0 0.0 0.0 41.0 18.0 24.0 28.0 70.0
Armenia (ARM) 5.0 1.0 2.0 9.0 12.0 6.0 0.0 0.0 0.0 0.0 11.0 1.0 2.0 9.0 12.0
Australasia (ANZ) [ANZ] 2.0 3.0 4.0 5.0 12.0 0.0 0.0 0.0 0.0 0.0 2.0 3.0 4.0 5.0 12.0
Australia (AUS) [AUS] [Z] 25.0 139.0 152.0 177.0 468.0 18.0 5.0 3.0 4.0 12.0 43.0 144.0 155.0 181.0 480.0
gold1["Gold"].count()
100

Boolean indexing

df.head()
#Summer Gold Silver Bronze Total #Winter Gold.1 Silver.1 Bronze.1 Total.1 #Games Gold.2 Silver.2 Bronze.2 Combined total
Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12
a = df["Gold"]>0
a.head()
Afghanistan (AFG)          False
Algeria (ALG)               True
Argentina (ARG)             True
Armenia (ARM)               True
Australasia (ANZ) [ANZ]     True
Name: Gold, dtype: bool
b = df[a]
b.head()
#Summer Gold Silver Bronze Total #Winter Gold.1 Silver.1 Bronze.1 Total.1 #Games Gold.2 Silver.2 Bronze.2 Combined total
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12
Australia (AUS) [AUS] [Z] 25 139 152 177 468 18 5 3 4 12 43 144 155 181 480

And Or

  • 가능한 ( )를 사용하여 operation 순서를 정해주는 것이 필요함
df[ (df["Gold"]>0) | \
    (df["Gold.1"]>0) \
  ]
#Summer Gold Silver Bronze Total #Winter Gold.1 Silver.1 Bronze.1 Total.1 #Games Gold.2 Silver.2 Bronze.2 Combined total
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12
Australia (AUS) [AUS] [Z] 25 139 152 177 468 18 5 3 4 12 43 144 155 181 480
Austria (AUT) 26 18 33 35 86 22 59 78 81 218 48 77 111 116 304
Azerbaijan (AZE) 5 6 5 15 26 5 0 0 0 0 10 6 5 15 26
Bahamas (BAH) 15 5 2 5 12 0 0 0 0 0 15 5 2 5 12
Belarus (BLR) 5 12 24 39 75 6 6 4 5 15 11 18 28 44 90
Belgium (BEL) 25 37 52 53 142 20 1 1 3 5 45 38 53 56 147
Brazil (BRA) 21 23 30 55 108 7 0 0 0 0 28 23 30 55 108
Bulgaria (BUL) [H] 19 51 85 78 214 19 1 2 3 6 38 52 87 81 220
Burundi (BDI) 5 1 0 0 1 0 0 0 0 0 5 1 0 0 1
Cameroon (CMR) 13 3 1 1 5 1 0 0 0 0 14 3 1 1 5
Canada (CAN) 25 59 99 121 279 22 62 56 52 170 47 121 155 173 449
Chile (CHI) [I] 22 2 7 4 13 16 0 0 0 0 38 2 7 4 13
China (CHN) [CHN] 9 201 146 126 473 10 12 22 19 53 19 213 168 145 526
Colombia (COL) 18 2 6 11 19 1 0 0 0 0 19 2 6 11 19
Costa Rica (CRC) 14 1 1 2 4 6 0 0 0 0 20 1 1 2 4
Croatia (CRO) 6 6 7 10 23 7 4 6 1 11 13 10 13 11 34
Cuba (CUB) [Z] 19 72 67 70 209 0 0 0 0 0 19 72 67 70 209
Czech Republic (CZE) [CZE] 5 14 15 15 44 6 7 9 8 24 11 21 24 23 68
Czechoslovakia (TCH) [TCH] 16 49 49 45 143 16 2 8 15 25 32 51 57 60 168
Denmark (DEN) [Z] 26 43 68 68 179 13 0 1 0 1 39 43 69 68 180
Dominican Republic (DOM) 13 3 2 1 6 0 0 0 0 0 13 3 2 1 6
Ecuador (ECU) 13 1 1 0 2 0 0 0 0 0 13 1 1 0 2
Egypt (EGY) [EGY] [Z] 21 7 9 10 26 1 0 0 0 0 22 7 9 10 26
Estonia (EST) 11 9 9 15 33 9 4 2 1 7 20 13 11 16 40
Ethiopia (ETH) 12 21 7 17 45 2 0 0 0 0 14 21 7 17 45
Finland (FIN) 24 101 84 117 302 22 42 62 57 161 46 143 146 174 463
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Russia (RUS) [RUS] 5 132 121 142 395 6 49 40 35 124 11 181 161 177 519
Russian Empire (RU1) [RU1] 3 1 4 3 8 0 0 0 0 0 3 1 4 3 8
Soviet Union (URS) [URS] 9 395 319 296 1010 9 78 57 59 194 18 473 376 355 1204
Unified Team (EUN) [EUN] 1 45 38 29 112 1 9 6 8 23 2 54 44 37 135
Serbia (SRB) [SRB] 3 1 2 4 7 2 0 0 0 0 5 1 2 4 7
Serbia and Montenegro (SCG) [SCG] 3 2 4 3 9 3 0 0 0 0 6 2 4 3 9
Slovakia (SVK) [SVK] 5 7 9 8 24 6 2 2 1 5 11 9 11 9 29
Slovenia (SLO) 6 4 6 9 19 7 2 4 9 15 13 6 10 18 34
South Africa (RSA) 18 23 26 27 76 6 0 0 0 0 24 23 26 27 76
Spain (ESP) [Z] 22 37 59 35 131 19 1 0 1 2 41 38 59 36 133
Suriname (SUR) [E] 11 1 0 1 2 0 0 0 0 0 11 1 0 1 2
Sweden (SWE) [Z] 26 143 164 176 483 22 50 40 54 144 48 193 204 230 627
Switzerland (SUI) 27 47 73 65 185 22 50 40 48 138 49 97 113 113 323
Syria (SYR) 12 1 1 1 3 0 0 0 0 0 12 1 1 1 3
Chinese Taipei (TPE) [TPE] [TPE2] 13 2 7 12 21 11 0 0 0 0 24 2 7 12 21
Thailand (THA) 15 7 6 11 24 3 0 0 0 0 18 7 6 11 24
Trinidad and Tobago (TRI) [TRI] 16 2 5 11 18 3 0 0 0 0 19 2 5 11 18
Tunisia (TUN) 13 3 3 4 10 0 0 0 0 0 13 3 3 4 10
Turkey (TUR) 21 39 25 24 88 16 0 0 0 0 37 39 25 24 88
Uganda (UGA) 14 2 3 2 7 0 0 0 0 0 14 2 3 2 7
Ukraine (UKR) 5 33 27 55 115 6 2 1 4 7 11 35 28 59 122
United Arab Emirates (UAE) 8 1 0 0 1 0 0 0 0 0 8 1 0 0 1
United States (USA) [P] [Q] [R] [Z] 26 976 757 666 2399 22 96 102 84 282 48 1072 859 750 2681
Uruguay (URU) 20 2 2 6 10 1 0 0 0 0 21 2 2 6 10
Uzbekistan (UZB) 5 5 5 10 20 6 1 0 0 1 11 6 5 10 21
Venezuela (VEN) 17 2 2 8 12 4 0 0 0 0 21 2 2 8 12
Yugoslavia (YUG) [YUG] 16 26 29 28 83 14 0 3 1 4 30 26 32 29 87
Zimbabwe (ZIM) [ZIM] 12 3 4 1 8 1 0 0 0 0 13 3 4 1 8
Mixed team (ZZX) [ZZX] 3 8 5 4 17 0 0 0 0 0 3 8 5 4 17
Totals 27 4809 4775 5130 14714 22 959 958 948 2865 49 5768 5733 6078 17579

101 rows × 15 columns

df[ (df["Gold.1"]>0) & \
    (df["Gold"] == 0) \
     ]
#Summer Gold Silver Bronze Total #Winter Gold.1 Silver.1 Bronze.1 Total.1 #Games Gold.2 Silver.2 Bronze.2 Combined total
Liechtenstein (LIE) 16 0 0 0 0 18 2 2 5 9 34 2 2 5 9

Add column

df.head()
#Summer Gold Silver Bronze Total #Winter Gold.1 Silver.1 Bronze.1 Total.1 #Games Gold.2 Silver.2 Bronze.2 Combined total
Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12
df.index
Index(['Afghanistan (AFG)', 'Algeria (ALG)', 'Argentina (ARG)',
       'Armenia (ARM)', 'Australasia (ANZ) [ANZ]', 'Australia (AUS) [AUS] [Z]',
       'Austria (AUT)', 'Azerbaijan (AZE)', 'Bahamas (BAH)', 'Bahrain (BRN)',
       ...
       'Uzbekistan (UZB)', 'Venezuela (VEN)', 'Vietnam (VIE)',
       'Virgin Islands (ISV)', 'Yugoslavia (YUG) [YUG]',
       'Independent Olympic Participants (IOP) [IOP]', 'Zambia (ZAM) [ZAM]',
       'Zimbabwe (ZIM) [ZIM]', 'Mixed team (ZZX) [ZZX]', 'Totals'],
      dtype='object', length=147)
df["Country"] = df.index
df.head()
#Summer Gold Silver Bronze Total #Winter Gold.1 Silver.1 Bronze.1 Total.1 #Games Gold.2 Silver.2 Bronze.2 Combined total Country
Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2 Afghanistan (AFG)
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15 Algeria (ALG)
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70 Argentina (ARG)
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12 Armenia (ARM)
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12 Australasia (ANZ) [ANZ]

set_index, reset_index

  • 특정한 컬럼을 인덱스로 할 수 있음
df = df.set_index("Gold") # Gold 컬럼을 인덱스로 설정. 
df.head()
#Summer Silver Bronze Total #Winter Gold.1 Silver.1 Bronze.1 Total.1 #Games Gold.2 Silver.2 Bronze.2 Combined total Country
Gold
0 13 0 2 2 0 0 0 0 0 13 0 0 2 2 Afghanistan (AFG)
5 12 2 8 15 3 0 0 0 0 15 5 2 8 15 Algeria (ALG)
18 23 24 28 70 18 0 0 0 0 41 18 24 28 70 Argentina (ARG)
1 5 2 9 12 6 0 0 0 0 11 1 2 9 12 Armenia (ARM)
3 2 4 5 12 0 0 0 0 0 2 3 4 5 12 Australasia (ANZ) [ANZ]

좋은 웹페이지 즐겨찾기