데이터 분석 기초(一)
10947 단어 python 데이터 분석
# coding: utf-8
# In[1]:
import pandas as pd
# In[4]:
data = pd.read_csv('pokemon.csv')
# In[6]:
data.head()
# In[12]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().magic('matplotlib inline')
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from subprocess import check_output
print(check_output(["ls", "./"]).decode("utf8"))
# In[13]:
f,ax = plt.subplots(figsize = (18,18))
sns.heatmap(data.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
# In[14]:
data.head(10)
# In[15]:
data.info()
# In[16]:
data.columns
# In[19]:
dictionary = {'spain' : 'madrid','usa' : 'vegas'}
print(dictionary.keys())
print(dictionary.values())
# In[20]:
dictionary['spain'] = "barcelona" # update existing entry
print(dictionary)
dictionary['france'] = "paris" # Add new entry
print(dictionary)
del dictionary['spain'] # remove entry with key 'spain'
print(dictionary)
print('france' in dictionary) # check include or not
dictionary.clear() # remove all entries in dict
print(dictionary)
# In[23]:
x = data['Defense'] > 200
data[x]
# In[27]:
x = (data['Defense'] > 200) & (data['Attack'] > 100)
# In[28]:
data[x]
# In[29]:
data.shape
# In[35]:
print(data['Type 2'].value_counts(dropna =False))
# In[36]:
print(data['Type 2'].value_counts(dropna =True))
# In[37]:
data.describe()
# In[38]:
# For example: compare attack of pokemons that are legendary or not
# Black line at top is max
# Blue line at top is 75%
# Red line is median (50%)
# Blue line at bottom is 25%
# Black line at bottom is min
# There are no outliers
data.boxplot(column='Attack',by = 'Legendary')
# In[39]:
data_new = data.head()
# In[164]:
data_new
# In[172]:
data_new[['Attack','Defense']]
# In[171]:
data_new.index = ['A','b','c','e','f']
data_new
# In[41]:
# lets melt
# id_vars = what we do not wish to melt
# value_vars = what we want to melt
melt = pd.melt(frame = data_new,id_vars = 'Name',value_vars=['Attack','Defense'])
# In[42]:
melt
# In[46]:
melt['variable'].value_counts()
# In[48]:
# Index is name
# I want to make that columns are variable
# Finally values in columns are value
melt.pivot(index = 'Name', columns = 'variable',values='value')
# In[56]:
# Firstly lets create 2 data frame
data1 = data.head()
data2= data.tail()
conc_data_row = pd.concat([data1,data2],axis =0,ignore_index =True) # axis = 0 : adds dataframes in row
conc_data_row
# In[59]:
data1 = data['Attack'].head()
data2= data['Defense'].head()
# In[60]:
data1
# In[61]:
data2
# In[62]:
conc_data_col = pd.concat([data1,data2],axis =1) # axis = 0 : adds dataframes in row
conc_data_col
# In[96]:
a = np.array([[1,2,3],[3,2,1]])
b = np.array([[4,5,6],[6,5,4]])
# In[66]:
x = np.arange(9.).reshape(3, 3)
# In[69]:
y = np.where(x > 5)
# In[70]:
x[y]
# In[72]:
y
# In[73]:
x = np.random.randn(4,4)
print(np.where(x>0,2,-2))
# In[74]:
x
# In[81]:
xarr = np.array([1.1,1.2,1.3,1.4,1.5])
yarr = np.array([2.1,2.2,2.3,2.4,2.5])
zarr = np.array([True,False,True,True,False])
result = [(x if c else y)
for x,y,c in zip(xarr,yarr,zarr)]
# In[82]:
result = np.where(zarr,xarr,yarr)#
print(result)
# In[99]:
np.concatenate([a,b],axis = 1)#
# In[100]:
data.dtypes
# In[101]:
# lets convert object(str) to categorical and int to float.
data['Type 1'] = data['Type 1'].astype('category')
data['Speed'] = data['Speed'].astype('float')
# In[102]:
data.dtypes
# In[103]:
# Lets drop nan values
data1=data # also we will use data to fill missing value so I assign it to data1 variable
data1["Type 2"].dropna(inplace = True) # inplace = True means we do not assign it to new variable. Changes automatically assigned to data
# So does it work ?
# In[106]:
data1["Type 2"].value_counts()
# In[113]:
data['Type 1'].notnull().all() # returns nothing because we drop nan values
# In[128]:
# data frames from dictionary
country = ["Spain","France"]
population = ["11","12"]
list_label = ["country","population"]
list_col = [country,population]
zipped = list(zip(list_label,list_col))
data_dict = dict(zipped)
df = pd.DataFrame(data_dict)
df
# In[117]:
zipped
# In[118]:
data_dict
# In[129]:
a = ((1,2),(3,4))
a = dict(a)
# In[130]:
a
# In[131]:
df["capital"] = ["madrid","paris"]
df
# In[132]:
# Broadcasting
df["income"] = 0 #Broadcasting entire column
df
# In[137]:
# Plotting all data
data1 = data.loc[:,["Attack","Defense","Speed"]]
data1.plot()
# In[192]:
data1.head()
# In[144]:
data1.iloc[[0]]
data1.loc[[0]]
# In[155]:
data.iloc[0:2,[1,2]]#
# In[158]:
data1.plot(subplots=True)
# In[161]:
data1.plot(kind = "scatter",x="Attack",y = "Defense")
# In[162]:
#hist plot
data1.plot(kind = "hist",y = "Defense",bins = 50,range= (0,250),normed = True)
# In[163]:
# histogram subplot with non cumulative and cumulative
fig, axes = plt.subplots(nrows=2,ncols=1)
data1.plot(kind = "hist",y = "Defense",bins = 50,range= (0,250),normed = True,ax = axes[0])
data1.plot(kind = "hist",y = "Defense",bins = 50,range= (0,250),normed = True,ax = axes[1],cumulative = True)
# In[173]:
time_list = ["1992-03-08","1992-04-12"]
print(type(time_list[1])) # As you can see date is string
# however we want it to be datetime object
datetime_object = pd.to_datetime(time_list)
print(type(datetime_object))
# In[174]:
# In order to practice lets take head of pokemon data and add it a time list
data2 = data.head()
# In[175]:
data2
# In[185]:
# In order to practice lets take head of pokemon data and add it a time list
date_list = ["1992-01-10","1992-02-10","1992-03-10","1993-03-15","1993-03-16"]
datetime_object = pd.to_datetime(date_list)
data2["date"] = datetime_object
# lets make date as index
data2= data2.set_index("date")
data2
# In[183]:
date_list = ["1992-01-10","1992-02-10","1992-03-10","1993-03-15","1993-03-16"]
datetime_object = pd.to_datetime(date_list)
#data2["date"] = datetime_object
# lets make date as index
data2.index = datetime_object
data2.index.name = "date"
# In[184]:
data2
# In[196]:
print(data2.loc["1993-03-16"])
print(data2.loc["1992-03-10":"1993-03-16"])
# In[200]:
data2
# In[199]:
data2.resample('A').mean() #Needs string to specify frequency like "M" = month or "A" = year
# In[201]:
# Lets resample with month
data2.resample("M").mean()
# As you can see there are a lot of nan because data2 does not include all months
# In[204]:
data2.resample("M").first().interpolate('linear')
# In[210]:
# read data
data = pd.read_csv('pokemon.csv')
data= data.set_index("#")
data.head()
# In[211]:
# indexing using square brackets
data["HP"][1]
# In[212]:
# using column attribute and row label
data.HP[1]
# In[213]:
# using loc accessor
data.loc[1,["HP"]]
# In[215]:
# Selecting only some columns
data[["HP","Attack"]].head()
# In[216]:
# Slicing and indexing series
data.loc[1:10,"HP":"Defense"] # 10 and "Defense" are inclusive
# In[218]:
# Reverse slicing
data.loc[10:1:-1,"HP":"Defense"]
# In[219]:
# From something to end
data.loc[1:10,"Speed":]
# In[220]:
# Plain python functions
def div(n):
return n/2
data.HP.apply(div)
# In[222]:
data['HP'].apply(lambda x:x/2)
# In[223]:
# Defining column using other columns
data["total_power"] = data.Attack + data.Defense
data.head()
# In[225]:
# our index name is this:
print(data.index.name)
# lets change it
data.index.name = "index_name"
data.head()
# In[226]:
# Overwrite index
# if we want to modify index we need to change all of them.
data.head()
# first copy of our data to data3 then change index
data3 = data.copy()
# lets make index start from 100. It is not remarkable change but it is just example
data3.index = range(100,900,1)
data3.head()
# In[228]:
data = pd.read_csv('pokemon.csv')
# In[231]:
# Setting index : type 1 is outer type 2 is inner index
data1 = data.set_index(["Type 1","Type 2"])
data1.head(100)
# data1.loc["Fire","Flying"] # howw to use indexes
# In[235]:
dic = {"treatment":["A","A","B","B"],"gender":["F","M","F","M"],"response":[10,45,5,9],"age":[15,4,72,65]}
df = pd.DataFrame(dic)
df
# In[236]:
df.pivot(index = 'treatment',columns='gender',values = 'age')
# In[237]:
df1 = df.set_index(["treatment","gender"])
df1
# lets unstack it
# In[239]:
df1.unstack(level=0)
# In[240]:
df1.unstack(level=1)
# In[241]:
# change inner and outer level index position
df2 = df1.swaplevel(0,1)
df2
# In[242]:
df
# In[243]:
# df.pivot(index="treatment",columns = "gender",values="response")
pd.melt(df,id_vars="treatment",value_vars=["age","response"])
# In[244]:
df
# In[247]:
# according to treatment take means of other features
df.groupby("treatment").mean() # mean is aggregation / reduction method
# there are other methods like sum, std,max or min
# In[248]:
# we can only choose one of the feature
df.groupby("treatment").age.mean()
# In[249]:
# Or we can choose multiple features
df.groupby("treatment")[["age","response"]].mean()
# In[250]:
df.info()
# as you can see gender is object
# However if we use groupby, we can convert it categorical data.
# Because categorical data uses less memory, speed up operations like groupby
df["gender"] = df["gender"].astype("category")
df["treatment"] = df["treatment"].astype("category")
df.info()
# In[ ]:
이 내용에 흥미가 있습니까?
현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:
데이터 분석 기초(一)텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.