Chi Square Contingency를 사용한 탐색적 데이터 분석

7575 단어


Download Note Book Here

이 탐색적 데이터 분석은 kaggle.com에서 다운로드한 데이터 세트에서 분석을 수행한 이 연습에서 나만의 개인 학습 연습입니다.



시각화를 위한 플로틀리



시각화를 위한 Seaborn



필요한 라이브러리 가져오기




# Libraries for data manipulation
import pandas as pd
import numpy as np

# Libraries for visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Libraries for operatingsystem
import warnings
import os
warnings.filterwarnings('ignore')


데이터세트 가져오기




# Reading the dataset 
df = pd.read_csv(r'C:\Users\user\dl-course-data\abalone.csv')



df.head()




데이터 정보 확인




# Shape of dataset
df.shape




# Checking the null value in the dataset
df.isnull().sum()




# Infromation about dataset
df.info()




# Statistical description of dataset
df.describe().T




# Extracting a unique values of type column
a = df['Type'].unique()



print(a)




# Finding thee counts of Type
b = df['Type'].value_counts()



print(b)




# Computing Rings by Type

df.groupby(["Type"])["Rings"].count().reset_index(name="count")




데이터 세트에 ID 열 추가




df['id'] = range(1, len(df)+1)



df.head()




상관관계




# finding the correlation of datasets
correlation = df.corr()



# Longest Shell has the highest positive correlation value

fig = px.imshow(correlation,text_auto=True,aspect="auto")
fig.show()




# Type M has the highest number of percentage

import plotly.express as px
import pandas as pd 

fig = px.pie(df, values='id', names='Type', title='Abalone Type By Height')
fig.update_traces(hoverinfo='label+percent', textinfo='label+percent', textfont_size=20, pull=[0.1,0.1,0.1],
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.show()




#Type M has the highest number of counts

import plotly.express as px

fig = px.bar(df, x='Type', y='id', color='id')
fig.show()




# Include nbins= number_of_bins to specify histogram shape

px.histogram(df, x="id", color="Type")




# Cross tb for Type and Rings for easy understanding

cross_tab = pd.crosstab(df["Type"],df["Rings"],margins=True)



cross_tab




# The F type is the factor determinant for the whole parameters

sns.factorplot(df["Type"],df["Rings"],data=df)




import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency


alpha = 0.05

stats,p_value,degrees_of_freedom,expected = chi2_contingency(cross_tab)

if p_value > alpha:
    print(f'Accept Null Hypothesis\n p_value is {p_value}\n Ringss are independent of Types')
else:
    print(f'Reject Null Hypothesis\n p_value is {p_value}\n Rings are not independent of Types')




참조

좋은 웹페이지 즐겨찾기