EDA (II)
64911 단어 EDA
EDA(2)
(4) 특징 분석과 가시화
특징 분석
(1) 단변수 분석
분류 변수
(2) 다변수 분석
분류 변수 + 기타(분류, 연속)
sns.boxplot(x="day", y="total_bill", hue="weekend", data=tips, dodge=False);
연속 변수 + 기타
sns.jointplot(x="total_bill", y="tip", data=tips, kind="reg");
다변량
sns.pairplot(tips, x_vars=["total_bill", "size"], y_vars=["tip"],hue="smoker", size=5, aspect=.8, kind="reg");
Summary
시각화
Python 통계 드로잉:matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')
plt.style.use('classic')
%matplotlib inline
#first little example
x = np.linspace(0, 10, 100)
fig = plt.figure()
plt.plot(x, np.sin(x), '-')
plt.plot(x, np.cos(x), '--')
plt.show()
#matlab-style interface
plt.figure()
#create the first of two panels and set current axis
plt.subplot(2, 1, 1) # (rows, columns, panel number)
plt.plot(x, np.sin(x))
#create the second panel and set current axis
plt.subplot(2, 1, 2)
plt.plot(x, np.cos(x))
plt.show()
#Object-oriented interface
#ax will be an array of two Axes objects
fig, ax = plt.subplots(2)
#Call plot() method on the appropriate object
ax[0].plot(x, np.sin(x))
ax[1].plot(x, np.cos(x))
plt.show()
#
plt.style.use('seaborn-whitegrid')
fig = plt.figure()
ax = plt.axes()
plt.figure()
ax = plt.axes()
x = np.linspace(0, 10, 100)
ax.plot(x, np.sin(x))
plt.show()
#
plt.plot(x, np.sin(x - 0), color='blue') # specify color by name
plt.plot(x, np.sin(x - 1), color='g') # short color code (rgbcmyk)
plt.plot(x, np.sin(x - 2), color='0.75') # Grayscale between 0 and 1
plt.plot(x, np.sin(x - 3), color='#FFDD44') # Hex code (RRGGBB from 00 to FF)
plt.plot(x, np.sin(x - 4), color=(1.0,0.2,0.3)) # RGB tuple, values 0 to 1
plt.plot(x, np.sin(x - 5), color='chartreuse') # all HTML color names supporte
plt.show()
#
plt.plot(x, x + 0, linestyle='solid')
plt.plot(x, x + 1, linestyle='dashed')
plt.plot(x, x + 2, linestyle='dashdot')
plt.plot(x, x + 3, linestyle='dotted');
#For short, you can use the following codes:
plt.plot(x, x + 4, linestyle='-') # solid
plt.plot(x, x + 5, linestyle='--') # dashed
plt.plot(x, x + 6, linestyle='-.') # dashdot
plt.plot(x, x + 7, linestyle=':') # dotted
plt.show()
#
rng = np.random.RandomState(0)
for marker in ['o', '.', ',', 'x', '+', 'v', '^', ', '>', 's', 'd']:
plt.plot(rng.rand(5), rng.rand(5), marker,
label="marker='{0}'".format(marker))
plt.legend(numpoints=1)
plt.xlim(0, 1.8)
plt.show()
#
x = np.linspace(0, 10, 30)
y = np.sin(x)
plt.plot(x, y, 'o', color='black')
plt.show()
#
data = np.random.randn(1000)
plt.hist(data,color='g')
plt.show()
plt.hist(data, bins=30, normed=True, alpha=0.5,
histtype='stepfilled', color='steelblue',
edgecolor='none')
plt.show()
x1 = np.random.normal(0, 0.8, 1000)
x2 = np.random.normal(-2, 1, 1000)
x3 = np.random.normal(3, 2, 1000)
kwargs = dict(histtype='stepfilled', alpha=0.3, normed=True, bins=40)
plt.hist(x1, **kwargs)
plt.hist(x2, **kwargs)
plt.hist(x3, **kwargs);
#
men_means, men_std = (20, 35, 30, 35, 27), (2, 3, 4, 1, 2)
women_means, women_std = (25, 32, 34, 20, 25), (3, 5, 2, 3, 3)
ind = np.arange(len(men_means)) # the x locations for the groups
width = 0.35 # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(ind - width/2, men_means, width, yerr=men_std,
color='SkyBlue', label='Men')
rects2 = ax.bar(ind + width/2, women_means, width, yerr=women_std,
color='IndianRed', label='Women')
#Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Scores')
ax.set_title('Scores by group and gender')
ax.set_xticks(ind)
ax.set_xticklabels(('G1', 'G2', 'G3', 'G4', 'G5'))
ax.legend()
def autolabel(rects, xpos='center'):
"""
Attach a text label above each bar in *rects*, displaying its height.
*xpos* indicates which side to place the text w.r.t. the center of
the bar. It can be one of the following {'center', 'right', 'left'}.
"""
xpos = xpos.lower() # normalize the case of the parameter
ha = {'center': 'center', 'right': 'left', 'left': 'right'}
offset = {'center': 0.5, 'right': 0.57, 'left': 0.43} # x_txt = x + w*off
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()*offset[xpos], 1.01*height,
'{}'.format(height), ha=ha[xpos], va='bottom')
autolabel(rects1, "left")
autolabel(rects2, "right")
plt.show()
# /
#Fixing random state for reproducibility
np.random.seed(19680801)
plt.rcdefaults()
fig, ax = plt.subplots()
#Example data
people = ('Tom', 'Dick', 'Harry', 'Slim', 'Jim')
y_pos = np.arange(len(people))
performance = 3 + 10 * np.random.rand(len(people))
error = np.random.rand(len(people))
ax.barh(y_pos, performance, xerr=error, align='center',
color='green', ecolor='black')
ax.set_yticks(y_pos)
ax.set_yticklabels(people)
ax.invert_yaxis() # labels read top-to-bottom
ax.set_xlabel('Performance')
ax.set_title('How fast do you want to go today?')
plt.show()
#
from matplotlib.patches import Polygon
#Fixing random state for reproducibility
np.random.seed(19680801)
#fake up some data
spread = np.random.rand(50) * 100
center = np.ones(25) * 50
flier_high = np.random.rand(10) * 100 + 100
flier_low = np.random.rand(10) * -100
data = np.concatenate((spread, center, flier_high, flier_low))
fig, axs = plt.subplots(2, 3)
#basic plot
axs[0, 0].boxplot(data)
axs[0, 0].set_title('basic plot')
#notched plot
axs[0, 1].boxplot(data, 1)
axs[0, 1].set_title('notched plot')
#change outlier point symbols
axs[0, 2].boxplot(data, 0, 'gD')
axs[0, 2].set_title('change outlier
point symbols')
#don't show outlier points
axs[1, 0].boxplot(data, 0, '')
axs[1, 0].set_title("don't show
outlier points")
#horizontal boxes
axs[1, 1].boxplot(data, 0, 'rs', 0)
axs[1, 1].set_title('horizontal boxes')
#change whisker length
axs[1, 2].boxplot(data, 0, 'rs', 0, 0.75)
axs[1, 2].set_title('change whisker length')
fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9,
hspace=0.4, wspace=0.3)
#fake up some more data
spread = np.random.rand(50) * 100
center = np.ones(25) * 40
flier_high = np.random.rand(10) * 100 + 100
flier_low = np.random.rand(10) * -100
d2 = np.concatenate((spread, center, flier_high, flier_low))
data.shape = (-1, 1)
d2.shape = (-1, 1)
#Making a 2-D array only works if all the columns are the
#same length. If they are not, then use a list instead.
#This is actually more efficient because boxplot converts
#a 2-D array into a list of vectors internally anyway.
data = [data, d2, d2[::2, 0]]
#Multiple box plots on one Axes
fig, ax = plt.subplots()
ax.boxplot(data)
plt.show()
파이썬 통계 드로잉:seaborn
#set style darkgrid,whitegrid,dark,white,ticks
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
plt.plot(np.arange(10))
plt.show()
import pandas as pd
df_iris = pd.read_csv('./iris.csv')
fig, axes = plt.subplots(1, 2)
sns.distplot(df_iris['petal length'], ax = axes[0], kde = True, rug = True)
sns.kdeplot(df_iris['petal length'], ax = axes[1], shade=True)
plt.show()
sns.set(palette="muted", color_codes=True)
rs = np.random.RandomState(10)
d = rs.normal(size=100)
f, axes = plt.subplots(2, 2, figsize=(7, 7), sharex=True)
sns.distplot(d, kde=False, color="b", ax=axes[0, 0])
sns.distplot(d, hist=False, rug=True, color="r", ax=axes[0, 1])
sns.distplot(d, hist=False, color="g", kde_kws={"shade": True}, ax=axes[1, 0])
sns.distplot(d, color="m", ax=axes[1, 1])
plt.show()
#
sns.boxplot(x = df_iris['class'], y = df_iris['sepal width'])
#
sns.set()
sns.pairplot(df_iris, hue="class")
plt.show()
(5) 보고서 생성
pandas_profiling은pandas의DataFrame 데이터 유형을 바탕으로 간단하고 신속하게 탐색적 데이터 분석을 할 수 있다.
데이터 세트의 각 열에 대해 pandasprofiling은 다음과 같은 통계 정보를 제공합니다.
pandas_profiling 사용법
#
import seaborn as sns
import pandas as pd
import pandas_profiling as pp
import matplotlib.pyplot as plt
#
data = sns.load_dataset('titanic')
data.head()
report = pp.ProfileReport(data)
report
report.to_file('report.html')
이 내용에 흥미가 있습니까?
현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:
[프로그래머스 과제관] 채용 공고 추천 - EDA 및 전처리Programmers 채용 공고 페이지를 방문한 개발자들의 방문/지원 기록을 바탕으로 추천 모델을 만들어야 합니다. 전체 학습 데이터 중 applied = 1인 데이터와 applied = 0인 데이터의 수를 살펴보았...
텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.