SIGNATE Quest④

시각화
직사각형도 용도
직사각형은 수량 변수의 가시화에 적합하다.
'성별' 에서 데이터가 수치 이외인 경우 직사각형은 데이터의 수치 변환을 필요로 한다.
봉상도는 수치 변환이 필요하지 않기 때문에 이런 상황에서 봉상도가 비교적 적합하다고 할 수 있다.
데이터의 내용을 수치 항목과 분류 항목으로 나누는 경우 수치 항목의 변수를 직사각형으로 만들면 열마다 직사각형을 생성합니다.(10열이 있는 경우 한 번에 10개의 직사각형을 만들 수 있다)
#ライブラリのimport
import pandas as pd
import matplotlib.pyplot as plt

#前処理
df = pd.read_csv('dataset.csv')
df["AG_ratio"].fillna(df["Alb"] / (df["TP"] - df["Alb"]), inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

#データフレームの分離
col_categoric = ["Gender", "disease"]
df_numeric = df.drop(col_categoric, axis=1)
df_categoric = df[col_categoric]

#数量変数のヒストグラムを表示(※figsizeオプションはグラフのサイズを指定)
df_numeric.hist(figsize=(8, 6))

# グラフのラベルが重ならないようにレイアウトを自動調整
plt.tight_layout()
plt.show()
결실

중첩 표시 히스토그램
# ライブラリのimport
import pandas as pd
import matplotlib.pyplot as plt

# seabornライブラリをsnsという省略名でインポート
import seaborn as sns

# 前処理
df = pd.read_csv('dataset.csv')
df["AG_ratio"].fillna(df["Alb"] / (df["TP"] - df["Alb"]), inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
col_categoric = ["Gender", "disease"]
df_numeric = df.drop(col_categoric, axis=1)
df_categoric = df[col_categoric]

# disease列とdf_numericを結合
df_tmp = pd.concat([df_categoric["disease"], df_numeric], axis=1)

# diseaseの値に応じた"Age"データの抽出
df_Age_non=df_tmp.query("disease==0")["Age"]
df_Age_diseased=df_tmp.query("disease==1")["Age"]

# 2つのデータフレームのヒストグラムを同時に表示
sns.distplot(df_Age_non)
sns.distplot(df_Age_diseased)
# 凡例の表示
plt.legend(labels=["non", "diseased"], loc='upper right')
plt.show()

데이터 추출
데이터를 추출하는 방법은 주로 두 가지가 있다.
#データフレームの再帰代入
df_tmp[df_tmp["disease"] == 0]
#query関数を使う
df_tmp.query("disease == 0")
↓ 중요:query 데이터 추출 방법을 사용합니다.복수 조건의 경우query↓ 추천
# diseaseの値に応じた"Age"データの抽出
df_Age_non=df_tmp.query("disease==0")["Age"]
df_Age_diseased=df_tmp.query("disease==1")["Age"]


열 매핑으로 corr () 의 결과를 표시합니다
# ライブラリのimport
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 前処理
df = pd.read_csv('dataset.csv')
df["AG_ratio"].fillna(df["Alb"] / (df["TP"] - df["Alb"]), inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

# heatmapの表示
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), vmin=-1.0, vmax=1.0, annot=True, cmap='coolwarm', linewidths=0.1)
plt.show()

물류 회귀를 이용한 질병 예측
# ライブラリのimport
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve

# 前処理
df = pd.read_csv('dataset.csv')
df["AG_ratio"].fillna(df["Alb"] / (df["TP"] - df["Alb"]), inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df["Gender"] = df["Gender"].apply(lambda x: 1 if x=="Male" else 0)

# 説明変数・目的変数の作成と分割
X = df.drop(["disease"], axis=1)
y = df["disease"]
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.3, random_state=0)

# モデルの学習
lr = LogisticRegression()
lr.fit(X_train, y_train)

# モデルの予測(疾患あり(=1)に属する確率の算出)
y_pred_prob = lr.predict_proba(X_test)[:, 1]

# AUCスコアの算出
auc_score = ____(y_true=____, y_score=____)
print(auc_score)

# ROC曲線の要素(偽陽性率、真陽性率、閾値)の算出
fpr, tpr, thresholds = ____(y_true=____, y_score=____)

# ROC曲線の描画
plt.plot(fpr, tpr, label='roc curve (area = %0.3f)' % auc_score)
plt.plot([0, 1], [0, 1], linestyle=':', label='random')
plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', label='ideal')
plt.legend()
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.show()
가상 변수를 통해 원시 데이터에 가로로 연결하는 병합된 열을 만듭니다.생성된 피쳐 양을 사용하여 모델링을 다시 수행합니다.
# ライブラリのインポート
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
import pandas as pd
import matplotlib.pyplot as plt

# 前処理
df = pd.read_csv('dataset.csv')
df["AG_ratio"].fillna(df["Alb"] / (df["TP"] - df["Alb"]), inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df["Gender"] = df["Gender"].apply(lambda x: 1 if x=="Male" else 0)
X = df.drop(["disease"], axis=1)
y = df["disease"]

# binの境界値を指定
bins_T_Bil = [0, 0.5, 1.0, 100]

# T_Bil列を分割し、0始まりの連番でラベル化した結果を、X_cutに格納する
X_cut, bin_indice = pd.cut(X["T_Bil"], bins=bins_T_Bil, retbins=True, labels=False)

# bin分割した結果をダミー変数化 (prefix=X_Cut.nameは、列名の接頭語を指定している)
X_dummies = pd.get_dummies(X_cut, prefix=X_cut.name)

# 元の説明変数のデータフレーム(X)と、ダミー変数化の結果(X_dummies)を横連結
X_binned = pd.concat([X, X_dummies], axis=1)

# 学習用・評価用データの分割(元の説明変数Xの代わりに、bin分割したX_binnedを使う)
X_train, X_test, y_train, y_test = train_test_split(X_binned, y, test_size=0.3, random_state=0)

# モデルの学習・予測
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]

# ROC曲線の描画(偽陽性率、真陽性率、閾値の算出)
fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_pred)
plt.plot(fpr, tpr, label='roc curve')
plt.plot([0, 1], [0, 1], linestyle=':', label='random')
plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', label='ideal')
plt.legend()
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.show()

# AUCスコアの算出
auc_score = roc_auc_score(y_true=y_test, y_score=y_pred)
print("AUC:", auc_score)
다항식・상호작용 특징량의 생성
# ライブラリのimport
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing import PolynomialFeatures

# 前処理
df = pd.read_csv('dataset.csv')
df["AG_ratio"].fillna(df["Alb"] / (df["TP"] - df["Alb"]), inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df["Gender"] = df["Gender"].apply(lambda x: 1 if x=="Male" else 0)
X = df.drop(["disease"], axis=1)
y = df["disease"]

# Gender列を除外(数量変数のデータに絞る)
X_target = X.drop(["Gender"], axis=1)

# 多項式・交互作用特徴量の生成
polynomial = PolynomialFeatures(degree=2, include_bias=False)
polynomial_arr = polynomial.fit_transform(X_target)

# polynomial_arrのデータフレーム化 (※カラムはshape[1]でpolynomial_arrの列数分だけ出力)
X_polynomial = pd.DataFrame(polynomial_arr, columns=["poly" + str(x) for x in range(polynomial_arr.shape[1])])

# 生成した多項式・交互作用特徴量の表示
print(X_polynomial.shape)
print(X_polynomial.head())
특징량 선택
# ライブラリのimport
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectFromModel

# 前処理
df = pd.read_csv('dataset.csv')
df["AG_ratio"].fillna(df["Alb"] / (df["TP"] - df["Alb"]), inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df["Gender"] = df["Gender"].apply(lambda x: 1 if x=="Male" else 0)
X = df.drop(["disease"], axis=1)
y = df["disease"]
X_target = X.drop(["Gender"], axis=1)

# 多項式・交互作用特徴量の生成
polynomial = PolynomialFeatures(degree=2, include_bias=False)
polynomial_arr = polynomial.fit_transform(X_target)
X_polynomial = pd.DataFrame(polynomial_arr, columns=["poly" + str(x) for x in range(polynomial_arr.shape[1])])

# 組み込み法のモデル、閾値の指定
fs_model = LogisticRegression(penalty='l1', random_state=0)
# 閾値の指定
fs_threshold = "mean"
# 組み込み法モデルの初期化
selector = SelectFromModel(fs_model, threshold=fs_threshold)

# 特徴量選択の実行
selector.fit(X_polynomial, y)
mask = selector.get_support()

# 選択された特徴量だけのサンプル取得
X_polynomial_masked = X_polynomial.loc[:, mask]

print("選択された特徴量の表示(最初の5行)")
print(X_polynomial_masked.head())
print("選択された特徴量の数の確認")
print(X_polynomial_masked.shape)

총결산
# ライブラリのimport
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectFromModel

# 前処理
df = pd.read_csv('dataset.csv')
df["AG_ratio"].fillna(df["Alb"] / (df["TP"] - df["Alb"]), inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df["Gender"] = df["Gender"].apply(lambda x: 1 if x=="Male" else 0)
X = df.drop(["disease"], axis=1)
y = df["disease"]
X_target = X.drop(["Gender"], axis=1)

# 多項式・交互作用特徴量
polynomial = PolynomialFeatures(degree=2, include_bias=False)
polynomial_arr = polynomial.fit_transform(X_target)
X_polynomial = pd.DataFrame(polynomial_arr, columns=["poly" + str(x) for x in range(polynomial_arr.shape[1])])

# 組み込み法のモデル、閾値の指定
fs_model = LogisticRegression(penalty='l1', random_state=0)
fs_threshold = "mean"
# 組み込み法モデルの初期化
selector = SelectFromModel(fs_model, threshold=fs_threshold)

# 特徴量選択の実行
selector.fit(X_polynomial, y)
mask = selector.get_support()

# 選択された特徴量だけのサンプル取得
X_polynomial_masked = X_polynomial.loc[:, mask]

# 学習用・評価用データの分割(元の説明変数Xの代わりに、特徴量選択後のX_polynomial_maskedを使う)
X_train, X_test, y_train, y_test = train_test_split(X_polynomial_masked, y, test_size=0.3, random_state=0)

# モデルの学習・予測
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]

# ROC曲線の描画(偽陽性率、真陽性率、閾値の算出)
fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_pred)
plt.plot(fpr, tpr, label='roc curve')
plt.plot([0, 1], [0, 1], linestyle=':', label='random')
plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', label='ideal')
plt.legend()
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.show()

# AUCスコアの算出
auc_score = roc_auc_score(y_true=y_test, y_score=y_pred)
print("AUC:", auc_score)

좋은 웹페이지 즐겨찾기