SIGNATE Quest④
직사각형도 용도
직사각형은 수량 변수의 가시화에 적합하다.
'성별' 에서 데이터가 수치 이외인 경우 직사각형은 데이터의 수치 변환을 필요로 한다.
봉상도는 수치 변환이 필요하지 않기 때문에 이런 상황에서 봉상도가 비교적 적합하다고 할 수 있다.
데이터의 내용을 수치 항목과 분류 항목으로 나누는 경우 수치 항목의 변수를 직사각형으로 만들면 열마다 직사각형을 생성합니다.(10열이 있는 경우 한 번에 10개의 직사각형을 만들 수 있다)
#ライブラリのimport
import pandas as pd
import matplotlib.pyplot as plt
#前処理
df = pd.read_csv('dataset.csv')
df["AG_ratio"].fillna(df["Alb"] / (df["TP"] - df["Alb"]), inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
#データフレームの分離
col_categoric = ["Gender", "disease"]
df_numeric = df.drop(col_categoric, axis=1)
df_categoric = df[col_categoric]
#数量変数のヒストグラムを表示(※figsizeオプションはグラフのサイズを指定)
df_numeric.hist(figsize=(8, 6))
# グラフのラベルが重ならないようにレイアウトを自動調整
plt.tight_layout()
plt.show()
결실중첩 표시 히스토그램
# ライブラリのimport
import pandas as pd
import matplotlib.pyplot as plt
# seabornライブラリをsnsという省略名でインポート
import seaborn as sns
# 前処理
df = pd.read_csv('dataset.csv')
df["AG_ratio"].fillna(df["Alb"] / (df["TP"] - df["Alb"]), inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
col_categoric = ["Gender", "disease"]
df_numeric = df.drop(col_categoric, axis=1)
df_categoric = df[col_categoric]
# disease列とdf_numericを結合
df_tmp = pd.concat([df_categoric["disease"], df_numeric], axis=1)
# diseaseの値に応じた"Age"データの抽出
df_Age_non=df_tmp.query("disease==0")["Age"]
df_Age_diseased=df_tmp.query("disease==1")["Age"]
# 2つのデータフレームのヒストグラムを同時に表示
sns.distplot(df_Age_non)
sns.distplot(df_Age_diseased)
# 凡例の表示
plt.legend(labels=["non", "diseased"], loc='upper right')
plt.show()
데이터 추출
데이터를 추출하는 방법은 주로 두 가지가 있다.
#データフレームの再帰代入
df_tmp[df_tmp["disease"] == 0]
#query関数を使う
df_tmp.query("disease == 0")
↓ 중요:query 데이터 추출 방법을 사용합니다.복수 조건의 경우query↓ 추천# diseaseの値に応じた"Age"データの抽出
df_Age_non=df_tmp.query("disease==0")["Age"]
df_Age_diseased=df_tmp.query("disease==1")["Age"]
열 매핑으로 corr () 의 결과를 표시합니다
# ライブラリのimport
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# 前処理
df = pd.read_csv('dataset.csv')
df["AG_ratio"].fillna(df["Alb"] / (df["TP"] - df["Alb"]), inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
# heatmapの表示
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), vmin=-1.0, vmax=1.0, annot=True, cmap='coolwarm', linewidths=0.1)
plt.show()
물류 회귀를 이용한 질병 예측
# ライブラリのimport
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
# 前処理
df = pd.read_csv('dataset.csv')
df["AG_ratio"].fillna(df["Alb"] / (df["TP"] - df["Alb"]), inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df["Gender"] = df["Gender"].apply(lambda x: 1 if x=="Male" else 0)
# 説明変数・目的変数の作成と分割
X = df.drop(["disease"], axis=1)
y = df["disease"]
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.3, random_state=0)
# モデルの学習
lr = LogisticRegression()
lr.fit(X_train, y_train)
# モデルの予測(疾患あり(=1)に属する確率の算出)
y_pred_prob = lr.predict_proba(X_test)[:, 1]
# AUCスコアの算出
auc_score = ____(y_true=____, y_score=____)
print(auc_score)
# ROC曲線の要素(偽陽性率、真陽性率、閾値)の算出
fpr, tpr, thresholds = ____(y_true=____, y_score=____)
# ROC曲線の描画
plt.plot(fpr, tpr, label='roc curve (area = %0.3f)' % auc_score)
plt.plot([0, 1], [0, 1], linestyle=':', label='random')
plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', label='ideal')
plt.legend()
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.show()
가상 변수를 통해 원시 데이터에 가로로 연결하는 병합된 열을 만듭니다.생성된 피쳐 양을 사용하여 모델링을 다시 수행합니다.# ライブラリのインポート
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
import pandas as pd
import matplotlib.pyplot as plt
# 前処理
df = pd.read_csv('dataset.csv')
df["AG_ratio"].fillna(df["Alb"] / (df["TP"] - df["Alb"]), inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df["Gender"] = df["Gender"].apply(lambda x: 1 if x=="Male" else 0)
X = df.drop(["disease"], axis=1)
y = df["disease"]
# binの境界値を指定
bins_T_Bil = [0, 0.5, 1.0, 100]
# T_Bil列を分割し、0始まりの連番でラベル化した結果を、X_cutに格納する
X_cut, bin_indice = pd.cut(X["T_Bil"], bins=bins_T_Bil, retbins=True, labels=False)
# bin分割した結果をダミー変数化 (prefix=X_Cut.nameは、列名の接頭語を指定している)
X_dummies = pd.get_dummies(X_cut, prefix=X_cut.name)
# 元の説明変数のデータフレーム(X)と、ダミー変数化の結果(X_dummies)を横連結
X_binned = pd.concat([X, X_dummies], axis=1)
# 学習用・評価用データの分割(元の説明変数Xの代わりに、bin分割したX_binnedを使う)
X_train, X_test, y_train, y_test = train_test_split(X_binned, y, test_size=0.3, random_state=0)
# モデルの学習・予測
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
# ROC曲線の描画(偽陽性率、真陽性率、閾値の算出)
fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_pred)
plt.plot(fpr, tpr, label='roc curve')
plt.plot([0, 1], [0, 1], linestyle=':', label='random')
plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', label='ideal')
plt.legend()
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.show()
# AUCスコアの算出
auc_score = roc_auc_score(y_true=y_test, y_score=y_pred)
print("AUC:", auc_score)
다항식・상호작용 특징량의 생성# ライブラリのimport
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing import PolynomialFeatures
# 前処理
df = pd.read_csv('dataset.csv')
df["AG_ratio"].fillna(df["Alb"] / (df["TP"] - df["Alb"]), inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df["Gender"] = df["Gender"].apply(lambda x: 1 if x=="Male" else 0)
X = df.drop(["disease"], axis=1)
y = df["disease"]
# Gender列を除外(数量変数のデータに絞る)
X_target = X.drop(["Gender"], axis=1)
# 多項式・交互作用特徴量の生成
polynomial = PolynomialFeatures(degree=2, include_bias=False)
polynomial_arr = polynomial.fit_transform(X_target)
# polynomial_arrのデータフレーム化 (※カラムはshape[1]でpolynomial_arrの列数分だけ出力)
X_polynomial = pd.DataFrame(polynomial_arr, columns=["poly" + str(x) for x in range(polynomial_arr.shape[1])])
# 生成した多項式・交互作用特徴量の表示
print(X_polynomial.shape)
print(X_polynomial.head())
특징량 선택# ライブラリのimport
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
# 前処理
df = pd.read_csv('dataset.csv')
df["AG_ratio"].fillna(df["Alb"] / (df["TP"] - df["Alb"]), inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df["Gender"] = df["Gender"].apply(lambda x: 1 if x=="Male" else 0)
X = df.drop(["disease"], axis=1)
y = df["disease"]
X_target = X.drop(["Gender"], axis=1)
# 多項式・交互作用特徴量の生成
polynomial = PolynomialFeatures(degree=2, include_bias=False)
polynomial_arr = polynomial.fit_transform(X_target)
X_polynomial = pd.DataFrame(polynomial_arr, columns=["poly" + str(x) for x in range(polynomial_arr.shape[1])])
# 組み込み法のモデル、閾値の指定
fs_model = LogisticRegression(penalty='l1', random_state=0)
# 閾値の指定
fs_threshold = "mean"
# 組み込み法モデルの初期化
selector = SelectFromModel(fs_model, threshold=fs_threshold)
# 特徴量選択の実行
selector.fit(X_polynomial, y)
mask = selector.get_support()
# 選択された特徴量だけのサンプル取得
X_polynomial_masked = X_polynomial.loc[:, mask]
print("選択された特徴量の表示(最初の5行)")
print(X_polynomial_masked.head())
print("選択された特徴量の数の確認")
print(X_polynomial_masked.shape)
총결산# ライブラリのimport
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
# 前処理
df = pd.read_csv('dataset.csv')
df["AG_ratio"].fillna(df["Alb"] / (df["TP"] - df["Alb"]), inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df["Gender"] = df["Gender"].apply(lambda x: 1 if x=="Male" else 0)
X = df.drop(["disease"], axis=1)
y = df["disease"]
X_target = X.drop(["Gender"], axis=1)
# 多項式・交互作用特徴量
polynomial = PolynomialFeatures(degree=2, include_bias=False)
polynomial_arr = polynomial.fit_transform(X_target)
X_polynomial = pd.DataFrame(polynomial_arr, columns=["poly" + str(x) for x in range(polynomial_arr.shape[1])])
# 組み込み法のモデル、閾値の指定
fs_model = LogisticRegression(penalty='l1', random_state=0)
fs_threshold = "mean"
# 組み込み法モデルの初期化
selector = SelectFromModel(fs_model, threshold=fs_threshold)
# 特徴量選択の実行
selector.fit(X_polynomial, y)
mask = selector.get_support()
# 選択された特徴量だけのサンプル取得
X_polynomial_masked = X_polynomial.loc[:, mask]
# 学習用・評価用データの分割(元の説明変数Xの代わりに、特徴量選択後のX_polynomial_maskedを使う)
X_train, X_test, y_train, y_test = train_test_split(X_polynomial_masked, y, test_size=0.3, random_state=0)
# モデルの学習・予測
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
# ROC曲線の描画(偽陽性率、真陽性率、閾値の算出)
fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_pred)
plt.plot(fpr, tpr, label='roc curve')
plt.plot([0, 1], [0, 1], linestyle=':', label='random')
plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', label='ideal')
plt.legend()
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.show()
# AUCスコアの算出
auc_score = roc_auc_score(y_true=y_test, y_score=y_pred)
print("AUC:", auc_score)
Reference
이 문제에 관하여(SIGNATE Quest④), 우리는 이곳에서 더 많은 자료를 발견하고 링크를 클릭하여 보았다 https://qiita.com/rbrf7321/items/b52311447ddf801db0c8텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
우수한 개발자 콘텐츠 발견에 전념 (Collection and Share based on the CC Protocol.)