Titanic Top 4% with ensemble modeling(2)

7691 단어 kagglekaggle

Filling missing values

Age

#explore age vs sex, parch, pclass and sibsp

g = sns.factorplot(y="Age",x="Sex",data=dataset, kind="box")
g = sns.factorplot(y="Age",x="Sex", hue="Pclass", data=dataset, kind='box')
g = sns.factorplot(y="Age",x="Parch", data=dataset, kind="box")
g = sns.factorplot(y="Age",x="SibSp", data=dataset, kind="box")

#convert sex into categorical value 0 for male and 1 for female
dataset["Sex"] = dataset["Sex"].map({"male":0, "female":1})

g = sns.heatmap(dataset[["Age","Sex","SibSp","Parch","Pclass"]].corr(),cmap="BrBG",annot=True)

#Filling missing value of Age
##Filling Age with median age of similar rows according to Pclass, Parch and SibSp
#Index of NaN age rows
index_NaN_age = list(dataset["Age"][dataset["Age"].isnull()].index)

for i in index_NaN_age:
    age_med = dataset["Age"].median()
    age_pred = dataset["Age"][((dataset["SibSp"] == dataset.iloc[i]["SibSp"]) & (dataset["Parch"] == dataset.iloc[i]["Parch"]) & (dataset["Pclass"] == dataset.iloc[i]["Pclass"]))].median()
    if not np.isnan(age_pred):
        dataset["Age"].iloc[i] = age_pred
    else:
        dataset["Age"].iloc[i] = age_med
g = sns.factorplot(x="Survived", y="Age", data=train, kind="box")
g = sns.factorplot(x="Survived", y="Age", data=train, kind="violin")


Feature Engineering

Name/Title

dataset["Name"].head()

#Get Title from Name

dataset_title = [i.split(",")[1].split(".")[0].strip() for i in dataset["Name"]]
dataset["Title"] = pd.Series(dataset_title)
dataset["Title"].head()

g = sns.countplot(x="Title", data= dataset)
g = plt.setp(g.get_xticklabels(), rotation=45)

#convert to categorical values title

dataset["Title"] = dataset["Title"].replace(["Lady", "the Countess","Countess",'Capt',"Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"],"Rare")
dataset["Title"] = dataset["Title"].map({"Master": 0, "Miss":1, "Ms":1, "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
dataset["Title"] = dataset["Title"].astype(int)
g = sns.countplot(dataset["Title"])
g = g.set_xticklabels(["Master", "Miss/Ms/Mme/Mlle/Mrs","Mr","Rare"])

g = sns.factorplot(x="Title", y="Survived", data=dataset, kind="bar")
g = g.set_xticklabels(["Master","Miss-Mrs","Mr","Rare"])
g = g.set_ylabels("survival probability")

# Drop Name variable

dataset.drop(labels = ["Name"], axis = 1, inplace= True)

Family Size

#create a family size descriptor from SibSp and Parch

dataset["Fsize"] = dataset["SibSp"] + dataset["Parch"] + 1
g = sns.factorplot(x="Fsize", y="Survived", data = dataset)
g = g.set_ylabels("Survival Probability")

#create new feature of family size

dataset['Single'] = dataset['Fsize'].map(lambda s: 1 if s == 1 else 0)
dataset['SmallF'] = dataset["Fsize"].map(lambda s: 1 if s == 2 else 0)
dataset['MedF'] = dataset['Fsize'].map(lambda s: 1 if 3 <= s <=4 else 0)
dataset['LargeF'] = dataset['Fsize'].map(lambda s: 1 if s>=5 else 0)
g = sns.factorplot(x="Single", y="Survived", data=dataset, kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x="SmallF",y="Survived", data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x="MedF", y="Survived", data=dataset, kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x="LargeF", y="Survived", data=dataset, kind="bar")
g = g.set_ylabels("Survival Probability")

#convert to indicator values Title and Embarked

dataset = pd.get_dummies(dataset, columns = ['Title'])
dataset = pd.get_dummies(dataset, columns = ["Embarked"], prefix ="Em")


Cabin

dataset["Cabin"].head()

dataset["Cabin"].describe()

dataset["Cabin"].isnull().sum() #1007
dataset["Cabin"][dataset["Cabin"].notnull()].head()

#Replace the Cabin number by the type of cabin 'X' if not

dataset["Cabin"] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in dataset['Cabin']])
g = sns.countplot(dataset["Cabin"], order=['A','B','C','D','E','F','G','T','X'])

g = sns.factorplot(y="Survived", x="Cabin", data=dataset, kind='bar', order=['A','B','C','D','E','F','G','T','X'])
g = g.set_ylabels("Survival Probability")

dataset = pd.get_dummies(dataset, columns = ["Cabin"], prefix="Cabin")

Ticket

dataset["Ticket"].head()

#Treat Ticket by extracting the ticket prefix. When there is no prefix it returns X.

Ticket = []
for i in list(dataset.Ticket):
    if not i.isdigit():
        Ticket.append(i.replace(".","").replace("/","").strip().split(' ')[0]) #Take prefix
    else:
            Ticket.append("X")

dataset['Ticket'] = Ticket
dataset['Ticket'].head()

dataset = pd.get_dummies(dataset, columns = ["Ticket"], prefix="T")

#Create Categorical values for Pclass
dataset["Pclass"] = dataset["Pclass"].astype("category")
dataset = pd.get_dummies(dataset, columns = ["Pclass"], prefix ="Pc")

#Drop useless variables
dataset.drop(labels = ["PassengerId"], axis = 1, inplace= True)

dataset.head()

좋은 웹페이지 즐겨찾기