用tensorflow解決Kaggle上Titanic問題

用tensorflow 解決 Titanic v1

  • 庫文件

import pandas as pd
from tensorflow import keras
import matplotlib.pyplot as plt
import os
  • 引入數據

PATH = './titanic'
train_data = pd.read_csv(os.path.join(PATH, 'train.csv'))
test_data = pd.read_csv(os.path.join(PATH, 'test.csv'))
print('訓練集信息:')
train_data.info()
print('測試集信息:')
test_data.info()
predId = test_data['PassengerId']
  • 數據清洗

def arrange(df):
    df = df.drop('Name', axis=1)
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Embarked'] = df['Embarked'].fillna('S')
    df['Cabin'] = df['Cabin'].fillna('U')  # U表示未知,因爲缺失過多
    df['Sex'] = df['Sex'].apply(lambda x: 1 if x == 'male' else 0)
    # 對登船港口(Embarked)進行獨熱編碼,然後連接到原數據
    embarkedDf = pd.get_dummies(df['Embarked'], prefix='Embarked')
    df = pd.concat([df, embarkedDf], axis=1).drop('Embarked', axis=1)
    # 對客艙等級(Pclass)進行獨熱編碼,然後連接
    pclassDf = pd.get_dummies(df['Pclass'], prefix='Pclass')
    df = pd.concat([df, pclassDf], axis=1).drop('Pclass', axis=1)
    # 提取客艙特徵
    df['Cabin'] = df['Cabin'].apply(lambda x: ord(x[0]) - ord('A'))  # 取每個客艙的首字母作爲客艙類別
    # cabinDf = pd.get_dummies(df['Cabin'], 'Cabin')
    # df = pd.concat([df, cabinDf], axis=1).drop('Cabin', axis=1)
    df = df.drop('PassengerId', axis=1).drop('Ticket', axis=1)
    return df
  • 整理訓練集和預測集

train_data = arrange(train_data)
train_x = train_data.drop('Survived', axis=1)
train_y = train_data['Survived']
print('整理後訓練集:')
print(train_x.head())
print(train_x.shape)
test_data = arrange(test_data)
  • 建立模型

model = keras.Sequential()
model.add(keras.layers.InputLayer(input_shape=(12)))
model.add(keras.layers.Dense(20, activation='relu'))
model.add(keras.layers.Dense(20, activation='relu'))
model.add(keras.layers.Dense(20, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss=keras.losses.binary_crossentropy, metrics=['accuracy'])
model.summary()
  • 開始訓練

epochs = 100
history = model.fit(x=train_x, y=train_y, batch_size=32, epochs=epochs, validation_split=0.2)
  • 取得結果

loss = history.history['loss']
acc = history.history['accuracy']
val_loss = history.history['val_loss']
val_acc = history.history['val_accuracy']
  • 畫趨勢圖

plt.figure(figsize=(16, 8))

plt.subplot(1, 2, 1)
plt.plot(range(epochs), acc, label='Train acc')
plt.plot(range(epochs), val_acc, label='Val acc')
plt.legend(loc='lower right')
plt.title('Train and Val accuracy')

plt.subplot(1, 2, 2)
plt.plot(range(epochs), loss, label='Train loss')
plt.plot(range(epochs), val_loss, label='Val loss')
plt.legend(loc='lower right')
plt.title('Train and Val loss')

plt.show()

趨勢圖

  • 保存模型並進行預測

  • 保存模型
model.save('model_v1.h5')
  • 進行預測
pred_y = model.predict_classes(test_data)
pred_y = pred_y.astype(int)
pred_y = pd.Series(map(lambda x: x[0], pred_y))
  • 整理預測結果

predDf = pd.DataFrame(
    {'PassengerId': predId,
     'Survived': pred_y})
print('測試結果預覽:', predDf.head())
  • 保存結果

predDf.to_csv('pred_v1.csv')
發佈了5 篇原創文章 · 獲贊 7 · 訪問量 2666
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章