先導入工具包模塊 import pandas as pd import matplotlib.pyplot as plt ----------------------------------------------- #讀取train.csv中的數據並解析 titanic=pd.read_csv("train.csv") #年齡中位數 print(titanic.Age.median()) print("\n") 運行結果: 28.0 ----------------------------------------------- #填充所有age字段的空值爲中位數,不改變源數據 print(titanic.Age.fillna(titanic.Age.median())) print("\n") #打印前五行 print(titanic.head()) print("\n") 運行結果: PassengerId Survived Pclass ... Fare Cabin Embarked 0 1 0 3 ... 7.2500 NaN S 1 2 1 1 ... 71.2833 C85 C 2 3 1 3 ... 7.9250 NaN S 3 4 1 1 ... 53.1000 C123 S 4 5 0 3 ... 8.0500 NaN S [5 rows x 12 columns] ----------------------------------------------- #查看數據類型 print(titanic.info()) print("\n") 運行結果 <class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): PassengerId 891 non-null int64 Survived 891 non-null int64 Pclass 891 non-null int64 Name 891 non-null object Sex 891 non-null object Age 714 non-null float64 SibSp 891 non-null int64 Parch 891 non-null int64 Ticket 891 non-null object Fare 891 non-null float64 Cabin 204 non-null object Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.6+ KB None ----------------------------------------------- #統計所有數據類型 print(titanic.describe()) print("\n") 運行結果 PassengerId Survived Pclass ... SibSp Parch Fare count 891.000000 891.000000 891.000000 ... 891.000000 891.000000 891.000000 mean 446.000000 0.383838 2.308642 ... 0.523008 0.381594 32.204208 std 257.353842 0.486592 0.836071 ... 1.102743 0.806057 49.693429 min 1.000000 0.000000 1.000000 ... 0.000000 0.000000 0.000000 25% 223.500000 0.000000 2.000000 ... 0.000000 0.000000 7.910400 50% 446.000000 0.000000 3.000000 ... 0.000000 0.000000 14.454200 75% 668.500000 1.000000 3.000000 ... 1.000000 0.000000 31.000000 max 891.000000 1.000000 3.000000 ... 8.000000 6.000000 512.329200 [8 rows x 7 columns] ----------------------------------------------- #填充所有age字段的空值爲中位數,改變源數據,無返回值 titanic.Age.fillna(titanic.Age.median(),inplace=True) #統計所有空值個數 print(titanic.isnull().sum()) print("\n") 運行結果 PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64 ----------------------------------------------- #統計生還者、遇難者、性別 dead=titanic[titanic.Survived==0].Sex.value_counts() survived=titanic[titanic.Survived==1].Sex.value_counts() #繪製圖表,index設置y軸字段名 df=pd.DataFrame([survived,dead],index=["survived","dead"]) df=df.T #轉置矩陣 #增加百分比字段 df["p_survived"]=df.survived/(df.survived+df.dead) df["p_dead"]=df.dead/(df.survived+df.dead) print(df) print("\n") 運行結果 survived dead p_survived p_dead female 233 81 0.742038 0.257962 male 109 468 0.188908 0.811092 ----------------------------------------------- #分析性別對生存率的影響 df[["p_survived","p_dead"]].plot.bar() plt.show()
#分析年齡對生存率影響 dead=titanic[titanic.Survived==0].Age survived=titanic[titanic.Survived==1].Age df=pd.DataFrame([survived,dead],index=["survived","dead"]) df=df.T #轉置矩陣 print(df) df.plot.hist(stacked=True,bins=30) plt.show() print("\n") #分析是否成年 生存率影響 adult=titanic[titanic.Age>=18]["Survived"].value_counts() child=titanic[titanic.Age<18]["Survived"].value_counts() df=pd.DataFrame([adult,child],index=["adult","child"]) df.columns=["dead","survived"] df.plot.bar() plt.show() #分析 票價多少 年齡多大的人遇難最多 ax=plt.subplot() age=titanic[titanic.Survived==0].Age fare=titanic[titanic.Survived==0].Fare plt.scatter(age,fare,s=10) ax.set_xlabel("age") ax.set_ylabel("fare") plt.show()