基於隨機森林模型的紅酒品質分析

看了南京大學的《用python玩轉數據視頻》,Python非常強大。代碼做了些註釋。

https://scipy.org/

# url: https://archive.ics.uci.edu/ml/datasets/Wine+Quality
# 導入模塊
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore') 
 
try:
    wine = pd.read_csv('winequality-red.csv', sep = ';') # 讀取文件,sep設置分隔符爲;
except:
    print("Cannot find the file!")
 
 
print(wine.info()) # 葡萄酒數據
print(wine.describe()) # 數據的基本統計信息
wine = wine.drop_duplicates() # 刪除重複的記錄
 
# 餅圖展示quality每一類數據的值
wine['quality'].value_counts().plot(kind = 'pie', autopct = '%.2f')
plt.show()
 
# quality與其他屬性之間的皮爾遜相關係數
print(wine.corr().quality)
 
# 每個quality對應的volatile acidity和quality屬性的均值分佈
plt.subplot(121)
sns.barplot(x = 'quality', y = 'volatile acidity', data = wine)
plt.subplot(122)
sns.barplot(x = 'quality', y = 'alcohol', data = wine)
plt.show()
 
from sklearn.preprocessing import LabelEncoder
# bins構成左開右閉的區間 (2,4],(4,6],(6,8]
bins = (2, 4, 6, 8)
# 組名
group_names  = ['low', 'medium', 'high']
# 使用cut進行劃分數據
wine['quality_lb'] = pd.cut(wine['quality'], bins = bins, labels = group_names)
 
# 爲quality_lb屬性分配標籤0,1,2 label爲具體的標籤
lb_quality = LabelEncoder()    
wine['label'] = lb_quality.fit_transform(wine['quality_lb']) 
 
# wine.label.value_counts()統計新類別的分佈
print(wine.label.value_counts())
 
wine_copy = wine.copy()

wine.drop(['quality', 'quality_lb'], axis = 1, inplace = True) 
# 通過數據選擇方式,將特徵屬性和目標屬性分開存入x,y
X = wine.iloc[:,:-1]
y = wine.label
 
from sklearn.model_selection import train_test_split
# train_test_split可以從樣本中按照比例選取訓練數據和測試數據,test_size設置比例
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
 
# 進行規範化處理
from sklearn.preprocessing import scale  
# 將特徵屬性訓練集合測試集用scale進行標準化處理   
X_train = scale(X_train)
X_test = scale(X_test)
 
from sklearn.metrics import confusion_matrix
 
# 使用RandomForestClassifier構建一個分類器,n_estimators是使用最大投票數或均值建立子樹的數量
rfc = RandomForestClassifier(n_estimators = 200)
# 使用fit進行訓練
rfc.fit(X_train, y_train)
# 使用predict進行預測
y_pred = rfc.predict(X_test)
# 實際值與預測值比較,使用confusion_matrix混淆矩陣來觀察
print(confusion_matrix(y_test, y_pred))
 
# 選取的參數
param_rfc = {
            "n_estimators": [10,20,30,40,50,60,70,80,90,100,150,200],
            "criterion": ["gini", "entropy"]
            }
# GridSearchCV暴力搜索
grid_rfc = GridSearchCV(rfc, param_rfc, iid = False, cv = 5)
grid_rfc.fit(X_train, y_train)
best_param_rfc = grid_rfc.best_params_
# best_param_rfc是已取得最佳結果的參數的組合
print(best_param_rfc)
# 重新預測
rfc = RandomForestClassifier(n_estimators = best_param_rfc['n_estimators'], criterion = best_param_rfc['criterion'], random_state=0)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
print(confusion_matrix(y_test, y_pred))

在這裏插入圖片描述

在這裏插入圖片描述

在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章