天池大賽 > 【新人賽】工業蒸汽量預測建模算法

題目連接立即進入

線下交叉驗證爲0.1163…
特徵處理的思想(還望大佬指教):
首先得出剔除與標籤值相關性較差的特徵,
然後畫出訓練集合測試集的相對應的特徵的分佈模型圖形,對相應的特徵進行標準化處理,
然後通過pca進行降維處理,降維處理可以用自動調節的那個
代碼:

# -*- coding: utf-8 -*-
"""
Created on Tue Dec 12 14:48:01 2018

@author: 李帥超
"""

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, PolynomialFeatures,MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, learning_curve
from keras.models import load_model
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression,ARDRegression,BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import ShuffleSplit, cross_val_score
from keras.models import Sequential
from keras.layers import Dense, Activation,BatchNormalization, PReLU
from keras.layers import Input,add,multiply,maximum
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.callbacks import ReduceLROnPlateau
from keras import regularizers   #正則化

#構建機器學習模型
def Model_1(train_x, train_y):
    #建立一個三層的神經網絡(不帶輸出層)
    model = Sequential()
    model.add(Dense(500,input_shape=(train_x.shape[1],)))
    model.add(Activation('sigmoid'))
    
    model.add(Dense(100))
    model.add(Activation('relu'))
    
    model.add(Dense(50))
    model.add(Activation('tanh'))
    
    
    #輸出層
    model.add(Dense(1))
    model.add(Activation('linear'))
    
    #三種優化器:SGD,Adam,rmsprop
    model.compile(optimizer = 'sgd',
                 loss = 'mean_squared_error')
    reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=10, 
                                  verbose=0, mode='auto', min_delta=0.001, 
                                  cooldown=0, min_lr=0)

    epochs = 50  #迭代次數
    model.fit(train_x, train_y, epochs=epochs,
             batch_size = 20, validation_split = 0.0,
             callbacks = [reduce_lr],
             verbose = 0)
    return model

def Bn_prelu(x):
    #定義標準化函數
    x = BatchNormalization()(x)
    x = PReLU()(x)
    return x

def Model_2(train_x,train_y):
    #建立一個四層的神經網絡(不帶輸出層)
    inputs = Input(shape=(train_x.shape[1],))
    
    x1 = Dense(500, activation = 'sigmoid')(inputs)
    x2 = Dense(500, activation = 'relu')(inputs)
    x = multiply([x1,x2])
    x = Dense(20, activation = 'relu')(x)
    x = Dense(20, activation = 'tanh')(x)
    
    predictions = Dense(1, activation = 'linear')(x)
    model = Model(inputs=inputs, outputs=predictions)
    model.compile(optimizer='rmsprop',
                 loss = 'mean_squared_error')
    early_stopping = EarlyStopping(monitor = 'loss',
                                  patience = 2,
                                  mode = 'auto')
    epochs = 2
    model.fit(train_x, train_y, epochs = epochs,
             batch_size = 200, 
            validation_split = 0.0,
             callbacks = [early_stopping],
             verbose = 0)
    return model

def Model_3(train_x, train_y):
    #構建一個五層的神經網絡(不帶輸出層)
    inputs = Input(shape = (train_x.shape[1],))
    
    x1 = Dense(500, activation = 'sigmoid')(inputs)
    x1 = Bn_prelu(x1)
    x2 = Dense(500, activation = 'relu')(inputs)
    x2 = Bn_prelu(x2)
    x = maximum([x1, x2])
    x = Dense(50, activation = 'relu')(x)
    x = Bn_prelu(x)
    x = Dense(50, activation = 'relu')(x)
    x = Dense(50, activation = 'tanh')(x)
    
    predictions = Dense(1, activation = 'linear')(x)
    model = Model(inputs = inputs, outputs = predictions)
    model.compile(optimizer = 'sgd',
                 loss = 'mean_squared_error')
    early_stopping = EarlyStopping(monitor = 'loss',
                                  patience = 2,
                                  mode = 'auto')
    epochs = 2
    model.fit(train_x, train_y, epochs = epochs,
             batch_size = 20, validation_split = 0.0,
             callbacks = [early_stopping],
             verbose = 0)
    return model


#讀取數據文件
#def GetData(filename):
#    '''
#    輸入數據文件(.txt)然後輸出一個DataFrame類型數據。
#    '''
#    temp_list = []
#    with open(filename, 'r',encoding = 'utf-8') as fp:
#        for item in fp.readlines():
#            temp_list.append(item.strip())
#
#    label_list = temp_list[0].split()
#    data_list = []
#    for i in range(1,len(temp_list)):
#        data_list.append(temp_list[i].split())
#    
#    df = pd.DataFrame(data_list,dtype = 'float') #轉換數據類型
#    df.columns = label_list   #增加特定列名稱
#    return df

def kfold_loss(df_x, df_y):
    '''
    輸入:特徵數據,和標籤數據(dataframe類型的)
    輸出:利用交叉驗證劃分數據,得到mean_loss
    '''
    loss_list = []
    df_x = pd.DataFrame(df_x,index = None)
    df_y = pd.DataFrame(df_y, index = None)
    sfloder = KFold(n_splits = 5, shuffle = False)
    
    for train_id, test_id in sfloder.split(df_x, df_y):
        model = Model_1(df_x.iloc[train_id], df_y.iloc[train_id])
        loss = model.evaluate(df_x.iloc[test_id], df_y.iloc[test_id], verbose=0)
        loss_list.append(loss)
    return np.array(loss_list).mean()

def HeatGrape(df):
    # 找出相關程度
    plt.figure(figsize=(20, 16))  # 指定繪圖對象寬度和高度
    colnm = df.columns.tolist()  # 列表頭
    mcorr = df[colnm].corr()  # 相關係數矩陣,即給出了任意兩個變量之間的相關係數
    mask = np.zeros_like(mcorr, dtype=np.bool)  # 構造與mcorr同維數矩陣 爲bool型
    mask[np.triu_indices_from(mask)] = True  # 角分線右側爲True
    cmap = sns.diverging_palette(220, 10, as_cmap=True)  # 返回matplotlib colormap對象
    g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')  # 熱力圖(看兩兩相似度)
    plt.show()
    
#創建多項式模型的函數
def Polynomial_model(degree = 1):
    polynomial_features = PolynomialFeatures(degree = degree,
                                             include_bias = False)
    linear_regression = LinearRegression(normalize = True)
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression",linear_regression)])
    return pipeline

#畫出學習曲線->判斷過擬合和欠擬合
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()
 
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
 
    plt.legend(loc="best")
    plt.show()
    
def Pre_data_process(df_train, df_test):
    scale_column = ['V0','V1','V2','V3','V4','V5','V6','V7','V8',
                'V9','V10','V11','V13','V15','V16','V17','V19',
                'V20','V22','V23','V31','V35','V36','V37']
    min_max_column = ['V18','V24','V30']
    
    #mcorr = df_train.corr()
    #drop_label_list = [c for c in mcorr['target'].index if abs(mcorr['target'][c]) < 0.1]
    drop_label_list = ['V14', 'V21', 'V25', 'V26', 'V32', 'V33', 'V34','V28','V29']
    
    df_train = df_train.drop(drop_label_list, axis = 1)
    df_test = df_test.drop(drop_label_list, axis = 1)

    min_max_scaler = MinMaxScaler()
    #將部分特徵數據標準化
    for column in min_max_column:
        #MinMaxScaler 接收的是一個二維的數據,數據可能需要.reshape(-1,1)
        df_train[column] = min_max_scaler.fit_transform(np.array(df_train[column]).reshape(-1,1))
        df_test[column] = min_max_scaler.fit_transform(np.array(df_test[column]).reshape(-1,1))
    
    for column in scale_column:
        df_train[column] = scale(df_train[column])
        df_test[column] = scale(df_test[column])

    
    return (df_train, df_test)

def Cross_validation(x, y, model):
    loss_list = cross_val_score(model, x, y, cv = 5, scoring = 'neg_mean_squared_error')
    return -loss_list.mean()

def Predict(train_x, train_y, test):
    model = Model_1(train_x, train_y)
    scale_test = scale(test)
    pre = model.predict(scale_test)
    #test['target'] = pre
    #test.to_csv('predict1.csv', index = None, sep = '\t')
    with open('predict.txt', 'w') as fp:
        for item in pre:
            fp.write(str(item[0]) + '\n')
            
#數據降維(主程序分析法)
#n_components = 'mle' and svd_solver = 'full' to guess the dimension


    
df_train = pd.read_table('zhengqi_train.txt') #獲得訓練數據
df_test = pd.read_table('zhengqi_test.txt') #得到預測的數據

df_train, df_test = Pre_data_process(df_train, df_test)
#drop_label_list = ['V14', 'V21', 'V25', 'V26', 'V32', 'V33', 'V34','V28','V29']
#df_train = df_train.drop(drop_label_list, axis = 1)
#df_test = df_test.drop(drop_label_list, axis = 1)

df_train_x = df_train.drop(['target'], axis = 1).values
df_train_y = df_train['target'].values
df_train_x = scale(df_train_x)    #將特徵數據(data_np_x)標準化

#pca = PCA(n_components = 0.95)
pca = PCA(n_components = 'mle', svd_solver = 'full')
pca.fit(df_train_x)
df_train_x = pca.transform(df_train_x)
df_test = pca.transform(df_test)

#HeatGrape(df_train)


#LinearRegression
#line_R_model = LinearRegression()
ARD_model = ARDRegression(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, 
                          copy_X=True,fit_intercept=True, lambda_1=1e-06, 
                          lambda_2=1e-06, n_iter=300,
                          normalize=False, threshold_lambda=10000.0, 
                          tol=0.001, verbose=False)

BR_model = BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, 
                         copy_X=True,fit_intercept=True, lambda_1=1e-06, 
                         lambda_2=1e-06, n_iter=300,
                         normalize=False, tol=0.0000001, verbose=False)

myGBR = GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                                  learning_rate=0.01, loss='huber', max_depth=14,
                                  max_features='sqrt', max_leaf_nodes=None,
                                  min_impurity_decrease=0.0, min_impurity_split=None,
                                  min_samples_leaf=10, min_samples_split=40,
                                  min_weight_fraction_leaf=0.0, n_estimators=300,
                                  presort='auto', random_state=10, subsample=0.8, verbose=0,
                                  warm_start=False)





linear_model = Polynomial_model(degree = 1)
#cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
#title = "learn curve "
#plot_learning_curve(BR_model, title, df_train_x, df_train_y, cv = cv)



#loss = kfold_loss(df_train_x, df_train_y)
#print('keras_model     : ', loss)
print('linearRegression: ',Cross_validation(df_train_x,df_train_y, linear_model))
print('BayesianRidge   : ',Cross_validation(df_train_x,df_train_y, BR_model))
#print('GradientBosting : ',Cross_validation(df_train_x,df_train_y, myGBR))
#print('ARDRegression   : ',Cross_validation(df_train_x,df_train_y, ARD_model))




#將最終預測結果保存到文件當中
#BR_model.fit(df_train_x, df_train_y)
#final_ans = BR_model.predict(df_test)
#pd.DataFrame(final_ans).to_csv('predict_2018_12_22(2).txt',index = False, header = False)


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章