分類算法——LR

       最近部門在進行一個項目的POC,需要利用LR建模,想到可以利用現成的機器學習庫skilearn,但由於之前這方面知識接觸不多,所以整個建模過程存在很多知識不熟悉,這裏主要記錄一下,以供後續學習。

#coding=utf-8
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler #區間縮放,返回值爲縮放到[0, 1]區間的數據
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

    
# 讀取數據
finance = pd.read_csv('/Users/zhangsan/Documents/test_data.csv', header=0)
print ('original data shape: %d,%d'% (finance.shape[0], finance.shape[1])) # 獲取行列數

# 過濾包含'nan‘值的列
print ('the columns contain nan are: %s'%finance.columns[finance.isna().any()].tolist())

#DataFrame刪除指定列
finance = finance.drop(labels=['Stkcd','firmname','year','retirenum'], axis=1) 
print finance.shape

#2.數據預處理:缺失值,one-hot encorder,標準化

#替換和去除缺失值
finance=finance.replace(to_replace='',value=np.nan)  # 將空值代換爲標準缺失值表示 nan
finance=finance.dropna(how='any')  # 丟棄帶有缺失值的數據(只要有一個緯度有缺失)
print finance.shape  

finance=finance.replace(to_replace='#DIV/0!',value=np.nan)  # 將空值代換爲標準缺失值表示 nan
finance=finance.dropna(how='any')  # 丟棄帶有缺失值的數據(只要有一個緯度有缺失)
print finance.shape   

# one-hot encoder
# 對‘sic2name’字段進行
encoder = LabelEncoder()  
sic2name_label = encoder.fit_transform(finance['sic2name'].values)  
sic2name_label = np.array([sic2name_label]).T

# 將標籤轉成one-hot編碼
enc = OneHotEncoder()
sic2name_enc=enc.fit_transform(sic2name_label)
sic2name_enc=sic2name_enc.toarray()
sic2name_enc

finance = finance.drop(['sic2name'],axis=1)
finance = finance.astype(float)

# 分桶
bucket_fields = finance[['employeenum','director','inddirector','ipotime']]
print ('bucket fields size:%d,%d'%(bucket_fields.shape[0],bucket_fields.shape[1]))

# 對數分桶
# bucket_fieldsA= bucket_fields['employeenum'].map(lambda x: x+1.0)
# print bucket_fields['employeenum'] 

# employeenum_bucket = np.round(np.log10(bucket_fieldsA))
# director_bucket=np.round(np.log2(bucket_fields['director']))
# inddirector_bucket=np.round(np.log2(bucket_fields['inddirector']))
# ipotime_bucket=np.round(np.log2(bucket_fields['ipotime']))

#切割點分桶
employeenum_bucket = pd.cut(bucket_fields['employeenum'], [-1000,500,1000,2000,10000,600000],labels=False) 
director_bucket=pd.cut(bucket_fields['director'], [-1000,5, 7,11,13,30],labels=False) 
inddirector_bucket=pd.cut(bucket_fields['inddirector'], [-1000, 2,4,10],labels=False) 
ipotime_bucket=pd.cut(bucket_fields['ipotime'],[-1000,10,15,40],labels=False) 


# 拼接分桶字段
# concat:dataframe的拼接,可以通過axis=0 或 axis=1來指定橫向或縱向拼接
bucket_df = pd.concat([pd.DataFrame(employeenum_bucket),pd.DataFrame(director_bucket),
                       pd.DataFrame(inddirector_bucket),pd.DataFrame(ipotime_bucket)], axis=1)
finance = finance.drop(['employeenum','director','inddirector','ipotime'],axis=1)

# 取值二值化特徵, 不做處理
binary_value_fields = finance[['soe0','soecontrol','badindustry','negative_income']]
print ('binary_value_fields:%d'%binary_value_fields.shape[1])

# 0-1歸一化
mms = MinMaxScaler(feature_range=(0, 1)) 
min_max_scaler = finance[['shortloan', 'tangibleasset','payable','currentliability']]
min_max_scaler_columns = min_max_scaler.columns.values

min_max_scaler = mms.fit_transform(min_max_scaler)
min_max_scaler_df= pd.DataFrame(min_max_scaler,columns=min_max_scaler_columns);
print min_max_scaler_df.columns.values

# 拼接one-hot編碼
finance = pd.concat([finance, pd.DataFrame(sic2name_enc)],axis=1)

# 拼接分桶
finance = pd.concat([finance, bucket_df],axis=1)


# 拼接0-1歸一化屬性
finance = finance.drop(min_max_scaler_df.columns.values, axis=1)
finance = pd.concat([finance, min_max_scaler_df],axis=1)

columns_names_final = finance.columns.values
print ('columns names after feature engineering:%s'%columns_names_final)

# 提取X值
X= finance[columns_names_final[0:(columns_names_final.size-1)]]
print X.columns[X.isna().any()].tolist()

X=X.replace(to_replace='',value=np.nan)  # 將空值代換爲標準缺失值表示 nan
X=X.dropna(how='any')  # 丟棄帶有缺失值的數據(只要有一個緯度有缺失)

# 提取Y變量值
Y = X['next_negative_income']
Y = Y.astype(int) #利用astype方法可以進行dataframe的類型轉換

# 在測試集中過濾掉目標字段
X= X.drop(['next_negative_income'], axis=1)

#PCA
pca = PCA(n_components='mle')
newX = pca.fit_transform(X) #得到轉換之後的數據 
print(pca.explained_variance_ratio_) #打印各屬性方差的佔比,可以得出重要的屬性

# 將數據集拆分爲訓練集和測試集
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 1234)

# 利用訓練集建模
lr= LogisticRegression(
        penalty='l2',
        dual=False,
        tol=0.0001,
        C=1.0,
        fit_intercept=True,
        intercept_scaling=1, 
        class_weight=None,
        random_state=None,
        solver='liblinear', 
        max_iter=25,
        multi_class='ovr', 
        verbose=0,
        warm_start=False,
        n_jobs=1234)

lr.fit(X_train, Y_train)

# 模型預測
lr_predict = lr.predict(X_test)

# 預測數據分類爲0或1的概率,predict_prob_Y爲2列,第一列表示預測爲0的概率,第二列表示預測爲1的概率
predict_prob_Y = lr.predict_proba(X_test) 


#提取預測類別爲1的score
p_prob_Y = pd.DataFrame(predict_prob_Y).iloc[:,1]
print p_prob_Y 
# 以utf-8編碼的方式保存到csv,並通過指定index=False濾掉索引列
p_prob_Y.to_csv('/Users/zhangsan/Documents/p_prob_Y.csv', index=False,encoding='utf-8')

# 計算auc值
auc_result = roc_auc_score(Y_test, p_prob_Y)
print auc_result

print ('Accuracy of LR Classifier:%f'%lr.score(X_test,Y_test))   # 使得邏輯迴歸模型自帶的評分函數score獲得模型在測試集上的準確性結果
print classification_report(Y_test,lr_predict ,target_names=['0','1'])

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章