分类算法——LR

       最近部门在进行一个项目的POC,需要利用LR建模,想到可以利用现成的机器学习库skilearn,但由于之前这方面知识接触不多,所以整个建模过程存在很多知识不熟悉,这里主要记录一下,以供后续学习。

#coding=utf-8
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler #区间缩放,返回值为缩放到[0, 1]区间的数据
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

    
# 读取数据
finance = pd.read_csv('/Users/zhangsan/Documents/test_data.csv', header=0)
print ('original data shape: %d,%d'% (finance.shape[0], finance.shape[1])) # 获取行列数

# 过滤包含'nan‘值的列
print ('the columns contain nan are: %s'%finance.columns[finance.isna().any()].tolist())

#DataFrame删除指定列
finance = finance.drop(labels=['Stkcd','firmname','year','retirenum'], axis=1) 
print finance.shape

#2.数据预处理:缺失值,one-hot encorder,标准化

#替换和去除缺失值
finance=finance.replace(to_replace='',value=np.nan)  # 将空值代换为标准缺失值表示 nan
finance=finance.dropna(how='any')  # 丢弃带有缺失值的数据(只要有一个纬度有缺失)
print finance.shape  

finance=finance.replace(to_replace='#DIV/0!',value=np.nan)  # 将空值代换为标准缺失值表示 nan
finance=finance.dropna(how='any')  # 丢弃带有缺失值的数据(只要有一个纬度有缺失)
print finance.shape   

# one-hot encoder
# 对‘sic2name’字段进行
encoder = LabelEncoder()  
sic2name_label = encoder.fit_transform(finance['sic2name'].values)  
sic2name_label = np.array([sic2name_label]).T

# 将标签转成one-hot编码
enc = OneHotEncoder()
sic2name_enc=enc.fit_transform(sic2name_label)
sic2name_enc=sic2name_enc.toarray()
sic2name_enc

finance = finance.drop(['sic2name'],axis=1)
finance = finance.astype(float)

# 分桶
bucket_fields = finance[['employeenum','director','inddirector','ipotime']]
print ('bucket fields size:%d,%d'%(bucket_fields.shape[0],bucket_fields.shape[1]))

# 对数分桶
# bucket_fieldsA= bucket_fields['employeenum'].map(lambda x: x+1.0)
# print bucket_fields['employeenum'] 

# employeenum_bucket = np.round(np.log10(bucket_fieldsA))
# director_bucket=np.round(np.log2(bucket_fields['director']))
# inddirector_bucket=np.round(np.log2(bucket_fields['inddirector']))
# ipotime_bucket=np.round(np.log2(bucket_fields['ipotime']))

#切割点分桶
employeenum_bucket = pd.cut(bucket_fields['employeenum'], [-1000,500,1000,2000,10000,600000],labels=False) 
director_bucket=pd.cut(bucket_fields['director'], [-1000,5, 7,11,13,30],labels=False) 
inddirector_bucket=pd.cut(bucket_fields['inddirector'], [-1000, 2,4,10],labels=False) 
ipotime_bucket=pd.cut(bucket_fields['ipotime'],[-1000,10,15,40],labels=False) 


# 拼接分桶字段
# concat:dataframe的拼接,可以通过axis=0 或 axis=1来指定横向或纵向拼接
bucket_df = pd.concat([pd.DataFrame(employeenum_bucket),pd.DataFrame(director_bucket),
                       pd.DataFrame(inddirector_bucket),pd.DataFrame(ipotime_bucket)], axis=1)
finance = finance.drop(['employeenum','director','inddirector','ipotime'],axis=1)

# 取值二值化特征, 不做处理
binary_value_fields = finance[['soe0','soecontrol','badindustry','negative_income']]
print ('binary_value_fields:%d'%binary_value_fields.shape[1])

# 0-1归一化
mms = MinMaxScaler(feature_range=(0, 1)) 
min_max_scaler = finance[['shortloan', 'tangibleasset','payable','currentliability']]
min_max_scaler_columns = min_max_scaler.columns.values

min_max_scaler = mms.fit_transform(min_max_scaler)
min_max_scaler_df= pd.DataFrame(min_max_scaler,columns=min_max_scaler_columns);
print min_max_scaler_df.columns.values

# 拼接one-hot编码
finance = pd.concat([finance, pd.DataFrame(sic2name_enc)],axis=1)

# 拼接分桶
finance = pd.concat([finance, bucket_df],axis=1)


# 拼接0-1归一化属性
finance = finance.drop(min_max_scaler_df.columns.values, axis=1)
finance = pd.concat([finance, min_max_scaler_df],axis=1)

columns_names_final = finance.columns.values
print ('columns names after feature engineering:%s'%columns_names_final)

# 提取X值
X= finance[columns_names_final[0:(columns_names_final.size-1)]]
print X.columns[X.isna().any()].tolist()

X=X.replace(to_replace='',value=np.nan)  # 将空值代换为标准缺失值表示 nan
X=X.dropna(how='any')  # 丢弃带有缺失值的数据(只要有一个纬度有缺失)

# 提取Y变量值
Y = X['next_negative_income']
Y = Y.astype(int) #利用astype方法可以进行dataframe的类型转换

# 在测试集中过滤掉目标字段
X= X.drop(['next_negative_income'], axis=1)

#PCA
pca = PCA(n_components='mle')
newX = pca.fit_transform(X) #得到转换之后的数据 
print(pca.explained_variance_ratio_) #打印各属性方差的占比,可以得出重要的属性

# 将数据集拆分为训练集和测试集
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 1234)

# 利用训练集建模
lr= LogisticRegression(
        penalty='l2',
        dual=False,
        tol=0.0001,
        C=1.0,
        fit_intercept=True,
        intercept_scaling=1, 
        class_weight=None,
        random_state=None,
        solver='liblinear', 
        max_iter=25,
        multi_class='ovr', 
        verbose=0,
        warm_start=False,
        n_jobs=1234)

lr.fit(X_train, Y_train)

# 模型预测
lr_predict = lr.predict(X_test)

# 预测数据分类为0或1的概率,predict_prob_Y为2列,第一列表示预测为0的概率,第二列表示预测为1的概率
predict_prob_Y = lr.predict_proba(X_test) 


#提取预测类别为1的score
p_prob_Y = pd.DataFrame(predict_prob_Y).iloc[:,1]
print p_prob_Y 
# 以utf-8编码的方式保存到csv,并通过指定index=False滤掉索引列
p_prob_Y.to_csv('/Users/zhangsan/Documents/p_prob_Y.csv', index=False,encoding='utf-8')

# 计算auc值
auc_result = roc_auc_score(Y_test, p_prob_Y)
print auc_result

print ('Accuracy of LR Classifier:%f'%lr.score(X_test,Y_test))   # 使得逻辑回归模型自带的评分函数score获得模型在测试集上的准确性结果
print classification_report(Y_test,lr_predict ,target_names=['0','1'])

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章