def draw_result(filename):
import numpy as np
from scipy import interp
import matplotlib.pyplot as plt
from matplotlib.pyplot import savefig
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn import cross_validation
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import StratifiedKFold
%pylab inline
matplotlib.style.use('ggplot')
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'binary:logistic' #多分类:'multi:softprob'
# param['eval_metric '] = 'auc' #校验数据所需要的评价指标
param['eta'] = 0.03 #通常最后设置eta为0.01~0.2
# param['min_child_weight']=0.5 #孩子节点中最小的样本权重和。如果一个叶子节点的样本权重和小于min_child_weight则拆分过程结束。
# param['alpha '] =0 #默认0,L1正则惩罚系数,当数据维度极高时可以使用,使得算法运行更快。
# param['lambda '] =0 #默认0,L2 正则的惩罚系数
# param['scale_pos_weight'] = 0 #默认0,大于0的取值可以处理类别不平衡的情况。帮助模型更快收敛
param['max_depth'] = 6 #通常取值:3-10
# param['colsample_bytree '] =1 #默认为1,在建立树时对特征随机采样的比例。
# param['subsample']=1 #默认为1,用于训练模型的子样本占整个样本集合的比例。
# param['max_delta_step']=3 #通常不需要设置这个值,但在使用logistics 回归时,若类别极度不平衡,则调整该参数可能有效果
param['silent'] = 1 #取0时表示打印出运行时信息,取1时表示以缄默方式运行,不打印运行时的信息。
# param['nthread'] = 4 #如果你希望以最大速度运行,建议不设置这个参数,模型将自动获得最大线程
# param['num_class'] = 2 #多分类时需设置
num_round = 300 #提升迭代的个数
try:
import cPickle as pickle
except:
import pickle
#从.npz文件中加载数据
with np.load(filename) as data:
positive = data['positiveSample']
negative = data['negSample']
X = np.concatenate((positive,negative))
y1 = np.ones((len(positive),1))
y2 = np.zeros((len(negative),1))
y = []
for i in range(len(positive)):
y.append(1)
for i in range(len(negative)):
y.append(0)
y=np.array(y)
# Classification and ROC analysis
#分类,做ROC分析
# Run classifier with cross-validation and plot ROC curves
#使用6折交叉验证,并且画ROC曲线
cv = StratifiedKFold(y, n_folds=6)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate(cv):
xg_train = xgb.DMatrix( X[train], label=y[train])
xg_test = xgb.DMatrix(X[test], label=y[test])
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
bst = xgb.train(param, xg_train, num_round, watchlist) #,early_stopping_rounds=100
probas_ = bst.predict( xg_test )
# ylabel = np.argmax(probas_, axis=1)
fpr, tpr, thresholds = roc_curve(y[test], probas_) #probas_[:, 1]
mean_tpr += interp(mean_fpr, fpr, tpr) #对mean_tpr在mean_fpr处进行插值,通过scipy包调用interp()函数
mean_tpr[0] = 0.0 #初始处为0
roc_auc = auc(fpr, tpr)
#画图,只需要plt.plot(fpr,tpr),变量roc_auc只是记录auc的值,通过auc()函数能计算出来
plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.6f)' % (i, roc_auc))
#画对角线
plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
mean_tpr /= len(cv) #在mean_fpr100个点,每个点处插值插值多次取平均
mean_tpr[-1] = 1.0 #座标最后一个点为(1,1)
mean_auc = auc(mean_fpr, mean_tpr) #计算平均AUC值
#画平均ROC曲线
plt.plot(mean_fpr, mean_tpr, 'k--',label='Mean ROC (area = %0.6f)' % mean_auc, lw=2)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.savefig('data/mimic3.jpg',dpi=600) #保存图片
plt.show()
draw_result("file.npz")