利用Sklearn和pyspark进行旧金山犯罪数据集的预测

数据集格式

CrimePredict.py

import pandas as pd
import numpy as np

res_dic = []

# 1、载入数据
train = pd.read_csv('processed_train.csv', parse_dates = ['Dates'])#input train_path
test = pd.read_csv('processed_test.csv', parse_dates = ['Dates'])#input test_path

# 2、数据预处理,对category进行编码
from sklearn import preprocessing
label = preprocessing.LabelEncoder()
crime = label.fit_transform(train.Category) #进行编号

# 3、对Dates、DayOfWeek、PdDistrict三个特征进行二值化处理(1或者0),因为3个在训练集和测试集都出现
days = pd.get_dummies(train.DayOfWeek)
district = pd.get_dummies(train.PdDistrict)
hour = pd.get_dummies(train.Dates.dt.hour)
#month = pd.get_dummies(train.Dates.dt.month)

train_data = pd.concat([days, district, hour], axis=1)   # 将days district hour连成一张表 ,当axis = 1的时候,concat就是行对齐,然后将不同列名称的两张表合并
train_data['crime'] = crime  # 在DataFrame数据结构 表的 最后加一列,在本例中相当于标签
# 实际上,只使用了三个特征,和犯罪类型作为标签 即只使用了原始数据集中的4列数据
# 但是train_data这张表  其实是将3个特征展开成了几十个特征 对应一个标签

crime = label.fit_transform(test.Category) #进行编号
# 针对测试集做同样的处理
days = pd.get_dummies(test.DayOfWeek)
district = pd.get_dummies(test.PdDistrict)
hour = pd.get_dummies(test.Dates.dt.hour)
#month = pd.get_dummies(test.Dates.dt.month)
test_data = pd.concat([days, district, hour], axis=1)
test_data['crime'] = crime

# 4、将样本几何分割成训练集和验证集(70%训练,30%验证),返回的是划分好的训练集 和 验证集
from sklearn.model_selection import train_test_split
training, validation = train_test_split(train_data, train_size=0.7)


# 5、朴素贝叶斯
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from pyspark.ml.classification import NaiveBayes

model = BernoulliNB()
#model = NaiveBayes()
feature_list = training.columns.tolist()   #将列名字转换为列表
feature_list = feature_list[:len(feature_list) - 1]   # 选取的特征列  最后一列是标签,不能要,注意列表是左闭右开
model.fit(training[feature_list], training['crime'])    #根据给定的训练数据拟合模型

predicted = np.array(model.predict_proba(validation[feature_list]))   #validation[feature_list] 不包括最后一列crime 的验证集    model.predict_proba 第 i 行 第 j 列上的数值是模型预测第 i 个预测样本 为某个【标签】的概(表头是标签类别),从小到大排序的    predicted是在验证集上的结果
predicted_acc = np.array(model.predict(validation[feature_list]))
predicted_test_acc = np.array(model.predict(test_data[feature_list]))
logLoss = "BernoulliNB log loss: " + str(log_loss(validation['crime'], predicted))
trainAcc = "BernoulliNB train accuracy: " + str(accuracy_score(validation['crime'], predicted_acc))
testAcc = "BernoulliNB test accuracy: " + str(accuracy_score(test_data['crime'], predicted_test_acc))
print (logLoss)   #多分类的对数损失
print (trainAcc)
print (testAcc)
res_dic.append(logLoss)
res_dic.append(trainAcc)
res_dic.append(testAcc)

# 6、其他模型等 (逻辑回归,随机森林)
from sklearn.linear_model import LogisticRegression
#from pyspark.ml.classification import  LogisticRegression
model_LR = LogisticRegression(C=0.1)
model_LR.fit(training[feature_list], training['crime'])
predicted = np.array(model_LR.predict_proba(validation[feature_list]))
predicted_acc = np.array(model_LR.predict(validation[feature_list]))
predicted_test_acc = np.array(model_LR.predict(test_data[feature_list]))
logLoss = "LogisticRegression log loss: " + str(log_loss(validation['crime'], predicted))
trainAcc = "LogisticRegression train accuracy: " + str(accuracy_score(validation['crime'], predicted_acc))
testAcc = "LogisticRegression test accuracy: " + str(accuracy_score(test_data['crime'], predicted_test_acc))
print (logLoss)   #多分类的对数损失
print (trainAcc)
print (testAcc)
res_dic.append(logLoss)
res_dic.append(trainAcc)
res_dic.append(testAcc)

from sklearn.ensemble import RandomForestClassifier
from pyspark.ml.regression import RandomForestRegressor
model_RF = RandomForestClassifier()
#model_RF = RandomForestRegressor()
model_RF.fit(training[feature_list], training['crime'])
predicted = np.array(model_RF.predict_proba(validation[feature_list]))
predicted_acc = np.array(model_RF.predict(validation[feature_list]))
predicted_test_acc = np.array(model_RF.predict(test_data[feature_list]))
logLoss = "RandomForest log loss: " + str(log_loss(validation['crime'], predicted))
trainAcc = "RandomForest train accuracy: " + str(accuracy_score(validation['crime'], predicted_acc))
testAcc = "RandomForest test accuracy: " + str(accuracy_score(test_data['crime'], predicted_test_acc))
print (logLoss)   #多分类的对数损失
print (trainAcc)
print (testAcc)
res_dic.append(logLoss)
res_dic.append(trainAcc)
res_dic.append(testAcc)


with open('result.txt','w') as f:#output_path
    for res in res_dic:
        f.write(res + '\n')

运行结果

CrimePredict_pyspark.py

from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('log_reg').getOrCreate()

# 读取测试数据 s3://my-cluster-zdy/
df = spark.read.csv('s3://my-cluster-zdy/processed_train.csv',inferSchema=True,header=True)#input_path
df_t = spark.read.csv('s3://my-cluster-zdy/processed_test.csv',inferSchema=True,header=True)

print('-------------- train data transfer ------------------')

from pyspark.ml.feature import StringIndexer   # StringIndexer可以把字符串的列按照出现频率进行排序,出现次数最高的对应的Index为0

## 2.1 将字符串转换为可度量值
search_engine_indexer = StringIndexer(inputCol="DayOfWeek", outputCol="DayOfWeek_Num").fit(df)    # 返回对应的模型,即StringIndexerModel
df = search_engine_indexer.transform(df)       # 输入的dataset进行模型转换,返回经过转换后的dataset

search_engine_indexer = StringIndexer(inputCol="PdDistrict", outputCol="PdDistrict_Num").fit(df)    # 返回对应的模型,即StringIndexerModel
df = search_engine_indexer.transform(df)

search_engine_indexer = StringIndexer(inputCol="Category", outputCol="Category_Num").fit(df)    # 返回对应的模型,即StringIndexerModel
df = search_engine_indexer.transform(df)

## 2.2 进行独热编码
from pyspark.ml.feature import OneHotEncoder # OneHotEncoder 它可以实现将分类特征的每个元素转化为一个可以用来计算的值

## 对使用的搜索引擎独热编码
search_engine_encoder = OneHotEncoder(inputCol="DayOfWeek_Num", outputCol="DayOfWeek_Vector")
df = search_engine_encoder.transform(df)

## 对使用的搜索引擎独热编码
search_engine_encoder = OneHotEncoder(inputCol="PdDistrict_Num", outputCol="PdDistrict_Vector")
df = search_engine_encoder.transform(df)

print('-------------- test data transfer ------------------')

## 2.1 将字符串转换为可度量值
search_engine_indexer = StringIndexer(inputCol="DayOfWeek", outputCol="DayOfWeek_Num").fit(df_t)    # 返回对应的模型,即StringIndexerModel
df_t = search_engine_indexer.transform(df_t)       # 输入的dataset进行模型转换,返回经过转换后的dataset

search_engine_indexer = StringIndexer(inputCol="PdDistrict", outputCol="PdDistrict_Num").fit(df_t)    # 返回对应的模型,即StringIndexerModel
df_t = search_engine_indexer.transform(df_t)

search_engine_indexer = StringIndexer(inputCol="Category", outputCol="Category_Num").fit(df_t)    # 返回对应的模型,即StringIndexerModel
df_t = search_engine_indexer.transform(df_t)

## 2.2 进行独热编码

## 对使用的搜索引擎独热编码
search_engine_encoder = OneHotEncoder(inputCol="DayOfWeek_Num", outputCol="DayOfWeek_Vector")
df_t = search_engine_encoder.transform(df_t)

## 对使用的搜索引擎独热编码
search_engine_encoder = OneHotEncoder(inputCol="PdDistrict_Num", outputCol="PdDistrict_Vector")
df_t = search_engine_encoder.transform(df_t)


# 3 - 进行逻辑回归数据训练

print('-------------- LogisticRegression Training ------------------')

from pyspark.ml.feature import VectorAssembler     # 导入VerctorAssembler 将多个列合并成向量列的特征转换器,即将表中各列用一个类似list表示,输出预测列为单独一列。

## 3.1 将经过进行量化后的platform,country和原来的Age,Repeat_Visitor ,Web_pages_viewed 构成特征向量
df_assembler = VectorAssembler(inputCols=['DayOfWeek_Vector','PdDistrict_Vector'], outputCol="features")
df = df_assembler.transform(df)
df_t = df_assembler.transform(df_t)

## 查看构建后的数据

model_df=df.select(['features','Category_Num'])
model_df_t=df_t.select(['features','Category_Num'])

## 3.2 进行逻辑回归

from pyspark.ml.classification import LogisticRegression                            # 逻辑回归。该类支持多项逻辑(softmax)和二项逻辑回归

training_df,test_df=model_df.randomSplit([0.75,0.25])                               # 划分数据,75%的数据用于训练,25%数据用于验证测试

log_reg=LogisticRegression(labelCol='Category_Num').fit(training_df)                      # 返回LogisticRegressionModel类型模型对象

print('{}{}'.format('LogisticRegression Train accuracy:',log_reg.evaluate(training_df).accuracy) )         # 查看预测的准确率
print('{}{}'.format('LogisticRegression Test accuracy:',log_reg.evaluate(model_df_t).accuracy) )         # 查看预测的准确率


print('-------------- RandomForest Training ------------------')
from pyspark.ml.classification import RandomForestClassifier

training_df,test_df=model_df.randomSplit([0.75,0.25])                               # 划分数据,75%的数据用于训练,25%数据用于验证测试

rf_classifier=RandomForestClassifier(labelCol='Category_Num').fit(training_df)                      # 返回LogisticRegressionModel类型模型对象

rf_predictions=rf_classifier.transform(test_df)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator  # 多类分类的评估器,它期望两个输入列:预测和标签

rf_accuracy=MulticlassClassificationEvaluator(labelCol='Category_Num',metricName='accuracy').evaluate(rf_predictions)
#print('MulticlassClassificationEvaluator 随机深林测试的准确性: {0:.0%}'.format(rf_accuracy))
print('{}{}'.format('RandomForestClassifier Train accuracy:',rf_accuracy) )         # 查看预测的准确率

rf_predictions=rf_classifier.transform(model_df_t)

rf_accuracy=MulticlassClassificationEvaluator(labelCol='Category_Num',metricName='accuracy').evaluate(rf_predictions)
#print('MulticlassClassificationEvaluator 随机深林测试的准确性: {0:.0%}'.format(rf_accuracy))
print('{}{}'.format('RandomForestClassifier Test accuracy:',rf_accuracy) )         # 查看预测的准确率

运行结果

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章