利用Sklearn和pyspark進行舊金山犯罪數據集的預測

數據集格式

CrimePredict.py

import pandas as pd
import numpy as np

res_dic = []

# 1、載入數據
train = pd.read_csv('processed_train.csv', parse_dates = ['Dates'])#input train_path
test = pd.read_csv('processed_test.csv', parse_dates = ['Dates'])#input test_path

# 2、數據預處理,對category進行編碼
from sklearn import preprocessing
label = preprocessing.LabelEncoder()
crime = label.fit_transform(train.Category) #進行編號

# 3、對Dates、DayOfWeek、PdDistrict三個特徵進行二值化處理(1或者0),因爲3個在訓練集和測試集都出現
days = pd.get_dummies(train.DayOfWeek)
district = pd.get_dummies(train.PdDistrict)
hour = pd.get_dummies(train.Dates.dt.hour)
#month = pd.get_dummies(train.Dates.dt.month)

train_data = pd.concat([days, district, hour], axis=1)   # 將days district hour連成一張表 ,當axis = 1的時候,concat就是行對齊,然後將不同列名稱的兩張表合併
train_data['crime'] = crime  # 在DataFrame數據結構 表的 最後加一列,在本例中相當於標籤
# 實際上,只使用了三個特徵,和犯罪類型作爲標籤 即只使用了原始數據集中的4列數據
# 但是train_data這張表  其實是將3個特徵展開成了幾十個特徵 對應一個標籤

crime = label.fit_transform(test.Category) #進行編號
# 針對測試集做同樣的處理
days = pd.get_dummies(test.DayOfWeek)
district = pd.get_dummies(test.PdDistrict)
hour = pd.get_dummies(test.Dates.dt.hour)
#month = pd.get_dummies(test.Dates.dt.month)
test_data = pd.concat([days, district, hour], axis=1)
test_data['crime'] = crime

# 4、將樣本幾何分割成訓練集和驗證集(70%訓練,30%驗證),返回的是劃分好的訓練集 和 驗證集
from sklearn.model_selection import train_test_split
training, validation = train_test_split(train_data, train_size=0.7)


# 5、樸素貝葉斯
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from pyspark.ml.classification import NaiveBayes

model = BernoulliNB()
#model = NaiveBayes()
feature_list = training.columns.tolist()   #將列名字轉換爲列表
feature_list = feature_list[:len(feature_list) - 1]   # 選取的特徵列  最後一列是標籤,不能要,注意列表是左閉右開
model.fit(training[feature_list], training['crime'])    #根據給定的訓練數據擬合模型

predicted = np.array(model.predict_proba(validation[feature_list]))   #validation[feature_list] 不包括最後一列crime 的驗證集    model.predict_proba 第 i 行 第 j 列上的數值是模型預測第 i 個預測樣本 爲某個【標籤】的概(表頭是標籤類別),從小到大排序的    predicted是在驗證集上的結果
predicted_acc = np.array(model.predict(validation[feature_list]))
predicted_test_acc = np.array(model.predict(test_data[feature_list]))
logLoss = "BernoulliNB log loss: " + str(log_loss(validation['crime'], predicted))
trainAcc = "BernoulliNB train accuracy: " + str(accuracy_score(validation['crime'], predicted_acc))
testAcc = "BernoulliNB test accuracy: " + str(accuracy_score(test_data['crime'], predicted_test_acc))
print (logLoss)   #多分類的對數損失
print (trainAcc)
print (testAcc)
res_dic.append(logLoss)
res_dic.append(trainAcc)
res_dic.append(testAcc)

# 6、其他模型等 (邏輯迴歸,隨機森林)
from sklearn.linear_model import LogisticRegression
#from pyspark.ml.classification import  LogisticRegression
model_LR = LogisticRegression(C=0.1)
model_LR.fit(training[feature_list], training['crime'])
predicted = np.array(model_LR.predict_proba(validation[feature_list]))
predicted_acc = np.array(model_LR.predict(validation[feature_list]))
predicted_test_acc = np.array(model_LR.predict(test_data[feature_list]))
logLoss = "LogisticRegression log loss: " + str(log_loss(validation['crime'], predicted))
trainAcc = "LogisticRegression train accuracy: " + str(accuracy_score(validation['crime'], predicted_acc))
testAcc = "LogisticRegression test accuracy: " + str(accuracy_score(test_data['crime'], predicted_test_acc))
print (logLoss)   #多分類的對數損失
print (trainAcc)
print (testAcc)
res_dic.append(logLoss)
res_dic.append(trainAcc)
res_dic.append(testAcc)

from sklearn.ensemble import RandomForestClassifier
from pyspark.ml.regression import RandomForestRegressor
model_RF = RandomForestClassifier()
#model_RF = RandomForestRegressor()
model_RF.fit(training[feature_list], training['crime'])
predicted = np.array(model_RF.predict_proba(validation[feature_list]))
predicted_acc = np.array(model_RF.predict(validation[feature_list]))
predicted_test_acc = np.array(model_RF.predict(test_data[feature_list]))
logLoss = "RandomForest log loss: " + str(log_loss(validation['crime'], predicted))
trainAcc = "RandomForest train accuracy: " + str(accuracy_score(validation['crime'], predicted_acc))
testAcc = "RandomForest test accuracy: " + str(accuracy_score(test_data['crime'], predicted_test_acc))
print (logLoss)   #多分類的對數損失
print (trainAcc)
print (testAcc)
res_dic.append(logLoss)
res_dic.append(trainAcc)
res_dic.append(testAcc)


with open('result.txt','w') as f:#output_path
    for res in res_dic:
        f.write(res + '\n')

運行結果

CrimePredict_pyspark.py

from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('log_reg').getOrCreate()

# 讀取測試數據 s3://my-cluster-zdy/
df = spark.read.csv('s3://my-cluster-zdy/processed_train.csv',inferSchema=True,header=True)#input_path
df_t = spark.read.csv('s3://my-cluster-zdy/processed_test.csv',inferSchema=True,header=True)

print('-------------- train data transfer ------------------')

from pyspark.ml.feature import StringIndexer   # StringIndexer可以把字符串的列按照出現頻率進行排序,出現次數最高的對應的Index爲0

## 2.1 將字符串轉換爲可度量值
search_engine_indexer = StringIndexer(inputCol="DayOfWeek", outputCol="DayOfWeek_Num").fit(df)    # 返回對應的模型,即StringIndexerModel
df = search_engine_indexer.transform(df)       # 輸入的dataset進行模型轉換,返回經過轉換後的dataset

search_engine_indexer = StringIndexer(inputCol="PdDistrict", outputCol="PdDistrict_Num").fit(df)    # 返回對應的模型,即StringIndexerModel
df = search_engine_indexer.transform(df)

search_engine_indexer = StringIndexer(inputCol="Category", outputCol="Category_Num").fit(df)    # 返回對應的模型,即StringIndexerModel
df = search_engine_indexer.transform(df)

## 2.2 進行獨熱編碼
from pyspark.ml.feature import OneHotEncoder # OneHotEncoder 它可以實現將分類特徵的每個元素轉化爲一個可以用來計算的值

## 對使用的搜索引擎獨熱編碼
search_engine_encoder = OneHotEncoder(inputCol="DayOfWeek_Num", outputCol="DayOfWeek_Vector")
df = search_engine_encoder.transform(df)

## 對使用的搜索引擎獨熱編碼
search_engine_encoder = OneHotEncoder(inputCol="PdDistrict_Num", outputCol="PdDistrict_Vector")
df = search_engine_encoder.transform(df)

print('-------------- test data transfer ------------------')

## 2.1 將字符串轉換爲可度量值
search_engine_indexer = StringIndexer(inputCol="DayOfWeek", outputCol="DayOfWeek_Num").fit(df_t)    # 返回對應的模型,即StringIndexerModel
df_t = search_engine_indexer.transform(df_t)       # 輸入的dataset進行模型轉換,返回經過轉換後的dataset

search_engine_indexer = StringIndexer(inputCol="PdDistrict", outputCol="PdDistrict_Num").fit(df_t)    # 返回對應的模型,即StringIndexerModel
df_t = search_engine_indexer.transform(df_t)

search_engine_indexer = StringIndexer(inputCol="Category", outputCol="Category_Num").fit(df_t)    # 返回對應的模型,即StringIndexerModel
df_t = search_engine_indexer.transform(df_t)

## 2.2 進行獨熱編碼

## 對使用的搜索引擎獨熱編碼
search_engine_encoder = OneHotEncoder(inputCol="DayOfWeek_Num", outputCol="DayOfWeek_Vector")
df_t = search_engine_encoder.transform(df_t)

## 對使用的搜索引擎獨熱編碼
search_engine_encoder = OneHotEncoder(inputCol="PdDistrict_Num", outputCol="PdDistrict_Vector")
df_t = search_engine_encoder.transform(df_t)


# 3 - 進行邏輯迴歸數據訓練

print('-------------- LogisticRegression Training ------------------')

from pyspark.ml.feature import VectorAssembler     # 導入VerctorAssembler 將多個列合併成向量列的特徵轉換器,即將表中各列用一個類似list表示,輸出預測列爲單獨一列。

## 3.1 將經過進行量化後的platform,country和原來的Age,Repeat_Visitor ,Web_pages_viewed 構成特徵向量
df_assembler = VectorAssembler(inputCols=['DayOfWeek_Vector','PdDistrict_Vector'], outputCol="features")
df = df_assembler.transform(df)
df_t = df_assembler.transform(df_t)

## 查看構建後的數據

model_df=df.select(['features','Category_Num'])
model_df_t=df_t.select(['features','Category_Num'])

## 3.2 進行邏輯迴歸

from pyspark.ml.classification import LogisticRegression                            # 邏輯迴歸。該類支持多項邏輯(softmax)和二項邏輯迴歸

training_df,test_df=model_df.randomSplit([0.75,0.25])                               # 劃分數據,75%的數據用於訓練,25%數據用於驗證測試

log_reg=LogisticRegression(labelCol='Category_Num').fit(training_df)                      # 返回LogisticRegressionModel類型模型對象

print('{}{}'.format('LogisticRegression Train accuracy:',log_reg.evaluate(training_df).accuracy) )         # 查看預測的準確率
print('{}{}'.format('LogisticRegression Test accuracy:',log_reg.evaluate(model_df_t).accuracy) )         # 查看預測的準確率


print('-------------- RandomForest Training ------------------')
from pyspark.ml.classification import RandomForestClassifier

training_df,test_df=model_df.randomSplit([0.75,0.25])                               # 劃分數據,75%的數據用於訓練,25%數據用於驗證測試

rf_classifier=RandomForestClassifier(labelCol='Category_Num').fit(training_df)                      # 返回LogisticRegressionModel類型模型對象

rf_predictions=rf_classifier.transform(test_df)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator  # 多類分類的評估器,它期望兩個輸入列:預測和標籤

rf_accuracy=MulticlassClassificationEvaluator(labelCol='Category_Num',metricName='accuracy').evaluate(rf_predictions)
#print('MulticlassClassificationEvaluator 隨機深林測試的準確性: {0:.0%}'.format(rf_accuracy))
print('{}{}'.format('RandomForestClassifier Train accuracy:',rf_accuracy) )         # 查看預測的準確率

rf_predictions=rf_classifier.transform(model_df_t)

rf_accuracy=MulticlassClassificationEvaluator(labelCol='Category_Num',metricName='accuracy').evaluate(rf_predictions)
#print('MulticlassClassificationEvaluator 隨機深林測試的準確性: {0:.0%}'.format(rf_accuracy))
print('{}{}'.format('RandomForestClassifier Test accuracy:',rf_accuracy) )         # 查看預測的準確率

運行結果

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章