from pyspark import SparkConf
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.mllib.evaluation import RankingMetrics, RegressionMetrics
from pyspark.sql.types import StringType, FloatType, IntegerType, LongType
read and clean data
spark = SparkSession.builder.appName("MyApp").config("spark.jars.packages","Azure:mmlspark:0.17").getOrCreate()
data = spark.read.csv('hdfs:///user/hadoop/adult.csv',inferSchema=True, header=True)
data.limit(10).toPandas()
age
workclass
fnlwgt
education
education.num
marital.status
occupation
relationship
race
sex
capital.gain
capital.loss
hours.per.week
native.country
income
0
90
?
77053
HS-grad
9
Widowed
?
Not-in-family
White
Female
0
4356
40
United-States
<=50K
1
82
Private
132870
HS-grad
9
Widowed
Exec-managerial
Not-in-family
White
Female
0
4356
18
United-States
<=50K
2
66
?
186061
Some-college
10
Widowed
?
Unmarried
Black
Female
0
4356
40
United-States
<=50K
3
54
Private
140359
7th-8th
4
Divorced
Machine-op-inspct
Unmarried
White
Female
0
3900
40
United-States
<=50K
4
41
Private
264663
Some-college
10
Separated
Prof-specialty
Own-child
White
Female
0
3900
40
United-States
<=50K
5
34
Private
216864
HS-grad
9
Divorced
Other-service
Unmarried
White
Female
0
3770
45
United-States
<=50K
6
38
Private
150601
10th
6
Separated
Adm-clerical
Unmarried
White
Male
0
3770
40
United-States
<=50K
7
74
State-gov
88638
Doctorate
16
Never-married
Prof-specialty
Other-relative
White
Female
0
3683
20
United-States
>50K
8
68
Federal-gov
422013
HS-grad
9
Divorced
Prof-specialty
Not-in-family
White
Female
0
3683
40
United-States
<=50K
9
41
Private
70037
Some-college
10
Never-married
Craft-repair
Unmarried
White
Male
0
3004
60
?
>50K
withColumnRenamed
data = data.withColumnRenamed('education.num','education_num')\
.withColumnRenamed('marital.status','marital_status')\
.withColumnRenamed('capital.gain','capital_gain')\
.withColumnRenamed('capital.loss','capital_loss')\
.withColumnRenamed('hours.per.week','hours_per_week')\
.withColumnRenamed('native.country','native_country')
data = data.select(["age","education","education_num","marital_status","hours_per_week","income"])
train, test = data.randomSplit([0.75,0.25], seed=20200420)
from mmlspark import TrainClassifier
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, NaiveBayes