import scala.io.Source
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
/**
*數據來源爲:iris數據集
*數據集鏈接地址分享給大家,裏面有很多常見的機器學習數據源
*http://archive.ics.uci.edu/ml/datasets/Iris
*/
object knn {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setMaster("local").setAppName("ML")
val sc=new SparkContext(conf)
val fileName = "C:/Users/Desktop/MLData/knntest.txt"
val testdata=sc.textFile(fileName).map { line =>
val strArr=line.split(',')
Array(strArr(0),strArr(1),strArr(2),strArr(3))
}.collect()
val fileName2 = "C:/Users/Desktop/MLData/knntrain.txt"
val traindata=sc.textFile(fileName2).map { line =>
val strArr=line.split(',')
Array(strArr(4),strArr(0),strArr(1),strArr(2),strArr(3))
}.collect()
val knumber=3
val res=ArrayBuffer[String]()
for(tr<-testdata){
res.append(k(tr, traindata, knumber, sc))
}
res.foreach { x => println(x) }
}
/**
* trainData傳入數據格式:label,5.1,3.5,1.4,0.2
* testData傳入數據格式: 5.1,3.5,1.4,0.2
*/
def k(testData:Array[String],trainData:Array[Array[String]],knumber:Int,sc:SparkContext):String={
//遍歷testData,計算每個點與trainData的距離,升序排序,取前k個的label作爲改testData最終的label
val tdArr=ArrayBuffer[Double]()
for(i<-0 until testData.length) tdArr.append(testData(i).toDouble)
val distanceArr=ArrayBuffer[LabeledPoint]()
//計算每個點與trainData的距離,升序排序
for(nd<-trainData){
val ndArr=ArrayBuffer[Double]()
for(j<-1 until nd.length) ndArr.append(nd(j).toDouble)
distanceArr.append(LabeledPoint(nd(0).toDouble,Vectors.dense(caldis(tdArr.toArray, ndArr.toArray))))
}
val newdistanceArr=distanceArr.sortWith{case(x1,x2)=>(x1.features(0) < x2.features(0))}
val resultArr=newdistanceArr.take(knumber)
val resultlabel=ArrayBuffer[String]()
for(res<-resultArr) resultlabel.append(res.label.toString())
//得到最終的label排序,取第一個爲最終的testData label即可
val rs=sc.parallelize(resultlabel).map { x =>(x,1)}.reduceByKey(_+_).map{case(k,v)=>(v,k)}.sortByKey().collect()(0)._2
rs
}
/**
* @param x1 點1的n維座標
* @param x2 點2的n維座標
* @return 點間的歐式距離
*/
def caldis(x1:Array[Double],x2:Array[Double]):Double={
var dis=0.0
for(i<-0 until x1.length) dis +=(x1(i)-x2(i))*(x1(i)-x2(i))
val odis=Math.sqrt(dis)
odis
}
}
算法小白的第一次嘗試---KNN
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.