算法小白的第一次嘗試---KNN

import scala.io.Source
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
/**
*數據來源爲:iris數據集
*數據集鏈接地址分享給大家,裏面有很多常見的機器學習數據源
*http://archive.ics.uci.edu/ml/datasets/Iris
*/
object knn {
  def main(args: Array[String]): Unit = {
    val conf=new SparkConf().setMaster("local").setAppName("ML")
    val sc=new SparkContext(conf)
    val fileName = "C:/Users/Desktop/MLData/knntest.txt"
    val testdata=sc.textFile(fileName).map { line =>  
      val strArr=line.split(',')  
      Array(strArr(0),strArr(1),strArr(2),strArr(3))
    }.collect()
    
    val fileName2 = "C:/Users/Desktop/MLData/knntrain.txt"
    val traindata=sc.textFile(fileName2).map { line =>  
      val strArr=line.split(',')  
      Array(strArr(4),strArr(0),strArr(1),strArr(2),strArr(3))
    }.collect()
           
    val knumber=3
    val res=ArrayBuffer[String]()
    for(tr<-testdata){
      res.append(k(tr, traindata, knumber, sc))
    }
    res.foreach { x => println(x) }
  }
  
  /**
   * trainData傳入數據格式:label,5.1,3.5,1.4,0.2
   * testData傳入數據格式:				5.1,3.5,1.4,0.2
   */
  def k(testData:Array[String],trainData:Array[Array[String]],knumber:Int,sc:SparkContext):String={
    //遍歷testData,計算每個點與trainData的距離,升序排序,取前k個的label作爲改testData最終的label
      val tdArr=ArrayBuffer[Double]()
      for(i<-0 until testData.length) tdArr.append(testData(i).toDouble)
      val distanceArr=ArrayBuffer[LabeledPoint]()
      //計算每個點與trainData的距離,升序排序
      for(nd<-trainData){
        val ndArr=ArrayBuffer[Double]()
        for(j<-1 until nd.length) ndArr.append(nd(j).toDouble)
        distanceArr.append(LabeledPoint(nd(0).toDouble,Vectors.dense(caldis(tdArr.toArray, ndArr.toArray))))
      } 
      val newdistanceArr=distanceArr.sortWith{case(x1,x2)=>(x1.features(0) < x2.features(0))}
      val resultArr=newdistanceArr.take(knumber)
      val resultlabel=ArrayBuffer[String]()
      for(res<-resultArr) resultlabel.append(res.label.toString())
      //得到最終的label排序,取第一個爲最終的testData label即可
      val rs=sc.parallelize(resultlabel).map { x =>(x,1)}.reduceByKey(_+_).map{case(k,v)=>(v,k)}.sortByKey().collect()(0)._2
      rs
  }
  
  /**
   * @param x1 點1的n維座標
   * @param x2 點2的n維座標
   * @return 點間的歐式距離
   */
  def caldis(x1:Array[Double],x2:Array[Double]):Double={
    var dis=0.0
    for(i<-0 until x1.length) dis +=(x1(i)-x2(i))*(x1(i)-x2(i))
    val odis=Math.sqrt(dis)
    odis
  }
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章