SparkML之分類(二)logistics迴歸

前面已經陳述過logistic的理論的了，在此就不贅述了（http://blog.csdn.net/legotime/article/details/51312393）

Logistic 函數（分類時有個名字叫Sigmoid函數）如下：

logistic函數早期是用於人口預測的。但隨着人們對其的應用擴展，開始慢慢應用於分類問題,而且是神經網絡中一個

經常使用的過渡函數，圖1是將logistic函數

圖1

它的原理是：在分二類的情況下，當h的計算值大於0.5時，讓h等於1,h的計算值小於等於於0.5時，讓h等於0。這樣

對於輸入一個X

那麼結果就分類 0 或 1，所以達到了分類的效果。當然logistic函數可以應用於多個類的情況。

-----------------------------------------------------------------------------------------------------

spark Logistic模型訓練圖

-------------------------------------------------------------------------------------------------------

源碼分析

package org.apache.spark.mllib.classification

import org.apache.spark.SparkContext
import org.apache.spark.annotation.Since
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.classification.impl.GLMClassificationModel
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors}
import org.apache.spark.mllib.linalg.BLAS.dot
import org.apache.spark.mllib.optimization._
import org.apache.spark.mllib.pmml.PMMLExportable
import org.apache.spark.mllib.regression._
import org.apache.spark.mllib.util.{DataValidators, Loader, Saveable}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.storage.StorageLevel

/**
 * 利用（Multinomial/Binary）logistic迴歸來訓練分類模型
 *
 * @param weights 特徵的權重
 * @param intercept 偏置(二元迴歸的時候是一個值，在多元迴歸的時候會和特徵融合在一起.)
 * @param numFeatures 特徵的維度
 * @param numClasses 多元迴歸分析中的類分類問題的可能結果的個數。默認情況下，它是二元Logistic迴歸，numclasses將被設置爲2。
 */
@Since("0.8.0")
class LogisticRegressionModel @Since("1.3.0") (
    @Since("1.0.0") override val weights: Vector,
    @Since("1.0.0") override val intercept: Double,
    @Since("1.3.0") val numFeatures: Int,
    @Since("1.3.0") val numClasses: Int)
  extends GeneralizedLinearModel(weights, intercept) with ClassificationModel with Serializable
  with Saveable with PMMLExportable {

  if (numClasses == 2) {
    require(weights.size == numFeatures,
      s"LogisticRegressionModel with numClasses = 2 was given non-matching values:" +
      s" numFeatures = $numFeatures, but weights.size = ${weights.size}")
  } else {
    val weightsSizeWithoutIntercept = (numClasses - 1) * numFeatures
    val weightsSizeWithIntercept = (numClasses - 1) * (numFeatures + 1)
    require(weights.size == weightsSizeWithoutIntercept || weights.size == weightsSizeWithIntercept,
      s"LogisticRegressionModel.load with numClasses = $numClasses and numFeatures = $numFeatures" +
      s" expected weights of length $weightsSizeWithoutIntercept (without intercept)" +
      s" or $weightsSizeWithIntercept (with intercept)," +
      s" but was given weights of length ${weights.size}")
  }

  private val dataWithBiasSize: Int = weights.size / (numClasses - 1)

  private val weightsArray: Array[Double] = weights match {
    case dv: DenseVector => dv.values
    case _ =>
      throw new IllegalArgumentException(
        s"weights only supports dense vector but got type ${weights.getClass}.")
  }

  /**
   * 構建一個LogisticRegressionModel，權重和偏置都是二維的。
   */
  @Since("1.0.0")
  def this(weights: Vector, intercept: Double) = this(weights, intercept, weights.size, 2)

  private var threshold: Option[Double] = Some(0.5)

  /**
   * 設置 閾值，對於二分類情況下。這個閾值用於當y大於它時，就在 positive,當小於它時，就分來negative
   * 默認情況之惡個這個閾值設置爲 0.5
   */
  @Since("1.0.0")
  def setThreshold(threshold: Double): this.type = {
    this.threshold = Some(threshold)
    this
  }

  /**
   *返回的閾值（如果有的話），用於將原始預測分數轉換爲0 / 1預測。它僅用於二進制分類。
   */
  @Since("1.3.0")
  def getThreshold: Option[Double] = threshold

  /**
   * 清除閾值，以便“預測”將輸出預測值。
   * 它僅用於二進制分類
   */
  @Since("1.0.0")
  def clearThreshold(): this.type = {
    threshold = None
    this
  }

  override protected def predictPoint(
      dataMatrix: Vector,
      weightMatrix: Vector,
      intercept: Double) = {
    require(dataMatrix.size == numFeatures)

    // 如果 dataMatrix和 weightMatrix 具有相同的維度, 那麼它是二分類的logistic迴歸
    if (numClasses == 2) {
      val margin = dot(weightMatrix, dataMatrix) + intercept
      val score = 1.0 / (1.0 + math.exp(-margin))
      threshold match {
        case Some(t) => if (score > t) 1.0 else 0.0
        case None => score
      }
    } else {
      /**
       * Compute and find the one with maximum margins. If the maxMargin is negative, then the
       * prediction result will be the first class.
       *
       * PS, if you want to compute the probabilities for each outcome instead of the outcome
       * with maximum probability, remember to subtract the maxMargin from margins if maxMargin
       * is positive to prevent overflow.
       */
      var bestClass = 0
      var maxMargin = 0.0
      val withBias = dataMatrix.size + 1 == dataWithBiasSize
      (0 until numClasses - 1).foreach { i =>
        var margin = 0.0
        dataMatrix.foreachActive { (index, value) =>
          if (value != 0.0) margin += value * weightsArray((i * dataWithBiasSize) + index)
        }
        // Intercept is required to be added into margin.
        if (withBias) {
          margin += weightsArray((i * dataWithBiasSize) + dataMatrix.size)
        }
        if (margin > maxMargin) {
          maxMargin = margin
          bestClass = i + 1
        }
      }
      bestClass.toDouble
    }
  }

  @Since("1.3.0")
  override def save(sc: SparkContext, path: String): Unit = {
    GLMClassificationModel.SaveLoadV1_0.save(sc, path, this.getClass.getName,
      numFeatures, numClasses, weights, intercept, threshold)
  }

  override protected def formatVersion: String = "1.0"

  override def toString: String = {
    s"${super.toString}, numClasses = ${numClasses}, threshold = ${threshold.getOrElse("None")}"
  }
}

@Since("1.3.0")
object LogisticRegressionModel extends Loader[LogisticRegressionModel] {

  @Since("1.3.0")
  override def load(sc: SparkContext, path: String): LogisticRegressionModel = {
    val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path)
    // Hard-code class name string in case it changes in the future
    val classNameV1_0 = "org.apache.spark.mllib.classification.LogisticRegressionModel"
    (loadedClassName, version) match {
      case (className, "1.0") if className == classNameV1_0 =>
        val (numFeatures, numClasses) = ClassificationModel.getNumFeaturesClasses(metadata)
        val data = GLMClassificationModel.SaveLoadV1_0.loadData(sc, path, classNameV1_0)
        // numFeatures, numClasses, weights are checked in model initialization
        val model =
          new LogisticRegressionModel(data.weights, data.intercept, numFeatures, numClasses)
        data.threshold match {
          case Some(t) => model.setThreshold(t)
          case None => model.clearThreshold()
        }
        model
      case _ => throw new Exception(
        s"LogisticRegressionModel.load did not recognize model with (className, format version):" +
        s"($loadedClassName, $version).  Supported:\n" +
        s"  ($classNameV1_0, 1.0)")
    }
  }
}

/**
 * 用隨機梯度下降算法來訓練二分類的logitic迴歸的分類模型
 * 默認情況下用L2正則化，它可以通過[[LogisticRegressionWithSGD.optimizer]].來改變
 * note:二分類以上的K分類的logistic迴歸分類 ，Lables 可以爲 {0, 1, ..., k - 1}
 */
@Since("0.8.0")
@deprecated("Use ml.classification.LogisticRegression or LogisticRegressionWithLBFGS", "2.0.0")
class LogisticRegressionWithSGD private[mllib] (
    private var stepSize: Double,
    private var numIterations: Int,
    private var regParam: Double,
    private var miniBatchFraction: Double)
  extends GeneralizedLinearAlgorithm[LogisticRegressionModel] with Serializable {

  private val gradient = new LogisticGradient()
  private val updater = new SquaredL2Updater()
  @Since("0.8.0")
  override val optimizer = new GradientDescent(gradient, updater)
    .setStepSize(stepSize)
    .setNumIterations(numIterations)
    .setRegParam(regParam)
    .setMiniBatchFraction(miniBatchFraction)
  override protected val validators = List(DataValidators.binaryLabelValidator)

  /**
   * 構建一個默認情況下的邏輯迴歸，默認參數是{stepSize: 1.0,numIterations: 100, regParm: 0.01, miniBatchFraction: 1.0}.
   */
  @Since("0.8.0")
  def this() = this(1.0, 100, 0.01, 1.0)

  override protected[mllib] def createModel(weights: Vector, intercept: Double) = {
    new LogisticRegressionModel(weights, intercept)
  }
}

/**
 * 最先用的方法是隨機梯度下降
 * NOTE: Logistic 迴歸的label應該是 {0, 1}
 */
@Since("0.8.0")
@deprecated("Use ml.classification.LogisticRegression or LogisticRegressionWithLBFGS", "2.0.0")
object LogisticRegressionWithSGD {
  // NOTE(shivaram): We use multiple train methods instead of default arguments to support
  // Java programs.

  /**
   * 給定一個 pair RDD(label, features) 訓練一個logistic迴歸模型。我們通過特定步長來固定迭代次數。
   * 每次迭代用miniBatchFraction來計算梯度。
   * NOTE: Labels used in Logistic Regression should be {0, 1}
   
   *
   * @param input RDD of (label, array of features) pairs.
   * @param numIterations Number of iterations of gradient descent to run.（迭代次數）
   * @param stepSize Step size to be used for each iteration of gradient descent.（步長）
   * @param miniBatchFraction Fraction of data to be used per iteration.（一次用於迭代的數據量）
   * @param initialWeights Initial set of weights to be used. Array should be equal in size to
   *        the number of features in the data.
   */
  @Since("1.0.0")
  def train(
      input: RDD[LabeledPoint],
      numIterations: Int,
      stepSize: Double,
      miniBatchFraction: Double,
      initialWeights: Vector): LogisticRegressionModel = {
    new LogisticRegressionWithSGD(stepSize, numIterations, 0.0, miniBatchFraction)
      .run(input, initialWeights)
  }

  /**
  /**
   * 給定一個 pair RDD(label, features) 訓練一個logistic迴歸模型。我們通過特定步長來固定迭代次數。
   * 每次迭代用miniBatchFraction來計算梯度。
   * NOTE: Labels used in Logistic Regression should be {0, 1}
   *
   * @param input RDD of (label, array of features) pairs.
   * @param numIterations Number of iterations of gradient descent to run.
   * @param stepSize Step size to be used for each iteration of gradient descent.

   * @param miniBatchFraction Fraction of data to be used per iteration.
   */
  @Since("1.0.0")
  def train(
      input: RDD[LabeledPoint],
      numIterations: Int,
      stepSize: Double,
      miniBatchFraction: Double): LogisticRegressionModel = {
    new LogisticRegressionWithSGD(stepSize, numIterations, 0.0, miniBatchFraction)
      .run(input)
  }

  /**
  /**
   * 給定一個 pair RDD(label, features) 訓練一個logistic迴歸模型。我們通過特定步長來固定迭代次數。
   * 每次迭代用miniBatchFraction來計算梯度。
   * NOTE: Labels used in Logistic Regression should be {0, 1}
   *
   * @param input RDD of (label, array of features) pairs.
   * @param stepSize Step size to be used for each iteration of Gradient Descent.

   * @param numIterations Number of iterations of gradient descent to run.
   * @return a LogisticRegressionModel which has the weights and offset from training.
   */
  @Since("1.0.0")
  def train(
      input: RDD[LabeledPoint],
      numIterations: Int,
      stepSize: Double): LogisticRegressionModel = {
    train(input, numIterations, stepSize, 1.0)
  }

  /**
  /**
   * 給定一個 pair RDD(label, features) 訓練一個logistic迴歸模型。我們通過特定步長來固定迭代次數。
   * 每次迭代用miniBatchFraction來計算梯度。
   * NOTE: Labels used in Logistic Regression should be {0, 1}
   *
   * @param input RDD of (label, array of features) pairs.
   * @param numIterations Number of iterations of gradient descent to run.
   * @return a LogisticRegressionModel which has the weights and offset from training.
   */
  @Since("1.0.0")
  def train(
      input: RDD[LabeledPoint],
      numIterations: Int): LogisticRegressionModel = {
    train(input, numIterations, 1.0, 1.0)
  }
}

/**
 * 用Limited-memory BFGS算法來訓練二分類/K分類的logitic迴歸的分類模型，默認情況下是用L2正則化
 * note:二分類以上的K分類的logistic迴歸分類 ，Lables 可以爲 {0, 1, ..., k - 1}
 * 早期是用 LogisticRegressionWithLBFGS來實現正則化，包括偏置。如果updates是(L1Updater, or SquaredL2Updater) ，
 * 那麼它應該是來自 ml.LogisticRegression
 * 否則就是現在的 mllib下的廣義線性算法（GeneralizedLinearAlgorithm）來訓練，
 */
@Since("1.1.0")
class LogisticRegressionWithLBFGS
  extends GeneralizedLinearAlgorithm[LogisticRegressionModel] with Serializable {

  this.setFeatureScaling(true)

  @Since("1.1.0")
  override val optimizer = new LBFGS(new LogisticGradient, new SquaredL2Updater)

  override protected val validators = List(multiLabelValidator)

  private def multiLabelValidator: RDD[LabeledPoint] => Boolean = { data =>
    if (numOfLinearPredictor > 1) {
      DataValidators.multiLabelValidator(numOfLinearPredictor + 1)(data)
    } else {
      DataValidators.binaryLabelValidator(data)
    }
  }

  /**
   * 在多分類（k）的logistic迴歸中，設置用於類分類問題的可能結果的數量。默認情況下k = 2
   */
  @Since("1.3.0")
  def setNumClasses(numClasses: Int): this.type = {
    require(numClasses > 1)
    numOfLinearPredictor = numClasses - 1
    if (numClasses > 2) {
      optimizer.setGradient(new LogisticGradient(numClasses))
    }
    this
  }

  override protected def createModel(weights: Vector, intercept: Double) = {
    if (numOfLinearPredictor == 1) {
      new LogisticRegressionModel(weights, intercept)
    } else {
      new LogisticRegressionModel(weights, intercept, numFeatures, numOfLinearPredictor + 1)
    }
  }

  /**
   * Run Logistic Regression with the configured parameters on an input RDD
   * of LabeledPoint entries.
   *
   *
   * 如果在之前聲明瞭更新的方法是 ml包下面的，那麼就是，如果不是那麼選擇的是mllib包下的更新方法
   */
  override def run(input: RDD[LabeledPoint]): LogisticRegressionModel = {
    run(input, generateInitialWeights(input), userSuppliedWeights = false)
  }

  /**
   * Run Logistic Regression with the configured parameters on an input RDD
   * of LabeledPoint entries.
   *
   *
   * 如果在之前聲明瞭更新的方法是 ml包下面的，那麼就是，如果不是那麼選擇的是mllib包下的更新方法
   */
   *note：因爲在ml包下沒有配置LBFGS更新方法，所以optimizer.setNumCorrections()是無效的
   */
  override def run(input: RDD[LabeledPoint], initialWeights: Vector): LogisticRegressionModel = {
    run(input, initialWeights, userSuppliedWeights = true)
  }

  private def run(input: RDD[LabeledPoint], initialWeights: Vector, userSuppliedWeights: Boolean):
      LogisticRegressionModel = {
    // ml's Logistic regression only supports binary classification currently.
    if (numOfLinearPredictor == 1) {
      def runWithMlLogisitcRegression(elasticNetParam: Double) = {
        // Prepare the ml LogisticRegression based on our settings
        val lr = new org.apache.spark.ml.classification.LogisticRegression()
        lr.setRegParam(optimizer.getRegParam())
        lr.setElasticNetParam(elasticNetParam)
        lr.setStandardization(useFeatureScaling)
        if (userSuppliedWeights) {
          val uid = Identifiable.randomUID("logreg-static")
          lr.setInitialModel(new org.apache.spark.ml.classification.LogisticRegressionModel(
            uid, initialWeights.asML, 1.0))
        }
        lr.setFitIntercept(addIntercept)
        lr.setMaxIter(optimizer.getNumIterations())
        lr.setTol(optimizer.getConvergenceTol())
        // Convert our input into a DataFrame
        val sqlContext = new SQLContext(input.context)
        import sqlContext.implicits._
        val df = input.map(_.asML).toDF()
        // Determine if we should cache the DF
        val handlePersistence = input.getStorageLevel == StorageLevel.NONE
        // Train our model
        val mlLogisticRegresionModel = lr.train(df, handlePersistence)
        // convert the model
        val weights = Vectors.dense(mlLogisticRegresionModel.coefficients.toArray)
        createModel(weights, mlLogisticRegresionModel.intercept)
      }
      optimizer.getUpdater() match {
        case x: SquaredL2Updater => runWithMlLogisitcRegression(0.0)
        case x: L1Updater => runWithMlLogisitcRegression(1.0)
        case _ => super.run(input, initialWeights)
      }
    } else {
      super.run(input, initialWeights)
    }
  }
}

SparkML實驗

import org.apache.spark.mllib.classification.{LogisticRegressionModel, LogisticRegressionWithLBFGS}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils


object LinearRegressionWithSGDExample {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample").setMaster("local")
    val sc = new SparkContext(conf)


    val data = MLUtils.loadLibSVMFile(sc, "C:\\Users\\alienware\\IdeaProjects\\sparkCore\\data\\mllib\\sample_libsvm_data.txt")

    // Split data into training (60%) and test (40%).
    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    // Run training algorithm to build the model
    val model = new LogisticRegressionWithLBFGS()
      .setNumClasses(2)
      .run(training)


    // Compute raw scores on the test set.
    val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
      val prediction = model.predict(features)
      (prediction, label)
    }
    predictionAndLabels.foreach(println)

    // Get evaluation metrics.
    val metrics = new MulticlassMetrics(predictionAndLabels)
    val precision = metrics.precision
    println("Precision = " + precision)

    // Save and load model
    model.save(sc, "target/tmp/scalaLogisticRegressionWithLBFGSModel")
    val sameModel = LogisticRegressionModel.load(sc,
      "target/tmp/scalaLogisticRegressionWithLBFGSModel")


    sc.stop()
  }
}
//預測數據和實際數據

(1.0,1.0)
(1.0,1.0)
(0.0,0.0)
(1.0,1.0)
(0.0,0.0)
(0.0,0.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(0.0,0.0)
(1.0,1.0)
(1.0,1.0)
(0.0,0.0)
(1.0,1.0)
(0.0,0.0)
(0.0,0.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(0.0,0.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(0.0,0.0)
(1.0,1.0)
(0.0,0.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(0.0,0.0)