
詞項加權(Term Weighting)的目的是給分詞後的詞語加上權重。重要的詞項給予更高的權重。那麼當我們對文本

行檢索的時候。比如當我們在淘寶購物,輸入“那本語義分析類的書最好”,那麼我們進行Term Weighting可能






圖1:Local Weight formulas

圖2:Local weight formulas

圖3:Global Weight formulas

TF-IDF是一種簡單高效的詞項加權方法。在上面的公式體系裏,TF-IDF的local weight是FREQ,global weight是



步驟1:計算一篇文檔中,出現’數學’的頻率,即 ,比如數學出現了30次。一篇文件的總詞語數是1000個,那

”數學“一詞的詞頻 爲0.03
步驟2: 計算一個文件頻率的逆,即IDF.比如‘數學’一次在1000份文件出現過,總文件數是:10000000.那麼

       步驟3:計算 TF-IDF= 0.03*4 = 12




 * 用hash表的方式,把詞項的頻率放入到序列表中
 * @param numFeatures number of features (default: 2^20^)
class HashingTF(val numFeatures: Int) extends Serializable {

  import HashingTF._

  private var binary = false
  private var hashAlgorithm = HashingTF.Murmur3

  def this() = this(1 << 20)//1 << 20=2^20=1048576

   * 如果 true, 詞項頻率的向量會變成二進制,非零的詞項會被設置爲1
   * (default: false)
  def setBinary(value: Boolean): this.type = {
    binary = value

   *在把詞項映射到整數時,設置hash 算法,默認是 (murmur3)
   * (default: murmur3)
  def setHashAlgorithm(value: String): this.type = {
    hashAlgorithm = value

   * 返回輸入詞項的索引
  def indexOf(term: Any): Int = {
    Utils.nonNegativeMod(getHashFunction(term), numFeatures)

   * Get the hash function corresponding to the current [[hashAlgorithm]] setting.
  private def getHashFunction: Any => Int = hashAlgorithm match {
    case Murmur3 => murmur3Hash
    case Native => nativeHash
    case _ =>
      // This should never happen.
      throw new IllegalArgumentException(
        s"HashingTF does not recognize hash algorithm $hashAlgorithm")

   * Transforms the input document into a sparse term frequency vector.
   * 返回的是:Vectors.sparse(numFeatures, termFrequencies.toSeq),因爲numFeatures設置爲1 << 20,那麼返回的是:(1048576,[每個詞的hash值組成的列表(從小到大)],[對應前面單詞的頻率頻率])
  def transform(document: Iterable[_]): Vector = {
    val termFrequencies = mutable.HashMap.empty[Int, Double]
    val setTF = if (binary) (i: Int) => 1.0 else (i: Int) => termFrequencies.getOrElse(i, 0.0) + 1.0
    val hashFunc: Any => Int = getHashFunction
    document.foreach { term =>
      val i = Utils.nonNegativeMod(hashFunc(term), numFeatures)
      termFrequencies.put(i, setTF(i))
    Vectors.sparse(numFeatures, termFrequencies.toSeq)

   * Transforms the input document into a sparse term frequency vector (Java version).
  def transform(document: JavaIterable[_]): Vector = {

   * Transforms the input document to term frequency vectors.
  def transform[D <: Iterable[_]](dataset: RDD[D]): RDD[Vector] = {

   * Transforms the input document to term frequency vectors (Java version).
  def transform[D <: JavaIterable[_]](dataset: JavaRDD[D]): JavaRDD[Vector] = {

object HashingTF {

  private[spark] val Native: String = "native"

  private[spark] val Murmur3: String = "murmur3"

  private val seed = 42

   * Calculate a hash code value for the term object using the native Scala implementation.
   * This is the default hash algorithm used in Spark 1.6 and earlier.
  private[spark] def nativeHash(term: Any): Int = term.##

   * Calculate a hash code value for the term object using
   * Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32).
   * This is the default hash algorithm used from Spark 2.0 onwards.
  private[spark] def murmur3Hash(term: Any): Int = {
    term match {
      case null => seed
      case b: Boolean => hashInt(if (b) 1 else 0, seed)
      case b: Byte => hashInt(b, seed)
      case s: Short => hashInt(s, seed)
      case i: Int => hashInt(i, seed)
      case l: Long => hashLong(l, seed)
      case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed)
      case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed)
      case s: String =>
        val utf8 = UTF8String.fromString(s)
        hashUnsafeBytes(utf8.getBaseObject, utf8.getBaseOffset, utf8.numBytes(), seed)
      case _ => throw new SparkException("HashingTF with murmur3 algorithm does not " +
        s"support type ${term.getClass.getCanonicalName} of input data.")

 * 逆文檔頻率 (IDF).
 * The standard formulation is used: `idf = log((m + 1) / (d(t) + 1))`, where `m` is the total
 * number of documents and `d(t)` is the number of documents that contain term `t`.
 * This implementation supports filtering out terms which do not appear in a minimum number
 * of documents (controlled by the variable `minDocFreq`). For terms that are not in
 * at least `minDocFreq` documents, the IDF is found as 0, resulting in TF-IDFs of 0.
 * @param minDocFreq minimum of documents in which a term
 *                   should appear for filtering
class IDF @Since("1.2.0") (@Since("1.2.0") val minDocFreq: Int) {

  def this() = this(0)

  // TODO: Allow different IDF formulations.

   * 計算逆文檔頻率
   * @param dataset an RDD of term frequency vectors
  def fit(dataset: RDD[Vector]): IDFModel = {
    val idf = dataset.treeAggregate(new IDF.DocumentFrequencyAggregator(
          minDocFreq = minDocFreq))(
      seqOp = (df, v) => df.add(v),
      combOp = (df1, df2) => df1.merge(df2)
    new IDFModel(idf)

   * 計算逆文檔頻率
   * @param dataset a JavaRDD of term frequency vectors
  def fit(dataset: JavaRDD[Vector]): IDFModel = {

private object IDF {

  /** 文檔頻率聚合. */
  class DocumentFrequencyAggregator(val minDocFreq: Int) extends Serializable {

    /** number of documents */
    private var m = 0L
    /** document frequency vector */
    private var df: BDV[Long] = _

    def this() = this(0)

    /** 添加新文檔。. */
    def add(doc: Vector): this.type = {
      if (isEmpty) {
        df = BDV.zeros(doc.size)
      doc match {
        case SparseVector(size, indices, values) =>
          val nnz = indices.length
          var k = 0
          while (k < nnz) {
            if (values(k) > 0) {
              df(indices(k)) += 1L
            k += 1
        case DenseVector(values) =>
          val n = values.length
          var j = 0
          while (j < n) {
            if (values(j) > 0.0) {
              df(j) += 1L
            j += 1
        case other =>
          throw new UnsupportedOperationException(
            s"Only sparse and dense vectors are supported but got ${other.getClass}.")
      m += 1L

    /**合併另一個 */
    def merge(other: DocumentFrequencyAggregator): this.type = {
      if (!other.isEmpty) {
        m += other.m
        if (df == null) {
          df = other.df.copy
        } else {
          df += other.df

    private def isEmpty: Boolean = m == 0L

    /** 返回當前的IDF向量 */
    def idf(): Vector = {
      if (isEmpty) {
        throw new IllegalStateException("Haven't seen any document yet.")
      val n = df.length
      val inv = new Array[Double](n)
      var j = 0
      while (j < n) {
         * If the term is not present in the minimum
         * number of documents, set IDF to 0. This
         * will cause multiplication in IDFModel to
         * set TF-IDF to 0.
         * Since arrays are initialized to 0 by default,
         * we just omit changing those entries.
        if (df(j) >= minDocFreq) {
          inv(j) = math.log((m + 1.0) / (df(j) + 1.0))
        j += 1

 * Represents an IDF model that can transform term frequency vectors.
class IDFModel private[spark] (@Since("1.1.0") val idf: Vector) extends Serializable {

   * 輸入 詞項頻率向量(TF) vectors 返回 TF-IDF vectors
   * 如果設置了 minDocFreq 那麼詞項小於minDocFreq時,那麼TF-IDF值爲0
   * @param dataset an RDD of term frequency vectors
   * @return an RDD of TF-IDF vectors
  def transform(dataset: RDD[Vector]): RDD[Vector] = {
    val bcIdf = dataset.context.broadcast(idf)
    dataset.mapPartitions(iter => => IDFModel.transform(bcIdf.value, v)))

   * Transforms a term frequency (TF) vector to a TF-IDF vector
   * @param v a term frequency vector
   * @return a TF-IDF vector
  def transform(v: Vector): Vector = IDFModel.transform(idf, v)

   * Transforms term frequency (TF) vectors to TF-IDF vectors (Java version).
   * @param dataset a JavaRDD of term frequency vectors
   * @return a JavaRDD of TF-IDF vectors
  def transform(dataset: JavaRDD[Vector]): JavaRDD[Vector] = {

private object IDFModel {

   * Transforms a term frequency (TF) vector to a TF-IDF vector with a IDF vector
   * @param idf an IDF vector
   * @param v a term frequence vector
   * @return a TF-IDF vector
  def transform(idf: Vector, v: Vector): Vector = {
    val n = v.size
    v match {
      case SparseVector(size, indices, values) =>
        val nnz = indices.length
        val newValues = new Array[Double](nnz)
        var k = 0
        while (k < nnz) {
          newValues(k) = values(k) * idf(indices(k))
          k += 1
        Vectors.sparse(n, indices, newValues)
      case DenseVector(values) =>
        val newValues = new Array[Double](n)
        var j = 0
        while (j < n) {
          newValues(j) = values(j) * idf(j)
          j += 1
      case other =>
        throw new UnsupportedOperationException(
          s"Only sparse and dense vectors are supported but got ${other.getClass}.")



(數據:鏈接: 密碼:v00d)

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.feature.{HashingTF, IDF}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD

object TFIDFExample {
  type Word = String
  type Sentence = List[String]

  def splitWords(content: String): List[Word] =
    ("[a-zA-Z]+".r findAllIn content).toList

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("TFIDFExample").setMaster("local")
    val sc = new SparkContext(conf)

    // Load documents (one per line).
    val documents = sc.textFile("/root/application/upload/")
    /*Innovation can be viewed as a adoption and dissemination of something new in
a given context. E-commerce is thus an innovation when it is introduced to a
new environment in an emerging market or when adopted by a new class of
user industries. As a techno-managerial innovation, it requires business
adaption, organizational learning, and supportive environment that could lead to
wide diffusion and transformational impact. Several global forces drive the
adoption of e-commerce such as global competition, trade liberalization, and
increasingly, ICT advances and Internet diffusion. National factors, such as
governance, education, and infrastructure, then shape and differentiate the
speed of adoption across enterprises within a country, the breadth and depth of
use within an enterprise, and ultimately the impact on the firm and the nation.
Understanding the national environment, the policy, technological and
infrastructural contexts, and the common drivers and barriers to adoption and
effective use within firms should provide a guide to promoting e-commerce as a
techno-managerial innovation, and realizing its full potential for the nation.*/

    val parseDate ={x =>
    /*List(Innovation, can, be, viewed, as, a, adoption, and, dissemination, of, something, new, in)
List(a, given, context, E, commerce, is, thus, an, innovation, when, it, is, introduced, to, a)
List(new, environment, in, an, emerging, market, or, when, adopted, by, a, new, class, of)
List(user, industries, As, a, techno, managerial, innovation, it, requires, business)
List(adaption, organizational, learning, and, supportive, environment, that, could, lead, to)
List(wide, diffusion, and, transformational, impact, Several, global, forces, drive, the)
List(adoption, of, e, commerce, such, as, global, competition, trade, liberalization, and)
List(increasingly, ICT, advances, and, Internet, diffusion, National, factors, such, as)
List(governance, education, and, infrastructure, then, shape, and, differentiate, the)
List(speed, of, adoption, across, enterprises, within, a, country, the, breadth, and, depth, of)
List(use, within, an, enterprise, and, ultimately, the, impact, on, the, firm, and, the, nation)
List(Understanding, the, national, environment, the, policy, technological, and)
List(infrastructural, contexts, and, the, common, drivers, and, barriers, to, adoption, and)
List(effective, use, within, firms, should, provide, a, guide, to, promoting, e, commerce, as, a)
List(techno, managerial, innovation, and, realizing, its, full, potential, for, the, nation)*/
    val hashingTF = new HashingTF()
    val tf: RDD[Vector] = hashingTF.transform(parseDate)

    println("a indexOf:"+hashingTF.indexOf("a"))//a indexOf:97


    val idf = new IDF().fit(tf)
    val tfidf: RDD[Vector] = idf.transform(tf)
    val idfIgnore = new IDF(minDocFreq = 2).fit(tf)
    val tfidfIgnore: RDD[Vector] = idfIgnore.transform(tf)

    println("tfidf: ")
    tfidf.foreach(x => println(x))

    println("tfidfIgnore: ")
    tfidfIgnore.foreach(x => println(x))
(1048576,[3117,3551,81222,96727,114801,116103,151367,233886,471905,959595,1046078],[1.3862943611198906,0.0,1.6739764335716716,0.5753641449035617,2.0794415416798357,1.6739764335716716,1.6739764335716716,0.0,0.0,1.3862943611198906,0.0])16/06/05 19:25:10 INFO Executor: Finished task 0.0 in stage 6.0 (TID 6). 2044 bytes result sent to driver




