簡介
用文檔中單個單詞出現的次數組成一個向量。
代碼
object CountVectorizerExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().getOrCreate()
val df = spark.createDataFrame(Seq(
(0, Array("a", "b", "c")),
(1, Array("a", "b", "b", "c", "a", "a"))
)).toDF("id", "words")
// fit a CountVectorizerModel from the corpus
val cvModel: CountVectorizerModel = new CountVectorizer()
.setInputCol("words")
.setOutputCol("features")
.setVocabSize(3)
.setMinDF(2)
.fit(df)
// alternatively, define CountVectorizerModel with a-priori vocabulary
val cvm = new CountVectorizerModel(Array("a", "b", "c", "c"))
.setInputCol("words")
.setOutputCol("features")
cvModel.transform(df).show(false)
}
}
輸出
+---+------------------+-------------------------+
|id |words |features |
+---+------------------+-------------------------+
|0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])|
|1 |[a, b, b, c, a, a]|(3,[0,1,2],[3.0,2.0,1.0])|
+---+------------------+-------------------------+