/*
import org.apache.spark.sql.functions._
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
*/
/*
https://stackoverflow.com/questions/34614239/how-to-apply-a-function-to-a-column-of-a-spark-dataframe
https://jaceklaskowski.gitbooks.io/mastering-spark-sql/spark-sql-Column.html
https://www.jianshu.com/p/833b72adb2b6
*/
import org.apache.spark.sql.functions.udf
val df = Seq((1, "jeden"), (2, "dwa"), (3, "jerry"), (0,"tom")).toDF("number", "polish")
scala> df.show
+------+------+
|number|polish|
+------+------+
| 1| jeden|
| 2| dwa|
| 3| jerry|
| 0| tom|
+------+------+
val label_class = udf((x:Int) => if(x>0) 1 else 0)
scala> df.withColumn("number", label_class($"number")).show
+------+------+
|number|polish|
+------+------+
| 1| jeden|
| 1| dwa|
| 1| jerry|
| 0| tom|
+------+------+
scala> val data = df.withColumn("number", label_class($"number"))
data: org.apache.spark.sql.DataFrame = [number: int, polish: string]
scala> data
res3: org.apache.spark.sql.DataFrame = [number: int, polish: string]
scala> data.show
+------+------+
|number|polish|
+------+------+
| 1| jeden|
| 1| dwa|
| 1| jerry|
| 0| tom|
+------+------+
利用UDF對dataframe列數據進行修改
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.