DataFrame與RDD的互操作
/**
* DataFrame和RDD的互操作
*/
object DataFrameRDDApp {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("DataFrameRDDApp").master("local[2]").getOrCreate()
//inferReflection(spark)
program(spark)
spark.stop()
}
def program(spark: SparkSession): Unit = {
// RDD ==> DataFrame
val rdd = spark.sparkContext.textFile("file:///Users/data/infos.txt")
val infoRDD = rdd.map(_.split(",")).map(line => Row(line(0).toInt, line(1), line(2).toInt))
val structType = StructType(Array(StructField("id", IntegerType, true),
StructField("name", StringType, true),
StructField("age", IntegerType, true)))
val infoDF = spark.createDataFrame(infoRDD,structType)
infoDF.printSchema()
infoDF.show()
//通過df的api進行操作
infoDF.filter(infoDF.col("age") > 30).show
//通過sql的方式進行操作
infoDF.createOrReplaceTempView("infos")
spark.sql("select * from infos where age > 30").show()
}
def inferReflection(spark: SparkSession) {
// RDD ==> DataFrame
val rdd = spark.sparkContext.textFile("file:///Users/data/infos.txt")
//注意:需要導入隱式轉換
import spark.implicits._
val infoDF = rdd.map(_.split(",")).map(line => Info(line(0).toInt, line(1), line(2).toInt)).toDF()
infoDF.show()
infoDF.filter(infoDF.col("age") > 30).show
infoDF.createOrReplaceTempView("infos")
spark.sql("select * from infos where age > 30").show()
}
case class Info(id: Int, name: String, age: Int)
}
DataFrame API操作
/**
* DataFrame API基本操作
*/
object DataFrameApp {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("DataFrameApp").master("local[2]").getOrCreate()
// 將json文件加載成一個dataframe
val peopleDF = spark.read.format("json").load("file:///Users/data/people.json")
// 輸出dataframe對應的schema信息
peopleDF.printSchema()
// 輸出數據集的前20條記錄
peopleDF.show()
//查詢某列所有的數據: select name from table
peopleDF.select("name").show()
// 查詢某幾列所有的數據,並對列進行計算: select name, age+10 as age2 from table
peopleDF.select(peopleDF.col("name"), (peopleDF.col("age") + 10).as("age2")).show()
//根據某一列的值進行過濾: select * from table where age>19
peopleDF.filter(peopleDF.col("age") > 19).show()
//根據某一列進行分組,然後再進行聚合操作: select age,count(1) from table group by age
peopleDF.groupBy("age").count().show()
spark.stop()
}
}
/**
* DataFrame中的操作操作
*/
object DataFrameCase {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("DataFrameRDDApp").master("local[2]").getOrCreate()
// RDD ==> DataFrame
val rdd = spark.sparkContext.textFile("file:///Users/data/student.data")
//注意:需要導入隱式轉換
import spark.implicits._
val studentDF = rdd.map(_.split("\\|")).map(line => Student(line(0).toInt, line(1), line(2), line(3))).toDF()
//show默認只顯示前20條
studentDF.show
studentDF.show(30)
studentDF.show(30, false)
studentDF.take(10)
studentDF.first()
studentDF.head(3)
studentDF.select("email").show(30,false)
studentDF.filter("name=''").show
studentDF.filter("name='' OR name='NULL'").show
//name以M開頭的人
studentDF.filter("SUBSTR(name,0,1)='M'").show
studentDF.sort(studentDF("name")).show
studentDF.sort(studentDF("name").desc).show
studentDF.sort("name","id").show
studentDF.sort(studentDF("name").asc, studentDF("id").desc).show
studentDF.select(studentDF("name").as("student_name")).show
val studentDF2 = rdd.map(_.split("\\|")).map(line => Student(line(0).toInt, line(1), line(2), line(3))).toDF()
studentDF.join(studentDF2, studentDF.col("id") === studentDF2.col("id")).show
spark.stop()
}
case class Student(id: Int, name: String, phone: String, email: String)
}
DataSet操作
/**
* Dataset操作
*/
object DatasetApp {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("DatasetApp")
.master("local[2]").getOrCreate()
//注意:需要導入隱式轉換
import spark.implicits._
val path = "file:///Users/data/sales.csv"
//spark如何解析csv文件
//header true 告訴spark 該文件有表頭
val df = spark.read.option("header","true").option("inferSchema","true").csv(path)
df.show
val ds = df.as[Sales]
ds.map(line => line.itemId).show
spark.sql("seletc name from person").show
//df.seletc("name")
df.select("nname")
ds.map(line => line.itemId)
spark.stop()
}
case class Sales(transactionId:Int,customerId:Int,itemId:Int,amountPaid:Double)
}
訪問外部數據源
方便快速從不同的數據源(json、parquet、rdbms),經過混合處理(json join parquet),再將處理結果以特定的格式(json、parquet)寫回到指定的系統(HDFS、S3)上去
內置數據源:json/parquet/jdbc/csv(2.0+版本提供)
處理parquet數據
/**
* Parquet文件操作
*/
object ParquetApp {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("SparkSessionApp")
.master("local[2]").getOrCreate()
/**
* spark.read.format("parquet").load 這是標準寫法
*/
val userDF = spark.read.format("parquet").load("file:///home/hadoop/app/spark-2.1.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/users.parquet")
userDF.printSchema()
userDF.show()
userDF.select("name","favorite_color").show
###寫文件 userDF.select("name","favorite_color").write.format("json").save("file:///home/hadoop/tmp/jsonout")
spark.read.load("file:///home/hadoop/app/spark-2.1.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/users.parquet").show
//會報錯,因爲sparksql默認處理的format就是parquet
spark.read.load("file:///home/hadoop/app/spark-2.1.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/people.json").show
spark.read.format("parquet").option("path","file:///home/hadoop/app/spark-2.1.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/users.parquet").load().show
spark.stop()
}
}
hive與mysql數據源的使用
/**
* 使用外部數據源綜合查詢Hive和MySQL的表數據
*/
object HiveMySQLApp {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("HiveMySQLApp")
.master("local[2]").getOrCreate()
// 加載Hive表數據
val hiveDF = spark.table("dept")
// 加載MySQL表數據
val mysqlDF = spark.read.format("jdbc").option("url", "jdbc:mysql://192.168.7.2:3306").option("dbtable", "sparksql.dept").option("user", "root").option("password", "root").option("driver", "com.mysql.jdbc.Driver").load()
// JOIN
val resultDF = hiveDF.join(mysqlDF, hiveDF.col("deptno") === mysqlDF.col("DEPTNO"))
resultDF.show
resultDF.select(hiveDF.col("empno"),hiveDF.col("ename"),
mysqlDF.col("deptno"), mysqlDF.col("dname")).show
spark.stop()
}
}
## 運行結果
+------+------------+----------+------+----------+--------+
|deptno| dname| loc|DEPTNO| DNAME| LOC|
+------+------------+----------+------+----------+--------+
| 20| 'RESEARCH'| 'DALLAS'| 20| RESEARCH| DALLAS|
| 40|'OPERATIONS'| 'BOSTON'| 40|OPERATIONS| BOSTON|
| 10|'ACCOUNTING'|'NEW YORK'| 10|ACCOUNTING|NEW YORK|
| 30| 'SALES'| 'CHICAGO'| 30| SALES| CHICAGO|
+------+------------+----------+------+----------+--------+
寫文件
DataFrame.write.save方法或saveAsTable 可以寫會到文件或表