Hive //hadoop mr sql
pheonix //hbase之上構建sql交互過程
該模塊能在spark運行sql語句。
DataFrame //收據框.表.
SparkSQL //SQL | DataFrame API.
使用類似SQL方式訪問hadoop,實現MR計算。RDD
df = sc.createDataFrame(rdd);
DataSet<Row> === DataFrame ==> //類似於table操作。
RDD[Customer]==>
$scala>df = sc.createDataFrame(rdd);
//創建樣例類
$scala>case class Customer(id:Int,name:String,age:Int)
//構造數據
$scala>val arr = Array("1,tom,12","2,tomas,13","3,tomasLee,14")
$scala>val rdd1 = sc.makeRDD(arr)
//創建對象rdd
$scala>val rdd2 = rdd1.map(e=>{e.split(",") ; Customer(arr(0).toInt,arr(1),arr(2).toInt)})
//通過rdd創建數據框
$scala>val df = spark.createDataFrame(rdd2);
//打印表結構
$scala>df.printSchema
$scala>df.show //插敘數據
//創建臨時視圖
$scala>df.createTempView("customers")
$scala>val df2 = spark.sql("select * from customers")
$scala>spark.sql("select * from customers").show //使用sql語句
$scala>val df1 = spark.sql("select * from cusotmers where id < 2");
$scala>val df2 = spark.sql("select * from cusotmers where id > 2");
$scala>df1.createTempView("c1")
$scala>df2.createTempView("c2")
$scala>spark.sql("select * from c1 union select * from c2").show()
$scala>df1.union(df2);
$scala>spark.sql("select id,name from customers").show
$scala>df.selectExpr("id","name")
$scala>df.where("name like 't%'")
//映射
$scala>df.map(_.getAs[Int]("age")).reduce(_+_) //聚合操作DataSet[Int]
$scala>df.agg(sum("age"),max("age"),min("age")) //聚合函數
spark java操作json數據
package com.mao.scala.java;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import java.util.function.Consumer;
/**
* Created by Administrator on 2017/4/3.
*/
public class SQLJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local") ;
conf.setAppName("SQLJava");
SparkSession session = SparkSession.builder()
.appName("SQLJava")
.config("spark.master","local")
.getOrCreate();
Dataset<Row> df1 = session.read().json("file:///d:/mr/sparkJson.dat");
//創建臨時視圖
df1.createOrReplaceTempView("customers");
df1.show();
Dataset<Row> df3 = df1.where("age > 13");
df3.show();
//按照sql方式查詢
Dataset<Row> df2 = session.sql("select * from customers where age > 13");
df2.show();
System.out.println("=================");
//聚合查詢
Dataset<Row> dfCount = session.sql("select count(1) from customers");
dfCount.show();
//DataFrame和RDD互操作
JavaRDD<Row> rdd = df1.toJavaRDD();
rdd.collect().forEach(new Consumer<Row>() {
public void accept(Row row) {
long age = row.getLong(0);
long id = row.getLong(1);
String name = row.getString(2);
System.out.println(age + "," + id + "," + name);
}
});
//保存處理,設置保存模式
df2.write().mode(SaveMode.Append).json("file:///d:/mr/json/out.dat");
}
}
json文件的讀寫
//讀取json文件形成df對象
SparkSession.read().json("..");
//將數據框數據寫入json文件
SparkSession.write().json("..");