一、Spark SQL定義:Spark的一個針對結構化數據操作的一個模塊
作用:
1 用SQL或者DataFrame進行結構化數據處理在Spark的程序中
2 可以處理任何數據源包括:Hive,Parquet,ORC,json, 和jdbc,甚至可以將這些數據進行join操作
3 對於Hive中已經存在的數據進行查詢和UDF的處理
4 可以進行JDBC連接進行數據處理。
二、流程:
1 創建SparkConf
2 設置配置參數:conf.setAppName("appname").setMaster("local[*]");
3 創建JavaSparkContext
4 創建SQLContext 上下文環境
5 通過SQLContext來讀取結構化數據
6 進行DataFrame或者SQL操作來處理數據
7 stop和close打開的SQLContext
8 配置.setMaster("local[*]"),如果在集羣中跑則去掉這個配置
三、json數據讀取
理解DataFrame中的數據結構:
DataFrame中包含RDD和Schema,其中RDD是它的數據,Schema是數據的結構
Schema中是StructType->StructFiled->字段名稱,類型,是否爲空字段
/**
* 讀取json數據
* @param sqlContext
*/
public void testShow(SQLContext sqlContext){
//默認讀取hdfs中的文件,如果讀取本地文件需要添加file:\\
DataFrame df = sqlContext.read().json("file:\\E:\\sparktestplace\\json.txt") ;
//DataFrame df = sqlContext.read().parquet("/examples_testr/data/resources17/part-r-00000-a3ec949c-cc17-4db6-8e73-0e0ea673af53.gz.parquet") ;
df.show();
//打印schema
df.printSchema();
df.select("name").show();
df.select(df.col("name"), df.col("age").plus(1)).show();
StructType st = df.schema() ;
StructField[] sfs = st.fields();
//打印字段的名稱 和字段的類型
System.out.println(sfs[0].name() +" "+sfs[0].dataType().typeName() );
System.out.println(sfs[1].name() +" "+sfs[1].dataType().typeName() );
return ;
}
四、數據庫讀取數據
public void testSQL(SQLContext sqlContext){
String tableName = "test_mysql" ;
String tableName2 = "spark_test_table" ;
//連接數據的基本參數
HashMap<String, String> options = new HashMap<String, String>();
options.put("url", "jdbc:mysql://192.168.0.213:3306/data" );
options.put("driver", "com.mysql.jdbc.Driver");
options.put("dbtable",tableName );
options.put("user", "username" );
options.put("password", "123456" );
//讀取數據庫的數據
DataFrame df = sqlContext.read().format("jdbc").options( options).load();
//帶有過濾條件
//DataFrame df3 = sqlContext.read().jdbc("jdbc:mysql://192.168.0.213:3306/data",
// tableName,new String[]{"sex='nan'"}, prop);
//DataFrame df3 = sqlContext.sql( "SELECT * FROM "+tableName ) ;
//df3.show();
//寫數據
df.write().mode(SaveMode.Overwrite).format("parquet").save("file:\\E:\\"+tableName2+".txt");
JavaRDD<Row> rdd = df.javaRDD();
printRdd( rdd );
}
五、打印RDD
public static void printRdd(JavaRDD<Row> rdd ){
if(rdd==null){
System.out.println( " rdd is null ");
return ;
}
rdd.toDebugString();//打印rdd的血統數據
Iterator<Row> it=rdd.collect().iterator();
while(it.hasNext()){
Row row = it.next();
printStructField(row.schema() );
printRow(row);
}
}
六、打印Row數據
public static void printRow(Row row ){
Integer rowSize = getRowSize(row.schema());
for(int i=0;i<rowSize;i++){
System.out.println( "the row of "+ i+" value is "+row.get(i));
}
}
七、獲得row的列數
public static Integer getRowSize( StructType st ){
StructField[] sfs = st.fields() ;
return sfs.length;
}
八、打印數據結構
public static void printStructField( StructType st ){
StructField[] sfs = st.fields() ;
if( sfs==null || sfs.length==0){
return ;
}
for(StructField sf : sfs){
System.out.println(" the struct of field "+ sf.name() + " ,type is "+sf.dataType().typeName() );
}
}
九 讀取parque數據
public void parqueFile(SQLContext sqlContext){
Properties prop = new Properties();
prop.put("user", "username");
prop.put("password", "pwd");
String tableName = "spark_test_table" ;
sqlContext.setConf("spark.sql.parquet.binaryAsString", "true") ; //parque中schema的兼容性更好
//sqlContext.read().parquet("" )
DataFrame df = sqlContext.parquetFile("E:\\spark_test_table.txt") ;
System.out.println("count=="+ df.count() );
df.registerTempTable(tableName);
sqlContext.sql("select * from "+tableName).show();
df.write().mode(SaveMode.Append).jdbc("jdbc:mysql://192.168.0.213:3306/data", tableName, prop);
//sqlContext.table
}
十、RDD 與 DataFrame之間的轉換操作
public void rddToDataFrame(JavaSparkContext sc , SQLContext sqlContext ){
// Load a text file and convert each line to a JavaBean.
JavaRDD<Person> people = sc.textFile("e://sparktestplace/people.txt").map(
new Function<String, Person>() {
public Person call(String line) throws Exception {
String[] parts = line.split(",");
Person person = new Person();
person.setName(parts[0]);
person.setAge(Integer.parseInt(parts[1].trim()));
System.out.println("============="+person );
return person;
}
});
// Apply a schema to an RDD of JavaBeans and register it as a table.
DataFrame schemaPeople = sqlContext.createDataFrame(people, Person.class);
schemaPeople.registerTempTable("people");
// SQL can be run over RDDs that have been registered as tables.
DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") ;
teenagers.show();
// The results of SQL queries are DataFrames and support all the normal RDD operations.
// The columns of a row in the result can be accessed by ordinal.
// List<String> teenagerNames = teenagers.javaRDD().map(new Function<Row, String>() {
// public String call(Row row) {
// return "Name: " + row.getString(0);
// }
// }).collect();
// for (int i = 0; i <teenagerNames.size(); i++) {
// System.out.println( teenagerNames.get(i) );
// }
}
十一、自己定義數據結構
public void defineSchema(JavaSparkContext sc , SQLContext sqlContext ){
// Load a text file and convert each line to a JavaBean.
JavaRDD<String> people = sc.textFile("e://sparktestplace/people.txt");
// The schema is encoded in a string
String schemaString = "name age";
// Generate the schema based on the string of schema
List<StructField> fields = new ArrayList<StructField>();
for (String fieldName: schemaString.split(" ")) {
fields.add(DataTypes.createStructField(fieldName, DataTypes.StringType, true));
}
StructType schema = DataTypes.createStructType(fields);
// Convert records of the RDD (people) to Rows.
JavaRDD<Row> rowRDD = people.map(
new Function<String, Row>() {
public Row call(String record) throws Exception {
String[] fields = record.split(",");
return RowFactory.create(fields[0], fields[1].trim());
}
});
// Apply the schema to the RDD.
DataFrame peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema);
// Register the DataFrame as a table.
peopleDataFrame.registerTempTable("people");
// SQL can be run over RDDs that have been registered as tables.
DataFrame results = sqlContext.sql("SELECT name FROM people");
// The results of SQL queries are DataFrames and support all the normal RDD operations.
// The columns of a row in the result can be accessed by ordinal.
List<String> names = results.javaRDD().map(new Function<Row, String>() {
public String call(Row row) {
return "Name: " + row.getString(0);
}
}).collect();
}
十二、加載parque數據
public void loadAndSave( SQLContext sqlContext ){
DataFrame df = sqlContext.read().load("E:\\spark_test_table.txt");
//SQL進行數據處理
df.registerTempTable("user");
DataFrame df2 = sqlContext.sql("select * from user where age=2");
df2.show();
//寫數據
//df.select("name", "favorite_color").write().save("namesAndFavColors.parquet");
//df.select("id", "username", "age").write().save("E:\\sparktestplace\\spark_test_table_write.parquet");
df.show();
}
十三、 自定義UDF使用
/**
* 自定義UDF使用
* format的各種格式:json, parquet, jdbc
* SaveMode.ErrorIfExists , SaveMode.Append, SaveMode.Overwrite, SaveMode.Ignore
* @param sqlContext
*/
public void specifyingOptions( SQLContext sqlContext ){
DataFrame df = sqlContext.read().format("json").load("file:\\E:\\sparktestplace\\json.txt");
//df.select("name", "email").write().mode(SaveMode.Overwrite).format("parquet").save("E:\\sparktestplace\\spark_test_table_write2.parquet");
sqlContext.registerDataFrameAsTable(df, "user");
sqlContext.udf().register("strLen", (String s) -> s.length(), DataTypes.IntegerType);//自定義UDF使用
sqlContext.udf().register("replace", new UDF2<String,String,String>(){
@Override
public String call(String t1, String t2) throws Exception {
return t1+"== "+t2;
}
}, DataTypes.StringType);
DataFrame df2 = sqlContext.sql("select strLen(country),replace(country,'==') ,email,id,ip ,name from user");
df2.show();
//DataFrame df2 = sqlContext.read().format("parquet").load("E:\\sparktestplace\\spark_test_table_write2.parquet");
//df2.show();
}
十四、sql 直接讀取parque文件數據
/**
* sql 直接讀取parque文件數據
* @param sqlContext
*/
public void sqlReadFile( SQLContext sqlContext ){
DataFrame df = sqlContext.sql("SELECT * FROM parquet.`E:\\sparktestplace\\spark_test_table_write2.parquet`");
df.show();
df.printSchema();
}