Spark

一、Spark SQL定義：Spark的一個針對結構化數據操作的一個模塊

  作用：
  1 用SQL或者DataFrame進行結構化數據處理在Spark的程序中
  2 可以處理任何數據源包括：Hive，Parquet，ORC，json, 和jdbc，甚至可以將這些數據進行join操作
  3 對於Hive中已經存在的數據進行查詢和UDF的處理
  4 可以進行JDBC連接進行數據處理。

二、流程：

1 創建SparkConf
2 設置配置參數：conf.setAppName("appname").setMaster("local[*]");
3 創建JavaSparkContext
4 創建SQLContext 上下文環境 
5 通過SQLContext來讀取結構化數據
6 進行DataFrame或者SQL操作來處理數據
7 stop和close打開的SQLContext
8 配置.setMaster("local[*]")，如果在集羣中跑則去掉這個配置

三、json數據讀取

理解DataFrame中的數據結構：

DataFrame中包含RDD和Schema，其中RDD是它的數據，Schema是數據的結構

Schema中是StructType->StructFiled->字段名稱，類型，是否爲空字段

	/**
	 * 讀取json數據
	 * @param sqlContext
	 */
	public void testShow(SQLContext sqlContext){
		//默認讀取hdfs中的文件，如果讀取本地文件需要添加file:\\
		DataFrame df = sqlContext.read().json("file:\\E:\\sparktestplace\\json.txt") ;
		//DataFrame df = sqlContext.read().parquet("/examples_testr/data/resources17/part-r-00000-a3ec949c-cc17-4db6-8e73-0e0ea673af53.gz.parquet") ;
		
		df.show();
		//打印schema
		df.printSchema();
		df.select("name").show();
		df.select(df.col("name"), df.col("age").plus(1)).show();
		
		StructType st = df.schema() ;
		StructField[] sfs = st.fields();
		//打印字段的名稱 和字段的類型
		System.out.println(sfs[0].name() +" "+sfs[0].dataType().typeName() );
		System.out.println(sfs[1].name() +" "+sfs[1].dataType().typeName() );
		return ;
	}

四、數據庫讀取數據

	public void testSQL(SQLContext sqlContext){
		String tableName = "test_mysql" ; 
		String tableName2 = "spark_test_table" ; 
 
		//連接數據的基本參數 
		HashMap<String, String> options = new HashMap<String, String>();
		options.put("url",   "jdbc:mysql://192.168.0.213:3306/data" );
		options.put("driver",  "com.mysql.jdbc.Driver");
		options.put("dbtable",tableName );
		options.put("user", "username" );
		options.put("password", "123456" );
		//讀取數據庫的數據
        DataFrame df = sqlContext.read().format("jdbc").options( options).load();
 
        //帶有過濾條件
        //DataFrame df3 = sqlContext.read().jdbc("jdbc:mysql://192.168.0.213:3306/data",
       // 		tableName,new String[]{"sex='nan'"}, prop);
		//DataFrame df3 = sqlContext.sql( "SELECT * FROM "+tableName ) ; 
        //df3.show();
        //寫數據
        df.write().mode(SaveMode.Overwrite).format("parquet").save("file:\\E:\\"+tableName2+".txt");
        
        JavaRDD<Row> rdd = df.javaRDD();
        printRdd( rdd );
	}

五、打印RDD

	public static void printRdd(JavaRDD<Row> rdd ){
		if(rdd==null){
			System.out.println( " rdd is null ");
			return ;
		}
		 rdd.toDebugString();//打印rdd的血統數據
        Iterator<Row>  it=rdd.collect().iterator();
        while(it.hasNext()){
        	Row  row = it.next();
			printStructField(row.schema() );
			printRow(row);
        }
	}

六、打印Row數據

	public static void printRow(Row  row ){
		Integer rowSize = getRowSize(row.schema());
		for(int i=0;i<rowSize;i++){
			System.out.println( "the row of "+ i+"  value is "+row.get(i));
		}
	}

七、獲得row的列數

	public static Integer getRowSize( StructType  st  ){
		StructField[] sfs = st.fields() ; 
		return sfs.length;
	}

八、打印數據結構

	public static void printStructField( StructType  st  ){
		StructField[] sfs = st.fields() ; 
		
		if( sfs==null || sfs.length==0){
			return  ;
		}
		for(StructField sf : sfs){
			System.out.println(" the struct of field "+ sf.name() + " ,type is  "+sf.dataType().typeName()  );
		}
	}

九讀取parque數據

	public void parqueFile(SQLContext sqlContext){

        Properties prop = new Properties();
        prop.put("user", "username");
        prop.put("password", "pwd");
        
		String tableName = "spark_test_table" ; 
		sqlContext.setConf("spark.sql.parquet.binaryAsString", "true") ; //parque中schema的兼容性更好
		
		//sqlContext.read().parquet("" )
		DataFrame  df = sqlContext.parquetFile("E:\\spark_test_table.txt") ;
		System.out.println("count=="+ df.count() );
		df.registerTempTable(tableName);
		sqlContext.sql("select * from "+tableName).show(); 
		
		df.write().mode(SaveMode.Append).jdbc("jdbc:mysql://192.168.0.213:3306/data", tableName, prop);
		
		//sqlContext.table
	}

十、RDD 與 DataFrame之間的轉換操作

	public void rddToDataFrame(JavaSparkContext sc , SQLContext  sqlContext ){
		
		// Load a text file and convert each line to a JavaBean.
		JavaRDD<Person> people = sc.textFile("e://sparktestplace/people.txt").map(
		  new Function<String, Person>() {
		    public Person call(String line) throws Exception {
		      String[] parts = line.split(",");

		      Person person = new Person();
		      person.setName(parts[0]);
		      person.setAge(Integer.parseInt(parts[1].trim()));
		      System.out.println("============="+person );
		      return person;
		    }
		  });

		// Apply a schema to an RDD of JavaBeans and register it as a table.
		DataFrame schemaPeople = sqlContext.createDataFrame(people, Person.class);
		schemaPeople.registerTempTable("people");

		// SQL can be run over RDDs that have been registered as tables.
		DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") ; 

		teenagers.show(); 
		// The results of SQL queries are DataFrames and support all the normal RDD operations.
		// The columns of a row in the result can be accessed by ordinal.
//		List<String> teenagerNames = teenagers.javaRDD().map(new Function<Row, String>() {
//		  public String call(Row row) {
//		    return "Name: " + row.getString(0);
//		  }
//		}).collect();
//		for (int i = 0; i <teenagerNames.size(); i++) {
//			System.out.println( teenagerNames.get(i) );
//		}
	}

十一、自己定義數據結構

	public void defineSchema(JavaSparkContext sc , SQLContext  sqlContext ){

		// Load a text file and convert each line to a JavaBean.
		JavaRDD<String> people = sc.textFile("e://sparktestplace/people.txt");

		// The schema is encoded in a string
		String schemaString = "name age";

		// Generate the schema based on the string of schema
		List<StructField> fields = new ArrayList<StructField>();
		for (String fieldName: schemaString.split(" ")) {
			fields.add(DataTypes.createStructField(fieldName, DataTypes.StringType, true));
		}
		StructType schema = DataTypes.createStructType(fields);

		// Convert records of the RDD (people) to Rows.
		JavaRDD<Row> rowRDD = people.map(
		  new Function<String, Row>() {
		    public Row call(String record) throws Exception {
		      String[] fields = record.split(",");
		      return RowFactory.create(fields[0], fields[1].trim());
		    }
		  });

		// Apply the schema to the RDD.
		DataFrame peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema);

		// Register the DataFrame as a table.
		peopleDataFrame.registerTempTable("people");

		// SQL can be run over RDDs that have been registered as tables.
		DataFrame results = sqlContext.sql("SELECT name FROM people");

		// The results of SQL queries are DataFrames and support all the normal RDD operations.
		// The columns of a row in the result can be accessed by ordinal.
		List<String> names = results.javaRDD().map(new Function<Row, String>() {
		  public String call(Row row) {
		    return "Name: " + row.getString(0);
		  }
		}).collect();
	}

十二、加載parque數據

	public void loadAndSave( SQLContext  sqlContext ){
		DataFrame df = sqlContext.read().load("E:\\spark_test_table.txt");
		//SQL進行數據處理
		df.registerTempTable("user");
		DataFrame df2 = sqlContext.sql("select * from user  where age=2");
		df2.show();  
		//寫數據
		//df.select("name", "favorite_color").write().save("namesAndFavColors.parquet");
		//df.select("id", "username", "age").write().save("E:\\sparktestplace\\spark_test_table_write.parquet");
		df.show(); 
	}

十三、自定義UDF使用

	/**
	 * 自定義UDF使用
	 * format的各種格式：json, parquet, jdbc
	 * SaveMode.ErrorIfExists ， SaveMode.Append， SaveMode.Overwrite， SaveMode.Ignore
	 * @param sqlContext
	 */
	public void  specifyingOptions( SQLContext  sqlContext ){

		DataFrame df = sqlContext.read().format("json").load("file:\\E:\\sparktestplace\\json.txt");
		//df.select("name", "email").write().mode(SaveMode.Overwrite).format("parquet").save("E:\\sparktestplace\\spark_test_table_write2.parquet");
		sqlContext.registerDataFrameAsTable(df, "user");
		sqlContext.udf().register("strLen", (String s) -> s.length(), DataTypes.IntegerType);//自定義UDF使用
		sqlContext.udf().register("replace", new  UDF2<String,String,String>(){
			@Override
			public String call(String t1, String t2) throws Exception {
				return t1+"== "+t2;
			}
		}, DataTypes.StringType);
		
		DataFrame df2 = sqlContext.sql("select strLen(country),replace(country,'==') ,email,id,ip ,name  from user");
		
		
		df2.show(); 
		
		//DataFrame df2 = sqlContext.read().format("parquet").load("E:\\sparktestplace\\spark_test_table_write2.parquet");
		//df2.show(); 
	}

十四、sql 直接讀取parque文件數據

	/**
	 * sql 直接讀取parque文件數據 
	 * @param sqlContext
	 */
	public void  sqlReadFile( SQLContext  sqlContext ){
		DataFrame df = sqlContext.sql("SELECT * FROM parquet.`E:\\sparktestplace\\spark_test_table_write2.parquet`");
		df.show(); 
		df.printSchema(); 
	}