sparkSQL的RDD轉換成DataFrame

1、爲什麼要將RDD轉換成DataFrame,

直接針對HDFS等任何可以構建爲RDD數據,進行SparkSQL的sql查詢

2、SparkSQL支持RDD轉換成DataFrame的方式如下:

1>反射方式;

2>通過編程接口創建DataFrame;

方法一:使用createDataFrame方法;

val schema=StructType(

  seq(

   StructField("name",StringType,true),

   StructField("age",IntegerType,true)

      )

)

val rowRDD=sparkSession.sparkContext.textFile("/temp/person.txt",2)

                        .map(x=>x.split(",")).map(x=>Row(x(0),x(1).trim().toInt))

     sparksession.creteDataFrame(rowRDD,schema)

}

}

eg[java]:

calss RDDToDataFrame{

 public static void main(String [] args){

 SparkConf conf =new SparkConf().setMaster("local").setAppName("RDDToDataFrame")

JavaSparkContext sc=new JavaSparkContext(conf);

SQLContext sqlcontext=new SQLContext(sc);

//創建rdd,rdd->RDD(Row)

JavaRDD<String> lines=sc.textFile(./person.txt);

javaRDD<Row> rows=lines.map(new Function<String,Row>){

 public Row call(String line) throws Exception{

 String[] lineSplited=line.split(",");

//數據進行封裝到一個一個row中

return RowFactory.create(Integer.valueOf(lineSplited[0]),lineSplited[1],

   Integer.valueOf(lineSplited[2]));

}

});

//動態構造元數據

List<StructField> structFields=new ArrayList<StructField>();

structFields.add(DataTypes.createStructField("id",DataTypes.IntegerType,true));

structFields.add(DataTypes.createStructField("name",DataTypes.StringType,true));

structFields.add(DataTypes.createStructField("age",DataTypes.IntegerType,true));

structType structType=DataTypes.createStructType(structFields);

//動態構造的元數據,將RDD轉換成DataFrame

Dataset<Row> personDF=sqlContext.createDataFrame(rows,structType);

personDF.regidterTempTable("person");

Dataset<Row> teenagerDF=sqlContext.sql("select * from person where age<=18")

List<Row> teenagerRDD=teenagerDF.javaRDD().collect();

for(Row row:teenagerRDD){

  System.out.println(row);

}

}

scala版本;

object RDD2DataFrame{

 def main(args:Array[String]):Unit={

   val conf=new SparkConf().setMaster("local").setAppName("RDD2DataFrame");

   val sc=new SparkContext(conf);

  val sQLContext=new SQLContext(sc);

  //first:構造出元素爲Row的普通RDD

val peresonRDD=sc.textFile("./person.txt")

      .map(line=>Row(line.split(",")(0).toInt,line.split(",")(1),line.split(",")(2).toInt));

     //編程方式動態構建元數據

     val structType=StructType(Array(

      StructField("id",IntegerType,true),StructField("name",StringType,true),

      StructField("age",IntegerType,true) ))

     //第三步 進行RDD到DataFrame的轉換

      val personDF=sQLContext.creatDataFrame(personRDD,structType);

      personDF.registerTemple("person");

      val teenagerDF=sQLContext.sql("select * from person where age<=18");

      val teenagerRDD=teenagerDF.rdd.collect().foreach(row=>println(row));

}

}

eg:方法二使用反射來推斷包含了特定數據類型的RDD的元數據;

java版:

class RDD2DataFrame{

 public static void main(String[] args){

   SparkConf conf =new SparkConf().setAppName("RDD2DataFrame").setMaster("local");

   JavaSparkContext sc=new javaSparkContext(conf);

  SQLContext sqlcontext=new SQLContext(sc);

 

  javaRDD<String> lines=sc.textFile("./person.txt");

  javaRDD<person> persons=lines.map(new Function<String,Student>(){

 

 public Person call(String line) throws Exception{

  String[] lineSplited=line.split(",");

  Person per=new Person();

  per.setAge(Integer.valueOf(lineSplited[0]));

  per.setId(Integer.valueOf(lineSplited[1]));

  per.setName(lineSplited[2]);

 

 return per;

}

});

//使用反射的方式,將RDD轉換成DataFrame

Dataset<Row> personDF=sqlContext.createDataFrame(Person,person.calss);

//註冊中間表

personDF.registerTempTable("Person");

//針對臨時表執行sql語句

Dataset<Row> teenagerDF=sqlContext.sql("select * from person where age<=19");

//將查詢出來的DataFrame,再次轉換成RDD

JavaRDD<Row> teenagerRDD=teenagerDF.javaRDD();

JavaRDD<person>teenagerPersonRDD=teengerRDD.map(new Function<Row,person>(){

public Person call(Row row) throws Exception{

 person per =new person();

per.setAge(row.getInt(0));

per.setId(row.getInt(1));

per.setName(row.getString(2));

return per

}

});

//將數據collect回來,打印出來

List<person> personList=teenagerStudentRDD.collect();

for(person per:personList){

 System.out.println(per);

}

}

}

實體person:

class person implements Serializable{

private int id;

private int age;

private String name;

......

}

scala版本

objct RDD2DataFrame{

def main(args:Array[String]):Unit={

 val conf =new SparkConf().setMaster("local").setAppName("RDD2DataFrame");

 val sc=new SparkContext(conf);

 val sqlContext=new SQLContext(sc);

//scala需要手動的導入一個隱式轉換

import sqlContext.implicits._

case class person(id:Int,name:String,age:Int)

//case 的class的RDD元素,直接使用RDD的toDF,便可以轉換爲DataFrame元素

val personDF=sc.textFile("./person.txt",1)

       .map(line=>line.split(","))

       .map(arr=>person(arr(0).trim().toInt,arr(1),arr(2).trim().toInt))

       .toDF();

//註冊臨時表

personDF.registerTempTable("person");

val teenagerDF=sqlContext.sql("select * from person where age<=18");

val teenagerRDD=teenagerDF.rdd

teenagerRDD.map(row=>person(row(0).toString().toInt,row(1).toString,row(2).toString.toInt));

.collect()

.foreach(per=>println(per.id+":"+per.name+":"+per.age))

}

}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章