1、爲什麼要將RDD轉換成DataFrame,
直接針對HDFS等任何可以構建爲RDD數據,進行SparkSQL的sql查詢
2、SparkSQL支持RDD轉換成DataFrame的方式如下:
1>反射方式;
2>通過編程接口創建DataFrame;
方法一:使用createDataFrame方法;
val schema=StructType(
seq(
StructField("name",StringType,true),
StructField("age",IntegerType,true)
)
)
val rowRDD=sparkSession.sparkContext.textFile("/temp/person.txt",2)
.map(x=>x.split(",")).map(x=>Row(x(0),x(1).trim().toInt))
sparksession.creteDataFrame(rowRDD,schema)
}
}
eg[java]:
calss RDDToDataFrame{
public static void main(String [] args){
SparkConf conf =new SparkConf().setMaster("local").setAppName("RDDToDataFrame")
JavaSparkContext sc=new JavaSparkContext(conf);
SQLContext sqlcontext=new SQLContext(sc);
//創建rdd,rdd->RDD(Row)
JavaRDD<String> lines=sc.textFile(./person.txt);
javaRDD<Row> rows=lines.map(new Function<String,Row>){
public Row call(String line) throws Exception{
String[] lineSplited=line.split(",");
//數據進行封裝到一個一個row中
return RowFactory.create(Integer.valueOf(lineSplited[0]),lineSplited[1],
Integer.valueOf(lineSplited[2]));
}
});
//動態構造元數據
List<StructField> structFields=new ArrayList<StructField>();
structFields.add(DataTypes.createStructField("id",DataTypes.IntegerType,true));
structFields.add(DataTypes.createStructField("name",DataTypes.StringType,true));
structFields.add(DataTypes.createStructField("age",DataTypes.IntegerType,true));
structType structType=DataTypes.createStructType(structFields);
//動態構造的元數據,將RDD轉換成DataFrame
Dataset<Row> personDF=sqlContext.createDataFrame(rows,structType);
personDF.regidterTempTable("person");
Dataset<Row> teenagerDF=sqlContext.sql("select * from person where age<=18")
List<Row> teenagerRDD=teenagerDF.javaRDD().collect();
for(Row row:teenagerRDD){
System.out.println(row);
}
}
scala版本;
object RDD2DataFrame{
def main(args:Array[String]):Unit={
val conf=new SparkConf().setMaster("local").setAppName("RDD2DataFrame");
val sc=new SparkContext(conf);
val sQLContext=new SQLContext(sc);
//first:構造出元素爲Row的普通RDD
val peresonRDD=sc.textFile("./person.txt")
.map(line=>Row(line.split(",")(0).toInt,line.split(",")(1),line.split(",")(2).toInt));
//編程方式動態構建元數據
val structType=StructType(Array(
StructField("id",IntegerType,true),StructField("name",StringType,true),
StructField("age",IntegerType,true) ))
//第三步 進行RDD到DataFrame的轉換
val personDF=sQLContext.creatDataFrame(personRDD,structType);
personDF.registerTemple("person");
val teenagerDF=sQLContext.sql("select * from person where age<=18");
val teenagerRDD=teenagerDF.rdd.collect().foreach(row=>println(row));
}
}
eg:方法二使用反射來推斷包含了特定數據類型的RDD的元數據;
java版:
class RDD2DataFrame{
public static void main(String[] args){
SparkConf conf =new SparkConf().setAppName("RDD2DataFrame").setMaster("local");
JavaSparkContext sc=new javaSparkContext(conf);
SQLContext sqlcontext=new SQLContext(sc);
javaRDD<String> lines=sc.textFile("./person.txt");
javaRDD<person> persons=lines.map(new Function<String,Student>(){
public Person call(String line) throws Exception{
String[] lineSplited=line.split(",");
Person per=new Person();
per.setAge(Integer.valueOf(lineSplited[0]));
per.setId(Integer.valueOf(lineSplited[1]));
per.setName(lineSplited[2]);
return per;
}
});
//使用反射的方式,將RDD轉換成DataFrame
Dataset<Row> personDF=sqlContext.createDataFrame(Person,person.calss);
//註冊中間表
personDF.registerTempTable("Person");
//針對臨時表執行sql語句
Dataset<Row> teenagerDF=sqlContext.sql("select * from person where age<=19");
//將查詢出來的DataFrame,再次轉換成RDD
JavaRDD<Row> teenagerRDD=teenagerDF.javaRDD();
JavaRDD<person>teenagerPersonRDD=teengerRDD.map(new Function<Row,person>(){
public Person call(Row row) throws Exception{
person per =new person();
per.setAge(row.getInt(0));
per.setId(row.getInt(1));
per.setName(row.getString(2));
return per
}
});
//將數據collect回來,打印出來
List<person> personList=teenagerStudentRDD.collect();
for(person per:personList){
System.out.println(per);
}
}
}
實體person:
class person implements Serializable{
private int id;
private int age;
private String name;
......
}
scala版本
objct RDD2DataFrame{
def main(args:Array[String]):Unit={
val conf =new SparkConf().setMaster("local").setAppName("RDD2DataFrame");
val sc=new SparkContext(conf);
val sqlContext=new SQLContext(sc);
//scala需要手動的導入一個隱式轉換
import sqlContext.implicits._
case class person(id:Int,name:String,age:Int)
//case 的class的RDD元素,直接使用RDD的toDF,便可以轉換爲DataFrame元素
val personDF=sc.textFile("./person.txt",1)
.map(line=>line.split(","))
.map(arr=>person(arr(0).trim().toInt,arr(1),arr(2).trim().toInt))
.toDF();
//註冊臨時表
personDF.registerTempTable("person");
val teenagerDF=sqlContext.sql("select * from person where age<=18");
val teenagerRDD=teenagerDF.rdd
teenagerRDD.map(row=>person(row(0).toString().toInt,row(1).toString,row(2).toString.toInt));
.collect()
.foreach(per=>println(per.id+":"+per.name+":"+per.age))
}
}