1、讀取parquet數據源
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by Administrator on 2017/2/3.
*/
object ParquetLoadData {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("ParquetLoadData")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val usersDF = sqlContext.read.parquet("hdfs://master:9000/student/2016113012/spark/users.parquet")
usersDF.registerTempTable("t_users")
//查詢name
val usersNameDF = sqlContext.sql("select name from t_users")
//轉換成RDD並執行相關操作
usersNameDF.rdd.map(row => "Name:"+row(0)).collect().foreach(username => println(username))
}
}
2、json格式數據源
import org.apache.spark.sql.{DataFrame, Dataset, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
import redis.clients.jedis.Tuple
import scala.collection.mutable.ArrayBuffer
//功能
case class Person (name: String , age: Long)
case class PersonScore (n: String , score: Long)
object DataFrame_Chapter_13 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("chapter_13").setMaster("local")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
import org.apache.spark.sql.functions._
import sqlContext.implicits._
val persons: DataFrame = sqlContext.read.json("src/people.json")
val personsDS: Dataset[Person] = persons.as[Person]
// personsDS.show()
//+---+-------+
//|age| name|
//+---+-------+
//| 16|Michael|
//| 16|Michael|
//| 30| Andy|
//| 19| Justin|
//| 29| Justin|
//| 46|Michael|
//+---+-------+
}
}
3、讀取csv數據源
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
//功能一:採用RDD+case class 的方式來構造dataSets,實現某個電影中男性女性中不同年齡的有多少人
case class Rating(var userId:String,var movieId:String,var Rating:Double,var TimeStamp:String)
case class User(var userId:String,var age:String,var gender:String,var job:String)
object DataFrame_12_6 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("user_action_analysis_12_1_3").setMaster("local")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val ratingRdd = sc.textFile("src/ratings.csv").map(_.split("\\,")).
map(line=>{Rating(line(0).trim,line(1),line(2).toDouble,line(3))})
import sqlContext.implicits._
val ratingDS=ratingRdd.toDF().cache()
}
}