SparkR跑通的函數(持續更新中...)

spark1.4.0的sparkR的思路:用spark從大數據集中抽取小數據(sparkR的DataFrame),然後到R裏分析(DataFrame)。

這兩個DataFrame是不同的,前者是分佈式的,集羣上的DF,R裏的那些包都不能用;後者是單機版的DF,包裏的函數都能用。

sparkR的開發計劃,個人覺得是將目前包裏的函數,遷移到sparkR的DataFrame裏,這樣就打開一片天地。


> a<- sql(hiveContext, "SELECT count(*) FROM anjuke_scores where restaurant>=10");

> a<- sql(hiveContext, "SELECT * FROM anjuke_scores limit 5")
> a
DataFrame[city:string, housingname:string, ori_traffic_score:int, ori_traffic_score_normal:double, metro_station:double, metro_station_normal:double,...
> first(a)  #顯示Formal Data Frame的第一行

> head(a) ;  #列出a的前6行
> columns(a)      # 列出全部的列
[1] "city"                      "housingname"               "ori_traffic_score"         "ori_traffic_score_normal"

[5] "metro_station"             "metro_station_normal"      "bus_station"               "bus_station_normal"  ...

> showDF(a)
> b<-filter(a, a$ori_comfort>8); # 行篩選, ori_comfort_normal:double 

> print(a);    #打印列名及類型  
DataFrame[city:string, housingname:string, ori_traffic_score:int, ......

> printSchema(a); # 打印列名的樹形框架概要 root |-- city: string (nullable = true) |-- housingname: string (nullable = true) |-- ori_traffic_score: integer (nullable = true) |-- ori_traffic_score_normal: double (nullable = true) |-- metro_station: double (nullable = true) > take(a,10)   ;  # 提取Formal class DataFrame的前面num行,成爲R中普通的 data frame , take(x, num)

     city                  housingname ori_traffic_score ori_traffic_score_normal metro_station metro_station_normal
1  \t\x9a                   \xddrw\xb8                NA                        0            NA                    0
2  \t\x9a         \xe4\xf04\u03a2\021~                NA                        0            NA                    0
3  \t\x9a                \xf6\xe3w\xb8                NA                        0            NA                    0
4  \t\x9a               \x8e=\xb0w\xb8                NA                        0            NA                    0
5  \t\x9a \t\x9a\xe4\xf04\xce\xe4\xf0~                NA                        0            NA                    0
6  \t\x9a                      q4\xfdE                NA                        0            NA                    0
7  \t\x9a                \xe4\xf04\xce                NA                        0            NA                    0
8  \t\x9a                      )\xfdVT                NA                        0            NA                    0
9  \t\x9a                       q\177V                NA                        0            NA                    0
10 \t\x9a           \xe4\xf04\xceW\xb8                NA                        0            NA                    0

> b<-take(a,10) 
> dim(b)
[1] 10 41

> aa <- withColumn(a, "ori_comfort_aa", a$ori_comfort * 5)   #用現有的列生成新的列, 新增一列,ori_comfort_aa,結果還是Formal data frame結構
> printSchema(aa)
root
 |-- city: string (nullable = true)
.........
 |-- comfort_normal: double (nullable = true)
 |-- ori_comfort_aa: double (nullable = true)
> aa <- mutate(a, newCol1 = a$commerce_normal * 5, newCol2 = a$bank_normal * 2) ;   #與withColumn類似
> printSchema(aa)
root
 |-- city: string (nullable = true)
 。。。。。。。。。。。。。。。。。。
 |-- comfort_normal: double (nullable = true)
 |-- newCol1: double (nullable = true)
 |-- newCol2: double (nullable = true)


a1<-arrange(a,asc(a$level_tow));  # 按列排序, asc升序,desc降序

a1<-orderBy(a,asc(a$level_tow));  # 按列排序

count(a) ;  # 統計 Formal Data Frame有多少行數據

> dtypes(a);  #以list的形式列出Formal Data Frame的全部列名及類型
[[1]]
[1] "city"   "string"

[[2]]
[1] "housingname" "string"  
> a<-withColumnRenamed(a,"comfort_normal","AA");  # 更改列名  
> printSchema(a)
root
 |-- city: string (nullable = true)
 |-- housingname: string (nullable = true)
..........
 |-- AA: double (nullable = true)


創建sparkR的數據框的函數createDataFrame

> df<-createDataFrame(sqlContext,a.df);  # a.df是R中的數據框, df是sparkR的數據框,注意:使用sparkR的數據庫,需要sqlContext
> str(a.df)
'data.frame':    5 obs. of  41 variables:

> str(df)
Formal class 'DataFrame' [package "SparkR"] with 2 slots
  ..@ env:<environment: 0x4fce350>
  ..@ sdf:Class 'jobj' <environment: 0x4fc70b0>

> destDF <- select(SFO_DF, "dest", "cancelled");  #選擇列

> showDF(destDF);   #顯示sparkR的DF
+----+---------+
|dest|cancelled|
+----+---------+
| SFO|        0|
................

> registerTempTable(SFO_DF, "flightsTable");  #要對sparkDF使用SQL語句,首先需要將DF註冊成一個table
 
> wa <- sql(sqlContext, "SELECT dest, cancelled FROM flightsTable"); #在sqlContext下使用SQL語句

> showDF(wa);   #查詢的結果還是sparkDF
+----+---------+
|dest|cancelled|
+----+---------+
| SFO|        0|
................
> local_df <- collect(wa);   #將sparkDF轉換成R中的DF
> str(local_df)
'data.frame':    2818 obs. of  2 variables:
 $ dest     : chr  "SFO" "SFO" "SFO" "SFO" ...
 $ cancelled: int  0 0 0 0 0 0 0 0 0 0 ...

> wa<-flights_df[1:1000,];   #wa是R中的DF
> flightsDF<-createDataFrame(sqlContext,wa) ;   #flightsDF是sparkR的DF
> library(magrittr); #管道函數的包對sparkRDF適用
> groupBy(flightsDF, flightsDF$date) %>%
+     summarize(avg(flightsDF$dep_delay), avg(flightsDF$arr_delay)) -> dailyDelayDF;  #注意,語法和dplyr中的有所不同,結果還是sparkRDF

> str(dailyDelayDF)
Formal class 'DataFrame' [package "SparkR"] with 2 slots
  ..@ env:<environment: 0x4cd3118>
  ..@ sdf:Class 'jobj' <environment: 0x4cd6968>
> showDF(dailyDelayDF)
+----------+--------------------+--------------------+
|      date|      AVG(dep_delay)|      AVG(arr_delay)|
+----------+--------------------+--------------------+
|2011-01-01|                 5.2|                 5.8|
|2011-01-02|  1.8333333333333333|                -2.0|
................

在39機器上跑的

collect將sparkDF轉化成DF
Collects all the elements of a Spark DataFrame and coerces them into an R data.frame.
collect(x, stringsAsFactors = FALSE),x:A SparkSQL DataFrame

> dist_df<- sql(hiveContext, "SELECT * FROM anjuke_scores where restaurant<=1");
> local_df <- dist_df %>% 
      groupBy(dist_df$city) %>% 
      summarize(count = n(dist_df$housingname)) %>% 
      collect
> local_df
           city count
1        \t\x9a     5
2         8\xde     7
3      \xf0\xde     2
..........
..........

take也可將sparkDF轉化成DF
Take the first NUM rows of a DataFrame and return a the results as a data.frame
take(x, num)


> local_df <- dist_df %>% 
      groupBy(dist_df$city) %>% 
      summarize(count = n(dist_df$housingname))
> a<-take(local_df,100)
[Stage 16:=========================================>            (154 + 1) / 199]                                                                                > View(a)
> a
           city count
1        \t\x9a     5
2         8\xde     7
3      \xf0\xde     2
..........
..........



不通的函數:

> describe(a)
Error in x[present, drop = FALSE] : 
  object of type 'S4' is not subsettable
> jfkDF <- filter(flightsDF, flightsDF$dest == "DFW")
Error in filter(flightsDF, flightsDF$dest == "DFW") : 
  no method for coercing this S4 class to a vector




發佈了32 篇原創文章 · 獲贊 11 · 訪問量 15萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章