//本地模式,如果時集羣模式需要啓動master和worknode
./bin/spark-submit
--class com.package.test.app.demo01//直接是對象名稱
--deploy-mode cluster //驅動進程四在集羣上工作界定雲心那個cluster,還是在集羣之外客戶端運行client
analyzer-logs-1.0.jar //日誌分析
spark://localhost:7077 //saprk機器
-----------------------------------------------------------------
//spark 運行在yarn 集羣模式
./bin/spark-submit
--class com.package.test.app.demo0
analyzer-logs-1.0.jar //日誌分析
--master yarn-client/yarn-cluster
----------------------------------------------------------------
Eg:./bin/spark-submit --master spark://master.hadoop:7077,slave1.hadoop:7077 --executor-memory 1024MB --total-executor-cores 4
--class com.package.test.app.demo01 spark-1.0.jar hdfs://master.hadoop;9000/input hdfs:/master.hadoop:9000/output
解釋:
--master 指定的master位置有多個任務,用逗號分隔
--executor-memory 指定運行時侯的內存
--total-executor-cores 指定核數
--class 指定main方法的類名 jar包的路徑 輸入路徑 輸出路徑
---------------------------------------------------------------
import org.apache.saprk.SparkConf;
import org.apache.spark.SparkContext;
import java.util.Date;
import java.text.SimpleDateFormat;
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions;
import org.pache.spark.rdd.RDD;
Object demo01{
def main(args:Array[String]):Unit={
//配置信息
val conf=new SparkConf();
conf.setAppName("UV+PV");
conf.setMaster("local");
val sc=new SparkContext(conf);
//加載數據
val rdd=sc.textFile("d:/data/logdata");
//調用方法
val rdd2=first(rdd);
rdd2.saveAsTextFile("d:/data/log2");
sc.stop()
}
//封裝方法
def first(rdd:RDD[String])={
//切割字符串
val splitRDD=rdd.amp{_.split("\t")}
//過濾,去掉髒數據
val filterRDD=splitRDD.filter{_.length==5}
//PV model
val reduceRDD=pv(firstRDD);
//UV model
val reduceRDD2=uv(firstRDD);
//做join合併兩個RDD
val unionRDD=reduceRDD.join(reduceRDD2);
//返回時間模塊
val endRDD=unionRDD
.map(x=>{
val value=x._2._1*0.3+x._2._2*0.7
(x._a,value)}).sortBy(_._2,false)
.map(x=>{
val day=x._1.split("_")(0)
val model=x._1.split("_")(1)
(day,model)
})
.groupByKey()
.map(x=>{
val list=x._2.take(3)
(x._1,list)
}).foreach
{
printIn
}
endRDD
}
//PV操作
def pv(filterRDD:RDD[Array[String]])={
val mapRDD=filterRDD.map{
x=>{
val time =x(2).toLong;
val date=new Date(time);
val format=new SimpleDateFormate("yyyy-MM-dd")
val dateStr=format.format(date)
x(2)=dateStr
//返回時間模塊
(x(2)+"_"+x(4),1)
}
}
val reduceRDD=mapRDD.reducceByKey(_+_)
reduceRDD
}
//uv操作
def uv(filterRDD:RDD[Array[String]])={
val mapRDD2=filterRDD.map{
x=>{
val time =x(2).toLong
val date=new Date(time)
val format=new SimpleDateFormat("yyyy-MM-dd");
val dateStr=format.format(date)
x(2)=dateStr
//返回用戶id_模塊_時間
(x(1)+"_"+x(2)+"_"+x(4),null)
}
}
//去重
val disRDD=mapRDD2.distinct()
//只需要key,組裝成二元組
val tupleRDD=disRDD.map(x=>{
val key=x._1
//key:會員ID_時間_模板id ,把會員id切掉
val newKey=key.substrinng(key.inndexOf("_"+1,key.length()))
(newKey,1)
})
//累加
val reduceRDD2=tupleRDD.reduceRDD.reduceByKey(_+_)
reduceRDD2
}
}