import org.apache.spark.HashPartitioner
import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.sql.{Column, DataFrame, Row, columnar}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.mllib.linalg.Matrix
import scala.math.Ordering
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.storage.StorageLevel
@transient
val conf = new Configuration
conf.set("textinputformat.record.delimiter", "\1")
val sqlContext = new HiveContext(sc)//org.apache.spark.sql.hive ""->,
/*原始數據轉換爲DataFrame*/
def dataToDF(path:String) = {
val datatblRDD1 = sc.newAPIHadoopFile(path, classOf[TextInputFormat],
classOf[LongWritable], classOf[Text], conf).map(_._2.toString)
val brdsegs1 = sc.broadcast(datatblRDD1.first().split("\2"))
val rowRDD1 = datatblRDD1.filter(_(0) != brdsegs1.value(0)(0)).map(x =>
Row(x.split("\2",brdsegs1.value.size):_*))
val segments1 = sc.broadcast(StructType(for(x <- brdsegs1.value) yield
StructField(x,StringType,true)))
val alarmDF1tmp = sqlContext.createDataFrame(rowRDD1,segments1.value)
alarmDF1tmp.registerTempTable("TMPTBL")
/*正則選取ip time,過濾髒數據*/
val ipadrs_r = """.*((25[0-5]|2[0-4]\\d|1\\d{2}|[1-9]?\\d)($|(?!\\.$)\\.))
{4}$""".r.toString()
val date_r = """.*(\\d{1,2}($|(?!:$):)){3}""".r.toString()
val returnDF = sqlContext.sql(s"SELECT * FROM TMPTBL where length(regexp_extract
(IPADDRESS,'$ipadrs_r',0)) > 6 AND " +
s"length(regexp_extract(ORIGINALEVENTTIME,'$date_r',0)) > 4 AND length
(regexp_extract(EVENTTIME,'$date_r',0)) > 4")
returnDF
}
def dataToDF1(path:String) = {
val datatblRDD1 = sc.newAPIHadoopFile(path, classOf[TextInputFormat],
classOf[LongWritable],
classOf[Text], conf).map(_._2.toString)
val brdsegs1 = sc.broadcast(datatblRDD1.first().split("\2"))
val rowRDD1 = datatblRDD1.filter(_(0) != brdsegs1.value(0)(0)).map(x =>Row
(x.split("\2",brdsegs1.value.size):_*))
val segments1 = sc.broadcast(StructType(for(x <- brdsegs1.value) yield
StructField(x,StringType,true)))
val alarmDF1tmp = sqlContext.createDataFrame(rowRDD1,segments1.value)
alarmDF1tmp.registerTempTable("TMPTBL1")
/*正則選取ip time,過濾髒數據*/
val ipadrs_r = """.*((25[0-5]|2[0-4]\\d|1\\d{2}|[1-9]?\\d)($|(?!\\.$)\\.)){4}$""".r.toString()
val date_r = """.*(\\d{1,2}($|(?!:$):)){3}""".r.toString()
val returnDF = sqlContext.sql(s"SELECT * FROM TMPTBL1 where length(regexp_extract
(IPADDRESS,'$ipadrs_r',0)) > 6 AND " +
s"length(regexp_extract(ORIGINALEVENTTIME,'$date_r',0)) > 4 AND length(regexp_extract
(EVENTTIME,'$date_r',0)) > 4")
returnDF
}
/*DF註冊爲表,然後查詢表得出所需DF*/
def tableToDF(alarmdf1:DataFrame,similardf2:DataFrame) = {
alarmdf.registerTempTable("ALARMOBJECTTBL")
similardf.registerTempTable("SIMILARALARMTBL")
/*官網SqlContext的配置,默認是200,1.3.0v.*/
val ponalarmTOfObjPart1: DataFrame = sqlContext.sql("SELECT ALARMOBJECTID,OCNAME,
ORIGINALEVENTTIME as EVENTTIME,substr(EVENTTIME,0,13) as EVENTDATEHOUR," +
"EQUIPMENTNAME,SATOTAL,IPADDRESS,1 as IS_DG " + s"FROM ALARMOBJECTTBL WHERE
ALARMINFO = '2.1.5'")
ponalarmTOfObjPart1.registerTempTable("PON_ALARM_TBL1")
val getPlSim: DataFrame =sqlContext.sql("SELECT ALARMOBJECTID, OCNAME FROM PON_ALARM_TBL1
WHERE cast(SATOTAL as int) > 0 AND cast(IS_DG as int) = 1")
getPlSim.registerTempTable("GET_PL_SIM")
val ponalarmTOfObjPart2 = sqlContext.sql("SELECT s.ALARMOBJECTID,s.OCNAME,
s.ORIGINALEVENTTIME
as EVENTTIME,substr(s.EVENTTIME,0,13) as EVENTDATEHOUR," +
"s.EQUIPMENTNAME,-1 as SATOTAL,s.IPADDRESS,1 as IS_DG FROM SIMILARALARMTBL s ,GET_PL_SIM
g WHERE s.PARENTALARMOBJECT = g.ALARMOBJECTID AND s.OCNAME = g.OCNAME ")
val ponalarmTOfObjPart3: DataFrame = sqlContext.sql("SELECT ALARMOBJECTID,
OCNAME,ORIGINALEVENTTIME
as EVENTTIME,substr(EVENTTIME,0,13) as EVENTDATEHOUR," +
s"EQUIPMENTNAME,SATOTAL,IPADDRESS,0 as IS_DG FROM ALARMOBJECTTBL WHERE ALARMINFO =
'2.1.6'")
ponalarmTOfObjPart3.registerTempTable("PON_ALARM_TBL2")
val getOlSim = sqlContext.sql("SELECT ALARMOBJECTID, OCNAME FROM PON_ALARM_TBL2 WHERE
cast(SATOTAL as int) > 0 AND cast(IS_DG as int) = 0")
getOlSim.registerTempTable("GET_OL_SIM")
val ponalarmTOfObjPart4: DataFrame = sqlContext.sql("SELECT s.ALARMOBJECTID,s.OCNAME,
s.ORIGINALEVENTTIME as EVENTTIME,substr(s.EVENTTIME,0,13) as EVENTDATEHOUR," +
"s.EQUIPMENTNAME,-1 as SATOTAL,s.IPADDRESS,0 as IS_DG FROM SIMILARALARMTBL s,
GET_PL_SIM g WHERE s.PARENTALARMOBJECT = g.ALARMOBJECTID AND s.OCNAME = g.OCNAME")
val pon_alarm_tbl: DataFrame = ponalarmTOfObjPart1.unionAll(ponalarmTOfObjPart2).
unionAll(ponalarmTOfObjPart3).unionAll(ponalarmTOfObjPart4)
pon_alarm_tbl.registerTempTable("PON_ALARM_TBL")
val ponAllDeviceLable = sqlContext.sql("SELECT DISTINCT IPADDRESS as IPADDRESS,
trim(EQUIPMENTNAME) as NAME FROM PON_ALARM_TBL")
ponAllDeviceLable.registerTempTable("PON_ALL_DEVICE_LABLE_TBL")
val ponAllDevice = sqlContext.sql("SELECT DISTINCT IPADDRESS as IPADDRESS FROM
PON_ALL_DEVICE_LABLE_TBL")
ponAllDevice.registerTempTable("PON_ALL_DEVICE_TBL")
val ponDgAlarmCount = sqlContext.sql("SELECT p.EVENTDATEHOUR as EVENTDATEHOUR,
p.IPADDRESS as IPADDRESS,COUNT(1) AS ALARMCOUNT FROM PON_ALARM_TBL p WHERE IS_DG =
1 GROUP BY p.EVENTDATEHOUR,p.IPADDRESS")
ponDgAlarmCount.registerTempTable("PON_DG_ALARM_COUNT_TBL")
val ponAllAlaramCount =sqlContext.sql("SELECT p.EVENTDATEHOUR as EVENTDATEHOUR,
p.IPADDRESS as IPADDRESS,COUNT(1) AS ALARMCOUNT FROM PON_ALARM_TBL p GROUP BY
p.EVENTDATEHOUR,p.IPADDRESS")
ponAllAlaramCount.registerTempTable("PON_ALL_ALARM_COUNT_TBL")
val ponDgAlarmTime = sqlContext.sql("SELECT p.IPADDRESS as IPADDRESS,
sum(case when cast(p.ALARMCOUNT as int) > 0 then 1 else 0 end) as ALARMTIMECOUNT FROM
PON_DG_ALARM_COUNT_TBL p GROUP BY p.IPADDRESS")
ponDgAlarmTime.registerTempTable("PON_DG_ALALRM_TIME_TBL")
val curDgAlarmCount = sqlContext.sql("SELECT EVENTDATEHOUR,IPADDRESS,
ALARMCOUNT FROM PON_DG_ALARM_COUNT_TBL")
val curAllAlarmCount = sqlContext.sql("SELECT EVENTDATEHOUR,IPADDRESS,
ALARMCOUNT FROM PON_ALL_ALARM_COUNT_TBL")
val curDgAlarmTime = sqlContext.sql("SELECT IPADDRESS,ALARMTIMECOUNT FROM
PON_DG_ALALRM_TIME_TBL")
val curAllDevice = sqlContext.sql("SELECT IPADDRESS FROM PON_ALL_DEVICE_TBL")
List(curDgAlarmCount,curAllAlarmCount,curDgAlarmTime,curAllDevice)
}
/*過濾、轉換DF*/
def filterDfs(fourDFs:Array[DataFrame]) = {
val a_dg_alarm_count = fourDFs(0).filter("length(IPADDRESS) > 0 AND
length(EVENTDATEHOUR) > 0").rdd.map(e => (e(0).toString,e(1).toString,e(2).toString.toInt))
val a_all_alarm_count = fourDFs(1).filter("length(IPADDRESS) > 0 AND
length(EVENTDATEHOUR) > 0").rdd.map(e => (e(0).toString,e(1).toString,e(2).toString.toInt))
val a_dg_alarm_time = fourDFs(2).filter("length(IPADDRESS) > 0").rdd.
map(e => (e(0).toString,e(1).toString.toInt))
val a_all_device = fourDFs(3).filter("length(IPADDRESS) > 0").rdd.
map(e => e(0).toString)
(a_dg_alarm_count,a_all_alarm_count,a_dg_alarm_time,a_all_device)
}
val alarmdf = dataToDF("hdfs://10.0.0.1:9000/zh_test/alarmobjecttbl.txt")
sc.broadcast(alarmdf)
val similardf = dataToDF("hdfs://10.0.0.1:9000/zh_test/similaralarmtbl.txt")
sc.broadcast(similardf)
val fourDFs1 = tableToDF(alarmdf,similardf)
val fourDFs2 = filterDfs(fourDFs1)
/*time ip count*/
val a_dg_alarm_count_kv = fourDFs2._1.map(e => (e._1,e._2))// time -> ip
a_dg_alarm_count_kv.persist(StorageLevel.MEMORY_ONLY_SER)
val a_all_alarm_count_kv = fourDFs2._2.map(e => (e._1,e._2))
a_all_alarm_count_kv.persist(StorageLevel.MEMORY_ONLY_SER)
val a_dg_alarm_time_kv = fourDFs2._3.map(e => (e._1,e._2)).repartition(10)
a_dg_alarm_time_kv.persist(StorageLevel.MEMORY_ONLY_SER)
val a_all_device_kv = fourDFs2._4.repartition(10)//.map(e => (0,e))
a_all_device_kv.persist(StorageLevel.MEMORY_ONLY_SER)
val distinct_time = a_all_alarm_count_kv.map(e => e._1).distinct().map(e => (e,""))
//前者是掉電,後者是綜合
val dg_all_count_kv = a_dg_alarm_count_kv.join(a_all_alarm_count_kv,50).map(e =>
((e._2._1,e._2._2),1)).reduceByKey(_+_)//half
dg_all_count_kv.persist(StorageLevel.MEMORY_ONLY_SER)
a_dg_alarm_count_kv.unpersist()
a_all_alarm_count_kv.unpersist()
/**掉電若爲0,則掉電情況下發生任意告警的概率都爲零;
現有掉電設備、綜合告警設備*/
/*非0 掉電-綜合(ip,ip)-> ∩count dg_time_count(ip -> dgcount) */
val ip_value_kv_notzero = dg_all_count_kv.map(e => e._1._1 -> (e._1._2,e._2)).
leftOuterJoin(a_dg_alarm_time_kv).map(e => e._2._2 match {
case Some(s) => (e._1,e._2._1._1,if (s.toDouble > 0.8) e._2._1._2.toDouble/s.toDouble else 0.0)
case scala.None => (e._1,e._2._1._1,0.0)
}).map(e => ((e._1,e._2),e._3))
def addi(i:Double,j:Double) = {i+j}
/*全0 --> 部分非0*/
val ip_value_kv = a_all_device_kv.cartesian(a_all_device_kv).map(e => (e,0.0)).union(ip_value_kv_notzero).
aggregateByKey(0.0)(addi,addi)
/*是否過濾同ip爲1,且過濾小於2次的*/
ip_value_kv.persist(StorageLevel.MEMORY_ONLY_SER)
import org.apache.hadoop.io.compress.GzipCodec
val ip_value_kv_hive = ip_value_kv.map(e => Array(e._1._1,e._1._2,e._2).mkString(","))
//ip_value_kv_hive
val result_sorted_ip = ip_value_kv_hive.saveAsTextFile("hdfs://10.0.0.1:9000/zh_test/result",
classOf[GzipCodec])//生成zip壓縮文件,classOf[GzipCodec]</span>
spark實踐——處理表
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.