spark實踐——處理表

import org.apache.spark.HashPartitioner
import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.sql.{Column, DataFrame, Row, columnar}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.mllib.linalg.Matrix
import scala.math.Ordering
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.storage.StorageLevel
  @transient
  val conf = new Configuration
  conf.set("textinputformat.record.delimiter", "\1")
  val sqlContext = new HiveContext(sc)//org.apache.spark.sql.hive  ""->,
  /*原始數據轉換爲DataFrame*/
  def dataToDF(path:String) = {
    val datatblRDD1 = sc.newAPIHadoopFile(path, classOf[TextInputFormat], 
classOf[LongWritable], classOf[Text], conf).map(_._2.toString)
    val brdsegs1 = sc.broadcast(datatblRDD1.first().split("\2"))
    val rowRDD1 = datatblRDD1.filter(_(0) != brdsegs1.value(0)(0)).map(x =>
Row(x.split("\2",brdsegs1.value.size):_*))
    val segments1 = sc.broadcast(StructType(for(x <- brdsegs1.value) yield 
StructField(x,StringType,true)))
    val alarmDF1tmp = sqlContext.createDataFrame(rowRDD1,segments1.value)
    alarmDF1tmp.registerTempTable("TMPTBL")
    /*正則選取ip time,過濾髒數據*/
    val ipadrs_r = """.*((25[0-5]|2[0-4]\\d|1\\d{2}|[1-9]?\\d)($|(?!\\.$)\\.))
{4}$""".r.toString()
    val date_r = """.*(\\d{1,2}($|(?!:$):)){3}""".r.toString()
    val returnDF = sqlContext.sql(s"SELECT * FROM TMPTBL where length(regexp_extract
(IPADDRESS,'$ipadrs_r',0)) > 6 AND " +
      s"length(regexp_extract(ORIGINALEVENTTIME,'$date_r',0)) > 4 AND length
(regexp_extract(EVENTTIME,'$date_r',0)) > 4")
    returnDF
  }
def dataToDF1(path:String) = {
    val datatblRDD1 = sc.newAPIHadoopFile(path, classOf[TextInputFormat], 
classOf[LongWritable],
 classOf[Text], conf).map(_._2.toString)
    val brdsegs1 = sc.broadcast(datatblRDD1.first().split("\2"))
    val rowRDD1 = datatblRDD1.filter(_(0) != brdsegs1.value(0)(0)).map(x =>Row
(x.split("\2",brdsegs1.value.size):_*))
    val segments1 = sc.broadcast(StructType(for(x <- brdsegs1.value) yield 
StructField(x,StringType,true)))
    val alarmDF1tmp = sqlContext.createDataFrame(rowRDD1,segments1.value)
    alarmDF1tmp.registerTempTable("TMPTBL1")
    /*正則選取ip time,過濾髒數據*/
    val ipadrs_r = """.*((25[0-5]|2[0-4]\\d|1\\d{2}|[1-9]?\\d)($|(?!\\.$)\\.)){4}$""".r.toString()
    val date_r = """.*(\\d{1,2}($|(?!:$):)){3}""".r.toString()
    val returnDF = sqlContext.sql(s"SELECT * FROM TMPTBL1 where length(regexp_extract
(IPADDRESS,'$ipadrs_r',0)) > 6 AND " +
      s"length(regexp_extract(ORIGINALEVENTTIME,'$date_r',0)) > 4 AND length(regexp_extract
(EVENTTIME,'$date_r',0)) > 4")
    returnDF
  }

  /*DF註冊爲表,然後查詢表得出所需DF*/
  def tableToDF(alarmdf1:DataFrame,similardf2:DataFrame) = {
    alarmdf.registerTempTable("ALARMOBJECTTBL")
    similardf.registerTempTable("SIMILARALARMTBL")

    /*官網SqlContext的配置,默認是200,1.3.0v.*/
    val ponalarmTOfObjPart1: DataFrame = sqlContext.sql("SELECT ALARMOBJECTID,OCNAME,
ORIGINALEVENTTIME as EVENTTIME,substr(EVENTTIME,0,13) as EVENTDATEHOUR," +
      "EQUIPMENTNAME,SATOTAL,IPADDRESS,1 as IS_DG " + s"FROM ALARMOBJECTTBL WHERE 
ALARMINFO = '2.1.5'")
    ponalarmTOfObjPart1.registerTempTable("PON_ALARM_TBL1")

    val getPlSim: DataFrame =sqlContext.sql("SELECT ALARMOBJECTID, OCNAME FROM PON_ALARM_TBL1 
WHERE cast(SATOTAL as int) > 0 AND cast(IS_DG as int) = 1")
    getPlSim.registerTempTable("GET_PL_SIM")

    val ponalarmTOfObjPart2 = sqlContext.sql("SELECT s.ALARMOBJECTID,s.OCNAME,
s.ORIGINALEVENTTIME 
as EVENTTIME,substr(s.EVENTTIME,0,13) as EVENTDATEHOUR," +
      "s.EQUIPMENTNAME,-1 as SATOTAL,s.IPADDRESS,1 as IS_DG FROM SIMILARALARMTBL s ,GET_PL_SIM 
g WHERE s.PARENTALARMOBJECT = g.ALARMOBJECTID AND s.OCNAME = g.OCNAME ")

    val ponalarmTOfObjPart3: DataFrame = sqlContext.sql("SELECT ALARMOBJECTID,
OCNAME,ORIGINALEVENTTIME
 as EVENTTIME,substr(EVENTTIME,0,13) as EVENTDATEHOUR," +
      s"EQUIPMENTNAME,SATOTAL,IPADDRESS,0 as IS_DG FROM ALARMOBJECTTBL WHERE ALARMINFO =
 '2.1.6'")
    ponalarmTOfObjPart3.registerTempTable("PON_ALARM_TBL2")

    val getOlSim = sqlContext.sql("SELECT ALARMOBJECTID, OCNAME FROM PON_ALARM_TBL2 WHERE 
cast(SATOTAL as int) > 0 AND cast(IS_DG as int) = 0")
    getOlSim.registerTempTable("GET_OL_SIM")

    val ponalarmTOfObjPart4: DataFrame =  sqlContext.sql("SELECT s.ALARMOBJECTID,s.OCNAME,
s.ORIGINALEVENTTIME as EVENTTIME,substr(s.EVENTTIME,0,13) as EVENTDATEHOUR," +
      "s.EQUIPMENTNAME,-1 as SATOTAL,s.IPADDRESS,0 as IS_DG FROM SIMILARALARMTBL s,
GET_PL_SIM g WHERE s.PARENTALARMOBJECT = g.ALARMOBJECTID AND s.OCNAME = g.OCNAME")

    val pon_alarm_tbl: DataFrame = ponalarmTOfObjPart1.unionAll(ponalarmTOfObjPart2).
unionAll(ponalarmTOfObjPart3).unionAll(ponalarmTOfObjPart4)
    pon_alarm_tbl.registerTempTable("PON_ALARM_TBL")

    val ponAllDeviceLable = sqlContext.sql("SELECT DISTINCT IPADDRESS as IPADDRESS,
trim(EQUIPMENTNAME) as NAME FROM PON_ALARM_TBL")
    ponAllDeviceLable.registerTempTable("PON_ALL_DEVICE_LABLE_TBL")

    val ponAllDevice = sqlContext.sql("SELECT DISTINCT IPADDRESS as IPADDRESS FROM 
PON_ALL_DEVICE_LABLE_TBL")
    ponAllDevice.registerTempTable("PON_ALL_DEVICE_TBL")

    val ponDgAlarmCount = sqlContext.sql("SELECT p.EVENTDATEHOUR as EVENTDATEHOUR,
p.IPADDRESS as IPADDRESS,COUNT(1) AS ALARMCOUNT FROM PON_ALARM_TBL p WHERE IS_DG = 
1 GROUP BY p.EVENTDATEHOUR,p.IPADDRESS")
    ponDgAlarmCount.registerTempTable("PON_DG_ALARM_COUNT_TBL")

    val ponAllAlaramCount =sqlContext.sql("SELECT p.EVENTDATEHOUR as EVENTDATEHOUR,
p.IPADDRESS as IPADDRESS,COUNT(1) AS ALARMCOUNT FROM PON_ALARM_TBL p GROUP BY 
p.EVENTDATEHOUR,p.IPADDRESS")
    ponAllAlaramCount.registerTempTable("PON_ALL_ALARM_COUNT_TBL")

    val ponDgAlarmTime = sqlContext.sql("SELECT p.IPADDRESS as IPADDRESS,
sum(case when cast(p.ALARMCOUNT as int) > 0 then 1 else 0 end) as ALARMTIMECOUNT FROM 
PON_DG_ALARM_COUNT_TBL p GROUP BY p.IPADDRESS")
    ponDgAlarmTime.registerTempTable("PON_DG_ALALRM_TIME_TBL")

    val curDgAlarmCount = sqlContext.sql("SELECT EVENTDATEHOUR,IPADDRESS,
ALARMCOUNT FROM PON_DG_ALARM_COUNT_TBL")
    val curAllAlarmCount = sqlContext.sql("SELECT EVENTDATEHOUR,IPADDRESS,
ALARMCOUNT FROM PON_ALL_ALARM_COUNT_TBL")
    val curDgAlarmTime = sqlContext.sql("SELECT IPADDRESS,ALARMTIMECOUNT FROM 
PON_DG_ALALRM_TIME_TBL")
    val curAllDevice = sqlContext.sql("SELECT IPADDRESS FROM PON_ALL_DEVICE_TBL")
    List(curDgAlarmCount,curAllAlarmCount,curDgAlarmTime,curAllDevice)
  }


  /*過濾、轉換DF*/
  def filterDfs(fourDFs:Array[DataFrame]) = {
    val a_dg_alarm_count = fourDFs(0).filter("length(IPADDRESS) > 0 AND 
length(EVENTDATEHOUR) > 0").rdd.map(e => (e(0).toString,e(1).toString,e(2).toString.toInt))
    val a_all_alarm_count = fourDFs(1).filter("length(IPADDRESS) > 0 AND 
length(EVENTDATEHOUR) > 0").rdd.map(e => (e(0).toString,e(1).toString,e(2).toString.toInt))
    val a_dg_alarm_time = fourDFs(2).filter("length(IPADDRESS) > 0").rdd.
map(e => (e(0).toString,e(1).toString.toInt))
    val a_all_device = fourDFs(3).filter("length(IPADDRESS) > 0").rdd.
map(e => e(0).toString)
    (a_dg_alarm_count,a_all_alarm_count,a_dg_alarm_time,a_all_device)
  }
  
    val alarmdf = dataToDF("hdfs://10.0.0.1:9000/zh_test/alarmobjecttbl.txt")
sc.broadcast(alarmdf)
    val similardf = dataToDF("hdfs://10.0.0.1:9000/zh_test/similaralarmtbl.txt")
sc.broadcast(similardf)
    val fourDFs1 = tableToDF(alarmdf,similardf)


    val fourDFs2 = filterDfs(fourDFs1)

    /*time ip count*/
    val a_dg_alarm_count_kv = fourDFs2._1.map(e => (e._1,e._2))// time -> ip
    a_dg_alarm_count_kv.persist(StorageLevel.MEMORY_ONLY_SER)
    val a_all_alarm_count_kv = fourDFs2._2.map(e => (e._1,e._2))
    a_all_alarm_count_kv.persist(StorageLevel.MEMORY_ONLY_SER)
    val a_dg_alarm_time_kv = fourDFs2._3.map(e => (e._1,e._2)).repartition(10)
    a_dg_alarm_time_kv.persist(StorageLevel.MEMORY_ONLY_SER)
    val a_all_device_kv = fourDFs2._4.repartition(10)//.map(e => (0,e))
    a_all_device_kv.persist(StorageLevel.MEMORY_ONLY_SER)
    val distinct_time = a_all_alarm_count_kv.map(e => e._1).distinct().map(e => (e,""))
    //前者是掉電,後者是綜合
    val dg_all_count_kv = a_dg_alarm_count_kv.join(a_all_alarm_count_kv,50).map(e =>
 ((e._2._1,e._2._2),1)).reduceByKey(_+_)//half
    dg_all_count_kv.persist(StorageLevel.MEMORY_ONLY_SER)
    a_dg_alarm_count_kv.unpersist()
    a_all_alarm_count_kv.unpersist()

    /**掉電若爲0,則掉電情況下發生任意告警的概率都爲零;
      現有掉電設備、綜合告警設備*/
    /*非0 掉電-綜合(ip,ip)-> ∩count  dg_time_count(ip -> dgcount) */
    val ip_value_kv_notzero = dg_all_count_kv.map(e => e._1._1 -> (e._1._2,e._2)).
leftOuterJoin(a_dg_alarm_time_kv).map(e => e._2._2 match {
      case Some(s) => (e._1,e._2._1._1,if (s.toDouble > 0.8) e._2._1._2.toDouble/s.toDouble else 0.0)
      case scala.None => (e._1,e._2._1._1,0.0)
    }).map(e => ((e._1,e._2),e._3))

    def addi(i:Double,j:Double) = {i+j}
    /*全0 --> 部分非0*/
    val ip_value_kv = a_all_device_kv.cartesian(a_all_device_kv).map(e => (e,0.0)).union(ip_value_kv_notzero).
aggregateByKey(0.0)(addi,addi)
    /*是否過濾同ip爲1,且過濾小於2次的*/
    ip_value_kv.persist(StorageLevel.MEMORY_ONLY_SER)
    import org.apache.hadoop.io.compress.GzipCodec
    val ip_value_kv_hive = ip_value_kv.map(e => Array(e._1._1,e._1._2,e._2).mkString(","))
    //ip_value_kv_hive
    val result_sorted_ip = ip_value_kv_hive.saveAsTextFile("hdfs://10.0.0.1:9000/zh_test/result",
classOf[GzipCodec])//生成zip壓縮文件,classOf[GzipCodec]</span>

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章