使用spark streaming使用snappy壓縮保存數據到HDFS中

工作中需要將從Kafka中的數據保存到HDFS中去,並且需要使用snappy壓縮

話不多說,直接上代碼

/**
  * 自定義多目錄寫與追加寫,採用snappy壓縮
  * @author demon
  * @version 2019/05/05
  */
class AppendTextOutputFormat extends TextOutputFormat[Any, Any] {
  override def getRecordWriter(ignored: FileSystem, job: JobConf, iname: String, progress: Progressable): RecordWriter[Any, Any] = {
    val isCompressed: Boolean = FileOutputFormat.getCompressOutput(job)
    val keyValueSeparator: String = job.get("mapreduce.output.textoutputformat.separator", "\t")
    //自定義輸出文件名
    val name = job.get("filename",iname)
    if (!isCompressed) {
      val file: Path = FileOutputFormat.getTaskOutputPath(job, name)
      val fs: FileSystem = file.getFileSystem(job)
      val newFile : Path = new Path(FileOutputFormat.getOutputPath(job), name)
      val fileOut : FSDataOutputStream = if (fs.exists(newFile)) {
        //存在,追加寫
        fs.append(newFile)
      } else {
        fs.create(file, progress)
      }
      new TextOutputFormat.LineRecordWriter[Any, Any](fileOut, keyValueSeparator)
    } else {
      val codecClass: Class[_ <: CompressionCodec] = FileOutputFormat.getOutputCompressorClass(job, classOf[GzipCodec])
      // create the named codec
      val codec: CompressionCodec = ReflectionUtils.newInstance(codecClass, job)
      // build the filename including the extension
      val file: Path = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension)
      val fs: FileSystem = file.getFileSystem(job)
      val newFile: Path = new Path(FileOutputFormat.getOutputPath(job), name + codec.getDefaultExtension)

      val fileOut: FSDataOutputStream = if (fs.exists(newFile)) {
        //存在,追加寫
        fs.append(newFile)
      } else {
        fs.create(file, progress)
      }
      new TextOutputFormat.LineRecordWriter[Any, Any](new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator)
    }
  }
}

class RDDMultipleAppendTextOutputFormat extends MultipleOutputFormat[Any, Any]{
  private var theTextOutputFormat: AppendTextOutputFormat = null

  //產生分區目錄
  override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String ={

    //TODO 分區目錄
  }

  //追加寫
  override def getBaseRecordWriter(fs: FileSystem, job: JobConf, name: String, arg3: Progressable): RecordWriter[Any, Any] = {
    if (this.theTextOutputFormat == null) {
      this.theTextOutputFormat = new AppendTextOutputFormat()
    }
    this.theTextOutputFormat.getRecordWriter(fs, job, name, arg3)
  }

  //key重置爲空
  override def generateActualKey(key: Any, value: Any): Any =
    NullWritable.get()
}

使用方法:

/**
  * 操作HDFS工具類
  * @author demon
  * @version 2019/03/29
  */
object HdfsOperationUtil {

    /**
      * 保存數據到HDFS上
      * @param rdd rdd
      * @param path 保存路徑
      */
    def saveToHDFS(rdd: RDD[(String, String)], path: String) : Unit ={
//        if (!rdd.isEmpty())
//            rdd.saveAsTextFile(path)
        val job = new JobConf()
        job.set("mapred.output.compress", "true")
        job.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec")
        rdd.saveAsHadoopFile(path,
            classOf[Text], classOf[Text], classOf[RDDMultipleAppendTextOutputFormat], job)
    }

    /**
      * 從hdfs上刪除數據
      * @param path 刪除的路徑
      */
    def deleteToHDFS(path: String): Unit = {
        // 刪除輸出目錄
        val output = new Path(path)
        val hdfs = org.apache.hadoop.fs.FileSystem.get(
            new java.net.URI(PropertiesUtil.getPropertiesToStr("hdfs.hosts")), new org.apache.hadoop.conf.Configuration())
        if (hdfs.exists(output)) hdfs.delete(output, true)
    }
}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章