Spark計算結果繼續追加在HDFS目錄下，不會覆蓋之前的文件

原創

2020-02-20 23:32

由於工作需要，我用scala實現在已將有的目錄下面繼續寫入文件。需要重寫MultipleTextOutputFormat這個類，具體的請看下面代碼，需要交流可以聯繫我

import java.text.SimpleDateFormat
import java.util.Date

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
import org.apache.hadoop.mapred.{InvalidJobConfException, JobConf}
import org.apache.hadoop.mapreduce.security.TokenCache
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import org.apache.spark.rdd.RDD;


/**
  * 在HDFS目錄下繼續追加文件，不會覆蓋以前的文件
  * Created by csw on 2017/6/23.
  */
object MultipleTextOutput {
  def main(args: Array[String]) {
    val filePath = "hdfs://master:9000/csw/tmp/data";
    val savePath = "hdfs://master:9000/hxzj/mydata/tatol";
    val conf = new SparkConf().setAppName("Spark shell")
    val sc = new SparkContext(conf)
    //讀取文件後，不進行split操作，直接將整行內容看作key,
    val rdd: RDD[(String, String)] = sc.textFile(filePath).map(x => (x, ""))
    //rdd必須是(key,value)形式的
    RDD.rddToPairRDDFunctions(rdd).partitionBy(new HashPartitioner(1)).saveAsHadoopFile(savePath, classOf[String], classOf[String], classOf[RDDMultipleTextOutputFormat])
    sc.stop()
  }

  /**
    * 自定義一個輸出文件類
    */
  case class RDDMultipleTextOutputFormat() extends MultipleTextOutputFormat[Any, Any] {

    val currentTime: Date = new Date()
    val formatter = new SimpleDateFormat("yyyy-MM-dd-HHmmss");
    val dateString = formatter.format(currentTime);

    //自定義保存文件名
    override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String = {
      //key 和 value就是rdd中的(key,value)，name是part-00000默認的文件名
      //保存的文件名稱，這裏用字符串拼接系統生成的時間戳來區分文件名，可以自己定義
      "HTLXYFY" + dateString
    }

    override def checkOutputSpecs(ignored: FileSystem, job: JobConf): Unit = {
      val name: String = job.get(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.OUTDIR)
      var outDir: Path = if (name == null) null else new Path(name)
      //當輸出任務不等於0 且輸出的路徑爲空，則拋出異常
      if (outDir == null && job.getNumReduceTasks != 0) {
        throw new InvalidJobConfException("Output directory not set in JobConf.")
      }
      //當有輸出任務和輸出路徑不爲null時
      if (outDir != null) {
        val fs: FileSystem = outDir.getFileSystem(job)
        outDir = fs.makeQualified(outDir)
        outDir = new Path(job.getWorkingDirectory, outDir)
        job.set(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.OUTDIR, outDir.toString)
        TokenCache.obtainTokensForNamenodes(job.getCredentials, Array[Path](outDir), job)
        //下面的註釋掉，就不會出現這個目錄已經存在的提示了
        /* if (fs.exists(outDir)) {
             throw new FileAlreadyExistsException("Outputdirectory"
                     + outDir + "alreadyexists");
         }
      }*/
      }
    }
  }

}

守貓de人

發佈了45 篇原創文章 · 獲贊 23 · 訪問量 7萬+

私信關注

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

Spark計算結果繼續追加在HDFS目錄下，不會覆蓋之前的文件

使用c#強大的表達式樹實現對象的深克隆之解決循環引用的問題

free AI online tools All In One

痞子衡嵌入式：恩智浦i.MX RT1xxx系列MCU啓動那些事（12.A）- uSDHC eMMC啓動時間(RT1170)

linux安裝cuda和cudnn

Mellanox網卡開啓SR-IOV

模擬手機設備：使用 Playwright 實現移動端自動化測試

HTML 00 Tutorial

全面系統的AI學習路徑，幫助普通人也能玩轉AI

從零開始：使用 Playwright 腳本錄製實現自動化測試

騰訊面試：什麼鎖比讀寫鎖性能更高？

本地多級文件合併上傳到hdfs（遞歸上傳）

本地多級文件原樣上傳到hdfs

Cannot detect ES version - typically this happens if the network/Elasticsearch cluster is not acces

Spark Streaming 將數據保存在msyql中

elasticsearch 之Aggregation聚合

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結