hdfs sync的同步代碼重構

這個版本還是有性能問題,由於size比較相同會將文件以流的形式去比較md5,這個代價太高,如果重複同步一個文件,那麼時間會一直很高,所以重構了一個新的版本,在sync 再次重構

原文在hdfs file md5 計算,實現本地與hdfs同步文件

啥話都不說,直接上代碼:

主要是幾個問題:
第一個是slf4j的{},第二個是needUpdate方法的重寫,第三個是異常的處理,第四個是sameFile,先比較size,再比較md5,這樣資源消耗小

import java.io.{File, FileInputStream, IOException}

import org.apache.commons.codec.digest.DigestUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.slf4j.LoggerFactory

/**
  * Created by todd.chen on 16/3/15.
  * email : todd.chen@ximalaya.com
  */
object PathSyncer {
  lazy val logger = LoggerFactory.getLogger(this.getClass)
  type IsSameFile = (Boolean, Boolean, Boolean)

  @throws(classOf[IOException])
  def sync(localFile: File, hdfsPath: Path, configuration: Configuration): Unit = {
    implicit val fileSystem = FileSystem.get(configuration)
    sync(localFile, hdfsPath)
  }

  @throws(classOf[IOException])
  def sync(hdfsPath: Path, localFile: File)(implicit configuration: Configuration): Unit = {
    implicit val fileSystem = FileSystem.get(configuration)
    sync(localFile, hdfsPath)
  }

  private def sync(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem): Unit = {
    val fileCheck: IsSameFile = (localFile.isFile, fileSystem.isFile(hdfsPath),
      sameFile(localFile, hdfsPath))
    fileCheck match {
      case (true, true, true) ⇒ logger.info(s"the file : {} in local and hdfs are same one", localFile.getName)
      case (true, true, false) ⇒
        logger.debug(s"the file: {} in local and hdfs have same name,but they are different file",
          localFile.getName)
        fileSystem.copyFromLocalFile(false, true, new Path(localFile.toURI), hdfsPath)
      case (true, false, _) ⇒
        logger.debug(s"the file: {} in local is file and in hdfs is dir", localFile.getName)
        throw new IllegalArgumentException(s"${localFile.getName} in local is file and in hdfs is dir")
      case (false, true, _) ⇒
        logger.debug(s"in local {} is a dir and in hdfs is a file", localFile.getName)
        throw new IllegalArgumentException(s"${localFile.getName} in local is dir and in hdfs is file")
      case (false, false, _) ⇒
        logger.debug(s"both local and hdfs this is dir:{}", localFile.getName)
        //three list ,which need update ,which need delete ,which need update
        syncChildren(localFile, hdfsPath)
        val childrenDir = localFile.listFiles().filter(_.isDirectory)
        childrenDir.foreach(file ⇒ sync(file, new Path(hdfsPath, file.getName)))
    }
  }

  private def syncChildren(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = {
    val deleteList = needDelete(localFile, hdfsPath)
    val uploadList = needUpload(localFile, hdfsPath)
    val updateList = needUpdate(localFile, hdfsPath)
    val localParentMappingHdfs = new Path(localFile.toURI)
    logger.debug("deleting which file need delete")
    deleteList.foreach(name ⇒ fileSystem.delete(new Path(hdfsPath, name), true))
    logger.debug("deleted and uploading which file need upload or update")
    (updateList ++ uploadList).foreach(child ⇒ fileSystem.copyFromLocalFile(false, true,
      new Path(localParentMappingHdfs, child), new Path(hdfsPath, child)))
    logger.debug("uploaded")
  }

  private def needDelete(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = {
    fileSystem.listStatus(hdfsPath)
      .map(_.getPath.getName).diff(localFile.listFiles().map(_.getName)).toList
  }

  private def needUpload(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = {
    localFile.listFiles().filter(_.isFile).map(_.getName).diff(
      fileSystem.listStatus(hdfsPath).filter(_.isFile).map(_.getPath.getName)).toList
  }

  private def needUpdate(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = {
    val intersectNameList = fileSystem.listStatus(hdfsPath).filter(_.isFile).map(_.getPath.getName)
      .intersect(localFile.listFiles().filter(_.isFile).map(_.getName))
    intersectNameList.filter(name ⇒ !sameFile(new File(localFile, name), new Path(hdfsPath, name)))
  }

  private def sameFile(file: File, path: Path)(implicit fileSystem: FileSystem): Boolean = {
    val dfsStatus = fileSystem.getFileStatus(path)
    file.isFile && dfsStatus.isFile &&
      (file.length() == dfsStatus.getLen) &&
      (getHdfsFileMd5(dfsStatus.getPath) == getLocalFileMd5(file))
  }

  @throws(classOf[IOException])
  private[sync] def getHdfsFileMd5(path: Path)(implicit dfs: FileSystem): String = {
    val in = dfs.open(path)
    try {
      val md5 = DigestUtils.md5Hex(in)
      md5
    } catch {
      case e: IOException ⇒ throw e
    } finally {
      try {
        in.close()
      } catch {
        case e: IOException ⇒ throw e
      }
    }
  }

  @throws(classOf[IOException])
  private[sync] def getLocalFileMd5(file: File): String = {
    val in = new FileInputStream(file)
    try {
      val md5 = DigestUtils.md5Hex(in)
      md5
    } catch {
      case e: IOException ⇒ throw e
    } finally {
      try {
        in.close()
      } catch {
        case e: IOException ⇒ throw e
      }
    }
  }
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章