hdfs sync的同步代码重构

这个版本还是有性能问题,由于size比较相同会将文件以流的形式去比较md5,这个代价太高,如果重复同步一个文件,那么时间会一直很高,所以重构了一个新的版本,在sync 再次重构

原文在hdfs file md5 计算,实现本地与hdfs同步文件

啥话都不说,直接上代码:

主要是几个问题:
第一个是slf4j的{},第二个是needUpdate方法的重写,第三个是异常的处理,第四个是sameFile,先比较size,再比较md5,这样资源消耗小

import java.io.{File, FileInputStream, IOException}

import org.apache.commons.codec.digest.DigestUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.slf4j.LoggerFactory

/**
  * Created by todd.chen on 16/3/15.
  * email : todd.chen@ximalaya.com
  */
object PathSyncer {
  lazy val logger = LoggerFactory.getLogger(this.getClass)
  type IsSameFile = (Boolean, Boolean, Boolean)

  @throws(classOf[IOException])
  def sync(localFile: File, hdfsPath: Path, configuration: Configuration): Unit = {
    implicit val fileSystem = FileSystem.get(configuration)
    sync(localFile, hdfsPath)
  }

  @throws(classOf[IOException])
  def sync(hdfsPath: Path, localFile: File)(implicit configuration: Configuration): Unit = {
    implicit val fileSystem = FileSystem.get(configuration)
    sync(localFile, hdfsPath)
  }

  private def sync(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem): Unit = {
    val fileCheck: IsSameFile = (localFile.isFile, fileSystem.isFile(hdfsPath),
      sameFile(localFile, hdfsPath))
    fileCheck match {
      case (true, true, true) ⇒ logger.info(s"the file : {} in local and hdfs are same one", localFile.getName)
      case (true, true, false) ⇒
        logger.debug(s"the file: {} in local and hdfs have same name,but they are different file",
          localFile.getName)
        fileSystem.copyFromLocalFile(false, true, new Path(localFile.toURI), hdfsPath)
      case (true, false, _) ⇒
        logger.debug(s"the file: {} in local is file and in hdfs is dir", localFile.getName)
        throw new IllegalArgumentException(s"${localFile.getName} in local is file and in hdfs is dir")
      case (false, true, _) ⇒
        logger.debug(s"in local {} is a dir and in hdfs is a file", localFile.getName)
        throw new IllegalArgumentException(s"${localFile.getName} in local is dir and in hdfs is file")
      case (false, false, _) ⇒
        logger.debug(s"both local and hdfs this is dir:{}", localFile.getName)
        //three list ,which need update ,which need delete ,which need update
        syncChildren(localFile, hdfsPath)
        val childrenDir = localFile.listFiles().filter(_.isDirectory)
        childrenDir.foreach(file ⇒ sync(file, new Path(hdfsPath, file.getName)))
    }
  }

  private def syncChildren(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = {
    val deleteList = needDelete(localFile, hdfsPath)
    val uploadList = needUpload(localFile, hdfsPath)
    val updateList = needUpdate(localFile, hdfsPath)
    val localParentMappingHdfs = new Path(localFile.toURI)
    logger.debug("deleting which file need delete")
    deleteList.foreach(name ⇒ fileSystem.delete(new Path(hdfsPath, name), true))
    logger.debug("deleted and uploading which file need upload or update")
    (updateList ++ uploadList).foreach(child ⇒ fileSystem.copyFromLocalFile(false, true,
      new Path(localParentMappingHdfs, child), new Path(hdfsPath, child)))
    logger.debug("uploaded")
  }

  private def needDelete(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = {
    fileSystem.listStatus(hdfsPath)
      .map(_.getPath.getName).diff(localFile.listFiles().map(_.getName)).toList
  }

  private def needUpload(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = {
    localFile.listFiles().filter(_.isFile).map(_.getName).diff(
      fileSystem.listStatus(hdfsPath).filter(_.isFile).map(_.getPath.getName)).toList
  }

  private def needUpdate(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = {
    val intersectNameList = fileSystem.listStatus(hdfsPath).filter(_.isFile).map(_.getPath.getName)
      .intersect(localFile.listFiles().filter(_.isFile).map(_.getName))
    intersectNameList.filter(name ⇒ !sameFile(new File(localFile, name), new Path(hdfsPath, name)))
  }

  private def sameFile(file: File, path: Path)(implicit fileSystem: FileSystem): Boolean = {
    val dfsStatus = fileSystem.getFileStatus(path)
    file.isFile && dfsStatus.isFile &&
      (file.length() == dfsStatus.getLen) &&
      (getHdfsFileMd5(dfsStatus.getPath) == getLocalFileMd5(file))
  }

  @throws(classOf[IOException])
  private[sync] def getHdfsFileMd5(path: Path)(implicit dfs: FileSystem): String = {
    val in = dfs.open(path)
    try {
      val md5 = DigestUtils.md5Hex(in)
      md5
    } catch {
      case e: IOException ⇒ throw e
    } finally {
      try {
        in.close()
      } catch {
        case e: IOException ⇒ throw e
      }
    }
  }

  @throws(classOf[IOException])
  private[sync] def getLocalFileMd5(file: File): String = {
    val in = new FileInputStream(file)
    try {
      val md5 = DigestUtils.md5Hex(in)
      md5
    } catch {
      case e: IOException ⇒ throw e
    } finally {
      try {
        in.close()
      } catch {
        case e: IOException ⇒ throw e
      }
    }
  }
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章