這個版本還是有性能問題,由於size比較相同會將文件以流的形式去比較md5,這個代價太高,如果重複同步一個文件,那麼時間會一直很高,所以重構了一個新的版本,在sync 再次重構
原文在hdfs file md5 計算,實現本地與hdfs同步文件
啥話都不說,直接上代碼:
主要是幾個問題:
第一個是slf4j的{},第二個是needUpdate方法的重寫,第三個是異常的處理,第四個是sameFile,先比較size,再比較md5,這樣資源消耗小
import java.io.{File, FileInputStream, IOException}
import org.apache.commons.codec.digest.DigestUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.slf4j.LoggerFactory
/**
* Created by todd.chen on 16/3/15.
* email : todd.chen@ximalaya.com
*/
object PathSyncer {
lazy val logger = LoggerFactory.getLogger(this.getClass)
type IsSameFile = (Boolean, Boolean, Boolean)
@throws(classOf[IOException])
def sync(localFile: File, hdfsPath: Path, configuration: Configuration): Unit = {
implicit val fileSystem = FileSystem.get(configuration)
sync(localFile, hdfsPath)
}
@throws(classOf[IOException])
def sync(hdfsPath: Path, localFile: File)(implicit configuration: Configuration): Unit = {
implicit val fileSystem = FileSystem.get(configuration)
sync(localFile, hdfsPath)
}
private def sync(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem): Unit = {
val fileCheck: IsSameFile = (localFile.isFile, fileSystem.isFile(hdfsPath),
sameFile(localFile, hdfsPath))
fileCheck match {
case (true, true, true) ⇒ logger.info(s"the file : {} in local and hdfs are same one", localFile.getName)
case (true, true, false) ⇒
logger.debug(s"the file: {} in local and hdfs have same name,but they are different file",
localFile.getName)
fileSystem.copyFromLocalFile(false, true, new Path(localFile.toURI), hdfsPath)
case (true, false, _) ⇒
logger.debug(s"the file: {} in local is file and in hdfs is dir", localFile.getName)
throw new IllegalArgumentException(s"${localFile.getName} in local is file and in hdfs is dir")
case (false, true, _) ⇒
logger.debug(s"in local {} is a dir and in hdfs is a file", localFile.getName)
throw new IllegalArgumentException(s"${localFile.getName} in local is dir and in hdfs is file")
case (false, false, _) ⇒
logger.debug(s"both local and hdfs this is dir:{}", localFile.getName)
//three list ,which need update ,which need delete ,which need update
syncChildren(localFile, hdfsPath)
val childrenDir = localFile.listFiles().filter(_.isDirectory)
childrenDir.foreach(file ⇒ sync(file, new Path(hdfsPath, file.getName)))
}
}
private def syncChildren(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = {
val deleteList = needDelete(localFile, hdfsPath)
val uploadList = needUpload(localFile, hdfsPath)
val updateList = needUpdate(localFile, hdfsPath)
val localParentMappingHdfs = new Path(localFile.toURI)
logger.debug("deleting which file need delete")
deleteList.foreach(name ⇒ fileSystem.delete(new Path(hdfsPath, name), true))
logger.debug("deleted and uploading which file need upload or update")
(updateList ++ uploadList).foreach(child ⇒ fileSystem.copyFromLocalFile(false, true,
new Path(localParentMappingHdfs, child), new Path(hdfsPath, child)))
logger.debug("uploaded")
}
private def needDelete(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = {
fileSystem.listStatus(hdfsPath)
.map(_.getPath.getName).diff(localFile.listFiles().map(_.getName)).toList
}
private def needUpload(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = {
localFile.listFiles().filter(_.isFile).map(_.getName).diff(
fileSystem.listStatus(hdfsPath).filter(_.isFile).map(_.getPath.getName)).toList
}
private def needUpdate(localFile: File, hdfsPath: Path)(implicit fileSystem: FileSystem) = {
val intersectNameList = fileSystem.listStatus(hdfsPath).filter(_.isFile).map(_.getPath.getName)
.intersect(localFile.listFiles().filter(_.isFile).map(_.getName))
intersectNameList.filter(name ⇒ !sameFile(new File(localFile, name), new Path(hdfsPath, name)))
}
private def sameFile(file: File, path: Path)(implicit fileSystem: FileSystem): Boolean = {
val dfsStatus = fileSystem.getFileStatus(path)
file.isFile && dfsStatus.isFile &&
(file.length() == dfsStatus.getLen) &&
(getHdfsFileMd5(dfsStatus.getPath) == getLocalFileMd5(file))
}
@throws(classOf[IOException])
private[sync] def getHdfsFileMd5(path: Path)(implicit dfs: FileSystem): String = {
val in = dfs.open(path)
try {
val md5 = DigestUtils.md5Hex(in)
md5
} catch {
case e: IOException ⇒ throw e
} finally {
try {
in.close()
} catch {
case e: IOException ⇒ throw e
}
}
}
@throws(classOf[IOException])
private[sync] def getLocalFileMd5(file: File): String = {
val in = new FileInputStream(file)
try {
val md5 = DigestUtils.md5Hex(in)
md5
} catch {
case e: IOException ⇒ throw e
} finally {
try {
in.close()
} catch {
case e: IOException ⇒ throw e
}
}
}
}