file:///home/text1.txt 則從本地讀
hdfs://clusterA/direct1/text1.txt 從集羣上讀
import java.io.OutputStreamWriter
/**
* 讀取hdfs文件
*
* @param aPath 要讀取的文件路徑,如hdfs://clusterA/direct1/text1.txt"
* @return
*/
val sc = SparkSession.builder().enableHiveSupport().getOrCreate()
val pathArr = aPath.split("//")
val uri = pathArr.head + "//" + pathArr(1).split("/").head
val path = new Path(aPath)
val hdfs = org.apache.hadoop.fs.FileSystem.get(
new java.net.URI(uri),
new org.apache.hadoop.conf.Configuration())
if (hdfs.exists(path))
val outpu=sc.sparkContext.textFile(aPath).collect()
/**
* 刪除hdfs目錄
*
* @param aPath 要刪除的路徑,如hdfs://clusterA/direct1"
* @return
*/
def deleteHdfsPath(aPath: String) = {
val pathArr = aPath.split("//")
val uri = pathArr.head + "//" + pathArr(1).split("/").head
val path = new Path(aPath)
val hdfs = org.apache.hadoop.fs.FileSystem.get(
new java.net.URI(uri),
new org.apache.hadoop.conf.Configuration())
if (hdfs.exists(path))
hdfs.delete(path, true)
}
/**
* 獲取文本變更時間
*
* @param aPath 要獲取的文本,如hdfs://clusterA/direct1/file.txt"
* @return
*/
def getMdfTime(aPath: String) = {
val pathArr = aPath.split("//")
val uri = pathArr.head + "//" + pathArr(1).split("/").head
val path = new Path(aPath)
val hdfs = org.apache.hadoop.fs.FileSystem.get(
new java.net.URI(uri),
new org.apache.hadoop.conf.Configuration())
if (hdfs.exists(path))
{ val fileSt = hdfs.getFileStatus(new Path(path))
val modTime = fileSt.getModificationTime.toString\\毫秒時間戳 Long型
}
}
/**
* 支持中文的hdfs文件寫入-覆蓋模式
*
* @param aPath 要寫如的文本路徑,如hdfs://clusterA/direct1/file.txt"
* @param content 要寫入的文本
* @return
*/
def writeOverwrite(aPath: String, content: Iterator[String], hdfs: FileSystem) = {
//支持中文
val path = new Path(aPath)
val out = new OutputStreamWriter(hdfs.create(path, true)) //true則overwrite,false的話路徑存在會報錯
content.foreach(str => out.write(str + "\n"))
out.flush()
out.close()
}
/**
* 支持中文的hdfs文件寫入-追加模式
*
* @param aPath 要寫如的文本路徑,如hdfs://clusterA/direct1/file.txt"
* @param content 要寫入的文本
* @return
*/
def writeAppend( filename : String, content : Iterator[ String ],hdfs:FileSystem ) = {
//支持中文
val path = new Path( filename )
var fileOutputStream:FSDataOutputStream= null
try
if (hdfs.exists(path)) {
fileOutputStream = hdfs.append(path)
content.foreach(x=>fileOutputStream.write((x+"\n").getBytes()))
}
else {
fileOutputStream = hdfs.create(path)
content.foreach(x=>fileOutputStream.write((x+"\n").getBytes()))
}
finally {
if (fileOutputStream != null) fileOutputStream.close
}
}
/**
* 保存dataframe到指定table
*
* @param aPath
* @return
*/
def saveDF(sqlContext: HiveContext, tableNme: String, hdfsPath: String, DF: DataFrame, day: String, hour: String, numPartitions: Int): Unit = {
//方法1
val savePath = hdfsPath + "/pt_d=" + day + "/pt_h=" + hour
val sqlcode = "alter table " + tableNme + " add if not exists partition (pt_d='" + day + "', pt_h='" + hour + "')"
//覆寫模式
funDeleteHdfsPath(savePath)//先刪除路徑下文件
DF.repartition(numPartitions).write.format("orc").save(savePath)
//追加模式//DF.repartition(numPartitions).write.mode("append").format("orc").save(savePath)
sqlContext.sql("use biads")
sqlContext.sql(sqlcode)
//方法2
val sc = SparkSession.builder().enableHiveSupport().getOrCreate()
DF.createOrReplaceTempView(tempViewName)
sc.sql(s"INSERT OVERWRITE TABLE biads.${logTableName} PARTITION (pt_d='${timeD}',pt_h='${timeH}',pt_channel='${pt_channel}') select * from ${tempViewName}")//若想追加寫表則,OVERWRITE改爲 INTO
outString
}