hadoop數據複製及掛載hive外部表

場景

公司有生產的hadoop（2.7）集羣，現在需要搭建新的hadoop（3.1）集羣，想要廢棄mapreduce，使用其他的運算框架。選擇了sparksql和presto需要做測試。那麼新的集羣是沒有數據的，需要從生產導一些數據過來。

策略

1.使用hadoop命令從原集羣的某個表的文件目錄賦值到新的集羣目錄

2.創建hive表，並分區

3.新的集羣文件掛載到hive表

整個過程比較簡單，但是對於新手來說有必要記錄一筆。

先看下hive表及分區信息，3個分區family，operation，event_date

第一步，根據表信息的實際需要，複製2019-03-22到2019-04-05的數據，找到原文件地址/warehouse/output/tb/626/20190324，設置好新集羣需要放置的地址/tmp/output/tb/626/20190324，使用下面的命令導入數據

nohup hadoop distcp hdfs://seadoop108.wux.chin.seagate.com:8020/warehouse/output/tb/626/20190324 hdfs://seadoop-test125.wux.chin.seagate.com:8020/tmp/output/tb/626/20190324 &

hdfs上數據目錄

第二步，在新的集羣hive裏面創建表P250_ERROR_RATE_BY_ZONE，創建hive表時要注意原集羣文件的格式，格式有幾種textfile,sequencefile,refile,orcfile。參考https://blog.csdn.net/TOMOCAT/article/details/81673154。建表時指定STORED AS sequencefile

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;

import com.seagate.client.common.DataSource;

public class HiveDao {
	
	private static String hiveUrl = "jdbc:hive2://10.38.149.128:10120/testdb";
	private static String hiveUser = "hive";
	private static String hivePassword = "";
	
	private static Connection conn = null;
	private static Statement stmt = null;
	
	private static int i=1;
	
	static {
		try {
			conn = getHiveConnection();
			stmt = conn.createStatement();
		} catch (SQLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		
	}
	
	public static void executeSql(String sql) throws SQLException {
		Connection conn = null;
		Statement stmt = null;

		try {
			System.out.println("execute sql:"+sql);
			conn = getHiveConnection();
			stmt = conn.createStatement();
			stmt.execute(sql);

		} catch (SQLException e) {
			e.printStackTrace();
			System.out.println("excute sql error, sql = " + sql);
			throw e;
		} finally {
			DataSource.close(conn, stmt);
		}
	}

	public static Connection getHiveConnection() throws SQLException {
		return DriverManager.getConnection(hiveUrl, hiveUser, hivePassword);
	}
	
	public static void closeConnetion(){
		DataSource.close(conn, stmt);
	}
	
	public static void main(String[] args) throws Exception{
		String sql = "create external table P250_ERROR_RATE_BY_ZONE( "
				+ "   serial_num              string "
				+ " 	,trans_seq               string "
				+ " 	,seq                     string "
				+ " 	,spc_id                  string "
				+ " 	,test_seq_event          string "
				+ " 	,state_name              string "
				+ " 	,occurrence              string "
				+ " 	,hd_phys_psn             string "
				+ " 	,data_zone               string "
				+ " 	,num_sova_iterations     string "
				+ " 	,error_rate_type         string "
				+ " 	,hd_lgc_psn              string "
				+ " 	,start_trk_num           string "
				+ " 	,ecc_level               string "
				+ " 	,bits_read_log10         string "
				+ " 	,raw_error_rate          string "
				+ " 	,data_err_cnt            string "
				+ " 	,sync_err_cnt            string "
				+ " 	,fail_code               string "
				+ " 	,bits_in_error_cnt       string "
				+ " 	,avg_itrtn_per_cword     string  ) "
				+ " PARTITIONED BY (family STRING, operation STRING,event_date STRING) "
				+ " row format delimited " 
				+ " fields terminated by ',' "
				+ " STORED AS sequencefile"
				+ " location '/tmp/output/tb/626' ";
		
		executeSql(sql);
	}
}

第三步，掛載分區信息到hive表

import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.sql.SQLException;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import test.hbase.TaskListModel;

public class HdfsDao {
	
	private static String url = "hdfs://10.38.149.125:8020/";
	
	private static String comPath = "/tmp/tasklist";
	private static String comPath2 = "/tmp/output/tb/626";
	
	private static int rows = 0;
	
	private static Configuration conf;
	private static FileSystem fileSystem;
	
	private static int partitionCount = 0;
	private static StringBuffer sql = new StringBuffer();
	
	static {
		//initHadoopConf();
		Configuration conf = new Configuration();  
		conf.set("fs.defaultFS", url);
        // Because of Maven
        conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
        conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
        System.setProperty("HADOOP_USER_NAME", "hdfs");
		//System.setProperty("HADOOP_USER_NAME", "hdfs");
        //System.setProperty("hadoop.home.dir", "/");
		try {
			fileSystem = FileSystem.get(URI.create(url),conf);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		//FileSystem fs = FileSystem.get(URI.create(url), conf);
	}
	
	private synchronized static void setRows(int i){
		rows = rows + i;
	}
	
	public static int getRows(){
		return rows;
	}
	
	public static void main(String[] args) throws FileNotFoundException, IOException, SQLException {
		
		listPathP250_ERROR_RATE_BY_ZONE(comPath2+"/20190325");
		HiveDao.executeMultiSql(sql.toString());
		HiveDao.closeConnetion();
		
	}

	
	public static void listPathP250_ERROR_RATE_BY_ZONE(String path) throws FileNotFoundException, IOException, SQLException{
		Path taskPath= new Path(path);
		FileStatus[] fileStatus = fileSystem.listStatus(taskPath);
		
		if(fileStatus != null){
			for(FileStatus file:fileStatus){
				
				//System.out.println(file.getPath().getName()); 
				if(file.isDirectory()){
					//System.out.println(path+"/"+file.getPath().getName()); 
					listPathP250_ERROR_RATE_BY_ZONE(path+"/"+file.getPath().getName());
				}else{
					String eventDate = path.split("/")[5];
					String sbr = path.split("/")[6];
					String operation = path.split("/")[7];
					sql.append("ALTER TABLE P250_ERROR_RATE_BY_ZONE ADD PARTITION(family ='"+sbr+"',operation='"+operation+"',event_date='"+eventDate+"') LOCATION '"+path+"';");
					partitionCount++;
					if(partitionCount > 1000){
						HiveDao.executeMultiSql(sql.toString());
						sql.setLength(0);;
						partitionCount = 0;
					}
					
					//System.out.println(sql);
				}
				
			}
		}
		
	}
}

查詢表數據，格式有點亂，沒有調整控制檯

hadoop數據複製及掛載hive外部表

場景

策略

[轉帖]

python列出centos7內存使用前50的進程信息

Garnet：微軟官方基於.NET開源的高性能分佈式緩存存儲數據庫

Java響應式編程

評估統計算法在銀行僞造鈔票檢測中的價值

【遠程調用傳參】java.io.InvalidClassException: failed to read class descriptor

【hive外部表複製】hadoop數據複製及掛載hive外部表

【筆記】java調用R，R調用hadoop

【presto on yarn】問題UnknownHostException和重啓節點

【postgres】postgres數據庫備份還原

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結