場景
公司有生產的hadoop(2.7)集羣,現在需要搭建新的hadoop(3.1)集羣,想要廢棄mapreduce,使用其他的運算框架。選擇了sparksql和presto需要做測試。那麼新的集羣是沒有數據的,需要從生產導一些數據過來。
策略
1.使用hadoop命令從原集羣的某個表的文件目錄賦值到新的集羣目錄
2.創建hive表,並分區
3.新的集羣文件掛載到hive表
整個過程比較簡單,但是對於新手來說有必要記錄一筆。
先看下hive表及分區信息,3個分區family,operation,event_date
第一步,根據表信息的實際需要,複製2019-03-22到2019-04-05的數據,找到原文件地址/warehouse/output/tb/626/20190324,設置好新集羣需要放置的地址/tmp/output/tb/626/20190324,使用下面的命令導入數據
nohup hadoop distcp hdfs://seadoop108.wux.chin.seagate.com:8020/warehouse/output/tb/626/20190324 hdfs://seadoop-test125.wux.chin.seagate.com:8020/tmp/output/tb/626/20190324 &
hdfs上數據目錄
第二步,在新的集羣hive裏面創建表P250_ERROR_RATE_BY_ZONE,創建hive表時要注意原集羣文件的格式,格式有幾種textfile,sequencefile,refile,orcfile。參考https://blog.csdn.net/TOMOCAT/article/details/81673154。建表時指定STORED AS sequencefile
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import com.seagate.client.common.DataSource;
public class HiveDao {
private static String hiveUrl = "jdbc:hive2://10.38.149.128:10120/testdb";
private static String hiveUser = "hive";
private static String hivePassword = "";
private static Connection conn = null;
private static Statement stmt = null;
private static int i=1;
static {
try {
conn = getHiveConnection();
stmt = conn.createStatement();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void executeSql(String sql) throws SQLException {
Connection conn = null;
Statement stmt = null;
try {
System.out.println("execute sql:"+sql);
conn = getHiveConnection();
stmt = conn.createStatement();
stmt.execute(sql);
} catch (SQLException e) {
e.printStackTrace();
System.out.println("excute sql error, sql = " + sql);
throw e;
} finally {
DataSource.close(conn, stmt);
}
}
public static Connection getHiveConnection() throws SQLException {
return DriverManager.getConnection(hiveUrl, hiveUser, hivePassword);
}
public static void closeConnetion(){
DataSource.close(conn, stmt);
}
public static void main(String[] args) throws Exception{
String sql = "create external table P250_ERROR_RATE_BY_ZONE( "
+ " serial_num string "
+ " ,trans_seq string "
+ " ,seq string "
+ " ,spc_id string "
+ " ,test_seq_event string "
+ " ,state_name string "
+ " ,occurrence string "
+ " ,hd_phys_psn string "
+ " ,data_zone string "
+ " ,num_sova_iterations string "
+ " ,error_rate_type string "
+ " ,hd_lgc_psn string "
+ " ,start_trk_num string "
+ " ,ecc_level string "
+ " ,bits_read_log10 string "
+ " ,raw_error_rate string "
+ " ,data_err_cnt string "
+ " ,sync_err_cnt string "
+ " ,fail_code string "
+ " ,bits_in_error_cnt string "
+ " ,avg_itrtn_per_cword string ) "
+ " PARTITIONED BY (family STRING, operation STRING,event_date STRING) "
+ " row format delimited "
+ " fields terminated by ',' "
+ " STORED AS sequencefile"
+ " location '/tmp/output/tb/626' ";
executeSql(sql);
}
}
第三步,掛載分區信息到hive表
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.sql.SQLException;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import test.hbase.TaskListModel;
public class HdfsDao {
private static String url = "hdfs://10.38.149.125:8020/";
private static String comPath = "/tmp/tasklist";
private static String comPath2 = "/tmp/output/tb/626";
private static int rows = 0;
private static Configuration conf;
private static FileSystem fileSystem;
private static int partitionCount = 0;
private static StringBuffer sql = new StringBuffer();
static {
//initHadoopConf();
Configuration conf = new Configuration();
conf.set("fs.defaultFS", url);
// Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
System.setProperty("HADOOP_USER_NAME", "hdfs");
//System.setProperty("HADOOP_USER_NAME", "hdfs");
//System.setProperty("hadoop.home.dir", "/");
try {
fileSystem = FileSystem.get(URI.create(url),conf);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//FileSystem fs = FileSystem.get(URI.create(url), conf);
}
private synchronized static void setRows(int i){
rows = rows + i;
}
public static int getRows(){
return rows;
}
public static void main(String[] args) throws FileNotFoundException, IOException, SQLException {
listPathP250_ERROR_RATE_BY_ZONE(comPath2+"/20190325");
HiveDao.executeMultiSql(sql.toString());
HiveDao.closeConnetion();
}
public static void listPathP250_ERROR_RATE_BY_ZONE(String path) throws FileNotFoundException, IOException, SQLException{
Path taskPath= new Path(path);
FileStatus[] fileStatus = fileSystem.listStatus(taskPath);
if(fileStatus != null){
for(FileStatus file:fileStatus){
//System.out.println(file.getPath().getName());
if(file.isDirectory()){
//System.out.println(path+"/"+file.getPath().getName());
listPathP250_ERROR_RATE_BY_ZONE(path+"/"+file.getPath().getName());
}else{
String eventDate = path.split("/")[5];
String sbr = path.split("/")[6];
String operation = path.split("/")[7];
sql.append("ALTER TABLE P250_ERROR_RATE_BY_ZONE ADD PARTITION(family ='"+sbr+"',operation='"+operation+"',event_date='"+eventDate+"') LOCATION '"+path+"';");
partitionCount++;
if(partitionCount > 1000){
HiveDao.executeMultiSql(sql.toString());
sql.setLength(0);;
partitionCount = 0;
}
//System.out.println(sql);
}
}
}
}
}
查詢表數據,格式有點亂,沒有調整控制檯