數據:
[customers.txt]
1,tom,12
2,tom,13
3,tom,14
4,tom,15
[orders.txt]
1,no001,12.23,1
2,no001,12.23,1
3,no001,12.23,2
4,no001,12.23,2
5,no001,12.23,2
6,no001,12.23,3
7,no001,12.23,3
8,no001,12.23,3
9,no001,12.23,3
map端join
1.創建Mapper
package com.it18zhang.hdfs.mr.mapjoin;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
/**
* join操作,map端連接。
*/
public class MapJoinMapper extends Mapper<LongWritable,Text,Text,NullWritable> {
private Map<String,String> allCustomers = new HashMap<String,String>();
//啓動,初始化客戶信息
protected void setup(Context context) throws IOException, InterruptedException {
try {
Configuration conf = context.getConfiguration();
FileSystem fs = FileSystem.get(conf);
FSDataInputStream fis = fs.open(new Path("file:///d:/mr/mapjoin/customers.txt"));
//得到緩衝區閱讀器
BufferedReader br = new BufferedReader(new InputStreamReader(fis));
String line = null ;
while((line = br.readLine()) != null){
//得到cid
String cid = line.substring(0,line.indexOf(","));
allCustomers.put(cid,line);
}
} catch (Exception e) {
e.printStackTrace();
}
}
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//訂單信息
String line = value.toString();
//提取customer id
String cid = line.substring(line.lastIndexOf(",") + 1);
//訂單信息
String orderInfo = line.substring(0,line.lastIndexOf(","));
//連接customer + "," + order
String customerInfo = allCustomers.get(cid);
context.write(new Text(customerInfo + "," + orderInfo),NullWritable.get());
}
}
2.創建App
package com.it18zhang.hdfs.mr.mapjoin;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
*
*/
public class MapJoinApp {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","file:///");
Job job = Job.getInstance(conf);
//設置job的各種屬性
job.setJobName("MapJoinApp"); //作業名稱
job.setJarByClass(MapJoinApp.class); //搜索類
//添加輸入路徑
FileInputFormat.addInputPath(job,new Path(args[0]));
//設置輸出路徑
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//沒有reduce
job.setNumReduceTasks(0);
job.setMapperClass(MapJoinMapper.class); //mapper類
job.setMapOutputKeyClass(Text.class); //
job.setMapOutputValueClass(NullWritable.class); //
job.waitForCompletion(true);
}
}
join端連接
-----------------------
1.自定義key
package com.it18zhang.hdfs.mr.mapjoin.reducejoin;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
*/
public class ComboKey2 implements WritableComparable<ComboKey2> {
//0-customer 1-order
private int type ;
private int cid ;
private int oid ;
private String customerInfo = "" ;
private String orderInfo = "" ;
public int compareTo(ComboKey2 o) {
int type0 = o.type ;
int cid0= o.cid;
int oid0 = o.oid;
String customerInfo0 = o.customerInfo;
String orderInfo0 = o.orderInfo ;
//是否同一個customer的數據
if(cid == cid0){
//同一個客戶的兩個訂單
if(type == type0){
return oid - oid0 ;
}
//一個Customer + 他的order
else{
if(type ==0)
return -1 ;
else
return 1 ;
}
}
//cid不同
else{
return cid - cid0 ;
}
}
public void write(DataOutput out) throws IOException {
out.writeInt(type);
out.writeInt(cid);
out.writeInt(oid);
out.writeUTF(customerInfo);
out.writeUTF(orderInfo);
}
public void readFields(DataInput in) throws IOException {
this.type = in.readInt();
this.cid = in.readInt();
this.oid = in.readInt();
this.customerInfo = in.readUTF();
this.orderInfo = in.readUTF();
}
}
2.自定義分區類
public class CIDPartitioner extends Partitioner<ComboKey2,NullWri
table>{
public int getPartition(ComboKey2 key, NullWritable nullWritable, int numPartitions) {
return key.getCid() % numPartitions;
}
}
3.創建Mapper
package com.it18zhang.hdfs.mr.mapjoin.reducejoin;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/**
* mapper
*/
public class ReduceJoinMapper extends Mapper<LongWritable,Text,ComboKey2,NullWritable> {
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//
String line = value.toString() ;
//判斷是customer還是order
FileSplit split = (FileSplit)context.getInputSplit();
String path = split.getPath().toString();
//客戶信息
ComboKey2 key2 = new ComboKey2();
if(path.contains("customers")){
String cid = line.substring(0,line.indexOf(","));
String custInfo = line ;
key2.setType(0);
key2.setCid(Integer.parseInt(cid));
key2.setCustomerInfo(custInfo);
}
//order info
else{
String cid = line.substring(line.lastIndexOf(",") + 1);
String oid = line.substring(0, line.indexOf(","));
String oinfo = line.substring(0, line.lastIndexOf(","));
key2.setType(1);
key2.setCid(Integer.parseInt(cid));
key2.setOid(Integer.parseInt(oid));
key2.setOrderInfo(oinfo);
}
context.write(key2,NullWritable.get());
}
}
4.創建Reducer
package com.it18zhang.hdfs.mr.mapjoin.reducejoin;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
/**
* ReduceJoinReducer,reducer端連接實現。
*/
public class ReduceJoinReducer extends Reducer<ComboKey2,NullWritable,Text,NullWritable> {
protected void reduce(ComboKey2 key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
Iterator<NullWritable> it = values.iterator();
it.next();
int type = key.getType();
int cid = key.getCid() ;
String cinfo = key.getCustomerInfo() ;
while(it.hasNext()){
it.next();
String oinfo = key.getOrderInfo();
context.write(new Text(cinfo + "," + oinfo),NullWritable.get());
}
}
}
5.創建排序對比器
package com.it18zhang.hdfs.mr.mapjoin.reducejoin;
import com.it18zhang.hdfs.maxtemp.allsort.secondarysort.ComboKey;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* 組合Key排序對比器
*/
public class ComboKey2Comparator extends WritableComparator {
protected ComboKey2Comparator() {
super(ComboKey2.class, true);
}
public int compare(WritableComparable a, WritableComparable b) {
ComboKey2 k1 = (ComboKey2) a;
ComboKey2 k2 = (ComboKey2) b;
return k1.compareTo(k2);
}
}
6.分組對比器
package com.it18zhang.hdfs.mr.mapjoin.reducejoin;
import com.it18zhang.hdfs.maxtemp.allsort.secondarysort.ComboKey;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* CID分組對比器
*/
public class CIDGroupComparator extends WritableComparator{
protected CIDGroupComparator() {
super(ComboKey2.class, true);
}
public int compare(WritableComparable a, WritableComparable b) {
ComboKey2 k1 = (ComboKey2) a;
ComboKey2 k2 = (ComboKey2) b;
return k1.getCid() - k2.getCid();
}
}
7.App
package com.it18zhang.hdfs.mr.mapjoin.reducejoin;
import com.it18zhang.hdfs.maxtemp.allsort.secondarysort.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
*
*/
public class ReduceJoinApp {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","file:///");
Job job = Job.getInstance(conf);
//設置job的各種屬性
job.setJobName("ReduceJoinApp"); //作業名稱
job.setJarByClass(ReduceJoinApp.class); //搜索類
//添加輸入路徑
FileInputFormat.addInputPath(job,new Path("D:\\mr\\reducejoin"));
//設置輸出路徑
FileOutputFormat.setOutputPath(job,new Path("D:\\mr\\reducejoin\\out"));
job.setMapperClass(ReduceJoinMapper.class); //mapper類
job.setReducerClass(ReduceJoinReducer.class); //reducer類
//設置Map輸出類型
job.setMapOutputKeyClass(ComboKey2.class); //
job.setMapOutputValueClass(NullWritable.class); //
//設置ReduceOutput類型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class); //
//設置分區類
job.setPartitionerClass(CIDPartitioner.class);
//設置分組對比器
job.setGroupingComparatorClass(CIDGroupComparator.class);
//設置排序對比器
job.setSortComparatorClass(ComboKey2Comparator.class);
job.setNumReduceTasks(2); //reduce個數
job.waitForCompletion(true);
}
}
hive
------------------
在hadoop處理結構化數據的數據倉庫。
不是: 關係數據庫
不是OLTP
實時查詢和行級更新。
hive特點
----------
hive存儲數據結構(schema)在數據庫中,處理的數據進入hdfs.
OLAP
HQL / HiveQL
hive安裝
-------------
1.下載hive2.1-tar.gz
2.tar開
$>tar -xzvf hive-2.1.0.tar.gz -C /soft //tar開
$>cd /soft/hive-2.1.0 //
$>ln -s hive-2.1.0 hive //符號連接
3.配置環境變量
[/etc/profile]
HIVE_HOME=/soft/hive
PATH=$PATH$HIVE_HOME/bin
執行 :source /etc/profile
4.驗證hive安裝成功
$>hive --v
5.配置hive,使用win7的mysql存放hive的元數據.
a)複製mysql驅動程序到hive的lib目錄下。
...
b)配置hive-site.xml
複製hive-default.xml.template爲hive-site.xml
修改連接信息爲mysql鏈接地址,將${system:...字樣替換成具體路徑。
[hive/conf/hive-site.xml]
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>root</value>
<description>password to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
<description>Username to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://192.168.231.1:3306/hive2</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>Driver class name for a JDBC metastore</description>
</property>
6)在msyql中創建存放hive信息的數據庫
mysql>create database hive2 ;
6)初始化hive的元數據(表結構)到mysql中。
$>cd /soft/hive/bin
$>schematool -dbType mysql -initSchema
hive命令行操作
------------------------
1.創建hive的數據庫
$hive>hive --version //
$hive>hive --help //
$hive>create database mydb2 ; //
$hive>show databases ;
$hive>use mydb2 ;
$hive>create table mydb2.t(id int,name string,age int);
$hive>drop table t ;
$hive>drop table mydb2.t ;
$hive>select * from mydb2.t ; //查看指定庫的表
$hive>exit ; //退出
$>hive //hive --service cli
$>hive //hive --service cli
通過遠程jdbc方式連接到hive數據倉庫
--------------------------------
1.啓動hiveserver2服務器,監聽端口10000
$>hive --service hiveserver2 &
2.通過beeline命令行連接到hiveserver2
$>beeline //進入beeline命令行(於hive --service beeline)
$beeline>!help //查看幫助
$beeline>!quit //退出
$beeline>!connect jdbc:hive2://localhost:10000/mydb2//連接到hibve數據
$beeline>show databases ;
$beeline>use mydb2 ;
$beeline>show tables; //顯式表
使用Hive-jdbc驅動程序採用jdbc方式訪問遠程數據倉庫
----------------------------------------------------
1.創建java模塊
2.引入maven
3.添加hive-jdbc依賴
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.it18zhang</groupId>
<artifactId>HiveDemo</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>2.1.0</version>
</dependency>
</dependencies>
</project>
4.App
package com.it18zhang.hivedemo;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
/**
* 使用jdbc方式連接到hive數據倉庫,數據倉庫需要開啓hiveserver2服務。
*/
public class App {
public static void main(String[] args) throws Exception {
Class.forName("org.apache.hive.jdbc.HiveDriver");
Connection conn = DriverManager.getConnection("jdbc:hive2://192.168.231.201:10000/mydb2");
Statement st = conn.createStatement();
ResultSet rs = st.executeQuery("select id , name ,age from t");
while(rs.next()){
System.out.println(rs.getInt(1) + "," + rs.getString(2)) ;
}
rs.close();
st.close();
conn.close();
}
}
hive中表
-------------------
1.managed table
託管表。
刪除表時,數據也刪除了。
2.external table
外部表。
刪除表時,數據不刪。
hive命令
----------------
//創建表,external 外部表
$hive>CREATE external TABLE IF NOT EXISTS t2(id int,name string,age int)
COMMENT 'xx' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE ;
//查看錶數據
$hive>desc t2 ;
$hive>desc formatted t2 ;
//加載數據到hive表
$hive>load data local inpath '/home/centos/customers.txt' into table t2 ; //local上傳文件
$hive>load data inpath '/user/centos/customers.txt' [overwrite] into table t2 ; //移動文件
//複製表
mysql>create table tt as select * from users ; //攜帶數據和表結構
mysql>create table tt like users ; //不帶數據,只有表結構
hive>create table tt as select * from users ;
hive>create table tt like users ;
//count()查詢要轉成mr
$hive>select count(*) from t2 ;
$hive>select id,name from t2 ;
//
$hive>select * from t2 order by id desc ; //MR
//啓用/禁用表
$hive>ALTER TABLE t2 ENABLE NO_DROP; //不允許刪除
$hive>ALTER TABLE t2 DISABLE NO_DROP; //允許刪除
//分區表,優化手段之一,從目錄的層面控制搜索數據的範圍。
//創建分區表.
$hive>CREATE TABLE t3(id int,name string,age int) PARTITIONED BY (Year INT, Month INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;
//顯式表的分區信息
$hive>SHOW PARTITIONS t3;
//添加分區,創建目錄
$hive>alter table t3 add partition (year=2014, month=12);
//刪除分區
hive>ALTER TABLE employee_partitioned DROP IF EXISTS PARTITION (year=2014, month=11);
//分區結構
hive>/user/hive/warehouse/mydb2.db/t3/year=2014/month=11
hive>/user/hive/warehouse/mydb2.db/t3/year=2014/month=12
//加載數據到分區表
hive>load data local inpath '/home/centos/customers.txt' into table t3 partition(year=2014,month=11);
//創建桶表
$hive>CREATE TABLE t4(id int,name string,age int) CLUSTERED BY (id) INTO 3 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;
//加載數據不會進行分桶操作
$hive>load data local inpath '/home/centos/customers.txt' into table t4 ;
//查詢t3表數據插入到t4中。
$hive>insert into t4 select id,name,age from t3 ;
//桶表的數量如何設置?
//評估數據量,保證每個桶的數據量block的2倍大小。
//連接查詢
$hive>CREATE TABLE customers(id int,name string,age int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;
$hive>CREATE TABLE orders(id int,orderno string,price float,cid int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;
//加載數據到表
//內連接查詢
hive>select a.*,b.* from customers a , orders b where a.id = b.cid ;
//左外
hive>select a.*,b.* from customers a left outer join orders b on a.id = b.cid ;
hive>select a.*,b.* from customers a right outer join orders b on a.id = b.cid ;
hive>select a.*,b.* from customers a full outer join orders b on a.id = b.cid ;
//explode,炸裂,表生成函數。
//使用hive實現單詞統計
//1.建表
$hive>CREATE TABLE doc(line string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;