大数据MR模型以及代码实现

数据：
[customers.txt]

    1,tom,12
    2,tom,13
    3,tom,14
    4,tom,15

[orders.txt]

    1,no001,12.23,1
    2,no001,12.23,1
    3,no001,12.23,2
    4,no001,12.23,2
    5,no001,12.23,2
    6,no001,12.23,3
    7,no001,12.23,3
    8,no001,12.23,3
    9,no001,12.23,3

map端join
1.创建Mapper

        package com.it18zhang.hdfs.mr.mapjoin;
        import org.apache.hadoop.conf.Configuration;
        import org.apache.hadoop.fs.FSDataInputStream;
        import org.apache.hadoop.fs.FileSystem;
        import org.apache.hadoop.fs.Path;
        import org.apache.hadoop.io.LongWritable;
        import org.apache.hadoop.io.NullWritable;
        import org.apache.hadoop.io.Text;
        import org.apache.hadoop.mapreduce.Mapper;
        import java.io.BufferedReader;
        import java.io.IOException;
        import java.io.InputStreamReader;
        import java.util.HashMap;
        import java.util.Map;

        /**
         * join操作，map端连接。
         */
        public class MapJoinMapper extends Mapper<LongWritable,Text,Text,NullWritable> {

            private Map<String,String> allCustomers = new HashMap<String,String>();

            //启动,初始化客户信息
            protected void setup(Context context) throws IOException, InterruptedException {
                try {
                    Configuration conf = context.getConfiguration();
                    FileSystem fs = FileSystem.get(conf);
                    FSDataInputStream fis = fs.open(new Path("file:///d:/mr/mapjoin/customers.txt"));
                    //得到缓冲区阅读器
                    BufferedReader br = new BufferedReader(new InputStreamReader(fis));
                    String line = null ;
                    while((line = br.readLine()) != null){
                        //得到cid
                        String cid = line.substring(0,line.indexOf(","));
                        allCustomers.put(cid,line);
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }

            protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
                //订单信息
                String line = value.toString();
                //提取customer id
                String cid = line.substring(line.lastIndexOf(",") + 1);
                //订单信息
                String orderInfo = line.substring(0,line.lastIndexOf(","));

                //连接customer + "," + order
                String customerInfo = allCustomers.get(cid);
                context.write(new Text(customerInfo + "," + orderInfo),NullWritable.get());
            }

        }

2.创建App

        package com.it18zhang.hdfs.mr.mapjoin;
        import org.apache.hadoop.conf.Configuration;
        import org.apache.hadoop.fs.Path;
        import org.apache.hadoop.io.NullWritable;
        import org.apache.hadoop.io.Text;
        import org.apache.hadoop.mapreduce.Job;
        import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
        import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

        /**
         *
         */
        public class MapJoinApp {
            public static void main(String[] args) throws Exception {

                Configuration conf = new Configuration();
                conf.set("fs.defaultFS","file:///");
                Job job = Job.getInstance(conf);

                //设置job的各种属性
                job.setJobName("MapJoinApp");                        //作业名称
                job.setJarByClass(MapJoinApp.class);                 //搜索类

                //添加输入路径
                FileInputFormat.addInputPath(job,new Path(args[0]));
                //设置输出路径
                FileOutputFormat.setOutputPath(job,new Path(args[1]));

                //没有reduce

                job.setNumReduceTasks(0);

                job.setMapperClass(MapJoinMapper.class);             //mapper类

                job.setMapOutputKeyClass(Text.class);           //
                job.setMapOutputValueClass(NullWritable.class);  //

                job.waitForCompletion(true);
            }
        }

join端连接
-----------------------
1.自定义key

        package com.it18zhang.hdfs.mr.mapjoin.reducejoin;
        import org.apache.hadoop.io.WritableComparable;
        import java.io.DataInput;
        import java.io.DataOutput;
        import java.io.IOException;
        /**
         */
        public class ComboKey2 implements WritableComparable<ComboKey2> {
            //0-customer 1-order
            private int type ;
            private int cid ;
            private int oid ;
            private String customerInfo = "" ;
            private String orderInfo = "" ;
            public int compareTo(ComboKey2 o) {
                int type0 = o.type ;
                int cid0= o.cid;
                int oid0 = o.oid;
                String customerInfo0 = o.customerInfo;
                String orderInfo0 = o.orderInfo ;
                //是否同一个customer的数据
                if(cid == cid0){
                    //同一个客户的两个订单
                    if(type == type0){
                        return oid - oid0 ;
                    }
                    //一个Customer + 他的order
                    else{
                        if(type ==0)
                            return -1 ;
                        else
                            return 1 ;
                    }
                }
                //cid不同
                else{
                    return cid - cid0 ;
                }
            }

            public void write(DataOutput out) throws IOException {
                out.writeInt(type);
                out.writeInt(cid);
                out.writeInt(oid);
                out.writeUTF(customerInfo);
                out.writeUTF(orderInfo);
            }

            public void readFields(DataInput in) throws IOException {
                this.type = in.readInt();
                this.cid = in.readInt();
                this.oid = in.readInt();
                this.customerInfo = in.readUTF();
                this.orderInfo = in.readUTF();
            }
        }

2.自定义分区类

    public class CIDPartitioner extends Partitioner<ComboKey2,NullWri
            table>{
    public int getPartition(ComboKey2 key, NullWritable nullWritable, int numPartitions) {
                return key.getCid() % numPartitions;
            }
        }

3.创建Mapper

        package com.it18zhang.hdfs.mr.mapjoin.reducejoin;
        import org.apache.hadoop.io.LongWritable;
        import org.apache.hadoop.io.NullWritable;
        import org.apache.hadoop.io.Text;
        import org.apache.hadoop.mapreduce.InputSplit;
        import org.apache.hadoop.mapreduce.Mapper;
        import org.apache.hadoop.mapreduce.lib.input.FileSplit;
        import java.io.IOException;

        /**
         * mapper
         */
        public class ReduceJoinMapper extends Mapper<LongWritable,Text,ComboKey2,NullWritable> {

            protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
                //
                String line = value.toString() ;

                //判断是customer还是order
                FileSplit split = (FileSplit)context.getInputSplit();
                String path = split.getPath().toString();
                //客户信息
                ComboKey2 key2 = new ComboKey2();
                if(path.contains("customers")){
                    String cid = line.substring(0,line.indexOf(","));
                    String custInfo = line ;
                    key2.setType(0);
                    key2.setCid(Integer.parseInt(cid));
                    key2.setCustomerInfo(custInfo);
                }
                //order info
                else{
                    String cid = line.substring(line.lastIndexOf(",") + 1);
                    String oid = line.substring(0, line.indexOf(","));
                    String oinfo = line.substring(0, line.lastIndexOf(","));
                    key2.setType(1);
                    key2.setCid(Integer.parseInt(cid));
                    key2.setOid(Integer.parseInt(oid));
                    key2.setOrderInfo(oinfo);
                }
                context.write(key2,NullWritable.get());
            }
        }

4.创建Reducer
package com.it18zhang.hdfs.mr.mapjoin.reducejoin;

       import org.apache.hadoop.io.NullWritable;
       import org.apache.hadoop.io.Text;
       import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Iterator;

       /**
       * ReduceJoinReducer,reducer端连接实现。
       */
       public class ReduceJoinReducer extends Reducer<ComboKey2,NullWritable,Text,NullWritable> {

           protected void reduce(ComboKey2 key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
               Iterator<NullWritable> it = values.iterator();
               it.next();
               int type = key.getType();
               int cid = key.getCid() ;
               String cinfo = key.getCustomerInfo() ;
               while(it.hasNext()){
                   it.next();
                   String oinfo = key.getOrderInfo();
                   context.write(new Text(cinfo + "," + oinfo),NullWritable.get());
               }
           }
       }

5.创建排序对比器
package com.it18zhang.hdfs.mr.mapjoin.reducejoin;

       import com.it18zhang.hdfs.maxtemp.allsort.secondarysort.ComboKey;
       import org.apache.hadoop.io.WritableComparable;
       import org.apache.hadoop.io.WritableComparator;

       /**
       * 组合Key排序对比器
       */
       public class ComboKey2Comparator extends WritableComparator {
           protected ComboKey2Comparator() {
               super(ComboKey2.class, true);
           }

           public int compare(WritableComparable a, WritableComparable b) {
               ComboKey2 k1 = (ComboKey2) a;
               ComboKey2 k2 = (ComboKey2) b;
               return k1.compareTo(k2);
           }
       }

6.分组对比器
package com.it18zhang.hdfs.mr.mapjoin.reducejoin;

       import com.it18zhang.hdfs.maxtemp.allsort.secondarysort.ComboKey;
       import org.apache.hadoop.io.WritableComparable;
       import org.apache.hadoop.io.WritableComparator;

       /**
       * CID分组对比器
       */
       public class CIDGroupComparator extends WritableComparator{

           protected CIDGroupComparator() {
               super(ComboKey2.class, true);
           }

           public int compare(WritableComparable a, WritableComparable b) {
               ComboKey2 k1 = (ComboKey2) a;
               ComboKey2 k2 = (ComboKey2) b;
               return k1.getCid() - k2.getCid();
           }
       }

7.App
package com.it18zhang.hdfs.mr.mapjoin.reducejoin;

       import com.it18zhang.hdfs.maxtemp.allsort.secondarysort.*;
       import org.apache.hadoop.conf.Configuration;
       import org.apache.hadoop.fs.Path;
       import org.apache.hadoop.io.IntWritable;
       import org.apache.hadoop.io.NullWritable;
       import org.apache.hadoop.io.Text;
       import org.apache.hadoop.mapreduce.Job;
       import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
       import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
       import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

       /**
       *
       */
       public class ReduceJoinApp {
           public static void main(String[] args) throws Exception {

Configuration conf = new Configuration();
conf.set("fs.defaultFS","file:///");

Job job = Job.getInstance(conf);

               //设置job的各种属性
               job.setJobName("ReduceJoinApp"); //作业名称
               job.setJarByClass(ReduceJoinApp.class); //搜索类

               //添加输入路径
               FileInputFormat.addInputPath(job,new Path("D:\\mr\\reducejoin"));
               //设置输出路径
               FileOutputFormat.setOutputPath(job,new Path("D:\\mr\\reducejoin\\out"));

job.setMapperClass(ReduceJoinMapper.class); //mapper类
job.setReducerClass(ReduceJoinReducer.class); //reducer类

               //设置Map输出类型
               job.setMapOutputKeyClass(ComboKey2.class); //
               job.setMapOutputValueClass(NullWritable.class); //

               //设置ReduceOutput类型
               job.setOutputKeyClass(Text.class);
               job.setOutputValueClass(NullWritable.class); //

               //设置分区类
               job.setPartitionerClass(CIDPartitioner.class);
               //设置分组对比器
               job.setGroupingComparatorClass(CIDGroupComparator.class);
               //设置排序对比器
               job.setSortComparatorClass(ComboKey2Comparator.class);
               job.setNumReduceTasks(2); //reduce个数
               job.waitForCompletion(true);
           }
       }

hive
------------------
   在hadoop处理结构化数据的数据仓库。
   不是:   关系数据库
           不是OLTP
           实时查询和行级更新。

hive特点
----------
   hive存储数据结构(schema)在数据库中,处理的数据进入hdfs.
   OLAP
   HQL / HiveQL

hive安装
-------------
   1.下载hive2.1-tar.gz
   2.tar开
       $>tar -xzvf hive-2.1.0.tar.gz -C /soft   //tar开
       $>cd /soft/hive-2.1.0                   //
       $>ln -s hive-2.1.0 hive                   //符号连接

   3.配置环境变量
       [/etc/profile]
       HIVE_HOME=/soft/hive
       PATH=$PATH$HIVE_HOME/bin

执行 :source /etc/profile

4.验证hive安装成功
$>hive --v

   5.配置hive,使用win7的mysql存放hive的元数据.
       a)复制mysql驱动程序到hive的lib目录下。
           ...
       b)配置hive-site.xml
           复制hive-default.xml.template为hive-site.xml
           修改连接信息为mysql链接地址，将${system:...字样替换成具体路径。

 [hive/conf/hive-site.xml]
            <property>
                <name>javax.jdo.option.ConnectionPassword</name>
                <value>root</value>
                <description>password to use against metastore database</description>
            </property>
            <property>
                <name>javax.jdo.option.ConnectionUserName</name>
                <value>root</value>
                <description>Username to use against metastore database</description>
            </property>
            <property>
                <name>javax.jdo.option.ConnectionURL</name>
                <value>jdbc:mysql://192.168.231.1:3306/hive2</value>
            </property>
            <property>
                <name>javax.jdo.option.ConnectionDriverName</name>
                <value>com.mysql.jdbc.Driver</value>
                <description>Driver class name for a JDBC metastore</description>
            </property>

6)在msyql中创建存放hive信息的数据库
mysql>create database hive2 ;

       6)初始化hive的元数据(表结构)到mysql中。
           $>cd /soft/hive/bin
           $>schematool -dbType mysql -initSchema

hive命令行操作
------------------------
1.创建hive的数据库

$hive>hive --version //
$hive>hive --help //

       $hive>create database mydb2 ;       //
       $hive>show databases ;
       $hive>use mydb2 ;
       $hive>create table mydb2.t(id int,name string,age int);
       $hive>drop table t ;
       $hive>drop table mydb2.t ;
       $hive>select * from mydb2.t ;       //查看指定库的表
       $hive>exit ;                       //退出

$>hive //hive --service cli
$>hive //hive --service cli

通过远程jdbc方式连接到hive数据仓库
--------------------------------
1.启动hiveserver2服务器，监听端口10000
$>hive --service hiveserver2 &

   2.通过beeline命令行连接到hiveserver2
       $>beeline                                           //进入beeline命令行(于hive --service beeline)
       $beeline>!help                                       //查看帮助
       $beeline>!quit                                       //退出
       $beeline>!connect jdbc:hive2://localhost:10000/mydb2//连接到hibve数据
       $beeline>show databases ;
       $beeline>use mydb2 ;
       $beeline>show tables;                               //显式表
       使用Hive-jdbc驱动程序采用jdbc方式访问远程数据仓库
----------------------------------------------------
   1.创建java模块
   2.引入maven
   3.添加hive-jdbc依赖

  <?xml version="1.0" encoding="UTF-8"?>
        <project xmlns="http://maven.apache.org/POM/4.0.0"
                 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
                 xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
            <modelVersion>4.0.0</modelVersion>

            <groupId>com.it18zhang</groupId>
            <artifactId>HiveDemo</artifactId>
            <version>1.0-SNAPSHOT</version>

            <dependencies>
                <dependency>
                    <groupId>org.apache.hive</groupId>
                    <artifactId>hive-jdbc</artifactId>
                    <version>2.1.0</version>
                </dependency>
            </dependencies>
        </project>

4.App
package com.it18zhang.hivedemo;

       import java.sql.Connection;
       import java.sql.DriverManager;
       import java.sql.ResultSet;
       import java.sql.Statement;

       /**
       * 使用jdbc方式连接到hive数据仓库，数据仓库需要开启hiveserver2服务。
       */

  public class App {
            public static void main(String[] args) throws  Exception {
                Class.forName("org.apache.hive.jdbc.HiveDriver");
                Connection conn = DriverManager.getConnection("jdbc:hive2://192.168.231.201:10000/mydb2");
                Statement st = conn.createStatement();
                ResultSet rs = st.executeQuery("select id , name ,age from t");
                while(rs.next()){
                    System.out.println(rs.getInt(1) + "," + rs.getString(2)) ;
                }
                rs.close();
                st.close();
                conn.close();
            }
        }

hive中表
-------------------
   1.managed table
       托管表。
       删除表时，数据也删除了。

   2.external table
       外部表。
       删除表时，数据不删。

hive命令
----------------
   //创建表,external 外部表
   $hive>CREATE external TABLE IF NOT EXISTS t2(id int,name string,age int)
   COMMENT 'xx' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE ;

   //查看表数据
   $hive>desc t2 ;
   $hive>desc formatted t2 ;

   //加载数据到hive表
   $hive>load data local inpath '/home/centos/customers.txt' into table t2 ;   //local上传文件
   $hive>load data inpath '/user/centos/customers.txt' [overwrite] into table t2 ;   //移动文件

   //复制表
   mysql>create table tt as select * from users ;       //携带数据和表结构
   mysql>create table tt like users ;           //不带数据，只有表结构

hive>create table tt as select * from users ;
hive>create table tt like users ;

//count()查询要转成mr
$hive>select count(*) from t2 ;
$hive>select id,name from t2 ;

//
   $hive>select * from t2 order by id desc ;               //MR

   //启用/禁用表
   $hive>ALTER TABLE t2 ENABLE NO_DROP;   //不允许删除
   $hive>ALTER TABLE t2 DISABLE NO_DROP;   //允许删除

//分区表,优化手段之一，从目录的层面控制搜索数据的范围。
//创建分区表.
$hive>CREATE TABLE t3(id int,name string,age int) PARTITIONED BY (Year INT, Month INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;

//显式表的分区信息
$hive>SHOW PARTITIONS t3;

   //添加分区,创建目录
   $hive>alter table t3 add partition (year=2014, month=12);

   //删除分区
   hive>ALTER TABLE employee_partitioned DROP IF EXISTS PARTITION (year=2014, month=11);

   //分区结构
   hive>/user/hive/warehouse/mydb2.db/t3/year=2014/month=11
   hive>/user/hive/warehouse/mydb2.db/t3/year=2014/month=12

//加载数据到分区表
hive>load data local inpath '/home/centos/customers.txt' into table t3 partition(year=2014,month=11);

   //创建桶表
   $hive>CREATE TABLE t4(id int,name string,age int) CLUSTERED BY (id) INTO 3 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;

   //加载数据不会进行分桶操作
   $hive>load data local inpath '/home/centos/customers.txt' into table t4 ;

   //查询t3表数据插入到t4中。
   $hive>insert into t4 select id,name,age from t3 ;

   //桶表的数量如何设置?
   //评估数据量，保证每个桶的数据量block的2倍大小。
   //连接查询
   $hive>CREATE TABLE customers(id int,name string,age int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;
   $hive>CREATE TABLE orders(id int,orderno string,price float,cid int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;

   //加载数据到表
   //内连接查询
   hive>select a.*,b.* from customers a , orders b where a.id = b.cid ;
   //左外
   hive>select a.*,b.* from customers a left outer join orders b on a.id = b.cid ;
   hive>select a.*,b.* from customers a right outer join orders b on a.id = b.cid ;
   hive>select a.*,b.* from customers a full outer join orders b on a.id = b.cid ;

   //explode,炸裂,表生成函数。
   //使用hive实现单词统计
   //1.建表
   $hive>CREATE TABLE doc(line string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;

大数据MR模型以及代码实现

如何在低代码平台中引用 JavaScript ？

探究职业发展的关键：能力模型解读

高效率使用windows

如何使用 JavaScript 获取当前页面帧率 FPS

工程款拖欠，农民工怎么了？就得一直忍着委屈求全吗？

HarmonyOS 实现下拉刷新，上拉加载更多

语音信号处理中的“窗函数”

智能决策新时代：可视化大屏是否能够超越传统白板？

解密Prompt系列28. LLM Agent之金融领域摸索：FinMem & FinAgent

分享几个.NET开源的AI和LLM相关项目框架

大數據MR模型以及代碼實現

kubernetes分佈式安裝部署-簡介

redis簡介-安裝部署-命令說明

UWSGI的作用

大數據—zookeeper介紹-配置-安裝-使用命令

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結