MapReduce 寫入數據到MySql數據庫


  Hadoop 0.19中提供的DBInputFormat和DBOutputFormat組件最終允許在Hadoop和許多關係數據庫之間輕鬆導入和導出數據,從而使關係數據更容易地合併到您的數據處理管道中。 要在Hadoop和MySQL之間導入和導出數據,肯定需要在機器上安裝Hadoop和MySQL。

一、軟件版本

  • jdk 1.7.0_79-b15
  • hadoop 2.7.1
  • mysql 5.0.22

二、環境配置

  1. 拷貝 mysql-connector-java-5.1.39-bin.jar到$HADOOP_HOME/lib目錄。
  2. 創建MySQL庫
mysql> use testDb;

mysql> create table studentinfo (  id integer ,  name varchar(32) );

mysql> insert into studentinfo values(1,'archana');

mysql> insert into studentinfo values(2,'XYZ');

mysql> insert into studentinfo values(3,'archana');

mysql> create table output (  name varchar(32),count integer  );

三、項目結構

目錄結構
要從DB訪問數據,我們必須創建一個類來定義要提取並寫回DB的數據。在我的項目中,我創建了一個名爲DBInputWritable.java和DBOutputWritable.java的類來實現相同的目的。

四、關鍵代碼

DBInputWritable.java

package com.etc;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.sql.ResultSet;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.lib.db.DBWritable;

public class DBInputWritable implements Writable, DBWritable
{
    private int id;
    private String name;

    public void readFields(DataInput in) throws IOException {   }

    public void readFields(ResultSet rs) throws SQLException
    //Resultset object represents the data returned from a SQL statement
    {
        id = rs.getInt(1);
        name = rs.getString(2);
    }

    public void write(DataOutput out) throws IOException {  }

    public void write(PreparedStatement ps) throws SQLException
    {
        ps.setInt(1, id);
        ps.setString(2, name);
    }

    public int getId()
    {
        return id;
    }

    public String getName()
    {
        return name;
    }
}


DBOutputWritable.java

package com.etc;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.sql.ResultSet;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.lib.db.DBWritable;

public class DBOutputWritable implements Writable, DBWritable
{
    private String name;
    private int count;

    public DBOutputWritable(String name, int count)
    {
        this.name = name;
        this.count = count;
    }

    public void readFields(DataInput in) throws IOException {   }

    public void readFields(ResultSet rs) throws SQLException
    {
        name = rs.getString(1);
        count = rs.getInt(2);
    }

    public void write(DataOutput out) throws IOException {    }

    public void write(PreparedStatement ps) throws SQLException
    {
        ps.setString(1, name);
        ps.setInt(2, count);
    }
}

Main

package com.etc;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
import org.apache.hadoop.mapreduce.lib.db.DBOutputFormat;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;

public class Main
{
    public static void main(String[] args) throws Exception
    {
        Configuration conf = new Configuration();
        DBConfiguration.configureDB(conf,
                "com.mysql.jdbc.Driver",   // driver class
                "jdbc:mysql://192.168.191.1:3306/test", // db url
                "root",    // user name
                "root"); //password

        Job job = new Job(conf);
        job.setJarByClass(Main.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(DBOutputWritable.class);
        job.setOutputValueClass(NullWritable.class);
        job.setInputFormatClass(DBInputFormat.class);
        job.setOutputFormatClass(DBOutputFormat.class);

        DBInputFormat.setInput(
                job,
                DBInputWritable.class,
                "studentinfo",   //input table name
                null,
                null,
                new String[] { "id", "name" }  // table columns
        );

        DBOutputFormat.setOutput(
                job,
                "output",    // output table name
                new String[] { "name", "count" }   //table columns
        );

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

Map

package com.etc;

import java.io.IOException;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;

public class Map extends Mapper<LongWritable, DBInputWritable, Text, IntWritable>
{
    private IntWritable one = new IntWritable(1);

    protected void map(LongWritable id, DBInputWritable value, Context ctx)
    {
        try
        {
            String[] keys = value.getName().split(" ");

            for(String key : keys)
            {
                ctx.write(new Text(key),one);
            }
        } catch(IOException e)
        {
            e.printStackTrace();
        } catch(InterruptedException e)
        {
            e.printStackTrace();
        }
    }
}

Reduce

package com.etc;

import java.io.IOException;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;

public class Reduce extends Reducer<Text, IntWritable, DBOutputWritable, NullWritable>
{
    protected void reduce(Text key, Iterable<IntWritable> values, Context ctx)
    {
        int sum = 0;

        for(IntWritable value : values)
        {
            sum += value.get();
        }

        try
        {
            ctx.write(new DBOutputWritable(key.toString(), sum), NullWritable.get());
        } catch(IOException e)
        {
            e.printStackTrace();
        } catch(InterruptedException e)
        {
            e.printStackTrace();
        }
    }
}

Main

package com.etc;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
import org.apache.hadoop.mapreduce.lib.db.DBOutputFormat;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;

public class Main
{
    public static void main(String[] args) throws Exception
    {
        Configuration conf = new Configuration();
        DBConfiguration.configureDB(conf,
                "com.mysql.jdbc.Driver",   // driver class
                "jdbc:mysql://192.168.191.1:3306/test", // db url
                "root",    // user name
                "root"); //password

        Job job = new Job(conf);
        job.setJarByClass(Main.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(DBOutputWritable.class);
        job.setOutputValueClass(NullWritable.class);
        job.setInputFormatClass(DBInputFormat.class);
        job.setOutputFormatClass(DBOutputFormat.class);

        DBInputFormat.setInput(
                job,
                DBInputWritable.class,
                "studentinfo",   //input table name
                null,
                null,
                new String[] { "id", "name" }  // table columns
        );

        DBOutputFormat.setOutput(
                job,
                "output",    // output table name
                new String[] { "name", "count" }   //table columns
        );

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

pom.xml

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.etc</groupId>
    <artifactId>MapReduce2MySql</artifactId>
    <version>1.0-SNAPSHOT</version>

    <name>MapReduce2MySql</name>
    <!-- FIXME change it to the project's website -->
    <url>http://www.example.com</url>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.source>1.7</maven.compiler.source>
        <maven.compiler.target>1.7</maven.compiler.target>
    </properties>

    <dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.1</version>
            <scope>provided</scope>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.7.1</version>
            <scope>provided</scope>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>2.7.1</version>
            <scope>provided</scope>
        </dependency>

        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.11</version>
        </dependency>
    </dependencies>

    <build>
        <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
            <plugins>
                <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
                <plugin>
                    <artifactId>maven-clean-plugin</artifactId>
                    <version>3.1.0</version>
                </plugin>
                <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
                <plugin>
                    <artifactId>maven-resources-plugin</artifactId>
                    <version>3.0.2</version>
                </plugin>
                <plugin>
                    <artifactId>maven-compiler-plugin</artifactId>
                    <version>3.8.0</version>
                </plugin>
                <plugin>
                    <artifactId>maven-surefire-plugin</artifactId>
                    <version>2.22.1</version>
                </plugin>
                <plugin>
                    <artifactId>maven-jar-plugin</artifactId>
                    <version>3.0.2</version>
                </plugin>
                <plugin>
                    <artifactId>maven-install-plugin</artifactId>
                    <version>2.5.2</version>
                </plugin>
                <plugin>
                    <artifactId>maven-deploy-plugin</artifactId>
                    <version>2.8.2</version>
                </plugin>
                <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
                <plugin>
                    <artifactId>maven-site-plugin</artifactId>
                    <version>3.7.1</version>
                </plugin>
                <plugin>
                    <artifactId>maven-project-info-reports-plugin</artifactId>
                    <version>3.0.0</version>
                </plugin>
            </plugins>
        </pluginManagement>
    </build>
</project>

五、執行jar包

$ hadoop jar MapReduce2MySql-1.0-SNAPSHOT.jar com.etc.Main

六、MySql查詢結果

mysql> select * from output;

+-----------+-------+

| name      | count |

+-----------+-------+

| archana |     2    |

|  XYZ      |     1  |

+----------+--------+

END
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章