Hadoop類型
Hadoop的類型全部在hadoop.io包中,下表是java與hadoop類型的對應關係
Java |
Hadoop |
|
long |
org.apache.hadoop.io.LongWritable |
|
Int |
org.apache.hadoop.io.IntWritable |
|
Byte |
org.apache.hadoop.io.ByteWritable |
|
boolean |
org.apache.hadoop.io.BooleanWritable |
|
double |
org.apache.hadoop.io.DoubleWritable |
|
float |
org.apache.hadoop.io.FloatWritable |
|
string |
org.apache.hadoop.io.Text |
|
null |
org.apache.hadoop.io.NullWritable |
NullWritable.get()獲取實例 |
Set,map,list |
org.apache.hadoop.io.ArrayWritable |
|
Bytes |
org.apache.hadoop.io.BytesWritable |
存儲音頻視頻 |
WordCount實例
1:編寫代碼
package mapreduce;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* hdfs上的hello中的內容爲
tiger pig
pig cat dog
dog bird cat
tiger house
bus bike bus car
* @author think
*
*/
public class WordCount {
public static void main(String[] args) throws Exception {
String inPath = args[0];
Path outPath = new Path(args[1]);
//1:hdfs configuration,get SystemFile Object
Configuration conf = new Configuration();
URI uri = new URI("/");// URI uri = new URI("hdfs://192.168.79.128:9000/");
FileSystem fileSystem = FileSystem.get(uri, conf);
if (fileSystem.exists(outPath)) {
fileSystem.delete(outPath, true);
}
// 2:job object
String jobName = WordCount.class.getName();
Job job = Job.getInstance(conf, jobName);
job.setJarByClass(WordCount.class);
// 3:輸入路徑
FileInputFormat.setInputPaths(job, inPath);
// 4:指定inputFormat的子類,可選,默認是TextInputFormat
job.setInputFormatClass(TextInputFormat.class);
// 5:指定mapper類,指定mapper的輸出<k2,v2>類型
job.setMapperClass(MapTask.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
// 6:指定reduce類,指定reduce的輸出<k3,v3>類型
job.setReducerClass(ReduceTask.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// 7:指定輸出路徑
FileOutputFormat.setOutputPath(job, outPath);
// 8:指定outputformat子類
job.setOutputFormatClass(TextOutputFormat.class);
// 9:提交yarn執行
job.waitForCompletion(true);
}
/**
* Map 任務
* @author think
* LongWritable, Text, Text, LongWritable這4個參數依次代表map任務的輸入鍵值對<k1,v1>和輸出鍵值對<k2,v2>
*/
public static class MapTask extends Mapper<LongWritable, Text, Text, LongWritable>
{
Logger logger = LoggerFactory.getLogger(WordCount.class);
Text k2 = new Text();
LongWritable v2 = new LongWritable();
/**
* 重寫map方法
* context是一個mapper的內部類
*/
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
//1:key爲內容的字節序數,value爲內容
String content = value.toString();
System.out.println("內容:" + key.get() + " ," + content);
logger.info("內容:" + key.get() + " ," + content);
String[] arrs = content.split(",");
for(String word : arrs)
{
k2.set(word);
v2.set(1);
context.write(k2, v2);
logger.info("map:" + k2.toString() + "," + v2);
}
}
}
/**
* Reduce 任務
* @author think
* Text, LongWritable, Text, LongWritable這4個參數依次代表reduce任務的輸入鍵值對<k2,v2s>和輸出鍵值對<k3,v3>
*/
public static class ReduceTask extends Reducer<Text, LongWritable, Text, LongWritable>
{
LongWritable v3 = new LongWritable();
@Override
protected void reduce(Text k2, Iterable<LongWritable> v2s,
Reducer<Text, LongWritable, Text, LongWritable>.Context content)
throws IOException, InterruptedException {
System.out.println("k2:" + k2.toString());
long sum = 0;
for(LongWritable v2 : v2s)
{
System.out.println("v2:" + v2);
sum += v2.get();
}
v3.set(sum);
content.write(k2, v3);
System.out.println("k3,v3:" + k2.toString() + "," + v3);
}
}
}
2:打包並上傳到linux下
點擊java類->右鍵export->JAR File導出jar包,下圖是兩個需要注意的地方
3:在linux創建一個文件word然後將文件上傳到hdfs中
hadoop fs -put ./word /word
hadoop fs -text /word/word word在word目錄下
hadoop fs -cp /word/word /word/word2可以多複製幾個文件
4:執行hadoop jar,然後查看結果
hadoop jar wordCount.jar /word /out
在/out目錄中會自動生成文件記錄結果如/out/part-r-00000,查看此文件中的結果
hadoop fs -text /out/part-r-00000
5:在http://shb01:8088中查看集羣中map和reduce任務的輸出
我們重點需要關注的是Counters中的一些信息如本地讀取等
6:查看日誌需要在yarn-site.xml中加入如下內容
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
ArrayWritable類型
Hadoop中使用ArrayWritable來操作集合,必須自己寫一個類繼承它,ArrayWritable類中有一個Witable[]數組屬性,使用時必須傳遞值它會遍歷這個屬性。
下面是一個使用ArrayWritable統計流量的例子。
存在至少一個文件,此類文件模仿的是一個手機流量的日誌信息
每行數據結構爲
序號1:1363157993044是時間戳
序號2:13610002000是手機號
序號6是上行數據包,7是下行數據包,8上行總流量,9下行總流量,10是狀態嗎200表示成功。
1363157993044 13610002000 94-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99 iface.qiyi.com 視頻網站 15 12 1527 2106 200
假設有很多行每個手機號會出現多次,我們需要統計每個手機號的上下行數據包,上下行總流量的彙總數據。以下是代碼。
package mapreduce;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class FlowCount {
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
String inputPaths = args[0];
Path outPath = new Path(args[1]);
//1:獲取fileSystem對象,操作hdfs數據
Configuration conf = new Configuration();
URI uri = new URI("hdfs://192.168.79.139:9000/");
FileSystem fileSystem = FileSystem.get(uri, conf);
if(fileSystem.exists(outPath))
{
fileSystem.delete(outPath, true);
}
//2:獲取job對象
Job job = Job.getInstance(conf, FlowCount.class.getName());
job.setJarByClass(FlowCount.class);
//3:指定輸入路徑
FileInputFormat.setInputPaths(job, inputPaths);
//4:指定inputFormat子類
job.setInputFormatClass(TextInputFormat.class);
//5:指定mapper類及其輸出類型
job.setMapperClass(MapTask.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowWritable.class);
//6:指定reducer類及其輸出類型
job.setReducerClass(ReduceTask.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//7:指定outputFormat子類
job.setOutputFormatClass(TextOutputFormat.class);
//8:指定輸出路徑
FileOutputFormat.setOutputPath(job, outPath);
//9:提交yarn執行
job.waitForCompletion(true);
}
/**
* Map任務
* 4個參數LongWritable, Text, Text, FlowWritable對應map的輸入<k1,v1><每行的字節序數,每行內容>
* map的輸出<k2,v2><手機號,FlowWritable(實例包含上下行數據包,上下行流量)>
* @author think
*
*/
public static class MapTask extends Mapper<LongWritable, Text, Text, FlowWritable>
{
Logger logger = LoggerFactory.getLogger(MapTask.class);
Text k2 = new Text();
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, FlowWritable>.Context context)
throws IOException, InterruptedException {
String[] values = value.toString().split("\t");
k2.set(values[1]);
FlowWritable flow = new FlowWritable();
flow.set(values[5], values[6], values[7], values[8]);
context.write(k2, flow);
logger.info("MapTask[" + k2.toString() + ":" + flow + "]");
}
}
/**
* Reduce任務
* 4個參數Text, FlowWritable, Text, Text對應reduce的輸入<k2,v2s><手機號,FlowWritable(實例包含上下行數據包,上下行流量)>
* reduce的輸出<k3,v3><手機號,流量信息>
* @author think
*
*/
public static class ReduceTask extends Reducer<Text, FlowWritable, Text, Text>
{
Logger logger = LoggerFactory.getLogger(ReduceTask.class);
Text k3 = new Text();
Text v3 = new Text();
@Override
protected void reduce(Text k2, Iterable<FlowWritable> v2s,
Reducer<Text, FlowWritable, Text, Text>.Context context)
throws IOException, InterruptedException {
long six = 0;
long seven = 0;
long eight = 0;
long nine = 0;
for(FlowWritable v2 : v2s)
{
long[] flowArrs = v2.getLongArrs();
six += flowArrs[0];
seven += flowArrs[1];
eight += flowArrs[2];
nine += flowArrs[3];
}
k3.set(k2);
String flowString = "up package[" + six + "];down package[" + seven + "];up flow[" + eight + "];down flow[" + nine +"]";
v3.set(flowString);
context.write(k3, v3);
}
}
/**
* FlowWritable用來存放文件中的數據包和流量信息,對應序號6~9
* @author think
*
*/
public static class FlowWritable extends ArrayWritable
{
//必須在構造函數中調用super明確類型
public FlowWritable() {
super(LongWritable.class);
}
/**
* 將值賦給ArrayWritable中的values屬性
* @param six
* @param seven
* @param eight
* @param nine
*/
public void set(String six, String seven, String eight, String nine)
{
Writable[] values = new Writable[4];
//System.out.println("-" + six + "-" + seven + "-" + eight + "-" + nine);
values[0] = new LongWritable(Long.valueOf(six));
values[1] = new LongWritable(Long.valueOf(seven));
values[2] = new LongWritable(Long.valueOf(eight));
values[3] = new LongWritable(Long.valueOf(nine));
super.set(values);
}
/**
* 從ArrayWritable中獲取values屬性的值
* @return
*/
public long[] getLongArrs()
{
LongWritable[] values = (LongWritable[])super.toArray();
if(null != values)
{
long[] valueArrs = new long[values.length];
for(int i = 0; i < values.length; i++)
{
valueArrs[i] = values[i].get();
}
return valueArrs;
}
else
{
return null;
}
}
}
}