MapReuce筆記四之hadoop類型和MR操作hdfs數據實例

Hadoop類型

Hadoop的類型全部在hadoop.io包中,下表是java與hadoop類型的對應關係

Java

Hadoop

 

long

org.apache.hadoop.io.LongWritable

 

Int

org.apache.hadoop.io.IntWritable

 

Byte

org.apache.hadoop.io.ByteWritable

 

boolean

org.apache.hadoop.io.BooleanWritable

 

double

org.apache.hadoop.io.DoubleWritable

 

float

org.apache.hadoop.io.FloatWritable

 

string

org.apache.hadoop.io.Text

 

null

org.apache.hadoop.io.NullWritable

NullWritable.get()獲取實例

Set,map,list

org.apache.hadoop.io.ArrayWritable

 

Bytes

org.apache.hadoop.io.BytesWritable

存儲音頻視頻

 

WordCount實例

1:編寫代碼

package mapreduce;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * hdfs上的hello中的內容爲
tiger pig
pig cat dog
dog bird cat
tiger house
bus bike bus car

 * @author think
 *
 */
public class WordCount {

	public static void main(String[] args) throws Exception {
		String inPath = args[0];
		Path outPath = new Path(args[1]);

		//1:hdfs configuration,get SystemFile Object
		Configuration conf = new Configuration();
		URI uri = new URI("/");// URI uri = new URI("hdfs://192.168.79.128:9000/");
		FileSystem fileSystem = FileSystem.get(uri, conf);

		if (fileSystem.exists(outPath)) {
			fileSystem.delete(outPath, true);
		}

		// 2:job object
		String jobName = WordCount.class.getName();
		Job job = Job.getInstance(conf, jobName);
		job.setJarByClass(WordCount.class);

		// 3:輸入路徑
		FileInputFormat.setInputPaths(job, inPath);

		// 4:指定inputFormat的子類,可選,默認是TextInputFormat
		job.setInputFormatClass(TextInputFormat.class);

		// 5:指定mapper類,指定mapper的輸出<k2,v2>類型
		job.setMapperClass(MapTask.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(LongWritable.class);

		// 6:指定reduce類,指定reduce的輸出<k3,v3>類型
		job.setReducerClass(ReduceTask.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);

		// 7:指定輸出路徑
		FileOutputFormat.setOutputPath(job, outPath);

		// 8:指定outputformat子類
		job.setOutputFormatClass(TextOutputFormat.class);

		// 9:提交yarn執行
		job.waitForCompletion(true);
	}
	
	/**
	 * Map 任務
	 * @author think
	 * LongWritable, Text, Text, LongWritable這4個參數依次代表map任務的輸入鍵值對<k1,v1>和輸出鍵值對<k2,v2>
	 */
	public static class MapTask extends Mapper<LongWritable, Text, Text, LongWritable>
	{
		Logger logger = LoggerFactory.getLogger(WordCount.class);
		
		Text k2 = new Text();

		LongWritable v2 = new LongWritable();
		
		/**
		 * 重寫map方法
		 * context是一個mapper的內部類
		 */
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, Text, LongWritable>.Context context)
				throws IOException, InterruptedException {
			//1:key爲內容的字節序數,value爲內容
			String content = value.toString();
			System.out.println("內容:" + key.get() + " ," + content);
			logger.info("內容:" + key.get() + " ," + content);
			
			String[] arrs = content.split(",");
			for(String word : arrs)
			{
				k2.set(word);
				v2.set(1);
				context.write(k2, v2);
				logger.info("map:" + k2.toString() + "," + v2);
			}
		}
	}
	
	/**
	 * Reduce 任務 
	 * @author think
	 * Text, LongWritable, Text, LongWritable這4個參數依次代表reduce任務的輸入鍵值對<k2,v2s>和輸出鍵值對<k3,v3>
	 */
	public static class ReduceTask extends Reducer<Text, LongWritable, Text, LongWritable>
	{
		LongWritable v3 = new LongWritable();
		
		@Override
		protected void reduce(Text k2, Iterable<LongWritable> v2s,
				Reducer<Text, LongWritable, Text, LongWritable>.Context content)
				throws IOException, InterruptedException {
			System.out.println("k2:" + k2.toString());
			long sum = 0;
			for(LongWritable v2 : v2s)
			{
				System.out.println("v2:" + v2);
				sum += v2.get();
			}
			v3.set(sum);
			content.write(k2, v3);
			System.out.println("k3,v3:" + k2.toString() + "," + v3);
		}
	}
	
	
}

2:打包並上傳到linux下

點擊java類->右鍵export->JAR File導出jar包,下圖是兩個需要注意的地方





3:在linux創建一個文件word然後將文件上傳到hdfs中

hadoop fs -put ./word /word

hadoop fs -text /word/word  word在word目錄下

hadoop fs -cp /word/word  /word/word2可以多複製幾個文件

 

4:執行hadoop jar,然後查看結果

hadoop jar wordCount.jar /word  /out

在/out目錄中會自動生成文件記錄結果如/out/part-r-00000,查看此文件中的結果

hadoop fs -text /out/part-r-00000



5:在http://shb01:8088中查看集羣中map和reduce任務的輸出

         我們重點需要關注的是Counters中的一些信息如本地讀取等



6:查看日誌需要在yarn-site.xml中加入如下內容

<property>

               <name>yarn.log-aggregation-enable</name>

                <value>true</value>

        </property>

 

ArrayWritable類型

Hadoop中使用ArrayWritable來操作集合,必須自己寫一個類繼承它,ArrayWritable類中有一個Witable[]數組屬性,使用時必須傳遞值它會遍歷這個屬性。

下面是一個使用ArrayWritable統計流量的例子。

存在至少一個文件,此類文件模仿的是一個手機流量的日誌信息

每行數據結構爲

序號1:1363157993044是時間戳

序號2:13610002000是手機號

序號6是上行數據包,7是下行數據包,8上行總流量,9下行總流量,10是狀態嗎200表示成功。

1363157993044      13610002000  94-71-AC-CD-E6-18:CMCC-EASY     120.196.100.99        iface.qiyi.com  視頻網站         15     12         1527         2106         200

 

假設有很多行每個手機號會出現多次,我們需要統計每個手機號的上下行數據包,上下行總流量的彙總數據。以下是代碼。

package mapreduce;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class FlowCount {

	/**
	 * @param args
	 * @throws Exception
	 */
	public static void main(String[] args) throws Exception {
		String inputPaths = args[0];
		Path outPath = new Path(args[1]);
		
		//1:獲取fileSystem對象,操作hdfs數據
		Configuration conf = new Configuration();
		URI uri = new URI("hdfs://192.168.79.139:9000/");
		FileSystem fileSystem = FileSystem.get(uri, conf);
		if(fileSystem.exists(outPath))
		{
			fileSystem.delete(outPath, true);
		}
		
		//2:獲取job對象
		Job job = Job.getInstance(conf, FlowCount.class.getName());
		job.setJarByClass(FlowCount.class);
		
		//3:指定輸入路徑
		FileInputFormat.setInputPaths(job, inputPaths);
		
		//4:指定inputFormat子類
		job.setInputFormatClass(TextInputFormat.class);
		
		//5:指定mapper類及其輸出類型
		job.setMapperClass(MapTask.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(FlowWritable.class);
		
		//6:指定reducer類及其輸出類型
		job.setReducerClass(ReduceTask.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		//7:指定outputFormat子類
		job.setOutputFormatClass(TextOutputFormat.class);
		
		//8:指定輸出路徑
		FileOutputFormat.setOutputPath(job, outPath);
		
		//9:提交yarn執行
		job.waitForCompletion(true);
	}

	/**
	 * Map任務
	 * 4個參數LongWritable, Text, Text, FlowWritable對應map的輸入<k1,v1><每行的字節序數,每行內容>
	 * map的輸出<k2,v2><手機號,FlowWritable(實例包含上下行數據包,上下行流量)>
	 * @author think
	 *
	 */
	public static class MapTask extends Mapper<LongWritable, Text, Text, FlowWritable>
	{
		Logger logger = LoggerFactory.getLogger(MapTask.class);
		
		Text k2 = new Text();
		
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, Text, FlowWritable>.Context context)
				throws IOException, InterruptedException {
			
			String[] values = value.toString().split("\t");
			k2.set(values[1]);

			FlowWritable flow = new FlowWritable();
			flow.set(values[5], values[6], values[7], values[8]);
			
			context.write(k2, flow);
			logger.info("MapTask[" + k2.toString() + ":" + flow + "]");
		}
	}
	
	/**
	 * Reduce任務
	 * 4個參數Text, FlowWritable, Text, Text對應reduce的輸入<k2,v2s><手機號,FlowWritable(實例包含上下行數據包,上下行流量)>
	 * reduce的輸出<k3,v3><手機號,流量信息>
	 * @author think
	 *
	 */
	public static class ReduceTask extends Reducer<Text, FlowWritable, Text, Text>
	{
		Logger logger = LoggerFactory.getLogger(ReduceTask.class);
		
		Text k3 = new Text();
		
		Text v3 = new Text();
		
		@Override
		protected void reduce(Text k2, Iterable<FlowWritable> v2s,
				Reducer<Text, FlowWritable, Text, Text>.Context context)
				throws IOException, InterruptedException {
			long six = 0;
			long seven = 0;
			long eight = 0;
			long nine = 0;
			for(FlowWritable v2 : v2s)
			{
				long[] flowArrs = v2.getLongArrs();
				six += flowArrs[0];
				seven += flowArrs[1];
				eight += flowArrs[2];
				nine += flowArrs[3];
			}
			
			k3.set(k2);
			String flowString = "up package[" + six + "];down package[" + seven + "];up flow[" + eight + "];down flow[" + nine +"]";
			v3.set(flowString);
			context.write(k3, v3);
			
		}
	}
	
	/**
	 * FlowWritable用來存放文件中的數據包和流量信息,對應序號6~9
	 * @author think
	 *
	 */
	public static class FlowWritable extends ArrayWritable
	{
		//必須在構造函數中調用super明確類型
		public FlowWritable() {
			super(LongWritable.class);
		}
		
		/**
		 * 將值賦給ArrayWritable中的values屬性
		 * @param six
		 * @param seven
		 * @param eight
		 * @param nine
		 */
		public void set(String six, String seven, String eight, String nine)
		{
			Writable[] values = new Writable[4];
			//System.out.println("-" + six + "-" + seven + "-" + eight + "-" + nine);
			values[0] = new LongWritable(Long.valueOf(six));
			values[1] = new LongWritable(Long.valueOf(seven));
			values[2] = new LongWritable(Long.valueOf(eight));
			values[3] = new LongWritable(Long.valueOf(nine));
			super.set(values);
		}
		
		/**
		 * 從ArrayWritable中獲取values屬性的值
		 * @return
		 */
		public long[] getLongArrs()
		{
			LongWritable[] values = (LongWritable[])super.toArray();
			if(null != values)
			{
				long[] valueArrs = new long[values.length];
				for(int i = 0; i < values.length; i++)
				{
					valueArrs[i] = values[i].get();
				}
				return valueArrs;
			}
			else
			{
				return null;
			}
		}
		
	}
}





發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章