MapReuce筆記四之hadoop類型和MR操作hdfs數據實例

Hadoop類型

Hadoop的類型全部在hadoop.io包中，下表是java與hadoop類型的對應關係

Java	Hadoop
long	org.apache.hadoop.io.LongWritable
Int	org.apache.hadoop.io.IntWritable
Byte	org.apache.hadoop.io.ByteWritable
boolean	org.apache.hadoop.io.BooleanWritable
double	org.apache.hadoop.io.DoubleWritable
float	org.apache.hadoop.io.FloatWritable
string	org.apache.hadoop.io.Text
null	org.apache.hadoop.io.NullWritable	NullWritable.get()獲取實例
Set,map,list	org.apache.hadoop.io.ArrayWritable
Bytes	org.apache.hadoop.io.BytesWritable	存儲音頻視頻

WordCount實例

1：編寫代碼

package mapreduce;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * hdfs上的hello中的內容爲
tiger pig
pig cat dog
dog bird cat
tiger house
bus bike bus car

 * @author think
 *
 */
public class WordCount {

	public static void main(String[] args) throws Exception {
		String inPath = args[0];
		Path outPath = new Path(args[1]);

		//1:hdfs configuration,get SystemFile Object
		Configuration conf = new Configuration();
		URI uri = new URI("/");// URI uri = new URI("hdfs://192.168.79.128:9000/");
		FileSystem fileSystem = FileSystem.get(uri, conf);

		if (fileSystem.exists(outPath)) {
			fileSystem.delete(outPath, true);
		}

		// 2:job object
		String jobName = WordCount.class.getName();
		Job job = Job.getInstance(conf, jobName);
		job.setJarByClass(WordCount.class);

		// 3:輸入路徑
		FileInputFormat.setInputPaths(job, inPath);

		// 4:指定inputFormat的子類，可選，默認是TextInputFormat
		job.setInputFormatClass(TextInputFormat.class);

		// 5:指定mapper類，指定mapper的輸出<k2,v2>類型
		job.setMapperClass(MapTask.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(LongWritable.class);

		// 6:指定reduce類，指定reduce的輸出<k3,v3>類型
		job.setReducerClass(ReduceTask.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);

		// 7:指定輸出路徑
		FileOutputFormat.setOutputPath(job, outPath);

		// 8:指定outputformat子類
		job.setOutputFormatClass(TextOutputFormat.class);

		// 9:提交yarn執行
		job.waitForCompletion(true);
	}
	
	/**
	 * Map 任務
	 * @author think
	 * LongWritable, Text, Text, LongWritable這4個參數依次代表map任務的輸入鍵值對<k1,v1>和輸出鍵值對<k2,v2>
	 */
	public static class MapTask extends Mapper<LongWritable, Text, Text, LongWritable>
	{
		Logger logger = LoggerFactory.getLogger(WordCount.class);
		
		Text k2 = new Text();

		LongWritable v2 = new LongWritable();
		
		/**
		 * 重寫map方法
		 * context是一個mapper的內部類
		 */
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, Text, LongWritable>.Context context)
				throws IOException, InterruptedException {
			//1:key爲內容的字節序數，value爲內容
			String content = value.toString();
			System.out.println("內容：" + key.get() + " ," + content);
			logger.info("內容：" + key.get() + " ," + content);
			
			String[] arrs = content.split(",");
			for(String word : arrs)
			{
				k2.set(word);
				v2.set(1);
				context.write(k2, v2);
				logger.info("map：" + k2.toString() + "," + v2);
			}
		}
	}
	
	/**
	 * Reduce 任務 
	 * @author think
	 * Text, LongWritable, Text, LongWritable這4個參數依次代表reduce任務的輸入鍵值對<k2,v2s>和輸出鍵值對<k3,v3>
	 */
	public static class ReduceTask extends Reducer<Text, LongWritable, Text, LongWritable>
	{
		LongWritable v3 = new LongWritable();
		
		@Override
		protected void reduce(Text k2, Iterable<LongWritable> v2s,
				Reducer<Text, LongWritable, Text, LongWritable>.Context content)
				throws IOException, InterruptedException {
			System.out.println("k2:" + k2.toString());
			long sum = 0;
			for(LongWritable v2 : v2s)
			{
				System.out.println("v2:" + v2);
				sum += v2.get();
			}
			v3.set(sum);
			content.write(k2, v3);
			System.out.println("k3,v3:" + k2.toString() + "," + v3);
		}
	}
	
	
}

2：打包並上傳到linux下

點擊java類->右鍵export->JAR File導出jar包，下圖是兩個需要注意的地方

3：在linux創建一個文件word然後將文件上傳到hdfs中

hadoop fs -put ./word /word

hadoop fs -text /word/word word在word目錄下

hadoop fs -cp /word/word /word/word2可以多複製幾個文件

4：執行hadoop jar，然後查看結果

hadoop jar wordCount.jar /word /out

在/out目錄中會自動生成文件記錄結果如/out/part-r-00000，查看此文件中的結果

hadoop fs -text /out/part-r-00000

5：在http://shb01:8088中查看集羣中map和reduce任務的輸出

我們重點需要關注的是Counters中的一些信息如本地讀取等

6：查看日誌需要在yarn-site.xml中加入如下內容

<name>yarn.log-aggregation-enable</name>

</property>

ArrayWritable類型

Hadoop中使用ArrayWritable來操作集合，必須自己寫一個類繼承它，ArrayWritable類中有一個Witable[]數組屬性，使用時必須傳遞值它會遍歷這個屬性。

下面是一個使用ArrayWritable統計流量的例子。

存在至少一個文件，此類文件模仿的是一個手機流量的日誌信息

每行數據結構爲

序號1：1363157993044是時間戳

序號2：13610002000是手機號

序號6是上行數據包，7是下行數據包，8上行總流量，9下行總流量，10是狀態嗎200表示成功。

1363157993044 13610002000 94-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99 iface.qiyi.com 視頻網站 15 12 1527 2106 200

假設有很多行每個手機號會出現多次，我們需要統計每個手機號的上下行數據包，上下行總流量的彙總數據。以下是代碼。

package mapreduce;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class FlowCount {

	/**
	 * @param args
	 * @throws Exception
	 */
	public static void main(String[] args) throws Exception {
		String inputPaths = args[0];
		Path outPath = new Path(args[1]);
		
		//1:獲取fileSystem對象，操作hdfs數據
		Configuration conf = new Configuration();
		URI uri = new URI("hdfs://192.168.79.139:9000/");
		FileSystem fileSystem = FileSystem.get(uri, conf);
		if(fileSystem.exists(outPath))
		{
			fileSystem.delete(outPath, true);
		}
		
		//2:獲取job對象
		Job job = Job.getInstance(conf, FlowCount.class.getName());
		job.setJarByClass(FlowCount.class);
		
		//3:指定輸入路徑
		FileInputFormat.setInputPaths(job, inputPaths);
		
		//4:指定inputFormat子類
		job.setInputFormatClass(TextInputFormat.class);
		
		//5:指定mapper類及其輸出類型
		job.setMapperClass(MapTask.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(FlowWritable.class);
		
		//6:指定reducer類及其輸出類型
		job.setReducerClass(ReduceTask.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		//7:指定outputFormat子類
		job.setOutputFormatClass(TextOutputFormat.class);
		
		//8:指定輸出路徑
		FileOutputFormat.setOutputPath(job, outPath);
		
		//9:提交yarn執行
		job.waitForCompletion(true);
	}

	/**
	 * Map任務
	 * 4個參數LongWritable, Text, Text, FlowWritable對應map的輸入<k1,v1><每行的字節序數，每行內容>
	 * map的輸出<k2,v2><手機號,FlowWritable(實例包含上下行數據包，上下行流量)>
	 * @author think
	 *
	 */
	public static class MapTask extends Mapper<LongWritable, Text, Text, FlowWritable>
	{
		Logger logger = LoggerFactory.getLogger(MapTask.class);
		
		Text k2 = new Text();
		
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, Text, FlowWritable>.Context context)
				throws IOException, InterruptedException {
			
			String[] values = value.toString().split("\t");
			k2.set(values[1]);

			FlowWritable flow = new FlowWritable();
			flow.set(values[5], values[6], values[7], values[8]);
			
			context.write(k2, flow);
			logger.info("MapTask[" + k2.toString() + ":" + flow + "]");
		}
	}
	
	/**
	 * Reduce任務
	 * 4個參數Text, FlowWritable, Text, Text對應reduce的輸入<k2,v2s><手機號,FlowWritable(實例包含上下行數據包，上下行流量)>
	 * reduce的輸出<k3,v3><手機號，流量信息>
	 * @author think
	 *
	 */
	public static class ReduceTask extends Reducer<Text, FlowWritable, Text, Text>
	{
		Logger logger = LoggerFactory.getLogger(ReduceTask.class);
		
		Text k3 = new Text();
		
		Text v3 = new Text();
		
		@Override
		protected void reduce(Text k2, Iterable<FlowWritable> v2s,
				Reducer<Text, FlowWritable, Text, Text>.Context context)
				throws IOException, InterruptedException {
			long six = 0;
			long seven = 0;
			long eight = 0;
			long nine = 0;
			for(FlowWritable v2 : v2s)
			{
				long[] flowArrs = v2.getLongArrs();
				six += flowArrs[0];
				seven += flowArrs[1];
				eight += flowArrs[2];
				nine += flowArrs[3];
			}
			
			k3.set(k2);
			String flowString = "up package[" + six + "];down package[" + seven + "];up flow[" + eight + "];down flow[" + nine +"]";
			v3.set(flowString);
			context.write(k3, v3);
			
		}
	}
	
	/**
	 * FlowWritable用來存放文件中的數據包和流量信息，對應序號6~9
	 * @author think
	 *
	 */
	public static class FlowWritable extends ArrayWritable
	{
		//必須在構造函數中調用super明確類型
		public FlowWritable() {
			super(LongWritable.class);
		}
		
		/**
		 * 將值賦給ArrayWritable中的values屬性
		 * @param six
		 * @param seven
		 * @param eight
		 * @param nine
		 */
		public void set(String six, String seven, String eight, String nine)
		{
			Writable[] values = new Writable[4];
			//System.out.println("-" + six + "-" + seven + "-" + eight + "-" + nine);
			values[0] = new LongWritable(Long.valueOf(six));
			values[1] = new LongWritable(Long.valueOf(seven));
			values[2] = new LongWritable(Long.valueOf(eight));
			values[3] = new LongWritable(Long.valueOf(nine));
			super.set(values);
		}
		
		/**
		 * 從ArrayWritable中獲取values屬性的值
		 * @return
		 */
		public long[] getLongArrs()
		{
			LongWritable[] values = (LongWritable[])super.toArray();
			if(null != values)
			{
				long[] valueArrs = new long[values.length];
				for(int i = 0; i < values.length; i++)
				{
					valueArrs[i] = values[i].get();
				}
				return valueArrs;
			}
			else
			{
				return null;
			}
		}
		
	}
}

MapReuce筆記四之hadoop類型和MR操作hdfs數據實例

Hadoop類型

WordCount實例

ArrayWritable類型

MapReuce筆記一之概念簡介

Elasticsearch筆記七之setting,mapping,分片查詢方式

Elasticsearch筆記九之優化

MapReuce筆記四之hadoop類型和MR操作hdfs數據實例

Elasticsearch筆記八之腦裂

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結