客戶端用java api 遠程操作HDFS以及遠程提交MR任務(源碼和異常處理)

兩個類,一個HDFS文件操作類,一個是wordcount 詞數統計類,都是從網上看來的。上代碼:

package mapreduce;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.io.IOUtils;
/**
 * file operation on HDFS
 * @author liuxingjiaofu
 *
 */
public class HDFS_File {
	//read the file from HDFS
	public void ReadFile(Configuration conf, String FileName){
	  try{
			FileSystem hdfs = FileSystem.get(conf);
			FSDataInputStream dis = hdfs.open(new Path(FileName));
			IOUtils.copyBytes(dis, System.out, 4096, false); 
		     dis.close();
		}catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	//copy the file from HDFS to local
	public void GetFile(Configuration conf, String srcFile, String dstFile){
		try {
			  FileSystem hdfs = FileSystem.get(conf);
			  Path srcPath = new Path(srcFile);
			  Path dstPath = new Path(dstFile);
			  hdfs.copyToLocalFile(true,srcPath, dstPath);
		}catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	//copy the local file to HDFS
	public void PutFile(Configuration conf, String srcFile, String dstFile){
	try {
		  FileSystem hdfs = FileSystem.get(conf);
		  Path srcPath = new Path(srcFile);
		  Path dstPath = new Path(dstFile);
		  hdfs.copyFromLocalFile(srcPath, dstPath);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	//create the new file
	public FSDataOutputStream CreateFile(Configuration conf, String FileName){
	try {	
		  FileSystem hdfs = FileSystem.get(conf);
		  Path path = new Path(FileName);
		  FSDataOutputStream outputStream = hdfs.create(path);
		  return outputStream;
		} catch (IOException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
		}
		return null;
	}
	//rename the file name
	public boolean ReNameFile(Configuration conf, String srcName, String dstName){
	try {
			Configuration config = new Configuration();
			FileSystem hdfs = FileSystem.get(config);
			Path fromPath = new Path(srcName);
			Path toPath = new Path(dstName);
			boolean isRenamed = hdfs.rename(fromPath, toPath);
			return isRenamed;
		}catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return false;
	}
	//delete the file
	// tyep = true, delete the directory
	// type = false, delete the file
	public boolean DelFile(Configuration conf, String FileName, boolean type){
		try {
			  FileSystem hdfs = FileSystem.get(conf);
			  Path path = new Path(FileName);
			  boolean isDeleted = hdfs.delete(path, type);
			  return isDeleted;
		}catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return false;
	}
	//Get HDFS file last modification time
	public long GetFileModTime(Configuration conf, String FileName){
	try{
			  FileSystem hdfs = FileSystem.get(conf);
			  Path path = new Path(FileName);
			  FileStatus fileStatus = hdfs.getFileStatus(path);
			  long modificationTime = fileStatus.getModificationTime();
			  return modificationTime;
		}catch(IOException e){
			e.printStackTrace();
		}
		return 0;
	}
	//check if a file  exists in HDFS
	public boolean CheckFileExist(Configuration conf, String FileName){
	try{			
			  FileSystem hdfs = FileSystem.get(conf);
			  Path path = new Path(FileName);
			  boolean isExists = hdfs.exists(path);
			  return isExists;
		}catch(IOException e){
			e.printStackTrace();
		}
		return false;
	}
	//Get the locations of a file in the HDFS cluster
	public List<String []> GetFileBolckHost(Configuration conf, String FileName){
		try{
			  List<String []> list = new ArrayList<String []>();
			  FileSystem hdfs = FileSystem.get(conf);
			  Path path = new Path(FileName);
			  FileStatus fileStatus = hdfs.getFileStatus(path);
	
			  BlockLocation[] blkLocations = hdfs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
			  
			  int blkCount = blkLocations.length;
			  for (int i=0; i < blkCount; i++) {
			    String[] hosts = blkLocations[i].getHosts();
			    list.add(hosts);
			   }
			  return list;
			}catch(IOException e){
				e.printStackTrace();
			}
			return null;
	}
	//Get a list of all the nodes host names in the HDFS cluster
	// have no authorization to do this operation
	public String[] GetAllNodeName(Configuration conf){
		try{
			  FileSystem fs = FileSystem.get(conf);
			  DistributedFileSystem hdfs = (DistributedFileSystem) fs;
			  DatanodeInfo[] dataNodeStats = hdfs.getDataNodeStats();
			  String[] names = new String[dataNodeStats.length];
			  for (int i = 0; i < dataNodeStats.length; i++) {
			      names[i] = dataNodeStats[i].getHostName();
			  }
			  return names;
		}catch(IOException e){
			System.out.println("error!!!!");
			e.printStackTrace();
		}
		return null;
	}
}

wordcount

package mapreduce;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class mywordcount {
	public static  class wordcountMapper extends
		Mapper<LongWritable, Text, Text, IntWritable>{
		private final static IntWritable one = new IntWritable(1);
		private Text word = new Text();
		public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException{
			String line = value.toString();
			StringTokenizer itr = new StringTokenizer(line);
			while(itr.hasMoreElements()){
				word.set(itr.nextToken());
				context.write(word, one);
			}
		}
	}
	public static  class wordcountReducer extends
		Reducer<Text, IntWritable, Text, IntWritable>{
		public void reduce(Text key, Iterable<IntWritable>values, Context context)throws IOException, InterruptedException{
			int sum = 0;
			for (IntWritable str : values){
				sum += str.get();
			}
			context.write(key, new IntWritable(sum));
		}
	}
	/**
	 * 2 args, the file you want to count words from and the directory you want to save the result
	 * @param args /home/hadooper/testmp/testtext /home/hadooper/testmp/testresult
	 * @throws Exception
	 */
	public static  void main(String args[])throws Exception{
		//首先定義兩個臨時文件夾,這裏可以使用隨機函數+文件名,這樣重名的機率就很小。
		String dstFile = "temp_src";
		String srcFile = "temp_dst";
		//這裏生成文件操作對象。
		HDFS_File file = new HDFS_File();
		
		Configuration conf = new Configuration();
		// must!!!  config the fs.default.name be the same to the value in core-site.xml
		conf.set("fs.default.name","hdfs://node1");
		conf.set("mapred.job.tracker","node1:54311");

		//從本地上傳文件到HDFS,可以是文件也可以是目錄
		file.PutFile(conf, args[0], dstFile);
		
		System.out.println("up ok");
		Job job = new Job(conf, "mywordcount");		
		job.setJarByClass(mywordcount.class);
		
		job.setInputFormatClass(TextInputFormat.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		job.setMapperClass(wordcountMapper.class);
		job.setReducerClass(wordcountReducer.class);
		job.setCombinerClass(wordcountReducer.class);
		//注意這裏的輸入輸出都應該是在HDFS下的文件或目錄		
		FileInputFormat.setInputPaths(job, new Path(dstFile));
		FileOutputFormat.setOutputPath(job, new Path(srcFile));
//開始運行
		job.waitForCompletion(true);
		//從HDFS取回文件保存至本地
		file.GetFile(conf, srcFile, args[1]);
		System.out.println("down the result ok!");
//刪除臨時文件或目錄
		file.DelFile(conf, dstFile, true);
		file.DelFile(conf, srcFile, true);
		System.out.println("delete file on hdfs ok!");
	}
}


期間,遇到幾個錯誤:

1.HDFS版本問題--Call to node1/172.*.*.*:8020 failed on local exception: java.io.EOFException

main() {…… 
  Configuration conf = new Configuration();
  conf.set("fs.default.name","hdfs://node1");//與conf/core-site裏的值對應,必須
  HDFS_File file = new HDFS_File();
  //print all the node name
  String[] host_name = file.GetAllNodeName(conf); 
……}
public String[] GetAllNodeName(Configuration conf){
  try{
    // Configuration config = new Configuration();
     FileSystem fs = FileSystem.get(conf);
     DistributedFileSystem hdfs = (DistributedFileSystem) fs;
     DatanodeInfo[] dataNodeStats = hdfs.getDataNodeStats();
     String[] names = new String[dataNodeStats.length];
     for (int i = 0; i < dataNodeStats.length; i++) {
         names[i] = dataNodeStats[i].getHostName();
     }
     return names;
  }catch(IOException e){
   System.out.println("eeeeeeeeeeeeeeeeeeeerror!!!!");
   e.printStackTrace();
  }
  return null;
 }
異常:
eeeeeeeeeeeeeeeeeeeerror!!!!
java.io.IOException: Call to node1/172.10.39.250:8020 failed on local exception: java.io.EOFException
 at org.apache.hadoop.ipc.Client.wrapException(Client.java:775)
 at org.apache.hadoop.ipc.Client.call(Client.java:743)
 at org.apache.hadoop.ipc.RPC$Invoker.invoke(RPC.java:220)
 at $Proxy0.getProtocolVersion(Unknown Source)
 at org.apache.hadoop.ipc.RPC.getProxy(RPC.java:359)
 at org.apache.hadoop.hdfs.DFSClient.createRPCNamenode(DFSClient.java:112)
 at org.apache.hadoop.hdfs.DFSClient.<init>(DFSClient.java:213)
 at org.apache.hadoop.hdfs.DFSClient.<init>(DFSClient.java:176)
 at org.apache.hadoop.hdfs.DistributedFileSystem.initialize(DistributedFileSystem.java:82)
 at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:1378)
 at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:66)
 at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:1390)
 at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:196)
 at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:95)
 at mapreduce.HDFS_File.GetAllNodeName(HDFS_File.java:151)
 at mapreduce.File_Operation.main(File_Operation.java:15)
Caused by: java.io.EOFException
 at java.io.DataInputStream.readInt(DataInputStream.java:392)
 at org.apache.hadoop.ipc.Client$Connection.receiveResponse(Client.java:501)
 at org.apache.hadoop.ipc.Client$Connection.run(Client.java:446)
Exception in thread "main" java.lang.NullPointerException
 at mapreduce.File_Operation.main(File_Operation.java:16)
原因:版本問題,確保java中的jar包跟hadoop集羣的jar包是相同版本的
2.HDFS
權限問題

 

org.apache.hadoop.security.AccessControlException: org.apache.hadoop.security.AccessControlException: Permission denied: user=hadooper, access=WRITE, inode="/user":root:supergroup:drwxr-xr-x

解決方案之
(1 added this entry to conf/hdfs-site.xml
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
(2.放開 要寫入目錄 hadoop 目錄的權限 , 命令如下 :$ hadoop fs -chmod 777 /user/
我用的是第2種方案

3.HDFS 2011-12-20 17:00:32 org.apache.hadoop.util.NativeCodeLoader <clinit>
警告: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable

在Hadoop的配置文件core-site.xml中可以設置是否使用本地庫:
<property>
  <name>hadoop.native.lib</name>
  <value>true</value>
  <description>Should native hadoop libraries, if present, be used.</description>
</property>

Hadoop默認的配置爲啓用本地庫。
另外,可以在環境變量中設置使用本地庫的位置:
export JAVA_LIBRARY_PATH=/path/to/hadoop-native-libs
有的時候也會發現Hadoop自帶的本地庫無法使用,這種情況下就需要自己去編譯本地庫了。在$HADOOP_HOME目錄下,使用如下命令即可:
ant compile-native
編譯完成後,可以在$HADOOP_HOME/build/native目錄下找到相應的文件,然後指定文件的路徑或者移動編譯好的文件到默認目錄下即可
我試了下,那個是64位的,我電腦是32位的,沒有源代碼,編譯不了,那隻好一段段程序的試,找出哪段代碼出了這個警告,我的是
  try {
     FileSystem hdfs = FileSystem.get(conf);
     Path srcPath = new Path(srcFile);
     Path dstPath = new Path(dstFile);
     hdfs.copyToLocalFile(true,srcPath, dstPath);//定位到此句
  }catch (IOException e) {
 到了此步,便只能如此了,爲什麼呢,java不是跨平臺的嗎

4.MR-jar包缺失

ClassNotFoundException: org.codehaus.jackson.map.JsonMappingException
NoClassDefFoundError: org/apache/commons/httpclient/HttpMethod

添加jar包到java工程中

jackson-core-asl-1.5.2.jar
jackson-mapper-asl-1.5.2.jar

commons-httpclient-3.0.1.jar

我是不習慣將所有Jar包都加到工程裏,覺得這樣很容易便加多了,浪費時空。
完成第一次mapreduce,不錯!

5.遠程的JOB掛掉了,居然還能運行成功,發現是mapred.job.tracker屬性沒設,默認在local下運行,其值在namenode的mapred-site.xml中看

 conf.set("mapred.job.tracker","node1:54311");

配置完了,運行可以初始化,但是找不到mapper類:

信息: Task Id : attempt_201112221123_0010_m_000000_0, Status : FAILED
java.lang.RuntimeException: java.lang.ClassNotFoundException: mapreduce.mywordcount$wordcountMapper
 at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:996)
 at org.apache.hadoop.mapreduce.JobContext.getMapperClass(JobContext.java:212)
 at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:611)
 at org.apache.hadoop.mapred.MapTask.run(MapTask.java:325)
 at org.apache.hadoop.mapred.Child$4.run(Child.java:270)
 at java.security.AccessController.doPrivileged(Native Method)
 at javax.security.auth.Subject.doAs(Subject.java:396)
 at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1127)
 at org.apache.hadoop.mapred.Child.main(Child.java:264)

將程序打成jar包放到hadoop集羣的jobtracker上可用,正常,結果也正確,但是在客戶端運行卻報上述錯誤,暫時還沒解決。

總結

1.遠程操作HDFS文件以及遠程提交MR任務,必須配置的兩項(其他暫時還沒發現):

conf.set("fs.default.name","hdfs://node1");//與conf/core-site.xml裏的值對應,必須 
conf.set("mapred.job.tracker","node1:54311");//mapred-site.xml

2.耐心分析問題,解決問題
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章