Linux下Hadoop hdfs Java API使用

0 前言

搞了大約2天時間終於把Linux下面Java API的使用給弄清楚了。做個筆記方便以後參考。環境如下所示

Hadoop:2.5.1
Linux:Ubuntu kylin
eclipse:luna

1 步驟

首先是要去下載一個eclipse,這裏使用的是Luna。名字比較好聽,代表月亮消滅你們...
然後發現自帶了maven,太棒了!Luna牛掰,毫無疑問創建maven工程,修改pom.xml文件爲下面的內容
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>maven</groupId>
  <artifactId>maven</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <packaging>jar</packaging>

  <name>maven</name>
  <url>http://maven.apache.org</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
  </properties>

  <dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>3.8.1</version>
      <scope>test</scope>
    </dependency>
     <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-minicluster</artifactId>
          <version>2.5.1</version>
    </dependency>
    <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-client</artifactId>
          <version>2.5.1</version>
    </dependency>
    <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-assemblies</artifactId>
          <version>2.5.1</version>
    </dependency>
        <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-maven-plugins</artifactId>
          <version>2.5.1</version>
    </dependency>
        <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-common</artifactId>
          <version>2.5.1</version>
    </dependency>
        <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-hdfs</artifactId>
          <version>2.5.1</version>
    </dependency>
  </dependencies>
</project>
然後等待eclipse maven自動下載依賴的包。等啊等就好了,下一步是配置jvm運行的參數,因爲運行的時候需要本地的庫所以必須配置下。我的Hadoop是放在/home/hadoop-master/hadoop-2.5.1下的。
-Djava.library.path=/home/hadoop-master/hadoop-2.5.1/lib/native
因爲hadoop2.5.1自己已經編譯好了本地庫所以不用在編譯一次了(這就是用新不用舊的原因,自己編譯太費事兒了。。。。到此一切OK

2 測試代碼

是驢子是馬,拉出來溜溜。寫個小程序跑跑。
package maven.maven;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Date;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.DFSClient.*;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;

public class HadoopFSOperations {
	
	private static Configuration conf = new Configuration();
	private static final String HADOOP_URL="hdfs://192.168.190.129:9000";
	
	private static FileSystem fs;
	
	private static DistributedFileSystem hdfs;
	
	static {
		try {
			FileSystem.setDefaultUri(conf, HADOOP_URL);
			fs = FileSystem.get(conf);
			hdfs = (DistributedFileSystem)fs;
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	/**
	 * 列出所有DataNode的名字信息
	 */
	public void listDataNodeInfo() {		
		try {
			DatanodeInfo[] dataNodeStats = hdfs.getDataNodeStats();
			String[] names = new String[dataNodeStats.length];
			System.out.println("List of all the datanode in the HDFS cluster:");
			
			for (int i=0;i<names.length;i++) {
				names[i] = dataNodeStats[i].getHostName();
				System.out.println(names[i]);
			}
			System.out.println(hdfs.getUri().toString());
 		} catch (Exception e) {
 			e.printStackTrace();
 		}
	}
	
	/**
	 * 查看文件是否存在
	 */
	public void checkFileExist() {
		try {
			Path a= hdfs.getHomeDirectory();
			System.out.println("main path:"+a.toString());
			
			Path f = new Path("/user/xxx/input01/");
			boolean exist = fs.exists(f);
			System.out.println("Whether exist of this file:"+exist);
			
			//刪除文件
//			if (exist) {
//				boolean isDeleted = hdfs.delete(f, false);
//				if(isDeleted) {
//					System.out.println("Delete success");
//				}				
//			}
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	/**
	 *創建文件到HDFS系統上 
	 */
	public void createFile() {
		try {
			Path f = new Path("/user/xxx/input02/file01");
			System.out.println("Create and Write :"+f.getName()+" to hdfs");
			
			FSDataOutputStream os = fs.create(f, true);
			Writer out = new OutputStreamWriter(os, "utf-8");//以UTF-8格式寫入文件,不亂碼
			out.write("你好 good job");
			out.close();
			os.close();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	
	/**
	 * 讀取本地文件到HDFS系統<br>
	 * 請保證文件格式一直是UTF-8,從本地->HDFS
	 */
	public void copyFileToHDFS() {
		try {
			Path f = new Path("/user/xxx/input02/file01");
			File file = new File("E:\\hadoopTest\\temporary.txt");
			
			FileInputStream is = new FileInputStream(file);
			InputStreamReader isr = new InputStreamReader(is, "utf-8");
			BufferedReader br = new BufferedReader(isr);
			
			FSDataOutputStream os = fs.create(f, true);
			Writer out = new OutputStreamWriter(os, "utf-8");
			
			String str = "";
			while((str=br.readLine()) != null) {
				out.write(str+"\n");
			}
			br.close();
			isr.close();
			is.close();
			out.close();
			os.close();
			System.out.println("Write content of file "+file.getName()+" to hdfs file "+f.getName()+" success");
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * 取得文件塊所在的位置..
	 */
	public void getLocation() {
		try {
			Path f = new Path("/user/xxx/input02/file01");
			FileStatus fileStatus = fs.getFileStatus(f);
			
			BlockLocation[] blkLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
			for (BlockLocation currentLocation : blkLocations) {
				String[] hosts = currentLocation.getHosts();
				for (String host : hosts) {
					System.out.println(host);
				}
			}
			
			//取得最後修改時間
			long modifyTime = fileStatus.getModificationTime();
			Date d = new Date(modifyTime);
			System.out.println(d);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * 讀取hdfs中的文件內容
	 */
	public void readFileFromHdfs() {
		try {
			Path f = new Path("/user/xxx/input02/file01");
			
			FSDataInputStream dis = fs.open(f);
			InputStreamReader isr = new InputStreamReader(dis, "utf-8");
			BufferedReader br = new BufferedReader(isr);
			String str = "";
			while ((str = br.readLine()) !=null) {
				System.out.println(str);
			}
			br.close();
			isr.close();
			dis.close();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * list all file/directory
	 * @param args
	 * @throws IOException 
	 * @throws IllegalArgumentException 
	 * @throws FileNotFoundException 
	 */
	public void listFileStatus(String path) throws FileNotFoundException, IllegalArgumentException, IOException {
		FileStatus fileStatus[]=fs.listStatus(new Path(path));
		int listlength=fileStatus.length;
		for (int i=0 ;i<listlength ;i++){
			if (fileStatus[i].isDirectory() == false) {
				System.out.println("filename:"
						+ fileStatus[i].getPath().getName() + "\tsize:"
						+ fileStatus[i].getLen());
			} else {
				String newpath = fileStatus[i].getPath().toString();
				listFileStatus(newpath);
			}
		}
	}
	
	public static void main(String[] args) {
		HadoopFSOperations a = new HadoopFSOperations();
		a.listDataNodeInfo();
//		a.checkFileExist();
//		a.createFile();
//		a.copyFileToHDFS();
//		a.getLocation();
//		a.readFileFromHdfs();
		try {
			a.listFileStatus(HADOOP_URL+"/user");
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IllegalArgumentException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

因爲我的hadoop是在192.168.190.129上的所以private static final String HADOOP_URL="hdfs://192.168.190.129:9000";,請酌情修改。搞定跑起來,就能看到下面的結果

List of all the datanode in the HDFS cluster:
hadoopslaver0
hadoopslaver2
hadoopslaver1
hdfs://192.168.190.129:9000
filename:TrustCom2015_CFP.pdf	size:290401
filename:jd.PNG	size:16647

可以看到 三個datanode hadoopslaver0,1,2 以及/user下事先放好的文件。小實驗成功

3 總結

在Linux下面Java API就可以按照上面的步驟弄起來了。總算是萬里之行邁出了第一步。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章