SequenceFile
SequenceFile是hadoop中提供的一種二進制文件支持,可以將小文件序列化到大文件中。文件名稱爲key文件內容爲value,優點是支持壓縮格式(CompressionType.BLOCK和CompressionType.RECORD),反序列化速度快。缺點是上傳後的小文件不好定位檢索時需要遍歷所有的小文件。
以下是使用sequenceFile上傳小文件到hdfs然後再讀取的代碼實例
package mapreduce;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.util.Collection;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
public class SequenceFileExample {
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
//1:獲取FileSystem對象
Configuration conf = new Configuration();
URI uri = new URI("hdfs://192.168.79.139:9000");
FileSystem fileSystem = FileSystem.get(uri, conf);
//2:sequenceFile寫操作,從windows上傳文件到hdfs
sequenceFileWrite(conf, fileSystem);
//3:sequenceFile讀 操作
sequenceFileRead(conf, fileSystem);
}
@SuppressWarnings("deprecation")
private static void sequenceFileRead(Configuration conf,
FileSystem fileSystem) throws IOException {
SequenceFile.Reader reader = new SequenceFile.Reader(fileSystem, new Path("/sequence.seq"), conf);
Text key = new Text();
Text value = new Text();
while(reader.next(key, value))
{
System.out.println(key.toString() + " " + value.toString());
}
IOUtils.closeStream(reader);
}
@SuppressWarnings("deprecation")
private static void sequenceFileWrite(Configuration conf,
FileSystem fileSystem) throws IOException {
/**
* writer方法顯示過時了但是仍可以使用
* 第三個參數是上傳後在hdfs中的文件名
* 第四第五個參數分別是key和value的類型,這裏我指定爲Text
*/
SequenceFile.Writer writer = new SequenceFile.Writer(fileSystem, conf,
new Path("/sequence.seq"), Text.class, Text.class);
//3:上傳文件
//這裏的F:/sequencefile文件夾下有存放了數千個小文件
Collection<File> listFiles = FileUtils.listFiles(new File("F:/sequencefile"), null, true);
Text key = null;
Text value = null;
for(File file : listFiles)
{
key = new Text(file.getName());
value = new Text(FileUtils.readFileToString(file));
writer.append(key, value);
}
IOUtils.closeStream(writer);
fileSystem.close();
System.out.println("ok");
}
}
Hdfs的sequence.seq文件就是我通過sequenceFile上傳的文件
SequenceFile壓縮
SequenceFile支持兩種格式壓縮CompressionType.BLOCK和CompressionType.RECORD,下面是我寫的例子,不過這個例子我沒有實驗。
package mapreduce;
import java.io.File;
import java.net.URI;
import java.util.Collection;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.zookeeper.common.IOUtils;
public class SequenceFileZipExample {
@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {
//1:獲取FileSystem對象
Configuration conf = new Configuration();
URI uri = new URI("hdfs://192.168.79.139:9000");
FileSystem fileSystem = FileSystem.get(uri, conf);
//2:sequenceFile寫操作
Path path = new Path("/sequence.block");
FSDataOutputStream out = fileSystem.create(path);
//type 可以爲CompressionType.RECORD;CompressionType.BLOCK
CompressionType type = CompressionType.BLOCK;
SequenceFile.Writer createWriter = SequenceFile.createWriter(conf, out, Text.class, Text.class, type, new GzipCodec());
Collection<File> listFiles = FileUtils.listFiles(new File("F:/sequencefile"), null, true);
Text key = null;
Text value = null;
for(File file : listFiles)
{
key = new Text(file.getName());
value = new Text(FileUtils.readFileToString(file));
createWriter.append(key, value);
}
IOUtils.closeStream(createWriter);
System.out.println("ok");
}
}
MapFile
MapFile是排序的sequenceFile,採用data+index目錄結構。Index是文件的數據索引記錄每個record的key值以及該record在文件中的偏移位置,所以mapFile的檢索效率高可以快速的定位record位置,但同時也會消耗多餘的空間來存儲index數據。
另外需要注意兩點:
1.mapfile並不會記錄所有record的key值,因爲那樣量就太大了,默認是每128條record存儲一個key,這個條數可以通過MapFile.Writer().setIndexInterval(interval)修改。
2.mapfile的keyclass必須實現WritableComparable,這樣就可以進行比較。
以下是使用mapFile讀寫文件的例子
package mapreduce;
import java.io.File;
import java.net.URI;
import java.util.Collection;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.Text;
public class MapFileExample {
@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {
//1:獲取FileSystem
Configuration conf = new Configuration();
URI uri = new URI("hdfs://192.168.79.139:9000");
FileSystem fs = FileSystem.get(uri, conf);
System.out.println(fs);
//2:MapFile上傳文件
MapFile.Writer writer = new MapFile.Writer(conf, fs, "/mapfile.map", Text.class, Text.class);;
Collection<File> listFiles = FileUtils.listFiles(new File("F:/sequencefile"), null, true);
Text key = null;
Text value = null;
for(File file : listFiles)
{
key = new Text(file.getName());
value = new Text(FileUtils.readFileToString(file));
writer.append(key, value);
}
IOUtils.closeStream(writer);
//3:MapFile讀文件
MapFile.Reader reader = new MapFile.Reader(fs, "/mapfile.map", conf);
Text keyr = new Text();
Text valuer = new Text();
while(reader.next(keyr, valuer))
{
System.out.println(keyr.toString() + " " + valuer.toString());
}
IOUtils.closeStream(reader);
IOUtils.closeStream(fs);
}
}