使用hadoop2.3+mahout+lucene將文本換算成vector

在http://www.daviddlewis.com/resources/testcollections/reuters21578/下載Reuters數據

參照：http://www.shellsec.com/tech/63646.html和http://blog.chinaunix.net/uid-20761674-id-3535501.html

跑的job：

將文本文件轉換爲sequencefile
分詞（DocumentProcessor::DocumentTokenizer)
統計詞頻（DictionaryVectorizer::WordCount）
生成分區向量空間（DictionaryVectorizer::MakePartialVectors）
合併分區向量空間（PartialVectorMerger）
用tfidf算法計算文檔向量空間各分量的權重（VectorTfIdf Document Frequency Count ）
優化向量空間（Prune Vectors）——將維
合併優化後的向量空間（PrunerPartialVectorMerger）
再次生成分區向量空間（MakePartialVectors）
再次合併分區向量空間（PartialVectorMerge）
聚類(Cluster Iterator running)
分類（Cluster Classification Driver running）
輸出聚類結果（Representative Points Driver running）

生成的向量化文件的目錄結構是這樣的：

df-count 目錄：保存着文本的頻率信息
tf-vectors 目錄：保存着以 TF 作爲權值的文本向量
tfidf-vectors 目錄：保存着以 TFIDF 作爲權值的文本向量
tokenized-documents 目錄：保存着分詞過後的文本信息
wordcount 目錄：保存着全局的詞彙出現的次數
dictionary.file-0 目錄：保存着這些文本的詞彙表
frequcency-file-0 目錄 : 保存着詞彙表對應的頻率信息。

在eclipse中運行需要以下的關於lucene的jar包

<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-benchmark</artifactId>
<version>4.6.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>4.6.1</version>
</dependency>

解壓縮reuters數據

public static void extractReuters(){
File inputFolder = new File("datafile/reuters");
File outputFolder = new File("datafile/reuters-extracted");
ExtractReuters extractor = new ExtractReuters(inputFolder, outputFolder);
extractor.extract();
}

將解壓後的reuters數據轉化成SequenceFile
public static void transformToSequenceFile(){
Configuration config = BasicConfig.config();
HdfsUtils hdfs = new HdfsUtils(BasicConfig.HDFS, config);

String[] mahoutJars = {//local jars mahout和lucene的相關jar包用於傳到hadoop上的classpath中，是local的jar包
"/home/training/git/socialrecommendation/datafile/mahout-math-1.0-SNAPSHOT.jar",
"/home/training/git/socialrecommendation/datafile/lucene-analyzers-common-4.6.1.jar",
"/home/training/git/socialrecommendation/datafile/mahout-integration-1.0-SNAPSHOT.jar",
"/home/training/git/socialrecommendation/datafile/mahout-mrlegacy-1.0-SNAPSHOT.jar",
"/home/training/git/socialrecommendation/datafile/mahout-mrlegacy-1.0-SNAPSHOT-job.jar" ,
};

try {
hdfs.addJarToDistributedCache(Arrays.asList(mahoutJars), config);
} catch (IOException e1) {
e1.printStackTrace();
}
String[] args = {"-c", "UTF-8", "-i", BasicConfig.HDFS+"/user/hdfs/userCF/reutersExtracted", "-o",
BasicConfig.HDFS+"/user/hdfs/userCF/reutersSeqfiles"}; //在集羣上運行的參數

/*String[] args = {"-c", "UTF-8", "-i", "datafile/reuters-extracted/", "-o",
"datafile/reuters-seqfiles"};*/ //在本地運行的參數

try {
/*SequenceFilesFromDirectory.main(args);*/ //在本地運行的參數
SequenceFilesFromDirectory job = new SequenceFilesFromDirectory();
job.main(args, config);
} catch (Exception e) {
e.printStackTrace();
}
}

想要讓這個job在hadoop集羣上跑，

需要將64行

public static void main(String[] args) throws Exception {
ToolRunner.run(new SequenceFilesFromDirectory(), args);
}

改成

private static Configuration conf;

public static void main(String[] args, Configuration config) throws Exception {
conf = config;
ToolRunner.run(new SequenceFilesFromDirectory(), args);
}

將84行：HadoopUtil.delete(getConf(), output);

改成 HadoopUtil.delete(conf, output);

將89行：//runSequential(getConf(), getInputPath(), output, options);

改成：runSequential(conf, getInputPath(), output, options);

將153行：

Job job = prepareJob(input, output, MultipleTextFileInputFormat.class,
SequenceFilesFromDirectoryMapper.class, Text.class, Text.class,
SequenceFileOutputFormat.class, "SequenceFilesFromDirectory", conf);

改成

Job job = prepareJob(input, output, MultipleTextFileInputFormat.class,
SequenceFilesFromDirectoryMapper.class, Text.class, Text.class,
SequenceFileOutputFormat.class, "SequenceFilesFromDirectory");

進行以上改變主要是爲了能夠將configuration的配置信息傳到job上，好像該方法只能用於進行本地跑

另外在AbstractJob.java類中添加方法

protected Job prepareJob(Path inputPath,
Path outputPath,
Class<? extends InputFormat> inputFormat,
Class<? extends Mapper> mapper,
Class<? extends Writable> mapperKey,
Class<? extends Writable> mapperValue,
Class<? extends OutputFormat> outputFormat,
String jobname, Configuration conf) throws IOException {

Job job = HadoopUtil.prepareJob(inputPath, outputPath,
inputFormat, mapper, mapperKey, mapperValue, outputFormat, conf);

String name =
jobname != null ? jobname : HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class);

job.setJobName(name);
return job;
}

將SequenceFile進行向量化

public static void transformToVector(Long l){

Configuration config = BasicConfig.config();
HdfsUtils hdfs = new HdfsUtils(BasicConfig.HDFS, config);

String[] mahoutJars = {//local jars
"/home/training/git/socialrecommendation/datafile/mahout-math-1.0-SNAPSHOT.jar",

"/home/training/git/socialrecommendation/datafile/lucene-analyzers-common-4.6.1.jar",

"/home/training/git/socialrecommendation/datafile/mahout-integration-1.0-SNAPSHOT.jar",

"/home/training/git/socialrecommendation/datafile/mahout-mrlegacy-1.0-SNAPSHOT.jar",

"/home/training/git/socialrecommendation/datafile/mahout-mrlegacy-1.0-SNAPSHOT-job.jar" ,

};

try {
hdfs.addJarToDistributedCache(Arrays.asList(mahoutJars), config);
} catch (IOException e1) {
e1.printStackTrace();
}

String[] args = {"-a", "org.apache.lucene.analysis.core.WhitespaceAnalyzer",
"-chunk", "200","-o", BasicConfig.HDFS+"/user/hdfs/userCF/"+l+"/reutersVectorsBigram",
"-i", BasicConfig.HDFS+"/user/hdfs/userCF/reutersSeqfiles/", "-md", "3",
"-x", "90", "-wt", "tfidf", "-ml", "50","-ng", "2",
"-seq"};
try {
SparseVectorsFromSequenceFiles job = new SparseVectorsFromSequenceFiles();
job.main(args,config);
} catch (Exception e) {
e.printStackTrace();
}
}

在類SparseVectorsFromSequenceFiles的方法

將54行

private static Configuration conf;
public static void main(String[] args) throws Exception {
ToolRunner.run(new SparseVectorsFromSequenceFiles(), args);
}

改成

private static Configuration conf;
public static void main(String[] args,Configuration config) throws Exception {
conf=config;
ToolRunner.run(new SparseVectorsFromSequenceFiles(), args);
}

把253行:去掉Configuration conf = getConf();

進行以上改變主要是爲了能夠將configuration的配置信息傳到job上，好像該方法只能用於進行本地跑

在HighDFWordsPruner類

第82行： DistributedCache.setCacheFiles(new URI[]{dictionaryFilePath.toUri()}, conf);

改成 DistributedCache.addCacheFileAsFirstOne(dictionaryFilePath.toUri(), conf);

在DistributedCache類中

添加方法：

/**
* Add a file to be localized to the conf. Intended
* to be used by user code.
* @param uri The uri of the cache to be localized
* @param conf Configuration to add the cache to
* @deprecated Use {@link Job#addCacheFile(URI)} instead
*/
@Deprecated
public static void addCacheFileAsFirstOne(URI uri, Configuration conf) {
String files = conf.get(MRJobConfig.CACHE_FILES);
conf.set(MRJobConfig.CACHE_FILES, files == null ? uri.toString() : uri.toString()+ ","
+ files);
}

由於使用set方法會把之前的放在classpath下的jar包的路徑給刪掉，因此使用add方法，並且需要改成添加成第一個路徑，

因爲在之後取的時候會只取第一個路徑，因此不放在第一個會報錯

其中：

這個是yarn的配置信息：

package com.hp.recommendation.util;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.yarn.conf.YarnConfiguration;

public class BasicConfig {

public static final String HDFS = "hdfs://c0004649.itcs.hp.com:9000";

public static final String YARN_RESOURCE="c0004650.itcs.hp.com";

public static Configuration config() {

Configuration conf = new YarnConfiguration();
conf.set("fs.defaultFS", BasicConfig.HDFS);
conf.set("mapreduce.framework.name", "yarn");
conf.set("yarn.resourcemanager.address", BasicConfig.YARN_RESOURCE+":8032");
conf.set("yarn.resourcemanager.scheduler.address", BasicConfig.YARN_RESOURCE+":8030");
//put all of the third party jars like this, but these jars should be put on hadoop clusters.
//conf.set("mapreduce.application.classpath", "$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,/opt/mount/learn/mahout-1.0-lib/*");將第三方jar包放到hadoop集羣的每個節點中比較麻煩
return conf;
}

}

這個是如何操作hdfs文件的使用方法：

package com.hp.recommendation.util;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;

import com.hp.recommendation.model.WriteDataToHDFSModel;

/**
*
* @author shijie
* ref:http://blog.fens.me/hadoop-mahout-mapreduce-itemcf/
*/
public class HdfsUtils {
private static final String HDFS = "hdfs://c0004649.itcs.hp.com:9000";

public HdfsUtils(Configuration conf) {
this(HDFS, conf);
}

public HdfsUtils(String hdfs, Configuration conf) {
this.hdfsPath = hdfs;
this.conf = conf;
}

private String hdfsPath;
private Configuration conf;
public static void getConf() {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", HDFS);
FileSystem hdfs;
try {
hdfs = FileSystem.get(conf);
FileStatus[] fs = hdfs.listStatus(new Path("/"));
for (int i = 0; i < fs.length; i++) {
System.out.println(fs[i].toString());
}
} catch (IOException e) {
e.printStackTrace();
}
}

public void mkdirs(String folder) throws IOException {
Path path = new Path(folder);
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
if (!fs.exists(path)) {
fs.mkdirs(path);
System.out.println("Create: " + folder);
}
fs.close();
}

public void rmr(String folder) throws IOException {
Path path = new Path(folder);
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
fs.deleteOnExit(path);
System.out.println("Delete: " + folder);
fs.close();
}

public void ls(String folder) throws IOException {
Path path = new Path(folder);
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
FileStatus[] list = fs.listStatus(path);
System.out.println("ls: " + folder);
System.out.println("==========================================================");
for (FileStatus f : list) {
System.out.printf("name: %s, folder: %s, size: %d\n", f.getPath(), f.isDir(), f.getLen());
}
System.out.println("==========================================================");
fs.close();
}

public void createFile(String file, String content) throws IOException {
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
byte[] buff = content.getBytes();
FSDataOutputStream os = null;
try {
os = fs.create(new Path(file));
os.write(buff, 0, buff.length);
System.out.println("Create: " + file);
} finally {
if (os != null)
os.close();
}
fs.close();
}

public void copyFile(String local, String remote) throws IOException {
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
fs.copyFromLocalFile(new Path(local), new Path(remote));
System.out.println("copy from: " + local + " to " + remote);
fs.close();
}

public void download(String remote, String local) throws IOException {
Path path = new Path(remote);
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
fs.copyToLocalFile(path, new Path(local));
System.out.println("download: from" + remote + " to " + local);
fs.close();
}
public void cat(String remoteFile) throws IOException {
Path path = new Path(remoteFile);
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf)
FSDataInputStream fsdis = null;
System.out.println("cat: " + remoteFile);
try {
fsdis =fs.open(path);
IOUtils.copyBytes(fsdis, System.out, 4096, false);
} finally {
IOUtils.closeStream(fsdis);
fs.close();
}
}

public String getFile(String remoteFile) throws IOException {

Path path = new Path(remoteFile);
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
FSDataInputStream fsdis = null;
System.out.println("cat: " + remoteFile);
BufferedInputStream buffer = null;
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
String str = null;

try {
fsdis =fs.open(path);
buffer = new BufferedInputStream(fsdis);
int BUFFER_SIZE = 4096;
byte[] data = new byte[BUFFER_SIZE];
int count = -1;
while((count = buffer.read(data, 0, BUFFER_SIZE)) != -1){
outStream.write(data, 0, count);
}
data = null;
str = new String(outStream.toByteArray());
} finally {
buffer.close();
outStream.close();
fs.close();
}
return str;
}

public void writeFileToHDFS(String source, String dest) throws IOException {

FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
// Get the filename out of the file path
String filename = source.substring(source.lastIndexOf('/') + 1, source.length());

// Create the destination path including the filename.
if (dest.charAt(dest.length() - 1) != '/') {
dest = dest + "/" + filename;
} else {
dest = dest + filename;
}

// Check if the file already exists
Path path = new Path(dest);
if (fs.exists(path)) {
System.out.println("File " + dest + " already exists");
return;
}

// Create a new file and write data to it.
FSDataOutputStream out = fs.create(path);
InputStream in = new BufferedInputStream(new FileInputStream(
new File(source)));

byte[] b = new byte[1024];
int numBytes = 0;
while ((numBytes = in.read(b)) > 0) {
out.write(b, 0, numBytes);

}

// Close all the file descripters
in.close();
out.close();

fs.close();

}

public WriteDataToHDFSModel getFileSystemAndFSDataOutputStream(String file) throws IOException {

FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);

// Check if the file already exists
Path path = new Path(file);
WriteDataToHDFSModel model = new WriteDataToHDFSModel();
if (fs.exists(path)) {
System.out.println("File " + file + " already exists");
FSDataOutputStream out = fs.append(path);
model.setOut(out);
}else{
// Create a new file and write data to it.
FSDataOutputStream out = fs.create(path);
model.setOut(out);
}

model.setFs(fs);
return model;

}

public void writeStringToFile(String content, String file) throws IOException {

FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
// Check if the file already exists
Path path = new Path(file);
if (fs.exists(path)) {
System.out.println("File " + file + " already exists");
FSDataOutputStream out = fs.append(path);
BufferedOutputStream buffer = new BufferedOutputStream(out);
buffer.write(content.getBytes());
buffer.flush();
// Close all the file descripters
buffer.close();
out.close();
}else{
// Create a new file and write data to it.
FSDataOutputStream out = fs.create(path);
BufferedOutputStream buffer = new BufferedOutputStream(out);
buffer.write(content.getBytes());
buffer.flush();
// Close all the file descripters
buffer.close();
out.close();
}

fs.close();
}

public static void addJarToDistributedCache(Class classToAdd,Configuration conf) throws IOException {

// Retrieve jar file for class2Add
String jar = classToAdd.getProtectionDomain().getCodeSource().getLocation().getPath();
System.out.println("jar=" + jar);
File jarFile = new File(jar);
// Declare new HDFS location
Path hdfsJar = new Path("/user/hadoop/lib/mahout/" + jarFile.getName());

// Mount HDFS
FileSystem hdfs = FileSystem.get(conf);

// Copy (override) jar file to HDFS
hdfs.copyFromLocalFile(false, true, new Path(jar), hdfsJar);

// Add jar to distributed classPath
DistributedCache.addFileToClassPath(hdfsJar, conf);
}

//add jars to classpath by jar path
public static void addJarToDistributedCache(List<String> jarPaths,Configuration conf) throws IOException {
// Mount HDFS
FileSystem hdfs = FileSystem.get(conf);
for (String jar : jarPaths) {
File jarFile = new File(jar);

// Declare new HDFS location
Path hdfsJar = new Path("/user/hadoop/lib/mahout/" + jarFile.getName());

// Copy (override) jar file to HDFS
if (!hdfs.exists(hdfsJar)) {
hdfs.copyFromLocalFile(false, true, new Path(jar), hdfsJar);

}
// Add jar to distributed classPath
DistributedCache.addFileToClassPath(hdfsJar, conf);

}

}

}

LANWENBING

發佈了95 篇原創文章 · 獲贊 11 · 訪問量 25萬+

私信關注

使用hadoop2.3+mahout+lucene將文本換算成vector

linux安裝cuda和cudnn

模擬手機設備：使用 Playwright 實現移動端自動化測試

Mellanox網卡開啓SR-IOV

全面系統的AI學習路徑，幫助普通人也能玩轉AI

HTML 00 Tutorial

uni-app實現上拉加載

vue3編譯優化之“靜態提升”

又是一個月-20240513

flask 如何保證返回json有序

linux服務器設置ssh免密

centos 改變root用戶密碼

git 衝突解決辦法

hadoop item based collaborative filtering use case

kmeans job eclipse

weblogic + apache 配置當兩個location重疊的情況

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結