MapRunnable設計一例


package org.apache.nutch.fetcher;

import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;

import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapRunnable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.fetcher.Fetcher;
import org.apache.nutch.fetcher.FetcherOutputFormat;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;

public class HostCheck extends Configured implements MapRunnable<Text, Text, Text, Text> {

public static final Log LOG = LogFactory.getLog(HostCheck.class);

OutputCollector<Text, Text> output;
static HttpClient httpClient;
static{
httpClient = new HttpClient();
httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(10000);
}
@Override
public void run(RecordReader<Text, Text> input,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
// TODO Auto-generated method stub
this.output = output;
Text host = new Text();
Text tmp = new Text();
HashSet<Thread> fetchset = new HashSet<Thread>();
while(input.next(host, tmp)){
if(fetchset.size()<100){
String h = host.toString();
Thread ft = new Thread(new FetchThread(h,output));
fetchset.add(ft);
ft.start();
}else{
Iterator<Thread> itr = fetchset.iterator();
while(itr.hasNext()){
Thread t = itr.next();
if(!t.isAlive()){
itr.remove();
}
}
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}

public static String checkHost(String host) {
GetMethod getMethod = new GetMethod(host);
getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 10000);
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
new DefaultHttpMethodRetryHandler());
getMethod.setRequestHeader("User-Agent", "Nokia-N73/1.0");
getMethod.setRequestHeader("Accept", "text/vnd.wap.wml");

String type = null;
try {
int statusCode = httpClient.executeMethod(getMethod);
if (statusCode == HttpStatus.SC_OK) {
Header h = getMethod.getResponseHeader("Content-Type");
type = h.getValue();
// if (type.toLowerCase().indexOf("text/html") >=0) {
// byte[] responseBody = getMethod.getResponseBody();
// String result = new String(responseBody, "utf-8");
// if (result.toLowerCase().indexOf("dtd/xhtml-mobile") >= 0) {
// type = "text/xhtml-mobile";
// }
// }


LOG.info(type + "\thost: " + host);


}
} catch (Exception e) {
// e.printStackTrace();
} finally {
getMethod.releaseConnection();
}
return type;
}
public static class FetchThread implements Runnable{

String url ;
OutputCollector<Text, Text> output;
@Override
public void run() {
try {
String type = HostCheck.checkHost("wap."+url.trim());
if(type==null)return;
if(type.indexOf("text/vnd.wap.wml")>=0 || type.indexOf("application/xhtml+xml")>=0){
this.output.collect(new Text("wap."+url.trim()), new Text(type));
}
type = HostCheck.checkHost("www."+url.trim()+"/wap");
if(type==null)return;
if(type.indexOf("text/vnd.wap.wml")>=0 || type.indexOf("application/xhtml+xml")>=0){
this.output.collect(new Text("www."+url.trim()+"/wap"), new Text(type));
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}


}

public FetchThread(String url,OutputCollector<Text, Text> output){
this.url = url;
this.output = output;
}
}
@Override
public void configure(JobConf job) {
// TODO Auto-generated method stub

}

public void check(String[] args){
Configuration conf = NutchConfiguration.create();
JobConf job = new NutchJob(conf);
job.setJobName("hostcheck ");

FileInputFormat.addInputPath(job, new Path(args[0]));
job.setInputFormat(SequenceFileInputFormat.class);

job.setMapRunnerClass(HostCheck.class);

FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setOutputFormat(SequenceFileOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);

try {
JobClient.runJob(job);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (LOG.isInfoEnabled()) {
// LOG.info("Fetcher: done");
}
}
public static void main(String[]args){
new HostCheck().check(args);
}

}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章