1. 輸入數據
122.245.205.218^A
1450572279.254^A
hadoop-master.volitation.com^A
/BEIfeng.gif?
u_nu=1&
u_sd=DFBFABA3-9F0B-451F-B47C-782EDBFB5D90&
c_time=1450572272695&
ver=1&
en=e_l&
pl=website&
sdk=js&
b_rst=1440*900&
u_ud=DE9CBECE-D062-4486-A3A6-DFB2A04A3D28&
b_iev=Mozilla%2F5.0%20(Windows%20NT%206.1)%20AppleWebKit%2F537.36%20(KHTML%2C%20like%20Gecko)%20Chrome%2F31.0.1650.63%20Safari%2F537.36&
l=zh-CN
2. Mapper
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class EtlMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
private Map<String,Object> logMaps;
private Text keyOut = new Text();
@Override
protected void map(LongWritable key, Text value,Context context){
try {
logMaps = new HashMap<String,Object>();
StringBuffer keySbs = new StringBuffer();
String logFiles = value.toString();
String[] logFileSplits = logFiles.split("\\^A");
logMaps.put("ip", logFileSplits[0]); //ip
logMaps.put("s_time", logFileSplits[1]); //服務器時間
logMaps.put("http_host", logFileSplits[2]); //nginx 服務器主機名
keySbs.append(logMaps.get("ip")+",");
keySbs.append(logMaps.get("s_time")+",");
keySbs.append(logMaps.get("http_host")+",");
String requestUrl = logFileSplits[3];
String[] requestUrlSplits = requestUrl.split("\\?");
String keyValues = requestUrlSplits[1];
String[] keyValueSplits = keyValues.split("&");
for (String keyValue : keyValueSplits) {
String[] kv = keyValue.split("=");
String key1 = kv[0];
String value1 = kv[1];
String realValue = URLDecoder.decode(value1, "utf-8"); //對value1進行解碼
logMaps.put(key1,realValue);
keySbs.append(logMaps.get(key1)+",");
}
keySbs.deleteCharAt(keySbs.length() - 1);
keyOut.set(keySbs.toString());
context.write(keyOut, NullWritable.get());
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
3. EtlExec
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class EtlExec {
public static void main(String[] args) throws IOException,
ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf); // job對象
job.setJarByClass(EtlExec.class); // jar運行的主類
job.setMapperClass(EtlMapper.class); // map對象
job.setMapOutputKeyClass(Text.class); // K2輸出類型
job.setMapOutputValueClass(NullWritable.class); // V2輸出類型
Path input = new Path("hdfs://master:8020/datas/dsLogFiles.txt");
FileInputFormat.setInputPaths(job, input);
Path output = new Path("hdfs://master:8020/outDatas/dsLogFiles");
FileOutputFormat.setOutputPath(job, output);
boolean status = job.waitForCompletion(true);
System.exit(status ? 1 : 0);
}
}
4. 輸出文件
122.245.205.218,
1450572279.254,
hadoop-master.volitation.com,
1,
DFBFABA3-9F0B-451F-B47C-782EDBFB5D90,
1450572272695,
1,
e_l,
website,
js,
1440*900,
DE9CBECE-D062-4486-A3A6-DFB2A04A3D28,
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36,
zh-CN