mapper.awk
#!/bin/awk -f
BEGIN{
RS = "\n";
FS = " ";
OFS = " ";
}
{
split($2, array_pname, ",");
for(pname in array_pname) {
array_counter[array_pname[pname]]++;
}
}
END{
for( pname in array_counter) {
print pname, array_counter[pname];
}
}
reducer.awk
#!/bin/awk -f
BEGIN{
RS = "\n";
FS = " ";
OFS = " ";
}
{
array_counter[$1] += $2;
}
END{
for( pname in array_counter ) {
count = array_counter[pname];
gsub(/\|/, " ", pname);
print count, pname;
}
}
hdpsubmit.sh
$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-0.20.1.12-fb-streaming.jar \
-D mapred.job.name=log_$yestoday \
-D mapred.ignore.badcompress=true \
-D mapred.compress.map.output=true \
-D mapred.map.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec \
-D mapred.output.compress=true \
-D mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec \
-D mapred.linerecordreader.maxlength=4096000 \
-numReduceTasks 24 \
-mapper $MAPRED_SCRIPT_DIR/mapper.awk \
-reducer $MAPRED_SCRIPT_DIR/reducer.awk \
-file $MAPRED_SCRIPT_DIR/mapper.awk \
-file $MAPRED_SCRIPT_DIR/reducer.awk \
-input /home/hdp-zhushou/update_log/$yestoday/* \
-output output_update_log/