最近學習HADOOP,寫mapreduce,最簡單的單詞統計。
以前刷題,英語不好,很煩。
現在統計一下poj上單詞。
首先,抓取了1000--4000的英文題目,對數據進行清洗,把不需要的數字,中文,各種奇怪的符號都去掉。
然後直接跑mapreduce
統計出這3000道題纔有不到800個不同的單詞,先看一下結果。
each 19
are 21
by 21
input 23
that 24
line 26
be 33
will 33
The 41
number 42
and 58
is 60
in 61
to 63
a 65
of 129
the 226
統計結果還是可以被參考的。
爬蟲(不太會,low)
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import org.junit.Test;
public class Pc {
public static String getHtml(String urlString) {
try {
StringBuffer html = new StringBuffer();
URL url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
InputStreamReader isr = new InputStreamReader(conn.getInputStream());
BufferedReader br = new BufferedReader(isr);
String temp;
while ((temp = br.readLine()) != null) {
html.append(temp).append("\n");
}
br.close();
isr.close();
return html.toString();
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
@Test
public static void xiewenjian(String str) throws Exception {
byte[] b=str.getBytes();
FileOutputStream out = new FileOutputStream("d:/poj11.txt", true);
out.write(b);
}
public void zhuaqu(String url,int k) throws Exception {
String s = getHtml(url);
String head = "<div class=\"ptx\" lang=\"en-US\">";
int h1 = s.indexOf(head,k);
if(h1==-1) {
return;
}
int x = h1 + head.length();
String tail = "</div>";
int h2 = s.indexOf(tail, x);
int y = h2;
String str = s.substring(x, y);
String str1 = str.replaceAll("\\."," ");
String str2 = str1.replaceAll("<.+>"," ");
String str3 = str2.replaceAll(" +", " ");
String str4 = str3.replaceAll("[^ a-zA-Z]", "");
xiewenjian(" "+str4);
zhuaqu(url,y);
}
public static void main(String[] args) throws Exception {
Pc p = new Pc();
String url0="http://poj.org/problem?id=";
for(int i=1001;i<=4000;i++) {
String s=Integer.toString(i);
String url = url0+s;
System.out.println(url);
p.zhuaqu(url,0);
}
}
}
統計
package cn.ky.mapreduce.sortwc;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Count {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Count.class);
job.setMapperClass(SMap.class);
job.setReducerClass(SReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
public static class SMap extends Mapper<LongWritable, Text, Text, IntWritable>{
Text k=new Text();
Infbean v=new Infbean();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
String line=value.toString();
String[] words=line.split(" ");
for(String word:words) {
if(word.equals("")==false) {
k.set(word);
context.write(k, new IntWritable(1));
}
}
}
}
public static class SReduce extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int sum=0;
for(IntWritable bean:values) {
sum+=bean.get();
}
context.write(key, new IntWritable(sum));
}
}
}
package cn.ky.mapreduce.sortwc;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Sort {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Sort.class);
job.setMapperClass(SMap.class);
job.setReducerClass(SReduce.class);
job.setMapOutputKeyClass(Infbean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Infbean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
public static class SMap extends Mapper<LongWritable, Text, Infbean, NullWritable>{
Infbean v=new Infbean();
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Infbean, NullWritable>.Context context)
throws IOException, InterruptedException {
String line=value.toString();
String[] str=line.split("\t");
v.set(str[0],Integer.parseInt(str[1]));
context.write(v, NullWritable.get());
}
}
public static class SReduce extends Reducer<Infbean, NullWritable, Infbean, NullWritable>{
@Override
protected void reduce(Infbean key, Iterable<NullWritable> values,
Reducer<Infbean, NullWritable, Infbean, NullWritable>.Context context)
throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
}
自定義類型
package cn.ky.mapreduce.sortwc;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class Infbean implements WritableComparable<Infbean>{
private String word;
private int count;
public void set(String word,int count) {
this.word=word;
this.count=count;
}
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
this.word=in.readUTF();
this.count=in.readInt();
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeUTF(word);
out.writeInt(count);
}
/**
* 和冒泡的思想差不多,自己理解。
*/
@Override
public int compareTo(Infbean o) {
// TODO Auto-generated method stub
if(this.count>o.count) {
return 1;
}else {
return -1;
}
}
public String toString() {
return this.word+"\t"+this.count;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
}
zip 1
zotz 1
writing 1
zac 1
yoxkin 1
AACEDGG 1
AT 1
An 1
yax 1
xul 1
x 1
cumhu 1
court 1
work 1
world 1
currently 1
wont 1
curve 1
Arrange 1
Another 1
counting 1
At 1
whose 1
who 1
whitespace 1
whilescanfsdsnEOF 1
whilescanfsdsn 1
while 1
counted 1
Case 1
when 1
wheather 1
whats 1
B 1
Businesses 1
water 1
Cartesian 1
wants 1
could 1
unambiguous 1
Consider 1
DAABEC 1
using 1
DNA 1
uppercase 1
upper 1
up 1
until 1
unsortedness 1
unsorted 1
unlucky 1
uayet 1
tzec 1
Dit 1
type 1
During 1
twentyfive 1
Exactly 1
F 1
trailing 1
trade 1
total 1
tonight 1
Figure 1
correspond 1
Have 1
From 1
tied 1
Further 1
through 1
Generate 1
thought 1
though 1
Ginos 1
thirteen 1
third 1
H 1
Hes 1
coordinates 1
coordinate 1
convert 1
task 1
However 1
Hut 1
Hyphens 1
th 1
I 1
tens 1
IONU 1
taxing 1
control 1
supplies 1
systems 1
sweep 1
surrounding 1
surprising 1
suppressed 1
constraints 1
starting 1
Imaging 1
Inc 1
stores 1
stopped 1
stock 1
Insignificant 1
step 1
statements 1
state 1
starts 1
debt 1
square 1
Instead 1
It 1
J 1
decided 1
sold 1
specifies 1
spaces 1
K 1
sortedwhile 1
sortedness 1
L 1
somehow 1
some 1
solve 1
solution 1
defined 1
denominations 1
signals 1
singlevalue 1
Leading 1
since 1
simple 1
concentration 1
encoding 1
outside 1
rather 1
check 1
occurrence 1
making 1
computer 1
session 1
automatically 1
run 1
cataloguing 1
boundary 1
minimum 1
civilization 1
request 1
look 1
likely 1
legal 1
Problems 1
justice 1
Some 1
back 1
sharper 1
believe 1
University 1
get 1
him 1
much 1
floating 1
Month 1
extra 1
mean 1
equal 1
seems 1
easier 1
removed 1
composed 1
records 1
alternating 1
Ruritania 1
lately 1
lot 1
knotted 1
sample 1
assume 1
punctuation 1
once 1
sabbatical 1
based 1
differences 1
occupy 1
Two 1
blank 1
alphabetical 1
holly 1
W 1
national 1
having 1
OUTPUT 1
fourth 1
rest 1
Where 1
Years 1
financial 1
match 1
few 1
Philately 1
excluding 1
phrase 1
actual 1
persons 1
endofline 1
equally 1
magnitude 1
performs 1
duplicate 1
made 1
divided 1
live 1
did 1
after 1
description 1
real 1
Rn 1
done 1
arranged 1
read 1
overhang 1
encounted 1
known 1
quality 1
kankin 1
local 1
job 1
purchasing 1
attempts 1
service 1
involving 1
Note 1
into 1
constraint 1
instance 1
THE 1
individual 1
discovered 1
including 1
NumberOfTheDay 1
immediately 1
problems 1
illustrated 1
problem 1
printed 1
second 1
hotel 1
O 1
brute 1
precision 1
her 1
necessarily 1
graduated 1
call 1
possible 1
capitalization 1
result 1
g 1
When 1
forms 1
separate 1
caused 1
eroding 1
floor 1
plural 1
finds 1
about 1
filled 1
achieve 1
field 1
P 1
fact 1
ends 1
experience 1
measured 1
examples 1
sell 1
every 1
phone 1
equivalent 1
requests 1
enough 1
added 1
comments 1
research 1
common 1
either 1
edge 1
edgedetected 1
compiling 1
mac 1
company 1
encodes 1
does 1
locations 1
discovery 1
Property 1
computed 1
allocates 1
dialed 1
Locations 1
detected 1
dont 1
lexicographical 1
remaining 1
reads 1
scanf 1
area 1
per 1
left 1
she 1
Ruritanian 1
shrinking 1
ascending 1
save 1
asks 1
exists 1
koyab 1
outputs 1
S 1
displayed 1
keypad 1
Satellite 1
purposes 1
religious 1
judged 1
orginal 1
Semicircle 1
NameOfTheDay 1
items 1
Service 1
issued 1
do 1
available 1
Sometimes 1
investigating 1
compute 1
inversion 1
Successive 1
Notice 1
Mapper 1
balances 1
Number 1
inside 1
River 1
information 1
That 1
processes 1
algorithm 1
indicated 1
Thus 1
includes 1
row 1
beexactly 1
To 1
begin 1
dial 1
believed 1
U 1
bisects 1
detection 1
born 1
rounded 1
hyphen 1
pax 1
V 1
shown 1
hoping 1
particular 1
hold 1
never 1
hired 1
past 1
highest 1
preceded 1
build 1
Mississippi 1
calculate 1
nearly 1
nd 1
nearest 1
collectors 1
grab 1
endoffile 1
calling 1
Waterloo 1
name 1
chen 1
generally 1
We 1
card 1
portfolio 1
cards 1
count 1
catalog 1
according 1
force 1
muan 1
responsible 1
Postal 1
ceh 1
YEAR 1
floatingpoint 1
column 1
centered 1
ZWQM 1
financing 1
please 1
finally 1
mol 1
files 1
except 1
figure 1
above 1
fewest 1
actually 1
fail 1
respect 1
absolute 1
entries 1
expressed 1
Pizza 1
doing 1
allocations 1
design 1
several 1
reverse 1
postage 1
group 1
greater 2
going 2
give 2
pop 2
point 2
follows 2
following 2
seven 2
respectively 2
pizza 2
message 2
mental 2
eznab 2
mem 2
erosion 2
mapping 2
sets 2
person 2
manik 2
make 2
period 2
edges 2
eb 2
duplicates 2
see 2
lost 2
due 2
dollar 2
penny 2
dialing 2
life 2
described 2
letter 2
lengths 2
pair 2
learned 2
large 2
lamat 2
know 2
other 2
kan 2
ix 2
inversions 2
old 2
intellectual 2
ok 2
series 2
indicating 2
indicates 2
included 2
nonnegative 2
sequences 2
none 2
hyphens 2
denomination 2
right 2
house 2
new 2
necessary 2
had 2
sign 2
smaller 2
list 2
software 2
muluk 2
spell 2
start 2
such 2
take 2
their 2
them 2
then 2
times 2
very 2
write 2
where 2
width 2
within 2
word 2
would 2
zeros 2
One 2
OF 2
between 2
ben 2
corresponding 2
being 2
RLE 2
Output 2
After 2
Q 2
both 2
As 2
T 2
TUTGLOP 2
G 2
consisting 2
Louisiana 2
businesses 2
They 2
considering 2
GINO 2
Your 2
Use 2
Ya 2
computation 2
compressed 2
Year 2
caban 2
bank 2
columns 2
Input 2
again 2
END 2
axis 2
average 2
allocation 2
alone 2
cib 2
C 2
D 2
cimi 2
characters 2
ahau 2
also 2
chuen 2
another 2
circle 2
chicchan 2
appear 2
cases 2
Dont 2
appears 2
Help 2
canac 2
E 3
giving 3
closing 3
twelve 3
exact 3
even 3
emotional 3
c 3
way 3
but 3
determine 3
time 3
consist 3
decimal 3
how 3
ik 3
contains 3
d 3
your 3
cycle 3
process 3
property 3
physical 3
below 3
peaks 3
may 3
No 3
akbal 3
been 3
out 3
all 3
string 3
integers 3
Since 3
these 3
balance 3
account 3
measure 3
M 3
There 3
issue 3
These 3
occur 3
professor 3
single 3
last 3
space 3
long 3
m 3
names 3
must 3
Y 3
answer 3
miles 3
still 3
month 3
current 3
used 4
like 4
want 4
test 4
least 4
called 4
they 4
Fred 4
exactly 4
imix 4
combination 4
length 4
many 4
i 4
its 4
people 4
periods 4
money 4
All 4
part 4
program 4
were 4
p 4
pixel 4
needs 4
customer 4
only 4
Larry 4
Maya 4
containing 4
X 4
consists 4
pixels 4
denoted 4
stamp 5
any 5
most 5
e 5
directory 5
letters 5
file 5
end 5
tie 5
N 5
semicircle 5
customers 5
land 5
best 5
occurs 5
he 5
can 5
RPS 5
R 5
four 6
standard 6
format 6
If 6
calendar 6
contain 6
there 6
than 6
beginning 6
Z 6
strings 6
memorable 6
images 6
set 6
In 6
digits 6
dates 6
months 6
sorted 6
Tzolkin 6
two 6
if 7
pairs 7
it 7
data 7
This 7
positive 7
You 7
followed 7
maximum 7
A 7
Haab 7
values 7
which 7
day 8
triple 8
cycles 8
sequence 8
Each 8
on 8
more 8
types 8
value 8
no 8
map 8
this 8
date 9
different 9
order 9
next 9
three 9
case 9
integer 9
was 9
lines 10
example 10
same 10
form 10
has 10
one 10
numbers 11
should 11
not 11
an 11
peak 11
n 11
stamps 11
print 11
have 11
his 12
first 12
with 13
image 13
you 13
or 13
given 13
from 14
For 14
telephone 14
year 14
days 15
for 15
at 16
output 16
as 18
each 19
are 21
by 21
input 23
that 24
line 26
be 33
will 33
The 41
number 42
and 58
is 60
in 61
to 63
a 65
of 129
the 226