3000道POJ英文題中高頻詞彙--HADOOP單詞統計

最近學習HADOOP,寫mapreduce,最簡單的單詞統計。

以前刷題,英語不好,很煩。

現在統計一下poj上單詞。

首先,抓取了1000--4000的英文題目,對數據進行清洗,把不需要的數字,中文,各種奇怪的符號都去掉。

然後直接跑mapreduce

統計出這3000道題纔有不到800個不同的單詞,先看一下結果。

each	19
are	21
by	21
input	23
that	24
line	26
be	33
will	33
The	41
number	42
and	58
is	60
in	61
to	63
a	65
of	129
the	226


統計結果還是可以被參考的。

爬蟲(不太會,low)

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;

import org.junit.Test;

public class Pc {

	public static String getHtml(String urlString) {
		try {
			StringBuffer html = new StringBuffer();
			URL url = new URL(urlString);
			HttpURLConnection conn = (HttpURLConnection) url.openConnection();
			InputStreamReader isr = new InputStreamReader(conn.getInputStream());
			BufferedReader br = new BufferedReader(isr);
			String temp;
			while ((temp = br.readLine()) != null) {
				html.append(temp).append("\n");
			}
			br.close();
			isr.close();
			return html.toString();
		} catch (Exception e) {
			e.printStackTrace();
			return null;
		}
	}

	@Test

	public static void xiewenjian(String str) throws Exception {
		byte[] b=str.getBytes();
    	FileOutputStream out = new FileOutputStream("d:/poj11.txt", true);
    	out.write(b);
	}

	public void zhuaqu(String url,int k) throws Exception {
		String s = getHtml(url);
		String head = "<div class=\"ptx\" lang=\"en-US\">";
		int h1 = s.indexOf(head,k);
		if(h1==-1) {
			return;
		}
		int x = h1 + head.length();
		String tail = "</div>";
		int h2 = s.indexOf(tail, x);
		int y = h2;
		String str = s.substring(x, y);
		String str1 = str.replaceAll("\\."," ");
		String str2 = str1.replaceAll("<.+>"," ");
		String str3 = str2.replaceAll(" +", " ");
		String str4 = str3.replaceAll("[^ a-zA-Z]", "");
		xiewenjian(" "+str4);
		zhuaqu(url,y);
	}

	public static void main(String[] args) throws Exception {
		Pc p = new Pc();
		String url0="http://poj.org/problem?id=";
		for(int i=1001;i<=4000;i++) {
			String s=Integer.toString(i);
			String url = url0+s;
			System.out.println(url);
			p.zhuaqu(url,0);
		}

	}

}


統計

package cn.ky.mapreduce.sortwc;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Count {
	
	public static void main(String[] args) throws Exception {
		
		Configuration conf = new Configuration();
		
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(Count.class);
		job.setMapperClass(SMap.class);
		job.setReducerClass(SReduce.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		job.waitForCompletion(true);
	}
	
	public static class SMap extends Mapper<LongWritable, Text, Text, IntWritable>{

		Text k=new Text();
		Infbean v=new Infbean();
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			String line=value.toString();
			String[] words=line.split(" ");
			for(String word:words) {
				if(word.equals("")==false) {
					k.set(word);
					context.write(k, new IntWritable(1));
				}
				
			}
		}
	}
	
	public static class SReduce extends Reducer<Text, IntWritable, Text, IntWritable>{
		
		@Override
		protected void reduce(Text key, Iterable<IntWritable> values,
				Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
			int sum=0;
			for(IntWritable bean:values) {
				sum+=bean.get();
			}
			context.write(key, new IntWritable(sum));
		}
		
		
	}
	
	
	
}


排序
package cn.ky.mapreduce.sortwc;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class Sort {
	
	public static void main(String[] args) throws Exception {
	Configuration conf = new Configuration();
		
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(Sort.class);
		job.setMapperClass(SMap.class);
		job.setReducerClass(SReduce.class);
		
		job.setMapOutputKeyClass(Infbean.class);
		job.setMapOutputValueClass(NullWritable.class);
		job.setOutputKeyClass(Infbean.class);
		job.setOutputValueClass(NullWritable.class);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		job.waitForCompletion(true);
	}
	
	public static class SMap extends Mapper<LongWritable, Text, Infbean, NullWritable>{

		Infbean v=new Infbean();
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, Infbean, NullWritable>.Context context)
				throws IOException, InterruptedException {
			String line=value.toString();
			String[] str=line.split("\t");
			v.set(str[0],Integer.parseInt(str[1]));
			context.write(v, NullWritable.get());
		}
	}
	
	public static class SReduce extends Reducer<Infbean, NullWritable, Infbean, NullWritable>{

		@Override
		protected void reduce(Infbean key, Iterable<NullWritable> values,
				Reducer<Infbean, NullWritable, Infbean, NullWritable>.Context context)
				throws IOException, InterruptedException {
			context.write(key, NullWritable.get());
			
		}
		
	}
	

}

自定義類型

package cn.ky.mapreduce.sortwc;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;

public class Infbean implements WritableComparable<Infbean>{

	private String word;
	private int count;
	
	public void set(String word,int count) {
		this.word=word;
		this.count=count;
	}
	
	@Override
	public void readFields(DataInput in) throws IOException {
		// TODO Auto-generated method stub
		this.word=in.readUTF();
		this.count=in.readInt();
		
	}
	
	@Override
	public void write(DataOutput out) throws IOException {
		// TODO Auto-generated method stub
		out.writeUTF(word);
		out.writeInt(count);
	}

	/**
	 * 和冒泡的思想差不多,自己理解。
	 */
	@Override
	public int compareTo(Infbean o) {
		// TODO Auto-generated method stub
		if(this.count>o.count) {
			return 1;
		}else {
			return -1;
		}
	}
	
	public String toString() {
		return this.word+"\t"+this.count;
	}

	public String getWord() {
		return word;
	}

	public void setWord(String word) {
		this.word = word;
	}

	public int getCount() {
		return count;
	}

	public void setCount(int count) {
		this.count = count;
	}
	
	
	
	
	
	
	
}


最後全部統計結果
zip	1
zotz	1
writing	1
zac	1
yoxkin	1
AACEDGG	1
AT	1
An	1
yax	1
xul	1
x	1
cumhu	1
court	1
work	1
world	1
currently	1
wont	1
curve	1
Arrange	1
Another	1
counting	1
At	1
whose	1
who	1
whitespace	1
whilescanfsdsnEOF	1
whilescanfsdsn	1
while	1
counted	1
Case	1
when	1
wheather	1
whats	1
B	1
Businesses	1
water	1
Cartesian	1
wants	1
could	1
unambiguous	1
Consider	1
DAABEC	1
using	1
DNA	1
uppercase	1
upper	1
up	1
until	1
unsortedness	1
unsorted	1
unlucky	1
uayet	1
tzec	1
Dit	1
type	1
During	1
twentyfive	1
Exactly	1
F	1
trailing	1
trade	1
total	1
tonight	1
Figure	1
correspond	1
Have	1
From	1
tied	1
Further	1
through	1
Generate	1
thought	1
though	1
Ginos	1
thirteen	1
third	1
H	1
Hes	1
coordinates	1
coordinate	1
convert	1
task	1
However	1
Hut	1
Hyphens	1
th	1
I	1
tens	1
IONU	1
taxing	1
control	1
supplies	1
systems	1
sweep	1
surrounding	1
surprising	1
suppressed	1
constraints	1
starting	1
Imaging	1
Inc	1
stores	1
stopped	1
stock	1
Insignificant	1
step	1
statements	1
state	1
starts	1
debt	1
square	1
Instead	1
It	1
J	1
decided	1
sold	1
specifies	1
spaces	1
K	1
sortedwhile	1
sortedness	1
L	1
somehow	1
some	1
solve	1
solution	1
defined	1
denominations	1
signals	1
singlevalue	1
Leading	1
since	1
simple	1
concentration	1
encoding	1
outside	1
rather	1
check	1
occurrence	1
making	1
computer	1
session	1
automatically	1
run	1
cataloguing	1
boundary	1
minimum	1
civilization	1
request	1
look	1
likely	1
legal	1
Problems	1
justice	1
Some	1
back	1
sharper	1
believe	1
University	1
get	1
him	1
much	1
floating	1
Month	1
extra	1
mean	1
equal	1
seems	1
easier	1
removed	1
composed	1
records	1
alternating	1
Ruritania	1
lately	1
lot	1
knotted	1
sample	1
assume	1
punctuation	1
once	1
sabbatical	1
based	1
differences	1
occupy	1
Two	1
blank	1
alphabetical	1
holly	1
W	1
national	1
having	1
OUTPUT	1
fourth	1
rest	1
Where	1
Years	1
financial	1
match	1
few	1
Philately	1
excluding	1
phrase	1
actual	1
persons	1
endofline	1
equally	1
magnitude	1
performs	1
duplicate	1
made	1
divided	1
live	1
did	1
after	1
description	1
real	1
Rn	1
done	1
arranged	1
read	1
overhang	1
encounted	1
known	1
quality	1
kankin	1
local	1
job	1
purchasing	1
attempts	1
service	1
involving	1
Note	1
into	1
constraint	1
instance	1
THE	1
individual	1
discovered	1
including	1
NumberOfTheDay	1
immediately	1
problems	1
illustrated	1
problem	1
printed	1
second	1
hotel	1
O	1
brute	1
precision	1
her	1
necessarily	1
graduated	1
call	1
possible	1
capitalization	1
result	1
g	1
When	1
forms	1
separate	1
caused	1
eroding	1
floor	1
plural	1
finds	1
about	1
filled	1
achieve	1
field	1
P	1
fact	1
ends	1
experience	1
measured	1
examples	1
sell	1
every	1
phone	1
equivalent	1
requests	1
enough	1
added	1
comments	1
research	1
common	1
either	1
edge	1
edgedetected	1
compiling	1
mac	1
company	1
encodes	1
does	1
locations	1
discovery	1
Property	1
computed	1
allocates	1
dialed	1
Locations	1
detected	1
dont	1
lexicographical	1
remaining	1
reads	1
scanf	1
area	1
per	1
left	1
she	1
Ruritanian	1
shrinking	1
ascending	1
save	1
asks	1
exists	1
koyab	1
outputs	1
S	1
displayed	1
keypad	1
Satellite	1
purposes	1
religious	1
judged	1
orginal	1
Semicircle	1
NameOfTheDay	1
items	1
Service	1
issued	1
do	1
available	1
Sometimes	1
investigating	1
compute	1
inversion	1
Successive	1
Notice	1
Mapper	1
balances	1
Number	1
inside	1
River	1
information	1
That	1
processes	1
algorithm	1
indicated	1
Thus	1
includes	1
row	1
beexactly	1
To	1
begin	1
dial	1
believed	1
U	1
bisects	1
detection	1
born	1
rounded	1
hyphen	1
pax	1
V	1
shown	1
hoping	1
particular	1
hold	1
never	1
hired	1
past	1
highest	1
preceded	1
build	1
Mississippi	1
calculate	1
nearly	1
nd	1
nearest	1
collectors	1
grab	1
endoffile	1
calling	1
Waterloo	1
name	1
chen	1
generally	1
We	1
card	1
portfolio	1
cards	1
count	1
catalog	1
according	1
force	1
muan	1
responsible	1
Postal	1
ceh	1
YEAR	1
floatingpoint	1
column	1
centered	1
ZWQM	1
financing	1
please	1
finally	1
mol	1
files	1
except	1
figure	1
above	1
fewest	1
actually	1
fail	1
respect	1
absolute	1
entries	1
expressed	1
Pizza	1
doing	1
allocations	1
design	1
several	1
reverse	1
postage	1
group	1
greater	2
going	2
give	2
pop	2
point	2
follows	2
following	2
seven	2
respectively	2
pizza	2
message	2
mental	2
eznab	2
mem	2
erosion	2
mapping	2
sets	2
person	2
manik	2
make	2
period	2
edges	2
eb	2
duplicates	2
see	2
lost	2
due	2
dollar	2
penny	2
dialing	2
life	2
described	2
letter	2
lengths	2
pair	2
learned	2
large	2
lamat	2
know	2
other	2
kan	2
ix	2
inversions	2
old	2
intellectual	2
ok	2
series	2
indicating	2
indicates	2
included	2
nonnegative	2
sequences	2
none	2
hyphens	2
denomination	2
right	2
house	2
new	2
necessary	2
had	2
sign	2
smaller	2
list	2
software	2
muluk	2
spell	2
start	2
such	2
take	2
their	2
them	2
then	2
times	2
very	2
write	2
where	2
width	2
within	2
word	2
would	2
zeros	2
One	2
OF	2
between	2
ben	2
corresponding	2
being	2
RLE	2
Output	2
After	2
Q	2
both	2
As	2
T	2
TUTGLOP	2
G	2
consisting	2
Louisiana	2
businesses	2
They	2
considering	2
GINO	2
Your	2
Use	2
Ya	2
computation	2
compressed	2
Year	2
caban	2
bank	2
columns	2
Input	2
again	2
END	2
axis	2
average	2
allocation	2
alone	2
cib	2
C	2
D	2
cimi	2
characters	2
ahau	2
also	2
chuen	2
another	2
circle	2
chicchan	2
appear	2
cases	2
Dont	2
appears	2
Help	2
canac	2
E	3
giving	3
closing	3
twelve	3
exact	3
even	3
emotional	3
c	3
way	3
but	3
determine	3
time	3
consist	3
decimal	3
how	3
ik	3
contains	3
d	3
your	3
cycle	3
process	3
property	3
physical	3
below	3
peaks	3
may	3
No	3
akbal	3
been	3
out	3
all	3
string	3
integers	3
Since	3
these	3
balance	3
account	3
measure	3
M	3
There	3
issue	3
These	3
occur	3
professor	3
single	3
last	3
space	3
long	3
m	3
names	3
must	3
Y	3
answer	3
miles	3
still	3
month	3
current	3
used	4
like	4
want	4
test	4
least	4
called	4
they	4
Fred	4
exactly	4
imix	4
combination	4
length	4
many	4
i	4
its	4
people	4
periods	4
money	4
All	4
part	4
program	4
were	4
p	4
pixel	4
needs	4
customer	4
only	4
Larry	4
Maya	4
containing	4
X	4
consists	4
pixels	4
denoted	4
stamp	5
any	5
most	5
e	5
directory	5
letters	5
file	5
end	5
tie	5
N	5
semicircle	5
customers	5
land	5
best	5
occurs	5
he	5
can	5
RPS	5
R	5
four	6
standard	6
format	6
If	6
calendar	6
contain	6
there	6
than	6
beginning	6
Z	6
strings	6
memorable	6
images	6
set	6
In	6
digits	6
dates	6
months	6
sorted	6
Tzolkin	6
two	6
if	7
pairs	7
it	7
data	7
This	7
positive	7
You	7
followed	7
maximum	7
A	7
Haab	7
values	7
which	7
day	8
triple	8
cycles	8
sequence	8
Each	8
on	8
more	8
types	8
value	8
no	8
map	8
this	8
date	9
different	9
order	9
next	9
three	9
case	9
integer	9
was	9
lines	10
example	10
same	10
form	10
has	10
one	10
numbers	11
should	11
not	11
an	11
peak	11
n	11
stamps	11
print	11
have	11
his	12
first	12
with	13
image	13
you	13
or	13
given	13
from	14
For	14
telephone	14
year	14
days	15
for	15
at	16
output	16
as	18
each	19
are	21
by	21
input	23
that	24
line	26
be	33
will	33
The	41
number	42
and	58
is	60
in	61
to	63
a	65
of	129
the	226


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章