1、下載數據包
wget http://archive.apache.org/dist/hbase/hbase-0.90.4/hbase-0.90.4.tar.gz
wget http://archive.apache.org/dist/hadoop/common/hadoop-0.20.2/hadoop-0.20.2.tar.gz
wget http://mirrors.ustc.edu.cn/apache/nutch/2.2/apache-nutch-2.2-src.tar.gz
wget http://mirror.bit.edu.cn/apache/zookeeper/zookeeper-3.4.5/zookeeper-3.4.5.tar.gz
wget http://mirror.bit.edu.cn/apache/gora/0.3/apache-gora-0.3-src.tar.gz
2、解壓:
tar –zxvf hbase-0.90.4.tar.gz
tar –zxvf hadoop-0.20.2.tar.gz
tar –zxvf apache-nutch-2.2-src.tar.gz
tar –zxvf zookeeper-3.4.5.tar.gz
tar –zxvf apache-gora-0.3-src.tar.gz
3、安裝所需插件
sudo apt-getinstall maven2
4、安裝hadoop
1)、cd $HADOOP_HOME
2)、mkdir data //用於制定hadoop的hadoop.tmp.dir目錄
3)、cd $HADOOP_HOME/conf
4)、vim hadoop-env.sh
//將JAVA_HOME修改爲自己的JAVA_HOME路徑
exportJAVA_HOME=/usr/lib/jvm/java-7-oracle
5)、vim core-site.xml
在configuration標籤中添加如下信息
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://nutch1:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/data/projects/hadoop-0.20.2/data</value>
</property>
</configuration>
6)、vim hdfs-site.xml
在configuration標籤中添加如下信息
<configuration>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
<property>
<name>dfs.datanode.max.xcievers</name>
<value>4096</value>
</property>
</configuration>
7)、vim mapred-site.xml
在configuration標籤中添加如下信息
<configuration>
<property>
<name>mapred.job.tracker</name>
<value>nutch1:9001</value>
</property>
</configuration>
8)、vim masters
//設置master
nutch1
9)、vim slaves
//設置slaves
nutch1
nutch2
10)、將hadoop項目通過scp拷貝到其他服務器上
scp –r$HADOOP_HOME hadoop@nutch2:/data/projects
11)、格式化hadoop文件系統
cd$HADOOP_HOME/bin
./hadoopnamenode –format
12)、啓動hadoop
./start-all.sh
13)、檢查hadoop運行情況
使用JAVA中的jps命令查詢
Master
hadoop@nutch1:/$jps
21832SecondaryNameNode
22031TaskTracker
25971 Jps
21695 DataNode
21914 JobTracker
21575 NameNode
Slave
hadoop@nutch2:/$ jps
29939 DataNode
30044TaskTracker
704 Jps
5、配置Zookeeper
1)、創建zoo.cfg配置文件
cd$ZOOKEEPER_HOME/conf
cpzoo_sample.cfg zoo.cfg
2)、配置zoo.cfg配置文件
//修改dataDir
dataDir=$ZOOKEEPER_HOME/data
//添加dataLogDir
dataLogDir=$ZOOKEEPER_HOME/data
//添加服務器信息
server.1=10.68.237.26:2888:3888
server.2=10.68.237.27:2888:3888
3)、創建id文件
cd$ZOOKEEPER_HOME
mkdir data
mkdir log
cd data
vim myid
//在myid文件中添加zoo.cfg中對應的服務器id
//如10.68.237.26服務器中的myid文件。添加信息1
1
4)、通過scp拷貝項目到其他的服務器
scp –r $ZOOKEEPER_HOMEhadoop@nutch2:/data/projects/
5)、修改其他服務器上的myid文件
ssh nutch2
cd$ZOOKEEPER_HOME/data
vim myid
//修改myid內容爲2
2
6)、分別啓動兩臺服務器的zookeeper服務
nutch1
$ZOOKEEPER_HOME/bin/zkServer.sh start
nutch2
$ZOOKEEPER_HOME/bin/zkServer.shstart
7)、檢查zookeeper服務狀態
nutch1
hadoop@nutch1:/data/projects/zookeeper-3.4.5/bin$./zkServer.sh status
JMX enabled by default
Using config:/data/projects/zookeeper-3.4.5/bin/../conf/zoo.cfg
Mode: follower
nutch2
hadoop@nutch2:/data/projects/zookeeper-3.4.5/bin$./zkServer.sh status
JMX enabled by default
Using config:/data/projects/zookeeper-3.4.5/bin/../conf/zoo.cfg
Mode: leader
6、安裝hbase
1)、cd $HBASE_HOME/conf
2)、配置hbase-env.sh
vim hbase-env.sh
//將JAVA_HOME修改爲自己的JAVA_HOME路徑
exportJAVA_HOME=/usr/lib/jvm/java-7-oracle
//修改HBASE_MANAGES_ZK爲false,這邊我們用上面搭建好的zookeeper集羣
exportHBASE_MANAGES_ZK=false
3)、修改hbase-site.xml,在configuration中添加如下信息
<configuration>
<property>
<name>hbase.rootdir</name>
<value>hdfs://nutch1:9000/hbase</value>
</property>
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
</property>
<property>
<name>hbase.zookeeper.quorum</name>
<value>nutch2</value>
</property>
<property>
<name>hbase.zookeeper.session.timeout</name>
<value>60000</value>
</property>
<property>
<name>hbase.zookeeper.property.clientPort</name>
<value>2181</value>
</property>
<property>
<name>hbase.master</name>
<value>nutch1</value>
</property>
<property>
<name>hbase.regionserver.lease.period</name>
<value>60000</value>
</property>
<property>
<name>hbase.rpc.timeout</name>
<value>60000</value>
</property>
<property>
<name>hbase.master.maxclockskew</name>
<value>180000</value>
</property>
</configuration>
4)、修改regionservers
nutch2
5)、將hbase中的hadoop jar包版本與hadoop集羣版本一致
rm hadoop-*
cp$HADOOP_HOME/hadoop-0.20.2-core.jar $HBASE_HOME/lib
6)、通過scp將hbase工程拷貝到其他服務器上
scp –r $HBASE_HOMEhadoop@nutch2:/data/projects
7)、啓動hbase
$HBASE_HOME/bin/start-hbase.sh
8)、通過jps命令檢查服務是否正常運行
Masters
hadoop@nutch1:/data/projects/hbase-0.90.4/lib$jps
26394 Jps
21832 SecondaryNameNode
22031TaskTracker
21695 DataNode
24953 HMaster
21914 JobTracker
24791QuorumPeerMain
21575 NameNode
Slaves
hadoop@nutch2:~$jps
29939 DataNode
30044TaskTracker
32270QuorumPeerMain
1126 Jps
32493 HRegionServer
7、安裝nutch
1)、修改$NUTCH_HOME/ivy/ivy.xml
將被註釋掉的<dependency org="org.apache.gora"name="gora-hbase" rev="0.3" conf="*->default"/>去除註釋
2)、修改gora.properties文件
vim$NTUCH_HOME/runtime/local/conf/gora.properties
修改以下內容
#gora.datastore.default=org.apache.gora.mock.store.MockDataStore
修改爲
gora.datastore.default=org.apache.gora.hbase.store.HBaseStore
3)、修改nutch-site.xml配置
//在configuration標籤中添加如下內容
<configuration>
<property>
<name>http.agent.name</name>
<value>test-nutch</value>
</property>
<property>
<property>
<name>http.robots.agents</name>
<value>test-nutch,*</value>
</property>
<property>
<name>generate.batch.id</name>
<value>1</value>
</property>
<property>
<name>http.agent.name.check</name>
<value>true</value>
</property>
<property>
<name>distributed.search.test.port</name>
<value>60000</value>
<description>TCP port used duringjunit testing.</description>
</property>
<property>
<name>http.accept.language</name>
<value>ja-jp,en-us,en-gb,en;q=0.7,*;q=0.3</value>
<description>Value of the“Accept-Language” request header field.
This
allows selecting non-Englishlanguage as default one to retrieve.
It
is a useful setting for searchengines build for certain national
group.
</description>
</property>
<property>
<name>parser.character.encoding.default</name>
<value>utf-8</value>
<description>The characterencoding to fall back to when no other
information
is available
</description>
</property>
<property>
<name>storage.data.store.class</name>
<value>org.apache.gora.hbase.store.HBaseStore</value>
<description>The Gora DataStoreclass for storing and retrieving data.
Currently the following stores areavailable: ….
</description>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/data/zqhadoop/data</value>
<description>此處設置hadoop根目錄</description>
</property>
<property>
<name>plugin.folders</name>
<value>/home/zqgame/apache-nutch/runtime/local/plugins</value>
<description>Directorieswhere nutch plugins are located. Each
element may be a relative or absolutepath. If absolute, it is used
asis. If relative, it is searched for onthe classpath.</description>
</property>
</configuration>
4)、將$HBASE_HOME/conf中的hbase-site.xml拷貝到$NUTCH_HOME/runtime/local/conf目錄下覆蓋
5)、export JAVA_HOME
exportJAVA_HOME=/usr/lib/jvm/java-7-oracle
6)、編譯nutch
cd $NUTCH_HOME
ant
7)、替換nutch中對應的hadoop、hbase 的jar包版本
將hadoop-core-0.20.2.jar,hbase-0.90.4.jar 拷貝到$NUTCH_HOME/runtime/local/lib目錄下
8)、創建攔截的url文件
//在$NUTCH_HOME/runtime/local/目錄中創建一個目錄urls,且創建一個存放url路徑//的文件
mkdir urls
cd urls
vim seed.txt
//輸入想要的url
9)、設置正則攔截規則
vim$NUTCH_HOME/runtime/local/conf/regex-url
//修改
# acceptanything else
+.
//爲以下內容
# acceptanything else
+^http://([a-z0-9]*\.)*nutch.apache.org/
10)、inject任務url到nutch中
bin/nutch injecturls/seed.txt
11)、crawl你的任務
bin/nutch crawlurls -depth 3 -topN 5
12)、執行成功之後可以在hbase看到一個webpage的表,且日誌不會報錯