這是我實現的第一個版本的爬蟲,侷限比較大,用的算法思想也比較簡單。不過通過爬蟲1號,我對爬蟲和搜索引擎實現機制有了更多的認識,收穫還是挺大的^_^,塗鴉之作,我不知道別人如何實現的,就按自己的想法寫了個,大家有興趣可以看看,用java實現的
這是工程目錄:
下面是具體代碼:
package com.rgy.reptile;
import com.rgy.utils.PageUtils;
public class Entry {
public static void main(String args[]){
String url = "http://www.youku.com";
PageUtils.history_list.add(url);
PageUtils.parent_stack.push(url);
PageUtils.searchUrl(url);
//PageUtils.hrefShow(url);
}
}
package com.rgy.entity;
import java.util.ArrayList;
public class PageInfo {
private String url;
private String title;
private String keywords;
private ArrayList<String> href_list;
public PageInfo(){
this.url="";
this.title="";
this.keywords="";
this.href_list=null;
}
public void setUrl(String url){
this.url = url;
}
public void setTitle(String title){
this.title = title;
}
public void setKeywords(String keywords){
this.keywords = keywords;
}
public void setHref_list(ArrayList<String> href_list){
this.href_list = href_list;
}
public String getUrl(){
return url;
}
public String getTitle(){
return title;
}
public String getKeywords(){
return keywords;
}
public ArrayList<String> getHref_list(){
return href_list;
}
}
package com.rgy.utils;
import java.util.ArrayList;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.rgy.entity.PageInfo;
public class PageUtils {
public static PageInfo getPageInfo(String url){
PageInfo info = new PageInfo();
if(url.endsWith("/")){
url = url.substring(0, url.length()-1);
}
info.setUrl(url);
try{
Document doc = Jsoup.connect(url).timeout(30000).get();
String title = doc.title().toString();
info.setTitle(title);
String keywords = doc.getElementsByTag("meta").select("[name=keywords]").attr("content");
info.setKeywords(keywords);
Elements links = doc.getElementsByTag("a");
ArrayList<String> href_list = new ArrayList<String>();
for (Element link : links) {
String linkHref = link.attr("href");
if(linkHref.endsWith("/")){
linkHref = linkHref.substring(0, linkHref.length()-1);
}
//如果數組中不存在這個鏈接
if(linkIsAvailable(linkHref)&&!href_list.contains(linkHref)){
href_list.add(linkHref);
info.setHref_list(href_list);
}
}
}catch(Exception ex){
ex.printStackTrace();
}
return info;
}
public static boolean linkIsAvailable(String url){
if(url.startsWith("http://")){
String regex = ".*.exe|.*.apk|.*.zip|.*.rar|.*.pdf|.*.doc";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(url);
return !matcher.matches();
}
return false;
}
public static boolean keywordsIsAvailable(String keywords){
String regex = ".*青春.*|.*搞笑.*|.*微電影.*|.*短片.*|.*迷你劇.*|.*喜劇.*";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(keywords);
return matcher.matches();
}
//存放已經訪問過的url
public static ArrayList<String> history_list = new ArrayList<String>();
//記錄一路走來的父結點
public static Stack<String> parent_stack = new Stack<String>();
public static void searchUrl(String url){
PageInfo info = getPageInfo(url);
String keywords = info.getKeywords();
int hlist_size = history_list.size();
System.out.println(hlist_size+"-->"+history_list.get(hlist_size-1));
// if(keywordsIsAvailable(keywords)){//如果匹配上了,
// System.out.println(url+"===>"+keywords);
// }
ArrayList<String> href_list = info.getHref_list();
if(href_list==null){//該結點不可用,回到父親結點繼續走0
parent_stack.pop();
if(!parent_stack.empty()){//不爲空棧
searchUrl(parent_stack.peek());
}else{//空棧
System.out.println("Yir,爬蟲1號已完成任務!!!");
}
}else{//結點可用
int size = href_list.size();
for(int i=0;i<size;i++){
String strUrl = href_list.get(i);
if(history_list.contains(strUrl)){//如果當前鏈接已經被訪問過了
continue;
}
history_list.add(strUrl);
parent_stack.push(strUrl);
searchUrl(strUrl);
}
}
}
public static void hrefShow(String url){
PageInfo info = getPageInfo(url);
ArrayList<String> href_list = info.getHref_list();
int size = href_list.size();
for(int i=0;i<size;i++){
System.out.println(href_list.get(i));
}
}
}
有興趣的童鞋可以到這裏下載工程代碼:
http://download.csdn.net/detail/u011700203/8410597