spidesample 機器人例子(Java)

一個簡單的機器人例子(Java)

import java.awt.*;

import java.net.*;

import java.io.*;

import java.lang.*;

import java.util.*;

class node{     private Object data;     private node next;     private node prev;     public node(Object o){  

data = o;  prev = next = null;

    }     public String toString(){

 if(next!=null)return data.toString() + "/n"+ next.toString();  return data.toString();

    }     public node getNext(){return next;}

    public void setNext(node n){next = n;}

    public node getPrev(){return prev;}

    public void setPrev(node n){prev = n;}

    public Object getData(){return data;}

}

class linkedlist{

    node head;

    node tail;

    public linkedlist(){

 tail = head = null;

    }     public String toString(){

 if(head==null)return "Empty list";

 return head.toString();

    }     public void insert(Object o){

 if(tail==null){

     head = tail = new node(o);

 }else{

     node nn = new node(o);

     tail.setNext(nn);

     tail=nn;

 }     }     public boolean contains(Object o){

 for(node n = head;n!=null;n=n.getNext()){

     if(o.equals(n.getData()))return true;

 }  return false;

    }     public Object pop(){

 if(head==null)return null;

 Object ret = head.getData();

 head = head.getNext();

 if(head==null)tail = null;

 return ret;

    }     public boolean isEmpty(){

 return head==null;

    } }

class list{

 protected node tail;

 protected node ptr;

 private boolean stop;

 public list(){

  ptr=tail=null;

  stop=false;

 }  public boolean isEmpty(){return tail==null;}

 public void reset(){

  stop=false;

  ptr=tail;

 }

 public String toString(){

  if(tail==null)return "Empty list";

  String ret="";

  for(node n = tail.getNext();n!=tail;n=n.getNext())ret+=n.getData

 

().toString()+"/n";

  ret+=tail.getData().toString();

  return ret;

 }

 public Object get(){

  if(ptr==null)return null;

  ptr = ptr.getNext();

  if(ptr==tail.getNext()){

   if(stop)return null;

   stop=true;

   return tail.getNext().getData();

  }

  return ptr.getData();

 }

 public void insert(Object o, boolean attail){

  node nn = new node(o);

  if(tail==null){

   nn.setNext(nn);

   nn.setPrev(nn);

   ptr=tail=nn;

   return;

  }

  if(attail){

  tail.getNext().setPrev(nn);

   nn.setNext(tail.getNext());

   tail.setNext(nn);

   nn.setPrev(tail);

   tail=nn;

  }else{

   nn.setNext(tail.getNext());

   nn.setPrev(tail);

   tail.setNext(nn);

   nn.getNext().setPrev(nn);

  }  }

 public void insert(Object o){}

}

class stack extends list{

 public stack(){super();}

 public void insert(Object o){insert(o, false);}

} class queue extends list{

 public queue(){super();}

 public void insert(Object o){insert(o, true);}

 public String peek(){

   if(tail==null)return "";

   return tail.getNext().getData().toString();

 }  public Object pop(){

  if(tail==null)return null;

  Object ret = tail.getNext().getData();

  if(tail.getNext()==tail){

    tail=ptr=null;

  }else{

    if(tail.getNext()==ptr)ptr=ptr.getNext();

    tail.setNext(tail.getNext().getNext());

  }

  return ret;

 } }

class hashtable{

    private Vector table;

    private int size;

    public hashtable(){

 size = 991;  table = new Vector();  for(int i=0;i<size;i++){      table.add(new linkedlist());  }     }     public void insert(Object o){  int index = o.hashCode();

 index = index % size;

 if(index<0)index+=size;

 linkedlist ol = (linkedlist)table.get(index);

 ol.insert(o);

    }     public boolean contains(Object o){

 int index = o.hashCode();

 index = index % size;

 if(index<0)index+=size;

 return ((linkedlist)(table.get(index))).contains(o);

    }     public String toString(){

 String ret ="";

 for(int i=0;i<size;i++){

     if(!((linkedlist)(table.get(i))).isEmpty()){

  ret+="/n";

  ret+=table.get(i).toString();

     }

 }  return ret;

    } }

class spider implements Runnable{

 public queue todo;  public stack done;  public stack errors;  public stack omittions;  private hashtable allsites;  private String last="";     int maxsites;     int visitedsites;     int TIMEOUT;     String base;     String []badEndings2 = {"ps", "gz"};     String []badEndings3 = {"pdf", "txt", "zip", "jpg", "mpg", "gif", "mov", "tut", "req", "abs", "swf", "tex", "dvi", "bin", "exe", "rpm"};     String []badEndings4 = {"jpeg", "mpeg"};

    public spider(String starturl, int max, String b){  TIMEOUT = 5000;

 base = b;

 allsites = new hashtable();

 todo = new queue();

 done = new stack();

 errors = new stack();

 omittions = new stack();

 try{      URL u = new URL(starturl);

     todo.insert(u);

 }catch(Exception e){      System.out.println(e);

     errors.insert("bad starting url "+starturl+", "+e.toString());  }  maxsites = max;

 visitedsites = 0;

    }

    /*      * how many millisec to wait for each page      */     public void setTimer(int amount){  TIMEOUT = amount;     }

    /*      * strips the '#' anchor off a url      */     private URL stripRef(URL u){

 try{

     return new URL(u.getProtocol(), u.getHost(), u.getPort(), u.getFile());

 }catch(Exception e){return u;}

    }

    /*      * adds a url for future processing      */     public void addSite(URL toadd){

 if(null!=toadd.getRef())toadd = stripRef(toadd);

 if(!allsites.contains(toadd)){

     allsites.insert(toadd);

     if(!toadd.toString().startsWith(base)){

  omittions.insert("foreign URL: "+toadd.toString());

  return;

     }      if(!toadd.toString().startsWith("http") && !toadd.toString().startsWith("HTTP")){

  omittions.insert("ignoring URL: "+toadd.toString());

  return;

     }

     String s = toadd.getFile();

     String last="";

     String []comp={};

     if(s.charAt(s.length()-3)=='.'){

  last = s.substring(s.length()-2);

  comp = badEndings2;

     }else if(s.charAt(s.length()-4)=='.'){

  last = s.substring(s.length()-3);

  comp = badEndings3;

     }else if(s.charAt(s.length()-5)=='.'){

  last = s.substring(s.length()-4);

  comp = badEndings4;

     }      for(int i=0;i<comp.length;i++){

  if(last.equalsIgnoreCase(comp[i])){//loop through all bad extensions       omittions.insert("ignoring URL: "+toadd.toString());

      return;

  }      }           todo.insert(toadd);

 }     }

    /*      * true if there are pending urls and the maximum hasn't been reached      */     public boolean hasMore(){

 return !todo.isEmpty() && visitedsites<maxsites;

    }

    /*      * returns the next site, works like enumeration, will return new values each time      */     private URL getNextSite(){

 last = todo.peek();

 visitedsites++;

 return (URL)todo.pop();

    }

   /*     * Just to see what we are doing now...     */     public String getCurrent(){

 return last;

    }

    /*      * process the next site      */     public void doNextSite(){

 URL current = getNextSite();

 if(current==null)return;

 try{      //System.err.println("Processing #"+visitedsites+": "+current);      parse(current);

     done.insert(current);

 }  catch(Exception e){

     errors.insert("Bad site: "+current.toString()+", "+e.toString());  }     }

    public void run(){

 while(hasMore())doNextSite();

    }

    /*      * to print out the internal data structures      */     public String toString(){return getCompleted()+getErrors();}     private String getErrors(){

 if(errors.isEmpty())return "No errors/n";

 else return "Errors:/n"+errors.toString()+"/nEnd of errors/n";

    }

    private String getCompleted(){

 return "Completed Sites:/n"+done.toString()+"/nEnd of completed sites/n";     }

    /*      *  Parses a web page at (site) and adds all the urls it sees      */     private void parse(URL site) throws Exception{  String source=getText(site);

 String title=getTitle(source);

 if(title.indexOf("404")!=-1 ||     title.indexOf("Error")!=-1 ||     title.indexOf("Not Found")!=-1){      throw new Exception (("404, Not Found: "+site));  }  int loc, beg;

 boolean hasLT=false;

 boolean hasSp=false;

 boolean hasF=false;

 boolean hasR=false;  boolean hasA=false;

 boolean hasM=false;  boolean hasE=false;

 for(loc=0;loc<source.length();loc++){

     char c = source.charAt(loc);

     if(!hasLT){

  hasLT = (c=='<');

     }

     //search for "<a "

     else if(hasLT && !hasA && !hasF){

  if(c=='a' || c=='A')hasA=true;

  else if(c=='f' || c=='F')hasF=true;

  else hasLT=false;

     }else if(hasLT && hasA && !hasF && !hasSp){

  if(c==' ' || c=='/t' || c=='/n')hasSp=true;

  else hasLT = hasA = false;

     }

     //search for "<frame "      else if(hasLT && hasF && !hasA && !hasR){

  if(c=='r' || c=='R')hasR=true;

  else hasLT = hasF = false;

     }else if(hasLT && hasF && hasR && !hasA){

  if(c=='a' || c=='A')hasA=true;

  else hasLT = hasF = hasR = false;

     }else if(hasLT && hasF && hasR && hasA && !hasM){   if(c=='m' || c=='M')hasM=true;

  else hasLT = hasF = hasR = hasA = false;

     }else if(hasLT && hasF && hasR && hasA && hasM && !hasE){   if(c=='e' || c=='E')hasE=true;

  else hasLT = hasF = hasR = hasA = hasM = false;

     }else if(hasLT && hasF && hasR && hasA && hasM && hasE && !hasSp){   if(c==' ' || c=='/t' || c=='/n')hasSp=true;

  else hasLT = hasF = hasR = hasA = hasM = hasE = false;

     }           //found "<frame "      else if(hasLT && hasF && hasR && hasA && hasM && hasE && hasSp){

  hasLT = hasF = hasR = hasA = hasM = hasE = hasSp = false;

  beg = loc;

  loc = source.indexOf(">", loc);

  if(loc==-1){

      errors.insert("malformed frame at "+site.toString());       loc = beg;

  }   else{

      try{

   parseFrame(site, source.substring(beg, loc));

      }       catch(Exception e){

   errors.insert("while parsing "+site.toString()+", error parsing frame: "+e.toString());

      }   }      }

     //found "<a "      else if(hasLT && hasA && hasSp && !hasF){

  hasLT = hasA = hasSp = false;

  beg = loc;

  loc = source.indexOf(">", loc);

  if(loc==-1){       errors.insert("malformed linked at "+site.toString());

      loc = beg;

  }   else{       try{

   parseLink(site, source.substring(beg, loc));

      }       catch(Exception e){

   errors.insert("while parsing "+site.toString()+", error parsing link: "+e.toString());

      }   }      }  }     }         /*      * parses a frame      */     private void parseFrame(URL at_page, String s) throws Exception{  int beg=s.indexOf("src");

 if(beg==-1)beg=s.indexOf("SRC");

 if(beg==-1)return;//doesn't have a src, ignore

 beg = s.indexOf("=", beg);

 if(beg==-1)throw new Exception("while parsing "+at_page.toString()+", bad frame, missing /'=/' after src: "+s);

 int start = beg;

 for(;beg<s.length();beg++){

     if(s.charAt(beg)=='/'')break;

     if(s.charAt(beg)=='/"')break;  }  int end=beg+1;

 for(;end<s.length();end++){

     if(s.charAt(beg)==s.charAt(end))break;  }  beg++;

 if(beg>=end){//missing quotes... just take the first token after "src="      for(beg=start+1;beg<s.length() && (s.charAt(beg)==' ');beg++){}      for(end=beg+1;end<s.length() && (s.charAt(beg)!=' ') && (s.charAt(beg)!='>');end++){}

 }

 if(beg>=end){

     errors.insert("while parsing "+at_page.toString()+", bad frame: "+s);      return;

 }

 String linkto=s.substring(beg,end);

 if(linkto.startsWith("mailto:")||linkto.startsWith("Mailto:"))return;

 if(linkto.startsWith("javascript:")||linkto.startsWith("Javascript:"))

 

return;

 if(linkto.startsWith("news:")||linkto.startsWith("Javascript:"))return;

 try{

     addSite(new URL(at_page, linkto));

     return;

 }catch(Exception e1){}  try{

     addSite(new URL(linkto));

     return;

 }catch(Exception e2){}  try{

     URL cp = new URL(at_page.toString()+"/index.html");

     System.out.println("attemping to use "+cp);

     addSite(new URL(cp, linkto));

     return;

 }catch(Exception e3){}

 errors.insert("while parsing "+at_page.toString()+", bad frame:

 

"+linkto+", formed from: "+s);

    }

    /*      * given a link at a URL, will parse it and add it to the list of sites to do      */     private void parseLink(URL at_page, String s) throws Exception{

 //System.out.println("parsing link "+s);

 int beg=s.indexOf("href");

 if(beg==-1)beg=s.indexOf("HREF");

 if(beg==-1)return;//doesn't have a href, must be an anchor

 beg = s.indexOf("=", beg);

 if(beg==-1)throw new Exception("while parsing "+at_page.toString()+", bad

 

link, missing /'=/' after href: "+s);

 int start = beg;

 for(;beg<s.length();beg++){

     if(s.charAt(beg)=='/'')break;

     if(s.charAt(beg)=='/"')break;  }  int end=beg+1;

 for(;end<s.length();end++){

     if(s.charAt(beg)==s.charAt(end))break;  }

 beg++;

 if(beg>=end){//missing quotes... just take the first token after "href="      for(beg=start+1;beg<s.length() && (s.charAt(beg)==' ');beg++){}      for(end=beg+1;end<s.length() && (s.charAt(beg)!=' ') && (s.charAt(beg)!='>');end++){}

 }

 if(beg>=end){

     errors.insert("while parsing "+at_page.toString()+", bad href: "+s);      return;  }

 String linkto=s.substring(beg,end);

 

if(linkto.startsWith("mailto:")||linkto.startsWith("Mailto:"))return;  

if(linkto.startsWith("javascript:")||linkto.startsWith("Javascript:"))return;

 if(linkto.startsWith("news:")||linkto.startsWith("Javascript:"))return;

 try{

     addSite(new URL(at_page, linkto));      return;  }catch(Exception e1){}

 try{

     addSite(new URL(linkto));

     return;

 }catch(Exception e2){}  try{

    

addSite(new URL(new URL(at_page.toString()+"/index.html"), linkto));      return;

 }catch(Exception e3){}

 errors.insert("while parsing "+at_page.toString()+", bad link: "+linkto+", formed from: "+s);

    }

    /*      * gets the title of a web page with content s      */     private String getTitle(String s){  try{      int beg=s.indexOf("<title>");

     if(beg==-1)beg=s.indexOf("<TITLE>");

     int end=s.indexOf("</title>");

     if(end==-1)end=s.indexOf("</TITLE>");

     return s.substring(beg,end);

 }  catch(Exception e){return "";}     }

    /*      * gets the text of a web page, times out after 10s      */     private String getText(URL site) throws Exception     {  urlReader u = new urlReader(site);

 Thread t = new Thread(u);

 t.setDaemon(true);

 t.start();

 t.join(TIMEOUT);

 String ret = u.poll();

 if(ret==null){

  throw new Exception("connection timed out");

 }else if(ret.equals("Not html")){

  throw new Exception("Not an HTML document");

 }  return ret;     }

    /*      * returns how many sites have been visited so far      */     public int Visited(){return visitedsites;} }

class urlReader implements Runnable{

    URL site;

    String s;

    public urlReader(URL u){

 site = u;

 s=null;

    }     public void run(){

 try{

     String ret=new String();

     URLConnection u = site.openConnection();

     String type = u.getContentType();

     if(type.indexOf("text")==-1 &&

        type.indexOf("txt")==-1 &&

        type.indexOf("HTM")==-1 &&

        type.indexOf("htm")==-1){

  //System.err.println("bad content type "+type+" at site "+site);

  System.out.println("bad content type "+type+" at site "+site);

  ret = "Not html";

  return;

     }      InputStream in = u.getInputStream();

     BufferedInputStream bufIn = new BufferedInputStream(in);

     int data;

     while(true){

  data = bufIn.read();

  // Check for EOF

  if (data == -1) break;

  else ret+= ( (char) data);

     }

     s = ret;

 }catch(Exception e){s=null;}

    }

    public String poll(){return s;}

}

public class spidergui extends Frame{

 private spider s;

 private Color txtColor;

 private Color errColor;

 private Color topColor;

 private Color numColor;

 private Color curColor;

 public spidergui(spider spi, String title){

  super(title);

  curColor = new Color(40, 40, 200);

  txtColor = new Color(0, 0, 0);

  errColor = new Color(255, 0, 0);

  topColor = new Color(40, 40, 100);

  numColor = new Color(50, 150, 50);

  s=spi;

  setBounds(0, 0, 800, 600);

  show();

  toFront();

  repaint();

 }  public void endShow(){

  System.out.println(s);

  hide();

  dispose();

 }  public void paint(Graphics g){

  super.paint(g);

  s.todo.reset();

  s.done.reset();

  s.errors.reset();

  s.omittions.reset();

  String txt;

  Object o;

  g.setColor(curColor);

  g.setFont(new Font("arial", Font.PLAIN, 18));

  String cur = s.getCurrent();

  if(cur.length()>80)g.drawString(

   cur.substring(0, 40)+

   " . . . "+

   cur.substring(cur.length()-30, cur.length()),

  50, 50);

  else g.drawString(cur, 50, 50);

  g.setColor(numColor);

  g.setFont(new Font("arial", Font.BOLD, 24));

  g.drawString(Integer.toString(s.Visited()), 350, 80);

  g.setFont(new Font("arial", Font.PLAIN, 14));

  g.setColor(topColor);

  g.drawString("To Do:", 100, 80);

  g.drawString("Completed:", 500, 80);

  g.drawString("Ignored:", 500, 250);

  g.drawString("Errors:", 100, 420);

  g.setColor(txtColor);

  g.setFont(new Font("arial", Font.PLAIN, 12));

  for(int i=0;i<23 && (o=s.todo.get())!=null;i++){

 txt = Integer.toString(i+1) + ": "+o.toString();

 if(txt.length()>65)g.drawString(

   txt.substring(0, 38) +

   " . . . " +

   txt.substring(txt.length()-18, txt.length()),

  20, 100+13*i);

 else g.drawString(txt, 20, 100+13*i);

  }   for(int i=0;i<10 && (o=s.done.get())!=null;i++){

 txt = Integer.toString(i+1) + ": "+o.toString();

 if(txt.length()>60)g.drawString(txt.substring(0, 57)+"...", 400,

 

100+13*i);

 else g.drawString(txt, 400, 100+13*i);

  }   for(int i=0;i<10 && (o=s.omittions.get())!=null;i++){

 txt = Integer.toString(i+1) + ": "+o.toString();

 if(txt.length()>60)g.drawString(txt.substring(0, 57)+"...", 400,

 

 270+13*i);

 else g.drawString(txt, 400, 270+13*i);

  }   g.setColor(errColor);

  for(int i=0;i<10 && (o=s.errors.get())!=null;i++){

 txt = Integer.toString(i+1) + ": "+o.toString();

 g.drawString(txt, 20, 440+13*i);

  }

 }  public void run(){

  repaint();

  while(s.hasMore()){

 repaint();

 s.doNextSite();

  }

  repaint();

 }

 public static void main(String []args){

 int max = 5;

 String site="";

 String base="";

 int time=0;

 for(int i=0;i<args.length;i++){

     if(args[i].startsWith("-max=")){

  max=Integer.parseInt(args[i].substring(5,args[i].length()));

     }

     else if(args[i].startsWith("-time=")){

  time=Integer.parseInt(args[i].substring(6,args[i].length()));

     }

     else if(args[i].startsWith("-init=")){

  site=args[i].substring(6,args[i].length());

     }

     else if(args[i].startsWith("-base=")){

  base=args[i].substring(6,args[i].length());

     }      else if(args[i].startsWith("-help")||args[i].startsWith("-?")){

  System.out.println("additional command line switches:");

  System.out.println("-max=N      : to limit to N sites, default 5");

  System.out.println("-init=URL   : to set the initial site, REQUIRED");

  System.out.println("-base=URL   : only follow url's that start with

 

this");

  System.out.println("              default /"/" (matches all URLs)");

  System.out.println("-time=N     : how many millisec to wait for each

 

page");

  System.out.println("              default 5000 (5 seconds)");

  System.exit(0);

     }      else System.err.println("unrecognized switch: "+args[i]+",

 

continuing");

 }

 if(site==""){

     System.err.println("No initial site parameter!");

    

System.err.println("Use -init=<site> switch to set, or -help for more info.");

     System.exit(1);

 }

 spider spi=new spider(site, max, base);

 if(time>0)spi.setTimer(time);

  spidergui s = new spidergui(spi, "Spider: "+site);

  s.run();

  System.out.println(spi);

 }

}

互聯網是一個龐大的非結構化的數據庫,將數據有效的檢索並組織呈現出來有着巨大的應用前景,尤其是類似RSS的以XML爲基礎的結構化的數據越來越多,內容的組織方式越來越靈活,檢索組織並呈現會有着越來越廣泛的應用範圍,同時在時效性和可讀性上也會有越來越高的要求。這一切的基礎是爬蟲,信息的來源入口。一個高效,靈活可擴展的爬蟲對以上應用都有着無可替代的重要意義。

要設計一個爬蟲,首先需要考慮的效率。對於網絡而言,基於TCP/IP的通信編程有幾種方法。

第一種是單線程阻塞,這是最簡單也最容易實現的一種,一個例子:在Shell中通過curlpcregrep等一系統命令可以直接實現一個簡單的爬蟲,但同時它的效率問題也顯而易見:由於是阻塞方式讀取,dns解析,建立連接,寫入請求,讀取結果這些步驟上都會產生時間的延遲,從而無法有效的利用服務器的全部資源。

第二種是多線程阻塞。建立多個阻塞的線程,分別請求不同的url。相對於第一種方法,它可以更有效的利用機器的資源,特別是網絡資源,因爲無數線程在同時工作,所以網絡會比較充分的利用,但同時對機器CPU資源的消耗也是比較大,在用戶級多線程間的頻繁切換對於性能的影響已經值得我們考慮。

第三種是單線程非阻塞。這是目前使用的比較多的一種做法,無論在client還是server都有着廣泛的應用。在一個線程內打開多個非阻塞的連接,通過poll/epoll/select對連接狀態進行判斷,在第一時間響應請求,不但充分利用了網絡資源,同時也將本機CPU資源的消耗降至最低。這種方法需要對dns請求,連接,讀寫操作都採用異步非阻塞操作,其中第一種比較複雜,可以採用adns作爲解決方案,後面三個操作相對簡單可以直接在程序內實現。

效率問題解決後就需要考慮具體的設計問題了。

url肯定需要一個單獨的類進行處理,包括顯示,分析url,得到主機,端口,文件數據。

然後需要對url進行排重,需要一個比較大的url Hash表。

如果還要對網頁內容進行排重,則還需要一個Document Hash表。

爬過的url需要記錄下來,由於量比較大,我們將它寫到磁盤上,所以還需要一個FIFO的類(記作urlsDisk)

現在需要爬的url同樣需要一個FIFO類來處理,重新開始時,url會從定時從爬過的url FIFO裏取出來,寫到這個FIFO裏。正在運行的爬蟲需要從這個FIFO裏讀數據出來,加入到主機類的url列表裏。當然,也會從前一個FIFO裏直接讀url出來,不過優先級應該比這個裏面出來的url低,畢竟是已經爬過的。

爬蟲一般是對多個網站進行爬取,但在同時站點內dns的請求可以只做一次,這就需要將主機名獨立於url,單獨有一個類進行處理。

主機名解析完成後需要有一個解析完成的IP類與之應用,用於connect的時候使用。

HTML文檔的解析類也要有一個,用來分析網頁,取出裏面的url,加入到urlsDisk

再加上一些字符串,調度類,一個簡單的爬蟲基本上就完成了。

以上基本上是Larbin的設計思路,Larbin在具體實現上還有一些特殊的處理,例如帶了一個webserver,以及對特殊文件的處理。Larbin有一點設計不不太好,就是慢的訪問會越來越多,佔用大量的連接,需要改進,另外如果對於大規模的爬蟲,這僅僅實現了抓取的部分,要分佈式的擴展還需要增加url的集中管理與調度以及前臺spider的分佈式算法。

 

 

發佈了5 篇原創文章 · 獲贊 3 · 訪問量 7萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章