用HtmlUnit和httpClient抓施華洛世奇網站圖片和動畫

try
    {
     mainPage = webClient.getPage(url);
    } catch (Exception e)
    {
     log.error(e.getMessage(), e);
    }
    if (mainPage != null)
    {
     HtmlElement paginationContainer = mainPage
       .getElementById("paginationContainer");
     if (paginationContainer != null)
     {
      url = url + "/all-1";
      try
      {
       mainPage = webClient.getPage(url);
      } catch (Exception e)
      {
       log.error(e.getMessage(), e);
      }
     }
HtmlElement categories = mainPage.getElementById("categories");
     if (categories == null)
     {
      HtmlElement products = mainPage.getElementById("products");
      if(products!=null)
      {
       List<HtmlElement> productDivList = products.getElementsByTagName("div");
       if(productDivList!=null&&productDivList.size()>0)
       {
        for(HtmlElement proDiv:productDivList)
        {
         List<HtmlElement> subList = proDiv.getElementsByTagName("div");
         if(subList!=null)
         {
          for(HtmlElement dt:subList)
          {
           String classt = dt.getAttribute("class");
           if (classt != null && classt.equals("productImg"))
           {
            List<HtmlElement> subAList = proDiv.getElementsByTagName("a");
            if(subAList!=null&&subAList.size()>0)
            {
             HtmlElement ae=subAList.get(0);
             String href = ae.getAttribute("href");
             thirdPageUrls.add(href);
            }
            break;
           }
          }
         }
        }
       }
      }
     }
     else
     {
      List<HtmlElement> divList = categories.getElementsByTagName("div");
         if (divList != null && divList.size() > 0)
         {
       for (HtmlElement div : divList)
       {
        List<HtmlElement> tempDivs = div
          .getElementsByTagName("div");
        if (tempDivs != null && tempDivs.size() > 0)
        {
         for (HtmlElement div1 : tempDivs)
         {
          List<HtmlElement> aList = div1
            .getElementsByTagName("a");
          HtmlAnchor a = (HtmlAnchor) aList
            .get(0);
          String link = a.getHrefAttribute();
          secondPageUrls.add(link);

         }

        }
       }
      }
     }
    

    }
   }
  }
  log.error("第二層抓取結束..........");
  log.error("目前抓取到的第二層URL個數爲:"+secondPageUrls.size());

 

int count=0;
  
  for (String url : secondPageUrls)
  {
   count++;
   log.error("正在抓取第二層的第"+count+"個URL:"+url);
   HtmlPage mainPage = null;
   try
   {
    mainPage = webClient.getPage(url);
   } catch (Exception e)
   {
    log.error(e.getMessage(), e);
   }
   log.error("抓取URL完成:"+url+",正在分析URL"+url+"+結果的URL");
   if (mainPage != null)
   {
    HtmlElement paginationContainer = mainPage
      .getElementById("paginationContainer");
    if (paginationContainer != null)
    {
     url = url + "/all-1";
    }
    HtmlElement products = mainPage.getElementById("products");
    if (products != null)
    {
     List<HtmlElement> list = products
       .getElementsByTagName("div");
     ;
     if (list == null || list.size() == 0)
     {
      continue;
     }
     for (HtmlElement h : list)
     {
      String cls = h.getAttribute("class");
      if (cls == null || !cls.equals("productName"))
      {
       continue;
      }
      List<HtmlElement> links = h.getElementsByTagName("a");
      if (links != null && links.size() > 0)
      {
       HtmlAnchor htmlAnchor = (HtmlAnchor) links.get(0);
       String linkStr = htmlAnchor.getHrefAttribute();
       thirdPageUrls.add(linkStr);
       log.error(linkStr);
      }

     }
    }

   }
  }
  log.error("第二層抓取結束..........");
  secondPageUrls.clear();
  secondPageUrls = null;
  
   count=0;
  log.error("目前抓取到的第三層URL個數爲:"+thirdPageUrls.size());
  String urlPrix="
http://www.swarovski-crystallized.com/jewelry/us/";
 for (String url : thirdPageUrls)
  {
   count++;
   log.error("正在抓取第三層的第"+count+"個URL:"+url);
   HtmlPage mainPage = null;
   try
   {
    mainPage = webClient.getPage(url);
   } catch (Exception e)
   {
    log.error(e.getMessage(), e);
   }
   if (mainPage != null)
   {
    log.error("抓取URL完成:"+url+",正在分析URL"+url+"+結果");
    int indexC=url.indexOf(urlPrix);
    int indexD=url.indexOf("?");
    String dirStr=url.substring(indexC+urlPrix.length(), indexD);
    String regEx = "/";
//    Pattern p = Pattern.compile(regEx);
//    Matcher m = p.matcher(dirStr);
    

//哦哦哦,建立文件夾準備把抓到數據放在裏面
    dirStr=replece( regEx,"\\\\",dirStr);
    dirStr = "D:\\swaroski\\"+dirStr;
    File   file   =   new   File(dirStr);  
    if(file.isDirectory())
    {
     dirStr=dirStr+"\\"+count;
     file   =   new   File(dirStr); 
    }

 file.mkdirs();

Product product=new Product();
    product.setLocalDir(dirStr);
    
    product.setPageUrl(url);
    HtmlElement rightCol = mainPage.getElementById("rightCol");
    String title = null;
    String description = null;
    String packingUnit = null;
    if (rightCol != null)
    {
     HtmlElement headlineDiv = rightCol
       .getElementById("headline");
     if (headlineDiv != null)

。。。。。。。。。。。。。。。。。。。。。。。。

log.error("完成:"+url+",分析結果");
     try
     {
      swaroSkiDAO.addProduct(product);
     } catch (Exception e)
     {
      log.error(e.getMessage(), e);
     }
     log.error("完成保存結果");
     
     for(String downloadUrl:resourceUrlList)
     {
      int index6=downloadUrl.lastIndexOf("/");
      String fileName=downloadUrl.substring(index6+1);
      String dirStr2=dirStr+"\\"+fileName;
         File storeFile = new File(fileName);
         if(storeFile.exists())
         {
          continue;
         }
      SaveFileThread runable=new SaveFileThread(dirStr2,downloadUrl,sem);
      pools.submit(runable);
      log.error("開始提交下載文件:"+downloadUrl);
      try
      {
       Thread.sleep(2500);
      } catch (InterruptedException e)
      {
      }

}

 

 

//啊哈哈,這個就是去拿下圖片和動畫的線程,結束

class SaveFileThread implements Runnable
 {
  private String fileName;
  private String downloadUrl;
  private Semaphore sem;
          public SaveFileThread(String fileName,String downloadUrl,Semaphore sem)
          {
           this.fileName=fileName;
           this.downloadUrl=downloadUrl;
           this.sem=sem;
          }
  public void run()
  {
            HttpClient client = new HttpClient(); 
           GetMethod get = new GetMethod(downloadUrl); 
           FileOutputStream output=null;
           try
     {
        client.executeMethod(get);
         File storeFile = new File(fileName); 
             output = new FileOutputStream(storeFile); 
             output.write(get.getResponseBody());
             output.flush();
     } catch (Exception e)
     {
      log.error(e.getMessage(), e);
     }

發佈了17 篇原創文章 · 獲贊 0 · 訪問量 2941
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章