Java版爬wiki【Atlassian Confluence】信息的

import lombok.Data;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.springframework.util.StreamUtils;

import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Created by  on 2019/10/24<br >
 *
 *
 */
public class GetWikiDataToDoc {

   public static final String ROOT_WIKI = "wiki地址";
   public static final String PAGES_VIEWPAGE_ACTION_PAGE_ID = "/pages/viewpage.action?pageId=";
   public static final String Root_Path = "d:\\test\\";
   public static final String DOLOGIN_PARAM = "/dologin.action?os_username=賬號&os_password=密碼&login=%E7%99%BB%E5%BD%95&os_destination=";
   public static final String DOWNLOAD_ATTACHMENTS1 = "/download/attachments_test/";


   public static ThreadLocal<String> d=new ThreadLocal<>();

   public static final String VIEW_PAGE = ROOT_WIKI + "/pages/viewpage.action?pageId=";

   public static final String ExportPAGE = ROOT_WIKI + "/exportword?pageId=";

   // 只是用來判斷是否登錄成功
   public  static final  String username="你的帳號";
   public static final  CloseableHttpClient httpclient = HttpClients.createDefault();

   public static final boolean IS_DOWNLOAD_ATTACHMENTS=true;

   @Data
   public static class Menu{

      public Menu(String name,String id,String url){
         this.name = name == null ? "" : name.trim().replace("\"", "");
         this.id=id;
         this.url=url;
      }
      String name;
      String id;
      String url;
      List<Menu> childrens=new ArrayList<>();
      List<String> attachments=new ArrayList<>();


      public void addChildren(Menu m){
         childrens.add(m);
      }


   }
   public static void main(String[] args) {

      Map <String,String > map=new HashMap<>();
      map.put("根目錄id", "XXXX");
      try {
         map.entrySet().forEach(e->{
            getWikiByRootID(e.getKey(),e.getValue());
         });
      }  catch (Throwable throwable) {
         throwable.printStackTrace();
      }



   }
   public static void getWikiByRootID(String id,String groupname) {
      Menu root=new Menu(groupname,id, "/pages/viewpage.action?pageId="+id);
      getSession(ROOT_WIKI, httpclient);
      lgoin(ROOT_WIKI+ DOLOGIN_PARAM, httpclient);
      getChildrenIds(root.url,root,0,Root_Path);
   }

   private static List<String> getAttachments(Document doc,String id,String name){
      List<String> ls=new ArrayList<>();
      if(!IS_DOWNLOAD_ATTACHMENTS)
         return ls;
       if(doc!=null){
       //    doc.getElementsByTag("a").select()
          Elements childrens= doc.select("a[href^=/download/attachments/]");
          childrens.forEach(e->{
             ls.add(e.attr("href"));
             // 下載指定類型附件
             if(e.text().contains(".ppt")) {
                try {
                   // 暫時不調整下載文件路徑
                   downloadFile(ROOT_WIKI + e.attr("href"), e.text().trim().replace("\"", ""),
                         DOWNLOAD_ATTACHMENTS1 + name.trim() + "_" + id);
                   System.out.println("下載附件:" + e.text().trim().replace("\"", "") + " 成功");
                } catch (Throwable throwable) {
                   throwable.printStackTrace();
                }
             }
          });
       }
      return ls;


   }


   public static void getChildrenIds(String url,Menu parent,int level,String path) {
      if (level >= 7)
         return;
      Document doc = getHtml(ROOT_WIKI + url, httpclient);
      if(doc==null)
         return ;
      // 解決url重寫問題
      if(url.contains("display")){
         parent.setId(doc.selectFirst("input[name=treePageId]").val());
      }
      try {
         getAttachments(doc,parent.id,parent.name);
         downloadFile(ExportPAGE + parent.id, parent.name + ".doc", path + parent.name + "\\");
      } catch (Throwable throwable) {
         throwable.printStackTrace();
      }
      if(doc.getElementById("page-children")!=null) {
         Elements childrens = doc.getElementById("page-children").getElementsByTag("a");
         if (childrens != null && childrens.size() > 0) {
            childrens.forEach(e -> {
               if(!e.text().contains("廢棄")) {
                  String name = e.text();
                  if (e.text().contains(" ")) {
                     name = e.text().substring(e.text().indexOf(" "));
                  }
                  String url2=e.attr("href");
                  Menu chi = new Menu(name, url2.replace(PAGES_VIEWPAGE_ACTION_PAGE_ID, ""), url2);
                  parent.addChildren(chi);
                  getChildrenIds(e.attr("href"), chi, level + 1, path + parent.name + "\\");
               }
            });
         }
      }
   }




   private static Map<String, String> getStringStringMap() {
      Map<String,String> map=new HashMap<>();
      map.put("accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
      map.put("accept-language","zh-CN,zh;q=0.9");
      map.put("connection","keep-alive");
      //map.put("content-length","87");
      map.put("content-type","application/x-www-form-urlencoded");
      map.put("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36");
      return map;
   }





   private static void getSession(String url, CloseableHttpClient httpclient) {
      HttpGet httpget = setUrlHeader(url);
      try (CloseableHttpResponse response = httpclient.execute(httpget)) {
         for(Header obj : response.getAllHeaders()){
            if("Set-Cookie".equals(obj.getName())){
               if(d.get()==null||"".equals(d.get()))
               d.set(obj.getValue());
            }

         }
      }catch(Throwable e){
         e.printStackTrace();
      }
   }

   private static Document getHtml(String url, CloseableHttpClient httpclient) {
      HttpGet httpget = setUrlHeader(url);
      try (CloseableHttpResponse response = httpclient.execute(httpget)) {
         HttpEntity entity = response.getEntity();
         return Jsoup.parse(entity.getContent(),"UTF-8",ROOT_WIKI);
      }catch(Throwable e){
         e.printStackTrace();
      }
      return null;
   }

   private static HttpGet setUrlHeader(String url) {
      HttpGet httpget = new HttpGet(url);
      getStringStringMap().entrySet().forEach(e->{
         httpget.addHeader(e.getKey(),e.getValue());
      });
      if( d.get()!=null&&!"".equals(d.get())){
         httpget.addHeader("Cookie", d.get());
      }
      httpget.setConfig(RequestConfig.custom() //
            .setConnectionRequestTimeout(10000) //
            .setConnectTimeout(10000) //
            .setSocketTimeout(10000) //
            .build());
      return httpget;
   }

   private static void lgoin(String url, CloseableHttpClient httpclient) {
      HttpGet httpget = setUrlHeader(url);
      try (CloseableHttpResponse response = httpclient.execute(httpget)) {
         HttpEntity entity = response.getEntity();
         for(Header obj : response.getAllHeaders()){
            //System.out.println("url = [" + obj.getName() + "]"+obj.getValue());
            if("X-AUSERNAME".equals(obj.getName())){
               if(username.equals(obj.getValue())){
                  System.out.println(username+" 登錄成功");
               }else{
                  throw new Exception(username+" 登錄失敗");
               }
            }
         }
      }catch(Throwable e){
         e.printStackTrace();
      }
   }


   public static void downloadFile(String url,String fileName,String filePach) throws Throwable {
      File desc = new File(filePach + File.separator + fileName);
      File folder = desc.getParentFile();
      if (desc.exists()) {
         return;
      }
      folder.mkdirs();
      HttpGet httpget = setUrlHeader(url);
      try (CloseableHttpResponse response = httpclient.execute(httpget)) {
         org.apache.http.HttpEntity entity = response.getEntity();

         try (InputStream is = entity.getContent(); //
               OutputStream os = new FileOutputStream(desc)) {
            StreamUtils.copy(is, os);
         }
         System.out.println("下載:" + fileName + ".doc 成功");
      } catch (Throwable e) {
         e.printStackTrace();
      }
   }

}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章