import lombok.Data;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.springframework.util.StreamUtils;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by on 2019/10/24<br >
*
*
*/
public class GetWikiDataToDoc {
public static final String ROOT_WIKI = "wiki地址";
public static final String PAGES_VIEWPAGE_ACTION_PAGE_ID = "/pages/viewpage.action?pageId=";
public static final String Root_Path = "d:\\test\\";
public static final String DOLOGIN_PARAM = "/dologin.action?os_username=賬號&os_password=密碼&login=%E7%99%BB%E5%BD%95&os_destination=";
public static final String DOWNLOAD_ATTACHMENTS1 = "/download/attachments_test/";
public static ThreadLocal<String> d=new ThreadLocal<>();
public static final String VIEW_PAGE = ROOT_WIKI + "/pages/viewpage.action?pageId=";
public static final String ExportPAGE = ROOT_WIKI + "/exportword?pageId=";
// 只是用來判斷是否登錄成功
public static final String username="你的帳號";
public static final CloseableHttpClient httpclient = HttpClients.createDefault();
public static final boolean IS_DOWNLOAD_ATTACHMENTS=true;
@Data
public static class Menu{
public Menu(String name,String id,String url){
this.name = name == null ? "" : name.trim().replace("\"", "");
this.id=id;
this.url=url;
}
String name;
String id;
String url;
List<Menu> childrens=new ArrayList<>();
List<String> attachments=new ArrayList<>();
public void addChildren(Menu m){
childrens.add(m);
}
}
public static void main(String[] args) {
Map <String,String > map=new HashMap<>();
map.put("根目錄id", "XXXX");
try {
map.entrySet().forEach(e->{
getWikiByRootID(e.getKey(),e.getValue());
});
} catch (Throwable throwable) {
throwable.printStackTrace();
}
}
public static void getWikiByRootID(String id,String groupname) {
Menu root=new Menu(groupname,id, "/pages/viewpage.action?pageId="+id);
getSession(ROOT_WIKI, httpclient);
lgoin(ROOT_WIKI+ DOLOGIN_PARAM, httpclient);
getChildrenIds(root.url,root,0,Root_Path);
}
private static List<String> getAttachments(Document doc,String id,String name){
List<String> ls=new ArrayList<>();
if(!IS_DOWNLOAD_ATTACHMENTS)
return ls;
if(doc!=null){
// doc.getElementsByTag("a").select()
Elements childrens= doc.select("a[href^=/download/attachments/]");
childrens.forEach(e->{
ls.add(e.attr("href"));
// 下載指定類型附件
if(e.text().contains(".ppt")) {
try {
// 暫時不調整下載文件路徑
downloadFile(ROOT_WIKI + e.attr("href"), e.text().trim().replace("\"", ""),
DOWNLOAD_ATTACHMENTS1 + name.trim() + "_" + id);
System.out.println("下載附件:" + e.text().trim().replace("\"", "") + " 成功");
} catch (Throwable throwable) {
throwable.printStackTrace();
}
}
});
}
return ls;
}
public static void getChildrenIds(String url,Menu parent,int level,String path) {
if (level >= 7)
return;
Document doc = getHtml(ROOT_WIKI + url, httpclient);
if(doc==null)
return ;
// 解決url重寫問題
if(url.contains("display")){
parent.setId(doc.selectFirst("input[name=treePageId]").val());
}
try {
getAttachments(doc,parent.id,parent.name);
downloadFile(ExportPAGE + parent.id, parent.name + ".doc", path + parent.name + "\\");
} catch (Throwable throwable) {
throwable.printStackTrace();
}
if(doc.getElementById("page-children")!=null) {
Elements childrens = doc.getElementById("page-children").getElementsByTag("a");
if (childrens != null && childrens.size() > 0) {
childrens.forEach(e -> {
if(!e.text().contains("廢棄")) {
String name = e.text();
if (e.text().contains(" ")) {
name = e.text().substring(e.text().indexOf(" "));
}
String url2=e.attr("href");
Menu chi = new Menu(name, url2.replace(PAGES_VIEWPAGE_ACTION_PAGE_ID, ""), url2);
parent.addChildren(chi);
getChildrenIds(e.attr("href"), chi, level + 1, path + parent.name + "\\");
}
});
}
}
}
private static Map<String, String> getStringStringMap() {
Map<String,String> map=new HashMap<>();
map.put("accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
map.put("accept-language","zh-CN,zh;q=0.9");
map.put("connection","keep-alive");
//map.put("content-length","87");
map.put("content-type","application/x-www-form-urlencoded");
map.put("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36");
return map;
}
private static void getSession(String url, CloseableHttpClient httpclient) {
HttpGet httpget = setUrlHeader(url);
try (CloseableHttpResponse response = httpclient.execute(httpget)) {
for(Header obj : response.getAllHeaders()){
if("Set-Cookie".equals(obj.getName())){
if(d.get()==null||"".equals(d.get()))
d.set(obj.getValue());
}
}
}catch(Throwable e){
e.printStackTrace();
}
}
private static Document getHtml(String url, CloseableHttpClient httpclient) {
HttpGet httpget = setUrlHeader(url);
try (CloseableHttpResponse response = httpclient.execute(httpget)) {
HttpEntity entity = response.getEntity();
return Jsoup.parse(entity.getContent(),"UTF-8",ROOT_WIKI);
}catch(Throwable e){
e.printStackTrace();
}
return null;
}
private static HttpGet setUrlHeader(String url) {
HttpGet httpget = new HttpGet(url);
getStringStringMap().entrySet().forEach(e->{
httpget.addHeader(e.getKey(),e.getValue());
});
if( d.get()!=null&&!"".equals(d.get())){
httpget.addHeader("Cookie", d.get());
}
httpget.setConfig(RequestConfig.custom() //
.setConnectionRequestTimeout(10000) //
.setConnectTimeout(10000) //
.setSocketTimeout(10000) //
.build());
return httpget;
}
private static void lgoin(String url, CloseableHttpClient httpclient) {
HttpGet httpget = setUrlHeader(url);
try (CloseableHttpResponse response = httpclient.execute(httpget)) {
HttpEntity entity = response.getEntity();
for(Header obj : response.getAllHeaders()){
//System.out.println("url = [" + obj.getName() + "]"+obj.getValue());
if("X-AUSERNAME".equals(obj.getName())){
if(username.equals(obj.getValue())){
System.out.println(username+" 登錄成功");
}else{
throw new Exception(username+" 登錄失敗");
}
}
}
}catch(Throwable e){
e.printStackTrace();
}
}
public static void downloadFile(String url,String fileName,String filePach) throws Throwable {
File desc = new File(filePach + File.separator + fileName);
File folder = desc.getParentFile();
if (desc.exists()) {
return;
}
folder.mkdirs();
HttpGet httpget = setUrlHeader(url);
try (CloseableHttpResponse response = httpclient.execute(httpget)) {
org.apache.http.HttpEntity entity = response.getEntity();
try (InputStream is = entity.getContent(); //
OutputStream os = new FileOutputStream(desc)) {
StreamUtils.copy(is, os);
}
System.out.println("下載:" + fileName + ".doc 成功");
} catch (Throwable e) {
e.printStackTrace();
}
}
}
Java版爬wiki【Atlassian Confluence】信息的
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.