package net.aykj.util;
import java.io.File;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.servlet.ServletContext;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
import net.aykj.pojo.Annex;
import net.aykj.pojo.Article;
import net.aykj.pojo.Rule;
import net.aykj.service.AnnexService;
import net.aykj.service.ArticleService;
import net.aykj.service.RuleService;
import sun.security.x509.GeneralName;
/**
* 採集線程蜘蛛
* Update by Bingyong.Wang 當詳情頁中沒有顯示時間時,採集列表頁的時間
*/
@SuppressWarnings({"unchecked", "unused"})
public class SpiderThreadUtil extends Thread {
private boolean stop = false;
private RuleService ruleService = null;
private ArticleService articleService = null;
private AnnexService annexService =null;
private ServletContext servletContext = null;
private Integer rulesId = null;
private String rulesName = null;
public SpiderThreadUtil(RuleService ruleService, ArticleService articleService,AnnexService annexService, ServletContext servletContext, String rulesName, Integer rulesId) {
this.ruleService = ruleService;
this.articleService = articleService;
this.annexService=annexService;
this.servletContext = servletContext;
this.rulesId = rulesId;
this.rulesName = rulesName;
}
@Override
public void run() {
try {
String totalCount = rulesName + "_totalCount";
// 採集的數量
String getCount = rulesName + "_getCount";
// 已經存在的數量
String gotCount = rulesName + "_gotCount";
String errorCount = rulesName + "_errorCount";
List<Rule> ruleList = ruleService.queryRuleListByRulesId(rulesId, true);
List<Rule> newRuleList = new ArrayList<Rule>();
if (ruleList != null && ruleList.size() > 0) {
int total = 0;
for (Rule rule : ruleList) {
// 查詢詳細頁的連接 若詳情頁沒時間時,列表頁獲取時間,及查詢列表頁時間,詳情頁連接與時間組合在一起,構成詳情頁鏈接,通過VT連接
List<String> viewUrlList = queryViewUrlList(rule, errorCount);
System.out.println("翻轉的詳情連接======" + viewUrlList);
total = total + viewUrlList.size();
System.out.println("-----------------------"+rule.getNewsClassId()+"欄目,查到" + total + "篇---------------------");
// 需要採集的詳細頁的連接
rule.setViewUrlList(viewUrlList);
newRuleList.add(rule);
// 判斷是否結束線程
if (isStop(totalCount, getCount, gotCount, errorCount)) break;
}
// 設置總共需要採集的文章數量
servletContext.setAttribute(totalCount, total);
// 採集文章
catchArticles(newRuleList, totalCount, getCount, gotCount, errorCount);
System.out.println("----------------------- 數據採集完成,共採集到" + total + "篇 ----------------------");
}
} catch (Exception e) {
e.printStackTrace();
}
}
private void catchArticles(List<Rule> newRuleList, String totalCount, String getCount, String gotCount, String errorCount) throws Exception {
// 詳情鏈接
String viewUrlTemp = null;
// 創建時間
String createtime = null;
for (Rule rule : newRuleList) {
List<String> viewUrlList = (List<String>) rule.getViewUrlList();
if (viewUrlList != null) {
for (String viewUrl : viewUrlList) {
if (viewUrl.indexOf("VT") > 0) {
viewUrlTemp = viewUrl.split("VT")[0];
viewUrlTemp=viewUrlTemp.replaceAll("href=\"|\"", "");
createtime = viewUrl.split("VT")[1];
}else{
viewUrlTemp=viewUrlTemp.replaceAll("href=\"|\"", "");
}
if (isStop(totalCount, getCount, gotCount, errorCount)) break; //判斷是否結束線程
System.out.println(viewUrlTemp);
// 詳情頁鏈接及文章時間
if (GeneralUtil.isNotNull(viewUrlTemp) && GeneralUtil.isNotNull(createtime)) {
catchArticleByViewUrlAndCreatetime(rule, viewUrlTemp, createtime, totalCount, getCount, gotCount, errorCount);
} else {
catchArticle(rule, viewUrlTemp, totalCount, getCount, gotCount, errorCount);
}
}
}
}
}
private void catchArticle(Rule rule, String viewUrl, String totalCount, String getCount, String gotCount, String errorCount) throws Exception {
String encode = rule.getEncode();
encode = encode == null ? "UTF-8" : encode;
String content = HttpUtil.get(viewUrl, encode);
if("HTTP/1.1 404 Not Found".equals(content)){
System.out.println("詳細頁鏈接:"+viewUrl+",訪問404,跳過");
this.addErrorCount(errorCount);
}else{
String host = rule.getHost();
String titleRegex = rule.getTitleRegex();
String authorRegex = rule.getAuthorRegex();
String createtimeRegex = rule.getCreatetimeRegex();
String sourceRegex = rule.getSourceRegex();
String hitsRegex = rule.getHitsRegex();
String contentRegex = rule.getContentRegex();
String titleFilterRegex = rule.getTitleFilterRegex();
String authorFilterRegex = rule.getAuthorFilterRegex();
String createtimeFilterRegex = rule.getCreatetimeFilterRegex();
String sourceFilterRegex = rule.getSourceFilterRegex();
String hitsFilterRegex = rule.getHitsFilterRegex();
String contentFilterRegex = rule.getContentFilterRegex();
Integer subsiteId = rule.getSubsiteId();
Integer objId = rule.getId();
Integer newsClassId = rule.getNewsClassId();
List<String> titleList = this.extractStrByPattern(content, titleRegex, errorCount, true, titleFilterRegex);
List<String> authorList = this.extractStrByPattern(content, authorRegex, errorCount, true, authorFilterRegex);
List<String> createtimeList = this.extractStrByPattern(content, createtimeRegex, errorCount, true, createtimeFilterRegex);
List<String> sourceList = this.extractStrByPattern(content, sourceRegex, errorCount, true, sourceFilterRegex);
List<String> hitsList = this.extractStrByPattern(content, hitsRegex, errorCount, true, hitsFilterRegex);
List<String> contentList = this.extractStrByPattern(content, contentRegex, errorCount, false, contentFilterRegex);
String title = titleList!=null && titleList.size()>0 ? titleList.get(0).trim(): null;
// 沒有名稱的說明沒有采集到 跳過
if(GeneralUtil.isNotNull(title)){
String createtimeStr = createtimeList!=null && createtimeList.size()>0 ? createtimeList.get(0): null;
// 獲取時間
if (createtimeStr.contains("發佈時間")) {
createtimeStr = createtimeStr.substring(createtimeStr.indexOf("發佈時間"), createtimeStr.indexOf("作者")).replace("發佈時間:", "").trim();
}
// 獲取文章來源
String source = sourceList!=null && sourceList.size()>0 ? sourceList.get(0): "原創";
if (source.contains("新聞來源")) {
source = source.substring(source.indexOf("新聞來源:"), source.indexOf("新聞來源")).replace("新聞來源:", "").trim();
source = null != source && source != " " && source.length() == 2 ? source : "原創";
}
// 作者
String author = authorList!=null && authorList.size()>0 ? authorList.get(0).trim(): "管理員";
if (author.contains("作者")) {
author = author.substring(author.indexOf("作者"), author.indexOf("作者")).replace("作者:", "").trim();
System.out.println(source.length());
author = null != author && author != " " && author.length() == 2 ? author : "管理員";
}
String hits = hitsList!=null && hitsList.size()>0 ? hitsList.get(0).trim(): "0";
String articleContent = contentList!=null && contentList.size()>0 ? contentList.get(0): null;
//處理採集到的時間
Date createtime = null;
if (createtimeStr == null) {
createtime = new Date();
} else {
// System.out.println(createtimeStr);
createtimeStr=createtimeStr.replaceAll("\\r.*\\n", "").trim();
// createtimeStr=createtimeStr.replaceAll("\u4E00-\u9FFF", "");
//System.out.println(createtimeStr);
String timeFormat = rule.getTimeFormat();
if (timeFormat != null && !"".equals(timeFormat)) {
SimpleDateFormat sdf = new SimpleDateFormat(timeFormat);
createtime = sdf.parse(createtimeStr);
}
}
if (articleExist(title, subsiteId, newsClassId)) {
this.addGotCount(gotCount);
return;
}
//採集圖片
// articleContent = catchImage(articleContent, host);
articleContent = catchImageByViewUrl(articleContent, viewUrl);
Integer aid=addArticle(title, author, createtime, source, articleContent, subsiteId, newsClassId, hits);
//創建一個縮略圖
String imgRegex="/static/upload.*?pdf";//獲取一個縮略圖的正則
Pattern pattern = Pattern.compile(imgRegex, Pattern.DOTALL);
Matcher matcher = pattern.matcher(articleContent);
String imgSrc="";
while(matcher.find()){
imgSrc=matcher.group();
break;
}
if(GeneralUtil.isNotNull(imgSrc)){
String annexPath=downloadFile("http://www.ynsap.org.cn/"+imgSrc);
Annex annex = new Annex();
annex.setPath(annexPath);
annex.setName(imgSrc.substring(imgSrc.lastIndexOf("/")+1));
annex.setExt("pdf");
annex.setType("annex");
annex.setObj("article");
annex.setCreatetime(createtime);
annex.setObjId(aid);
annexService.save(annex);
}
this.addGetCount(getCount);
}else{
System.out.println("詳細頁鏈接:"+viewUrl+",採集到的標題爲空");
this.addErrorCount(errorCount);
}
}
}
/**
* 根據文章詳情頁鏈接爬取文章,文章時間已獲取有
* @param rule
* @param viewUrl
* @param createtimeTemp
* @param totalCount
* @param getCount
* @param gotCount
* @param errorCount
* @throws Exception
* void
* Bingyong.Wang at 2019年12月5日
*/
private void catchArticleByViewUrlAndCreatetime(Rule rule, String viewUrl, String createtimeTemp, String totalCount, String getCount, String gotCount, String errorCount) throws Exception {
String encode = rule.getEncode();
encode = encode == null ? "UTF-8" : encode;
String content = HttpUtil.get(viewUrl, encode);
if("HTTP/1.1 404 Not Found".equals(content)){
System.out.println("詳細頁鏈接:"+viewUrl+",訪問404,跳過");
this.addErrorCount(errorCount);
}else{
String host = rule.getHost();
String titleRegex = rule.getTitleRegex();
String authorRegex = rule.getAuthorRegex();
String createtimeRegex = rule.getCreatetimeRegex();
String sourceRegex = rule.getSourceRegex();
String hitsRegex = rule.getHitsRegex();
String contentRegex = rule.getContentRegex();
String titleFilterRegex = rule.getTitleFilterRegex();
String authorFilterRegex = rule.getAuthorFilterRegex();
String createtimeFilterRegex = rule.getCreatetimeFilterRegex();
String sourceFilterRegex = rule.getSourceFilterRegex();
String hitsFilterRegex = rule.getHitsFilterRegex();
String contentFilterRegex = rule.getContentFilterRegex();
Integer subsiteId = rule.getSubsiteId();
Integer objId = rule.getId();
Integer newsClassId = rule.getNewsClassId();
List<String> titleList = this.extractStrByPattern(content, titleRegex, errorCount, true, titleFilterRegex);
List<String> authorList = this.extractStrByPattern(content, authorRegex, errorCount, true, authorFilterRegex);
List<String> createtimeList = this.extractStrByPattern(content, createtimeRegex, errorCount, true, createtimeFilterRegex);
List<String> sourceList = this.extractStrByPattern(content, sourceRegex, errorCount, true, sourceFilterRegex);
List<String> hitsList = this.extractStrByPattern(content, hitsRegex, errorCount, true, hitsFilterRegex);
List<String> contentList = this.extractStrByPattern(content, contentRegex, errorCount, false, contentFilterRegex);
String title = titleList!=null && titleList.size()>0 ? titleList.get(0).trim(): null;
// 沒有名稱的說明沒有采集到 跳過
if(GeneralUtil.isNotNull(title)){
/* ----------- 高檢 採集 start--- */
// 獲取時間
/*if (createtimeStr.contains("時間")) {
createtimeStr = createtimeStr.substring(createtimeStr.indexOf("時間"), createtimeStr.indexOf("作者")).replace("時間:", "").trim();
}
// 獲取文章來源
String source = sourceList!=null && sourceList.size()>0 ? sourceList.get(0): "網站原創";
if (source.contains("來源")) {
source = null != source.substring(source.indexOf("來源"), source.length()).replace("來源:", "").trim()
&& "" != source.substring(source.indexOf("來源"), source.length()).replace("來源:", "").trim()
? source.substring(source.indexOf("來源"), source.length()).replace("來源:", "").trim() : "網站原創";
}*/
/* ----------- 高檢 採集 end--- */
/* ----------- 雲檢 採集 start--- */
// 獲取文章來源
String source = sourceList!=null && sourceList.size()>0 ? sourceList.get(0): "原創";
if (source.contains("來源")) {
source = source.substring(source.indexOf("來源:"), source.indexOf("查看")).replace("來源:", "").replace("\r\n\t\t\t\t", "").trim();
source = null != source && source != " " && source.length() == 2 ? source : "原創";
}
/* ----------- 雲檢 採集 end--- */
// 作者
String author = authorList!=null && authorList.size()>0 ? authorList.get(0).trim(): "管理員";
if (author.contains("作者")) {
author = author.substring(author.indexOf("作者"), author.indexOf("作者")).replace("作者:", "").trim();
System.out.println(source.length());
author = null != author && author != " " && author.length() == 2 ? author : "管理員";
}
String hits = hitsList!=null && hitsList.size()>0 ? hitsList.get(0).trim(): "0";
String articleContent = contentList!=null && contentList.size()>0 ? contentList.get(0): null;
//處理採集到的時間
Date createtime = null;
if (GeneralUtil.isNull(createtimeTemp)) {
createtime = new Date();
} else {
// System.out.println(createtimeStr);
createtimeTemp = createtimeTemp.replaceAll("\\r.*\\n", "").trim();
// createtimeStr=createtimeStr.replaceAll("\u4E00-\u9FFF", "");
//System.out.println(createtimeStr);
String timeFormat = rule.getTimeFormat();
if (timeFormat != null && !"".equals(timeFormat)) {
SimpleDateFormat sdf = new SimpleDateFormat(timeFormat);
createtime = sdf.parse(createtimeTemp);
}
}
if (articleExist(title, subsiteId, newsClassId)) {
this.addGotCount(gotCount);
return;
}
//採集圖片
// articleContent = catchImage(articleContent, host);
articleContent = catchImageByViewUrl(articleContent, viewUrl);
Integer aid=addArticle(title, author, createtime, source, articleContent, subsiteId, newsClassId, hits);
//創建一個縮略圖
String imgRegex="/static/upload.*?pdf";//獲取一個縮略圖的正則
Pattern pattern = Pattern.compile(imgRegex, Pattern.DOTALL);
Matcher matcher = pattern.matcher(articleContent);
String imgSrc="";
while(matcher.find()){
imgSrc=matcher.group();
break;
}
if(GeneralUtil.isNotNull(imgSrc)){
String annexPath=downloadFile("http://www.ynsap.org.cn/"+imgSrc);
Annex annex = new Annex();
annex.setPath(annexPath);
annex.setName(imgSrc.substring(imgSrc.lastIndexOf("/")+1));
annex.setExt("pdf");
annex.setType("annex");
annex.setObj("article");
annex.setCreatetime(createtime);
annex.setObjId(aid);
annexService.save(annex);
}
this.addGetCount(getCount);
}else{
System.out.println("詳細頁鏈接:"+viewUrl+",採集到的標題爲空");
this.addErrorCount(errorCount);
}
}
}
private String catchImage(String articleContent, String host) throws ParserException, InterruptedException {
String html = "<html>" + articleContent + "</html>";
Parser parser = new Parser (html);
SimpleNodeIterator nodeList = parser.extractAllNodesThatMatch(new TagNameFilter("img")).elements();
while (nodeList.hasMoreNodes()) {
Tag tag = (Tag) nodeList.nextNode();
String src = tag.getAttribute("src");
if (src != null) {
String canDownloadSrc = src.replace("\\", "/");
if (canDownloadSrc.startsWith("/")) {
canDownloadSrc = host + canDownloadSrc;
} else if (!canDownloadSrc.startsWith("/") && !canDownloadSrc.startsWith("http://")) {
canDownloadSrc =host + "/" + canDownloadSrc;
}
String newSrc = downloadFile(canDownloadSrc);
html = html.replace(src, newSrc);
}
}
html = html.substring("<html>".length());
html = html.substring(0, html.length() - "</html>".length());
return html;
}
/**
* 根據文章詳情頁連接獲取文章中圖片
* @param articleContent
* @param viewUrl
* @return
* @throws ParserException
* @throws InterruptedException
* String
* Bingyong.Wang at 2019年8月8日
*/
private String catchImageByViewUrl(String articleContent, String viewUrl) throws ParserException, InterruptedException {
String html = "<html>" + articleContent + "</html>";
Parser parser = new Parser (html);
SimpleNodeIterator nodeList = parser.extractAllNodesThatMatch(new TagNameFilter("img")).elements();
while (nodeList.hasMoreNodes()) {
Tag tag = (Tag) nodeList.nextNode();
String src = tag.getAttribute("src");
if (src != null) {
String canDownloadSrc = src.replace("\\", "/");
if (canDownloadSrc.startsWith("/")) {
canDownloadSrc = viewUrl.substring(0, viewUrl.lastIndexOf("/") + 1) + canDownloadSrc;
} else if (!canDownloadSrc.startsWith("/") && !canDownloadSrc.startsWith("http://")) {
canDownloadSrc = viewUrl.substring(0, viewUrl.lastIndexOf("/") + 1) + canDownloadSrc;
}
String newSrc = downloadFile(canDownloadSrc);
html = html.replace(src, newSrc);
}
}
html = html.substring("<html>".length());
html = html.substring(0, html.length() - "</html>".length());
return html;
}
private String downloadFile(String src) throws InterruptedException {
Thread.sleep(500);
String ext = src.substring(src.lastIndexOf(".") + 1);
String fileName = System.currentTimeMillis() + "." + ext;
String localFile = net.aykj.listener.InitialListener.basePath + "temp/" + fileName;
HttpUtil.downloadFile(src, localFile);
return "/temp/" + fileName;
}
@SuppressWarnings("rawtypes")
private boolean articleExist(String title, Integer subsiteId, Integer newsClassId) {
//判斷文章是否存在
Map condition = new HashMap();
condition.put("title", title);
condition.put("subsiteId", subsiteId);
condition.put("newsClassIds", newsClassId);
Long count = articleService.queryArticleCountByTitle(subsiteId, newsClassId, title);
if (count > 0) {
return true;
}
return false;
}
/**
* 保存文章
* @param title
* @param author
* @param createtime
* @param source
* @param articleContent
* @param subsiteId
* @param newsClassId
* @param hits
* @throws Exception
*/
private Integer addArticle(String title, String author, Date createtime, String source, String articleContent,
Integer subsiteId, Integer newsClassId, String hits) throws Exception {
Article article = new Article();
article.setTitle(title);
article.setAuthor(author);
article.setCreatetime(createtime);
article.setSource(source);
article.setContent(articleContent);
article.setAudit(1);
article.setHits(hits == null || "".equals(hits) ? 0 : Integer.valueOf(hits));
return articleService.saveArticle(article, null, new Integer[]{newsClassId});
}
/**
* 查詢詳細頁的連接
* @param rule
* @param errorCount
* @return
* @throws Exception
*/
private List<String> queryViewUrlList(Rule rule, String errorCount) throws Exception {
String listUrl = rule.getListUrl();
String encode = GeneralUtil.isNull(rule.getEncode()) ? "UTF-8" : rule.getEncode();
String viewRegex = rule.getViewRegex();
// 列表時間正則表達式
String listCreatetimeRegex = rule.getListCreatetimeRegex();
// 列表時間過濾器
String listCreatetimeFilter = rule.getListCreatetimeFilter();
String host = rule.getHost();
// 雲檢
//host = "http://" + host + "/";
// 高檢
// host = "http:";
if (listUrl != null) {
String[] listUrlArray = listUrl.split(",");
List<String> viewUrlList = new ArrayList<String>();
List<String> viewUrlTemp = new ArrayList<String>();
for (String url : listUrlArray) {
String content = HttpUtil.get(url, encode);
if("HTTP/1.1 404 Not Found".equals(content)){
System.out.println("鏈接:"+host+",訪問404,請檢查鏈接");
}else{
if (GeneralUtil.isNotNull(rule.getListContainerRegex())) {
List<String> contentList = extractStrByPattern(host, content, rule.getListContainerRegex(), errorCount);
if (contentList != null && contentList.size() > 0) {
content = contentList.get(0);
}
}
//詳細頁連接在部分網站沒有寫絕對路徑 這裏要拼出完整的連接前綴
List<String> list = extractStrByPattern((GeneralUtil.isNotNull(rule.getPrefix()) ? rule.getPrefix() : "" ), content, viewRegex, errorCount);
// 獲取列表頁時間
List<String> createtimeList = extractStrByPattern(content, listCreatetimeRegex, errorCount, true, listCreatetimeFilter);
System.out.println(createtimeList);
// 這裏如果詳情頁沒有時間,需從列表頁獲取時間時使用。 思想:把列表頁詳情鏈接和時間綁在一起用VT分隔
if (GeneralUtil.isNotNull(createtimeList)) {
int i = 0;
for (String viewList : list) {
List<String> vListTemp = new ArrayList<String>();
vListTemp.add(viewList + "VT" + (createtimeList.get(i).contains("\r\n\t\t\t\t\t\t\t\t") ? createtimeList.get(i).replace("\r\n\t\t\t\t\t\t\t\t", "") : createtimeList.get(i)));
i++;
viewUrlTemp.addAll(vListTemp);
}
//詳細頁連接進行倒敘 插入數據纔是正着的
Collections.reverse(viewUrlTemp);
viewUrlList.addAll(viewUrlTemp);
} else {
//詳細頁連接進行倒敘 插入數據纔是正着的
Collections.reverse(list);
viewUrlList.addAll(list);
}
}
}
return viewUrlList;
}
return null;
}
private List<String> extractStrByPattern(String content, String regex, String errorCount) {
return extractStrByPattern(null, content, regex, errorCount);
}
private List<String> extractStrByPattern(String prefix, String content, String regex, String errorCount) {
return extractStrByPattern(prefix, content, regex, errorCount, false, null);
}
private List<String> extractStrByPattern(String content, String regex, String errorCount, boolean filterHtml) {
return extractStrByPattern(null, content, regex, errorCount, filterHtml, null);
}
private List<String> extractStrByPattern(String content, String regex, String errorCount, boolean filterHtml, String filterRegex) {
return extractStrByPattern(null, content, regex, errorCount, filterHtml, filterRegex);
}
private List<String> extractStrByPattern(String prefix, String content, String regex, String errorCount, boolean filterHtml, String filterRegex) {
if(GeneralUtil.isNotNull(regex)){
List<String> list = new ArrayList<String>();
Pattern pattern = Pattern.compile(regex, Pattern.DOTALL);
Matcher matcher = pattern.matcher(content);
boolean isFound = false;
while(matcher.find()) {
isFound = true;
String g = matcher.group();
if (filterHtml) {
g = g.replaceAll("^[\u00ff\uffff]", "").replaceAll("<.*?>", "").replaceAll("&.*?;", "");
}
if (GeneralUtil.isNotNull(filterRegex)) {
g = g.replaceAll(filterRegex, "");
}
if (GeneralUtil.isNotNull(prefix)) {
list.add(prefix + g);
} else {
list.add(g);
}
}
if (!isFound) {
addErrorCount(errorCount);
}
return list;
}else{
return null;
}
}
private void addGetCount(String getCount) {
Integer count = servletContext.getAttribute(getCount) == null ? 0 : (Integer)servletContext.getAttribute(getCount);
count++;
servletContext.setAttribute(getCount, count);
}
private void addGotCount(String gotCount) {
Integer count = servletContext.getAttribute(gotCount) == null ? 0 : (Integer)servletContext.getAttribute(gotCount);
count++;
servletContext.setAttribute(gotCount, count);
}
private void addErrorCount(String errorCount) {
Integer count = servletContext.getAttribute(errorCount) == null ? 0 : (Integer)servletContext.getAttribute(errorCount);
count++;
servletContext.setAttribute(errorCount, count);
}
private boolean isStop(String totalCount, String getCount, String gotCount, String errorCount) {
if (stop) {
servletContext.removeAttribute(rulesName);
servletContext.removeAttribute(totalCount);
servletContext.removeAttribute(getCount);
servletContext.removeAttribute(gotCount);
servletContext.removeAttribute(errorCount);
return true;
}
return false;
}
public void clear(String totalCount, String getCount, String gotCount, String errorCount) {
this.stop = true;
isStop(totalCount, getCount, gotCount, errorCount);
}
public boolean isStop() {
return stop;
}
public void setStop(boolean stop) {
this.stop = stop;
}
public static void main(String[] args) throws ParseException {
String content = FileUtil.readFileToString(new File("F:\\test.txt"), "UTF-8");
String regex = "E_ReadNews.asp\\?NewsID=[0-9]*";
Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE);
Matcher matcher = pattern.matcher(content);
while(matcher.find()) {
String g = matcher.group();
System.out.println(g);
}
}
}
/** 下載文件方法 */
public static String downloadFile(String remoteFile, String localFile) {
//匹配正則表達式 帶中文的替換成編譯過的
String zwRegex = "[\u4e00-\u9fa5]";
Pattern pattern = Pattern.compile(zwRegex, Pattern.DOTALL);
Matcher matcher = pattern.matcher(remoteFile);
String newRemoteFile = remoteFile;
while(matcher.find()){
String zw = matcher.group();
try {
String zwbm = URLEncoder.encode(zw, "utf-8");
newRemoteFile = newRemoteFile.replaceAll(zw, zwbm);
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
remoteFile = newRemoteFile;
//處理完畢
FileOutputStream output = null;
String message = null;
GetMethod get = null;
try {
HttpClient client = new HttpClient();
get = new GetMethod(remoteFile);
client.executeMethod(get);
localFile = localFile.replace("\\", "/");
String dirStr = localFile.substring(0, localFile.lastIndexOf("/"));
File dirFile = new File(dirStr);
if (!dirFile.exists()) dirFile.mkdirs();
File storeFile = new File(localFile);
output = new FileOutputStream(storeFile);
output.write(get.getResponseBody());
if (get.getStatusCode() != 200) {
message = get.getStatusText();
} else {
message =get.getStatusCode() + "";
}
} catch (HttpException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (output != null) {
output.flush();
output.close();
}
if (get != null) get.abort();
} catch (IOException e) {
e.printStackTrace();
}
}
return message;
}