一、配置mvn依賴
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.1.2</version>
</dependency>
二、代碼
1、獲取網頁內容
package com.chenanyi.fuli.Helper; import java.io.IOException; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.util.EntityUtils; public class GetHHH { /** * 根據URL抓取網頁內容 此類要用到HttpClient組件 * @author 陳安一 * @param url * @return */ public static String getContentFormUrl(String url) { /* 實例化一個HttpClient客戶端 */ HttpClient client = new DefaultHttpClient(); HttpGet getHttp = new HttpGet(url); String content = null; HttpResponse response; try { /*獲得信息載體*/ response = client.execute(getHttp); HttpEntity entity = response.getEntity(); if (entity != null) { /* 轉化爲文本信息 */ content = EntityUtils.toString(entity); } }catch (Exception e) { e.printStackTrace(); } finally { client.getConnectionManager().shutdown(); } return content; } }
2、獲取頁面列表內所有標題的Url
package com.chenanyi.fuli.Helper; import java.util.ArrayList; import java.io.StringReader; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Node; import org.dom4j.io.SAXReader; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class RegContent { /** * @author 陳安一 * @功能 根據正則表達式匹配返回的網頁信息 * @param reg * @param info * @return List<String> */ public static List<String> GetCon(String reg,String info){ List<String> result=new ArrayList<String>(); Matcher m = Pattern.compile(reg).matcher(info); while (m.find()) { String r = m.group(); result.add(r); } return result; } public static String GetDiv(String info){ SAXReader reader = new SAXReader(); Document doc; try { doc = reader.read(new StringReader(info)); Node node = doc.selectSingleNode("//body/div/div/div"); System.out.println(node.getText()); } catch (DocumentException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally{ } return info; } public static String GetOneCon(String reg,String info){ String result=info; Matcher m = Pattern.compile(reg).matcher(info); while (m.find()) { result = m.group(); } return result; } /** * @author 陳安一 * @功能 根據GetCon方法返回的List列表對數據進行重組,返回一個URL * @param result * @return */ public static List<String> GetallURL(List<String> result){ for(int i=0;i<result.size();i++){ result.set(i, "http://www.laossee.com/"+result.get(i)+".html"); } return result; } }
3、將內容保存到電腦中
package com.chenanyi.fuli.Helper; import java.io.FileWriter; import java.io.IOException; public class SaveTxt { /** * @author 陳安一 * @功能 將小說保存到本地中 * @param url * @param title 例如"noexists.txt" * @param cont * @return */ public static void Sava(String title,String cont){ FileWriter fileWriter = null; try { fileWriter = new FileWriter(title,true); fileWriter.write(cont); fileWriter.flush();; } catch (IOException e) { e.printStackTrace(); }finally{ try { fileWriter.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } }
4、圖片類的處理--下載圖片保存到本地
package com.chenanyi.fuli.Helper; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; public class Download { /** * @author 陳安一 * @功能 根據url,保存路徑,count(保存的標題,直接以數字保存) */ public static void down(String url, String path,int count) { // 構造URL URL img_url; URLConnection con; try { img_url = new URL(url); con = img_url.openConnection(); // 設置請求超時爲5s con.setConnectTimeout(5 * 1000); // 輸入流 InputStream is = con.getInputStream(); // 1K的數據緩衝 byte[] bs = new byte[1024]; // 讀取到的數據長度 int len; // 輸出的文件流 File sf = new File(path); if (!sf.exists()) { sf.mkdirs(); } String filename = count+".jpg"; OutputStream os; try { os = new FileOutputStream(sf.getPath() + "\\" + filename); // 開始讀取 while ((len = is.read(bs)) != -1) { os.write(bs, 0, len); } // 完畢,關閉所有鏈接 os.close(); is.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } catch (MalformedURLException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
5、根據獲取的url處理返回的html代碼,提取小說或者圖片保存到文件中
(1)、獲取txt的
package com.chenanyi.fuli.NBHelp; import java.util.List; import com.chenanyi.fuli.Helper.GetHHH; import com.chenanyi.fuli.Helper.RegContent; import com.chenanyi.fuli.Helper.SaveTxt; public class GetTxT { /** * @author 陳安一 * @param cate 分類,16是RQ * @param pagecount 爬取得總頁數 */ public static void Gettxt(int cate,int pagecount) { for (int m = 1; m < pagecount; m++) { int count = 0; // article-list-id-16-page- 16是小說- RQ小說。 // 6是圖片---ZPTP String url = "http://www.laossee.com/article-list-id-"+cate+"-page-" + m + ".html"; String info = GetHHH.getContentFormUrl(url); String reg = "article-show-id-\\d{6}"; List<String> result = RegContent.GetallURL(RegContent.GetCon(reg, info)); for (int i = 0; i < result.size(); i++) { String cont = GetHHH.getContentFormUrl(result.get(i)); List<String> titles = RegContent.GetCon("<title>.*?</title>", cont); String reggg = "<br />.*?<br />"; List<String> Content = RegContent.GetCon(reggg, cont); String conts = ""; for (int f = 0; f < Content.size(); f++) { conts += Content.get(f); } conts = conts.replace("<br />", ""); for (int j = 0; j < titles.size(); j++) { count++; String title = RegContent.GetOneCon(">.*?<", titles.get(j)); title = title.replace("/", "").replace(" ", ""); title = "txt/" + title.substring(1, title.length() - 1) .replace('(', ' ').trim().replace(')', ' ') .trim().replace('(', ' ').trim() .replace(')', ' ').trim() + ".txt"; SaveTxt.Sava(title, conts); System.out.println("第" + m + "頁第" + count + "個" + title); } } } } }
(2)、下載圖片
package com.chenanyi.fuli.NBHelp; import java.util.ArrayList; import java.util.List; import com.chenanyi.fuli.Helper.Download; import com.chenanyi.fuli.Helper.GetHHH; import com.chenanyi.fuli.Helper.RegContent; public class Getimg { /** * @author 陳安一! * @param cate 類別,6是ZpTp * @param pagecount 爬取的頁數 * @return List<String> 圖片鏈接 */ public static void Getimg(int cate, int pagecount,String path) { int count=0; for (int m = 1; m <= pagecount; m++) { // article-list-id-16-page- 16是小說- RQ小說。 // 6是圖片---ZPTP String url = "http://www.laossee.com/article-list-id-" + cate + "-page-" + m + ".html"; String info = GetHHH.getContentFormUrl(url); String reg = "article-show-id-\\d{6}"; List<String> result = RegContent.GetallURL(RegContent.GetCon(reg, info)); for (int i = 0; i < result.size(); i++) { String cont = GetHHH.getContentFormUrl(result.get(i)); List<String> img_urls = RegContent.GetCon("<img src=\"(.*?)/>", cont); for (int j = 0; j < img_urls.size(); j++) { count++; String temp = img_urls.get(j).substring(10); int index = temp.indexOf("\""); temp = temp.substring(0, index); Download.down(temp, path,count); System.out.println(count+"\tOK"); } } } } /** * @author 陳安一! * @param cate 類別,6是ZpTp * @param pagecount 爬取的頁數 * @return List<String> 圖片鏈接 */ public static List<String> GetOnePageimg(int cate, int page) { List<String> img_url = new ArrayList<String>(); // article-list-id-16-page- 16是小說- RQ小說。 // 6是圖片---ZPTP String url = "http://www.laossee.com/article-list-id-" + cate + "-page-" + page + ".html"; String info = GetHHH.getContentFormUrl(url); String reg = "article-show-id-\\d{6}"; List<String> result = RegContent.GetallURL(RegContent.GetCon(reg, info)); for (int i = 0; i < result.size(); i++) { String cont = GetHHH.getContentFormUrl(result.get(i)); List<String> img_urls = RegContent.GetCon("<img src=\"(.*?)/>", cont); for (int j = 0; j < img_urls.size(); j++) { String temp = img_urls.get(j).substring(10); int index = temp.indexOf("\""); temp = temp.substring(0, index); System.out.println(temp); img_url.add(temp); } } return img_url; } }
6、運行!
package com.chenanyi.fuli.start; import com.chenanyi.fuli.NBHelp.Getimg; public class Start { /** * @author 陳安一 * @see 網絡爬蟲 * @功能 獲取***** 哈哈哈哈哈哈 * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub try { java.util.Scanner scanner = new java.util.Scanner(System.in); System.out .println("**************************************************************"); System.out .println("**************************************************************"); System.out.println("第一個參數,分類(6,7,13),小說(14,15,16)"); System.out .println("**************************************************************"); System.out.println("第二個參數,獲取的總頁碼數,總頁碼數>=1"); System.out .println("**************************************************************"); System.out.println("第三個參數,保存的地址 : 格式 f:\\\\image4\\\\"); System.out .println("**************************************************************"); System.out.println("請輸入第一個參數"); int value = scanner.nextInt(); System.out.println("請輸入第二個參數"); int value1 = scanner.nextInt(); System.out.println("請輸入第三個參數"); String line = scanner.next(); System.out.println("開始執行"); Getimg.Getimg(value, value1, line); System.out.println("執行完畢"); } catch (Exception e) { e.printStackTrace(); } } }
7、給我評論!