筆者之前沒接觸過爬蟲的,參考網上的例子,模仿這寫了第一個Java爬蟲的例子,用來抓取http://www.mmjpg.com/網站的妹子圖片,先看結果:
第一次抓取到網上的圖片還是有點小欣慰的。好了,廢話不多說,說說實現的具體過程吧。
完成這個簡單的爬蟲程序主要是分爲以下幾個步驟:
1)獲取HttpClient對象,實現需要引入對應的jar包。可以到官網上下載,地址:點擊打開鏈接
2)執行HttpClient的execute方法,獲取HttpResponse對象的返回結果。代碼中是用CloseableHttpResponse對象,是對HttpClient接口的實現
3)將2中的對象轉換爲流式對象,並通過工具類將其轉換爲String類型。
4)完成了上面的步驟,後面的就是對字符串的拆分和寫入文本文件了。
4.1)先將字符串對象中的<ul></ul>部分截取出來,每一個<li></li>之間的內容就是一張圖片。
4.2)然後通過img標籤來獲取到每一張圖片的url地址,
4.3)準備好存儲圖片文件的路徑。
4.4)通過URL類new一個URL對象,傳入上面獲取到的url地址。
4.5)爲該URL對象創建輸出流用於輸出該url下的內容,對於文件來說是InputStream,通過System.arraycopy函數循環的將內容寫入到文件中。
下面附上程序實現源碼:
主程序實現。
import java.io.InputStream;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
public class SimpleSpider {
private static final int meiZiPage = 67;
public static void main(String[] args) {
SimpleSpider spider = new SimpleSpider();
//spider.getJianDanImages();
spider.getMeiZiImages();
}
private void getMeiZiImages() {
// TODO Auto-generated method stub
RequestConfig globalConfig =
RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD).setConnectionRequestTimeout(6000).setConnectTimeout(6000).build();
CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(globalConfig).build();
System.out.println("Java爬蟲馬上開始抓取妹子圖片。。。");
for (int i = 1; i <= meiZiPage; i++) {
//創建一個GET請求,http://www.mmjpg.com/home/
HttpGet httpGet = new HttpGet("http://www.mmjpg.com/home/" + i);
httpGet.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0");
try {
Thread.sleep(5000);
CloseableHttpResponse response = httpClient.execute(httpGet);
InputStream in = response.getEntity().getContent();
String html = Utils.convertStreamToString(in);
//每一頁啓動一個線程用於當前網頁內容的解析
new Thread(new MeiZiHtmlParser(html,i)).start();
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
}
}
}
拆分String類型的html對象,確定圖片的位置以及圖片的url地址。
import java.util.ArrayList;
import java.util.List;
public class MeiZiHtmlParser implements Runnable {
private String html;
private int page;
public MeiZiHtmlParser(String html, int page) {
// TODO Auto-generated constructor stub
this.html = html;
this.page = page;
}
@Override
public void run() {
// TODO Auto-generated method stub
System.out.println("**************第"+page+"頁******************");
List<String> list = new ArrayList<>();
html = html.substring(html.indexOf("<ul>"), html.indexOf("</ul>"));
String[] images = html.split("</li>");
//System.out.println("第"+page+"頁有"+(images.length - 1) +"張圖片");
for (String image : images) {
if(image.indexOf("<img src=") > 0){
int i = image.indexOf("img src=\"") + "img src=\"".length();
list.add(image.substring(i,image.indexOf("\"", i + 1)));
}
}
for (String imageUrl : list) {
if (imageUrl.indexOf("mmjpg") > 0) {
new Thread(new MeiZiImageCreator(imageUrl, page)).start();
}
}
}
}
生成圖片,並寫入到文件中去。
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
public class MeiZiImageCreator implements Runnable {
private static int count = 1;
private String imageUrl;
private int page;
private StringBuffer basePath;
public MeiZiImageCreator(String imageUrl, int page) {
// TODO Auto-generated constructor stub
this.imageUrl = imageUrl;
this.page = page;
basePath = new StringBuffer("D:/meizitu/page_"+page);
}
@Override
public void run() {
// TODO Auto-generated method stub
File dir = new File(basePath.toString());
if (!dir.exists()) {
dir.mkdirs();
System.out.println("妹子圖片存放於"+basePath+"目錄下");
}
String imageName = imageUrl.substring(imageUrl.lastIndexOf("/") + 1);
File file = new File(basePath + "/" + page + "--" + imageName);
try {
OutputStream os = new FileOutputStream(file);
URL url = new URL(imageUrl);
InputStream in = url.openStream();
byte[] buff = new byte[1024];
while(true){
int readed = in.read(buff);//讀取內容長度
if(readed == -1){
break;
}
byte[] temp = new byte[readed];
System.arraycopy(buff, 0, temp, 0, readed);//內容複製
//寫入到文件中
os.write(temp);
}
System.out.println("第" + (count++) + "張妹子:" + file.getAbsolutePath());
in.close();
os.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
工具類的實現,注意InputStreamReader傳入的UTF-8參數,否則解析網頁的內容會出現亂碼的現象。
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
public class Utils {
public static String convertStreamToString(InputStream in) throws UnsupportedEncodingException {
//BufferedReader reader = new BufferedReader(new InputStreamReader(in));
BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
StringBuilder sb = new StringBuilder();
String line = null;
try {
while ((line = reader.readLine()) != null) {
sb.append(line + "/n");
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return sb.toString();
}
}
說明:學習娛樂歸學習娛樂,但是所有的圖片版權歸原網站所有,請大家不要隨意傳播。