環境
- Google瀏覽器
- selenium
selenium可以模仿操作瀏覽器的過程。這裏使用selenium的原因主要是因爲使用JSoup暫時沒有找到百度圖片原圖鏈接。查看頁面源碼,可以發現圖片是動態加載出來的。
需要注意的是: selenium的版本要和chrome瀏覽器的版本匹配。下面的這條鏈接可以參考一下。
selenium之 chromedriver與chrome版本映射表(更新至v2.43)
【https://blog.csdn.net/qq_40374604/article/details/84430963】
下圖爲項目的結構。需要引入selenium jar包, 以及Chromedriver.exe
F12觀察頁面元素
這個過程就不多說了,選定一個元素從上到下所有的url全部試一遍。
編寫程序
這個沒什麼好說的,只要找對了頁面元素並獲取,都沒什麼問題。
這裏用到了一個頁面滾動的操作。
((JavascriptExecutor) connWeb).executeScript("window.scrollBy(0, document.body.scrollHeight)");
可以參考下面的鏈接:
Selenium之Web頁面滾動條滾操作
【https://blog.csdn.net/jlminghui/article/details/50477283】
package com.nikolazhang.spider;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.List;
import java.util.Scanner;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.interactions.Actions;
import com.nikolazhang.util.FileDownload;
import com.nikolazhang.util.HttpRequestUtil;
public class SpiderPicFromBaidu {
private static int cnt = 0;
public static void main(String[] args) throws IOException {
Scanner scanner = new Scanner(System.in);
System.out.println("請輸入參數[參數之間使用一個空格分割] , 回車執行!");
String[] params = scanner.nextLine().split(" ");
String filepath = params[0];
String keywords = params[1];
int count = Integer.valueOf(params[2]);
if(count<=0) {
System.out.println("下載數量必須大於零!!!程序退出");
return;
}
String url = inputBaiduImageUrl(keywords);
WebDriver connWeb = connWeb(url);
Iterator<WebElement> imageUrl = null;
for(int i = 1; i<=count/5; i++) {
((JavascriptExecutor) connWeb).executeScript("window.scrollBy(0, document.body.scrollHeight)");
}
System.out.println("頁面元素加載中...請等待...!");
try {
Thread.sleep(count*10);
} catch (InterruptedException e1) {
e1.printStackTrace();
}
try {
imageUrl = getImageUrl(connWeb);
} catch (InterruptedException e) {
e.printStackTrace();
}
if(imageUrl != null) {
downloadImage(imageUrl, filepath);
} else {
System.out.println("圖片下載失敗!!!");
}
System.out.println("下載結束----退出瀏覽器!!!");
connWeb.close();
}
/**
* 鏈接目標網站
* @param url
* @return
*/
public static WebDriver connWeb(String url) {
System.setProperty("webdriver.chrome.driver",
"chromedriver.exe");
WebDriver webDriver = new ChromeDriver();
webDriver.get(url);
System.out.println("*+*+*+*+* 已連接網站: 【"+webDriver.getTitle()+"】");
return webDriver;
}
/**
* 輸入關鍵詞
*/
public static String inputBaiduImageUrl(String text) {
String url = "https://image.baidu.com"
+ "/search/index"
+ "?tn=baiduimage"
+ "&word="+text;
return url;
}
public static Iterator<WebElement> getImageUrl(WebDriver webDriver) throws InterruptedException {
Actions actions = new Actions(webDriver);
List<WebElement> imgitem = webDriver.findElements(By.className("imgitem"));
System.out.println("獲取到的相關的圖片數量: "+imgitem.size());
Iterator<WebElement> imgItor = imgitem.iterator();
return imgItor;
}
public static void downloadImage(Iterator<WebElement> imgItor, String filepath) {
File file = new File(filepath);
if(!file.exists()) {
file.mkdirs();
}
while(imgItor.hasNext()) {
WebElement nextImg = imgItor.next();
String addrImg = nextImg.getAttribute("data-objurl");
String filename = ++cnt + "." + nextImg.getAttribute("data-ext");
FileDownload.saveImageToDisk(addrImg, filepath+filename);
System.out.println("下載第"+ cnt +"張圖片地址爲: "+addrImg);
}
}
}
- 文件下載公共類
package com.nikolazhang.util;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
public class FileDownload {
public static void saveImageToDisk(InputStream inputStream, String filepath) {
byte[] data = new byte[1024];
int len = 0;
FileOutputStream fileOutputStream = null;
try {
fileOutputStream = new FileOutputStream(filepath);
while ((len = inputStream.read(data)) != -1) {
fileOutputStream.write(data, 0, len);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (inputStream != null) {
try {
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (fileOutputStream != null) {
try {
fileOutputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
Http請求公共類:
package com.nikolazhang.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Map;
/**
* Http請求工具類
*/
public class HttpRequestUtil {
static boolean proxySet = false;
static String proxyHost = "127.0.0.1";
static int proxyPort = 8087;
/**
* 編碼
* @param source
* @return
*/
public static String urlEncode(String source,String encode) {
String result = source;
try {
result = java.net.URLEncoder.encode(source,encode);
} catch (UnsupportedEncodingException e) {
return "0";
}
return result;
}
public static String urlEncodeGBK(String source) {
String result = source;
try {
result = java.net.URLEncoder.encode(source,"GBK");
} catch (UnsupportedEncodingException e) {
return "0";
}
return result;
}
/**
* 發起http請求獲取返回結果
* @param req_url 請求地址
* @return
*/
public static String httpRequest(String req_url, String fmt) {
StringBuffer buffer = new StringBuffer();
try {
URL url = new URL(req_url);
HttpURLConnection httpUrlConn = (HttpURLConnection) url.openConnection();
httpUrlConn.setDoOutput(false);
httpUrlConn.setDoInput(true);
httpUrlConn.setUseCaches(false);
httpUrlConn.setRequestMethod("GET");
httpUrlConn.connect();
// 將返回的輸入流轉換成字符串
InputStream inputStream = httpUrlConn.getInputStream();
InputStreamReader inputStreamReader = new InputStreamReader(inputStream, fmt);
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
String str = null;
while ((str = bufferedReader.readLine()) != null) {
buffer.append(str);
}
bufferedReader.close();
inputStreamReader.close();
// 釋放資源
inputStream.close();
inputStream = null;
httpUrlConn.disconnect();
} catch (Exception e) {
}
return buffer.toString();
}
/**
* 發送http請求取得返回的輸入流
* @param requestUrl 請求地址
* @return InputStream
*/
public static InputStream httpRequestIO(String requestUrl) {
InputStream inputStream = null;
try {
URL url = new URL(requestUrl);
HttpURLConnection httpUrlConn = (HttpURLConnection) url.openConnection();
httpUrlConn.setDoInput(true);
httpUrlConn.setRequestMethod("GET");
httpUrlConn.connect();
// 獲得返回的輸入流
inputStream = httpUrlConn.getInputStream();
} catch (Exception e) {
}
return inputStream;
}
/**
* 向指定URL發送GET方法的請求
*
* @param url
* 發送請求的URL
* @param param
* 請求參數,請求參數應該是 name1=value1&name2=value2 的形式。
* @return URL 所代表遠程資源的響應結果
*/
public static String sendGet(String url, String param) {
String result = "";
BufferedReader in = null;
try {
String urlNameString = url + "?" + param;
URL realUrl = new URL(urlNameString);
// 打開和URL之間的連接
URLConnection connection = realUrl.openConnection();
// 設置通用的請求屬性
connection.setRequestProperty("accept", "*/*");
connection.setRequestProperty("connection", "Keep-Alive");
connection.setRequestProperty("user-agent",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
// 建立實際的連接
connection.connect();
// 獲取所有響應頭字段
Map<String, List<String>> map = connection.getHeaderFields();
// 遍歷所有的響應頭字段
// 定義 BufferedReader輸入流來讀取URL的響應
in = new BufferedReader(new InputStreamReader(
connection.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
} catch (Exception e) {
}
// 使用finally塊來關閉輸入流
finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result;
}
/**
* 向指定 URL 發送POST方法的請求
*
* @param url
* 發送請求的 URL
* @param param
* 請求參數,請求參數應該是 name1=value1&name2=value2 的形式。
* @param isproxy
* 是否使用代理模式
* @return 所代表遠程資源的響應結果
*/
public static String sendPost(String url, String param, String fmt,boolean isproxy) {
OutputStreamWriter out = null;
BufferedReader in = null;
String result = "";
try {
URL realUrl = new URL(url);
HttpURLConnection conn = null;
if(isproxy){//使用代理模式
@SuppressWarnings("static-access")
Proxy proxy = new Proxy(Proxy.Type.DIRECT.HTTP, new InetSocketAddress(proxyHost, proxyPort));
conn = (HttpURLConnection) realUrl.openConnection(proxy);
}else{
conn = (HttpURLConnection) realUrl.openConnection();
}
// 打開和URL之間的連接
// 發送POST請求必須設置如下兩行
conn.setDoOutput(true);
conn.setDoInput(true);
conn.setRequestMethod("POST"); // POST方法
// 設置通用的請求屬性
// conn.setRequestProperty("accept", "*/*");
// conn.setRequestProperty("connection", "Keep-Alive");
// conn.setRequestProperty("user-agent",
// "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
// conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
conn.connect();
// 獲取URLConnection對象對應的輸出流
out = new OutputStreamWriter(conn.getOutputStream(), fmt);
// 發送請求參數
out.write(param);
// flush輸出流的緩衝
out.flush();
// 定義BufferedReader輸入流來讀取URL的響應
in = new BufferedReader(
new InputStreamReader(conn.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
} catch (Exception e) {
}
//使用finally塊來關閉輸出流、輸入流
finally{
try{
if(out!=null){
out.close();
}
if(in!=null){
in.close();
}
}
catch(IOException ex){
ex.printStackTrace();
}
}
return result;
}
public static void main(String[] args) {
//demo:代理訪問
String url = "http://api.adf.ly/api.php";
String para = "key=youkeyid&youuid=uid&advert_type=int&domain=adf.ly&url=http://somewebsite.com";
}
}
以上程序可以在GitHub上獲取,CSDN上的代碼一般不會更新:
【https://github.com/NikolaZhang/PickPic】
運行
完美~~