0. 效果 👇
- 數據庫 👇
- 手機圖片 👇
1. 項目搭建 (創建 springboot 項目,集成 jpa,lombok)
-
項目結構 👇
- 數據庫表結構 👇
- pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.1.6.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>mr.s</groupId>
<artifactId>crawlerjd</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>crawlerjd</name>
<description>crawler-jd</description>
<properties>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-jdbc</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
<scope>runtime</scope>
<optional>true</optional>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.8.1</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
-
application.properties
#DB Configuration
spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/crawler?useUnicode=true&serverTimezone=Asia/Shanghai&characterEncoding=utf-8&nullCatalogMeansCurrent=true
spring.datasource.username=root
spring.datasource.password=123
#Jpa Configuration
spring.jpa.database=MySQL
spring.jpa.show-sql=true
spring.jpa.open-in-view=false
2. 代碼編寫
-
pojo 下的 Item 類編寫
package mr.s.jd.pojo;
import lombok.Data;
import javax.persistence.*;
import java.util.Date;
@Entity
@Table(name = "jd_item")
@Data
public class Item {
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Long id;
private Long spu;
private Long sku;
private String title;
private Double price;
private String pic;
private String url;
private Date created;
private Date updated;
}
- dao 下的 ItemDao 接口編寫
package mr.s.jd.dao;
import mr.s.jd.pojo.Item;
import org.springframework.data.jpa.repository.JpaRepository;
public interface ItemDao extends JpaRepository<Item, Long> {
}
- util 下的 HttpUtils 工具類編寫 (注意一下,圖片下載保存的地址)
package mr.s.jd.util;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;
@Component
public class HttpUtils {
// 連接池管理器
private PoolingHttpClientConnectionManager cm;
public HttpUtils(){
this.cm = new PoolingHttpClientConnectionManager();
// 設置最大連接數
this.cm.setMaxTotal(100);
// 設置每個主機的最大連接數
this.cm.setDefaultMaxPerRoute(10);
}
/**
* get 方式獲取頁面
* @param url
* @return 頁面數據
*/
public String doGetHtml(String url){
// 獲取 HttpClient 對象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
// 創建 HttpGet 對象
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36");
httpGet.setHeader("Referer", "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=b1a43153d64f4920a10f8ca31aa6fa6b");
// 設置請求信息
httpGet.setConfig(this.getConfig());
// 發起請求獲得請求數據
CloseableHttpResponse httpResponse = null;
try {
httpResponse = httpClient.execute(httpGet);
if (httpResponse.getStatusLine().getStatusCode() == 200){
// 判斷響應體是否爲空
if (httpResponse.getEntity() != null){
return EntityUtils.toString(httpResponse.getEntity(), "utf8");
}
}
} catch (IOException e) {
e.printStackTrace();
}finally {
// 關閉
if (httpResponse != null){
try {
httpResponse.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
// 請求失敗,返回空串
return "";
}
/**
* get 方式獲取圖片
* @param url
* @return 圖片名稱
*/
public String doGetImage(String url){
// 獲取 HttpClient 對象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
// 創建 HttpGet 對象
HttpGet httpGet = new HttpGet(url);
// 設置請求信息
httpGet.setConfig(this.getConfig());
// 發起請求獲得請求數據
CloseableHttpResponse httpResponse = null;
try {
httpResponse = httpClient.execute(httpGet);
if (httpResponse.getStatusLine().getStatusCode() == 200){
// 下載圖片
// 獲得圖片後綴
String extName = url.substring(url.lastIndexOf("."));
// 創建圖片名
String picName = UUID.randomUUID().toString() + extName;
// 創建 OutputStream
OutputStream outputStream = new FileOutputStream(new File("C:\\Users\\xxx\\Desktop\\download\\" + picName));
// 圖片下載
httpResponse.getEntity().writeTo(outputStream);
return picName;
}
} catch (IOException e) {
e.printStackTrace();
}finally {
// 關閉
if (httpResponse != null){
try {
httpResponse.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
// 下載失敗返回空串
return "";
}
// 設置請求信息
private RequestConfig getConfig() {
return RequestConfig.custom()
.setConnectTimeout(1000)
.setConnectionRequestTimeout(500)
.setSocketTimeout(10000)
.build();
}
}
- service 下的 ItemService 服務接口編寫
package mr.s.jd.service;
import mr.s.jd.pojo.Item;
import java.util.List;
public interface ItemService {
/**
* 保存
* @param item
*/
public void save(Item item);
/**
* 查詢
* @param item
* @return
*/
public List<Item> findAll(Item item);
}
- service.impl 下的 ItemServiceImpl 服務實現類編寫
package mr.s.jd.service.impl;
import mr.s.jd.dao.ItemDao;
import mr.s.jd.pojo.Item;
import mr.s.jd.service.ItemService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Example;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.util.List;
@Service
public class ItemServiceImpl implements ItemService {
@Autowired
private ItemDao itemDao;
@Transactional
@Override
public void save(Item item) {
itemDao.save(item);
}
@Override
public List<Item> findAll(Item item) {
Example<Item> example = Example.of(item);
return itemDao.findAll(example);
}
}
- 最重要的 task 下的 ItemTask 定時任務編寫
package mr.s.jd.task;
import com.fasterxml.jackson.databind.ObjectMapper;
import mr.s.jd.pojo.Item;
import mr.s.jd.service.ItemService;
import mr.s.jd.util.HttpUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import java.util.Date;
import java.util.List;
@Configuration //1.主要用於標記配置類,兼備Component的效果。
@EnableScheduling // 2.開啓定時任務
public class ItemTask {
@Autowired
private HttpUtils httpUtils;
@Autowired
private ItemService itemService;
private static final ObjectMapper MAPPER = new ObjectMapper();
// 當下載任務完成後,間隔多長時間進行下一次的任務,單位是毫秒
@Scheduled(fixedDelay = 100 * 1000)
public void itemTask() throws Exception{
// 聲明解析初始地址
String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&s=57&click=0&page=";
// 按照頁碼進行遍歷
for (int page = 1; page < 10; page = page + 2){
String html = httpUtils.doGetHtml(url + page);
// 解析頁面,獲取商品數據並存儲
this.parse(html);
}
System.out.println("手機數據抓取完成");
}
// 解析頁面,獲取商品數據並存儲
private void parse(String html) throws Exception{
// 解析頁面,獲取商品數據並存儲
Document doc = Jsoup.parse(html);
// 獲取 spu 信息
Elements spuElements = doc.select("div#J_goodsList > ul > li");
for (Element spuElement : spuElements){
// 獲取 spu
Long spu = Long.parseLong(spuElement.attr("data-spu"));
// 獲取 sku 信息
Elements skuElements = spuElement.select("li.ps-item");
for (Element skuElement : skuElements) {
// 獲取 sku
Long sku = Long.parseLong(skuElement.select("[data-sku]").attr("data-sku"));
// 根據 sku 查詢商品數據
Item item = new Item();
item.setSku(sku);
List<Item> itemList = itemService.findAll(item);
// 如果不存在,則進行保存操作
if (itemList.size() == 0){
// 設置 spu
item.setSpu(spu);
// 拼接商品詳情地址
String itemUrl = "https://item.jd.com/"+sku+".html";
item.setUrl(itemUrl);
// 獲取商品圖片
String picUrl = "https:" + skuElement.select("img[data-sku]").attr("data-lazy-img");
picUrl = picUrl.replace("/n9/", "/n1/");
String picName = httpUtils.doGetImage(picUrl);
item.setPic(picName);
// 獲取商品價格
String priceJson = httpUtils.doGetHtml("https://p.3.cn/prices/mgets?skuIds=J_" + sku);
double price = MAPPER.readTree(priceJson).get(0).get("p").asDouble();
item.setPrice(price);
// 獲取商品標題
String itemInfo = httpUtils.doGetHtml(item.getUrl());
String title = Jsoup.parse(itemInfo).select("div.sku-name").text();
item.setTitle(title);
item.setCreated(new Date());
item.setUpdated(item.getCreated());
// 保存商品數據
itemService.save(item);
}
}
}
}
}
超級重要的 CrawlerjdApplication 類的編寫!!!
package mr.s.jd;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;
@SpringBootApplication
// 開啓定時任務
@EnableScheduling
public class CrawlerjdApplication {
public static void main(String[] args) {
SpringApplication.run(CrawlerjdApplication.class, args);
}
}