maven
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-jdbc</artifactId>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<!--jsoupxpath框架 爬蟲-->
<dependency>
<groupId>cn.wanghaomiao</groupId>
<artifactId>JsoupXpath</artifactId>
<version>2.3.2</version>
</dependency>
<!--幫助類框架 hutool-->
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>4.5.1</version>
</dependency>
</dependencies>
代碼
package com.hskj.tvdate.reptile;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.seimicrawler.xpath.JXDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import cn.hutool.core.date.DateUtil;
import cn.hutool.http.HttpUtil;
import cn.hutool.json.JSONObject;
import cn.hutool.json.JSONUtil;
/**
* @program: tvdate
* @description:爬取百度風雲榜的數據
* @author: hw
* @create: 2020-01-29 17:18
*/
@Component
public class BaiDuSituation {
@Autowired JdbcTemplate jdbcTemplate;
static ExecutorService executorService = Executors.newFixedThreadPool(30);
/** 影視類型和網址的key-value */
static final Map<String, Object> MAP =
new HashMap<String, Object>() {
{
put("電影", "http://top.baidu.com/buzz?b=26&c=1&fr=topcategory_c1");
put("電視劇", "http://top.baidu.com/buzz?b=4&c=2&fr=topcategory_c2");
put("綜藝", "http://top.baidu.com/buzz?b=19&c=3&fr=topcategory_c3");
put("動漫", "http://top.baidu.com/buzz?b=23&c=5&fr=topcategory_c5");
put("少兒", "http://top.baidu.com/buzz?b=1677&fr=topbuzz_b23_c5");
put("紀錄片", "http://top.baidu.com/buzz?b=1678&fr=topbuzz_b23_c5");
}
};
/** 影視類型和影視簡介的key-value value後需加上影視名稱的UrlEncode後的字符, 並且鏈接返回的的數據是unicode編碼,需要轉譯 */
static final Map<String, Object> KEY_VALUE =
new HashMap<String, Object>() {
{
put("電影", "http://top.baidu.com/detail/intro?boardid=26&keyword=");
put("電視劇", "http://top.baidu.com/detail/intro?boardid=4&keyword=");
put("綜藝", "http://top.baidu.com/detail/intro?boardid=19&keyword=");
put("動漫", "http://top.baidu.com/detail/intro?boardid=23&keyword=");
put("少兒", "http://top.baidu.com/detail/intro?boardid=1677&keyword=");
put("紀錄片", "http://top.baidu.com/detail/intro?boardid=1678&keyword=");
}
};
private static final Logger log = LoggerFactory.getLogger(BaiDuSituation.class);
/** 爬取數據的方法 */
@Scheduled(cron = "00 00 12 * * ?")
public void addBaiduData() throws Exception {
String today = DateUtil.today();
log.info("百度風雲榜爬取數據定時任務開始執行");
for (Map.Entry<String, Object> url : MAP.entrySet()) {
String urls = url.getValue().toString();
String type = url.getKey();
// 請求鏈接
Document document = Jsoup.parse(new URL(urls).openStream(), "GBK", urls);
JXDocument underTest = JXDocument.create(document.toString());
// 標題 名稱
String title = "//td[@class='keyword']/a[1]/text()";
String index = "//td[@class='last']/span/text()";
List<Object> titles = underTest.sel(title);
List<Object> indexs = underTest.sel(index);
for (int i = 0; i < titles.size(); i++) {
int finalI = i;
Thread thread =
new Thread(
() -> {
// 標題
Object titleName = titles.get(finalI);
// 指數
Object index2 = indexs.get(finalI);
String urlDeCode = strToUrlDeCode(titleName.toString());
String urli = KEY_VALUE.get(type) + urlDeCode;
Map<String, Object> jsonToMap = summaryJsonToMap(HttpUtil.get(urli));
Object urlBaike = null;
Object imageUrl = null;
Object imageBig = null;
Object brief = null;
try {
// 百科
urlBaike = jsonToMap.get("url");
// 小圖封面
imageUrl = jsonToMap.get("image");
// 大圖封面
imageBig = jsonToMap.get("orin_image");
// 簡介
brief = jsonToMap.get("abstract");
} catch (Exception e) {
log.info("該影片沒有簡介:{}", titleName);
}
addBaiduDateBase(
titleName, type, brief, index2, urlBaike, imageUrl, imageBig, finalI, today);
});
executorService.execute(thread);
}
}
}
/**
* 將中文字符轉成urlcode
*
* @param str 中文字符
* @return urlcode
*/
public static String strToUrlDeCode(String str) {
String deCode = "";
try {
deCode = URLEncoder.encode(str, "gb18030");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
return deCode;
}
return deCode;
}
/**
* 將簡介的json轉成map
*
* @param json 簡介的json
* @return 轉成後的map key: url.百度百科 image.小圖封面 orin_image.大圖封面 abstract.簡介
*/
public static Map<String, Object> summaryJsonToMap(String json) {
JSONObject content = null;
try {
JSONObject jsonObject = JSONUtil.parseObj(json);
content = JSONUtil.parseObj(jsonObject.get("content"));
} catch (Exception e) {
}
return content;
}
public void addBaiduDateBase(Object... obj) {
StringBuffer sb =
new StringBuffer(
"insert into baidu_situation(create_day,no,title,`type`,brief,`index`,url_baike,image_url,image_big)values(");
sb.append(
"'" + obj[8] + "','" + obj[7] + "','" + obj[0] + "','" + obj[1] + "','" + obj[2] + "','"
+ obj[3] + "','" + obj[4] + "','" + obj[5] + "','" + obj[6] + "'");
sb.append(")");
int update = jdbcTemplate.update(sb.toString());
if (update < 1) {
log.error("數據庫插入數據庫失敗,參數爲:{}", obj);
}
}
}
數據庫
CREATE TABLE `baidu_situation` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`title` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '標題名稱',
`type` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '類型',
`no` int(11) DEFAULT NULL COMMENT '排名',
`brief` varchar(2550) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '簡介',
`index` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '熱度指數',
`url_baike` varchar(500) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '百科鏈接',
`image_url` varchar(500) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '封面鏈接',
`image_big` varchar(500) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '大圖封面鏈接',
`create_day` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '創建的年月日 用於做索引',
`remark` varchar(255) CHARACTER SET utf8 DEFAULT NULL COMMENT '備註',
`create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '創建時間',
`state` int(11) NOT NULL DEFAULT '0' COMMENT '數據有效性(0-有效,1-無效)',
PRIMARY KEY (`id`),
UNIQUE KEY `titleDay` (`title`,`type`,`create_day`) USING BTREE COMMENT '唯一索引做標識防止重複爬取'
) ENGINE=InnoDB AUTO_INCREMENT=301 DEFAULT CHARSET=utf8 COLLATE=utf8_czech_ci;