java定時爬取百度風雲榜的數據

maven

 <dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-jdbc</artifactId>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
        </dependency>


        <!--jsoupxpath框架  爬蟲-->
        <dependency>
            <groupId>cn.wanghaomiao</groupId>
            <artifactId>JsoupXpath</artifactId>
            <version>2.3.2</version>
        </dependency>

        <!--幫助類框架  hutool-->
        <dependency>
            <groupId>cn.hutool</groupId>
            <artifactId>hutool-all</artifactId>
            <version>4.5.1</version>
        </dependency>


    </dependencies>

代碼

package com.hskj.tvdate.reptile;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.seimicrawler.xpath.JXDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import cn.hutool.core.date.DateUtil;
import cn.hutool.http.HttpUtil;
import cn.hutool.json.JSONObject;
import cn.hutool.json.JSONUtil;

/**
 * @program: tvdate
 * @description:爬取百度風雲榜的數據
 * @author: hw
 * @create: 2020-01-29 17:18
 */
@Component
public class BaiDuSituation {
  @Autowired JdbcTemplate jdbcTemplate;

  static ExecutorService executorService = Executors.newFixedThreadPool(30);

  /** 影視類型和網址的key-value */
  static final Map<String, Object> MAP =
      new HashMap<String, Object>() {
        {
          put("電影", "http://top.baidu.com/buzz?b=26&c=1&fr=topcategory_c1");
          put("電視劇", "http://top.baidu.com/buzz?b=4&c=2&fr=topcategory_c2");
          put("綜藝", "http://top.baidu.com/buzz?b=19&c=3&fr=topcategory_c3");
          put("動漫", "http://top.baidu.com/buzz?b=23&c=5&fr=topcategory_c5");
          put("少兒", "http://top.baidu.com/buzz?b=1677&fr=topbuzz_b23_c5");
          put("紀錄片", "http://top.baidu.com/buzz?b=1678&fr=topbuzz_b23_c5");
        }
      };
  /** 影視類型和影視簡介的key-value value後需加上影視名稱的UrlEncode後的字符, 並且鏈接返回的的數據是unicode編碼,需要轉譯 */
  static final Map<String, Object> KEY_VALUE =
      new HashMap<String, Object>() {
        {
          put("電影", "http://top.baidu.com/detail/intro?boardid=26&keyword=");
          put("電視劇", "http://top.baidu.com/detail/intro?boardid=4&keyword=");
          put("綜藝", "http://top.baidu.com/detail/intro?boardid=19&keyword=");
          put("動漫", "http://top.baidu.com/detail/intro?boardid=23&keyword=");
          put("少兒", "http://top.baidu.com/detail/intro?boardid=1677&keyword=");
          put("紀錄片", "http://top.baidu.com/detail/intro?boardid=1678&keyword=");
        }
      };

  private static final Logger log = LoggerFactory.getLogger(BaiDuSituation.class);

  /** 爬取數據的方法 */
  @Scheduled(cron = "00 00 12 * * ?")
  public void addBaiduData() throws Exception {
    String today = DateUtil.today();
    log.info("百度風雲榜爬取數據定時任務開始執行");
    for (Map.Entry<String, Object> url : MAP.entrySet()) {
      String urls = url.getValue().toString();
      String type = url.getKey();
      // 請求鏈接
      Document document = Jsoup.parse(new URL(urls).openStream(), "GBK", urls);
      JXDocument underTest = JXDocument.create(document.toString());
      // 標題  名稱
      String title = "//td[@class='keyword']/a[1]/text()";
      String index = "//td[@class='last']/span/text()";
      List<Object> titles = underTest.sel(title);
      List<Object> indexs = underTest.sel(index);
      for (int i = 0; i < titles.size(); i++) {
        int finalI = i;
        Thread thread =
            new Thread(
                () -> {
                  // 標題
                  Object titleName = titles.get(finalI);
                  // 指數
                  Object index2 = indexs.get(finalI);
                  String urlDeCode = strToUrlDeCode(titleName.toString());
                  String urli = KEY_VALUE.get(type) + urlDeCode;
                  Map<String, Object> jsonToMap = summaryJsonToMap(HttpUtil.get(urli));
                  Object urlBaike = null;
                  Object imageUrl = null;
                  Object imageBig = null;
                  Object brief = null;
                  try {
                    // 百科
                    urlBaike = jsonToMap.get("url");
                    // 小圖封面
                    imageUrl = jsonToMap.get("image");
                    // 大圖封面
                    imageBig = jsonToMap.get("orin_image");
                    // 簡介
                    brief = jsonToMap.get("abstract");
                  } catch (Exception e) {
                    log.info("該影片沒有簡介:{}", titleName);
                  }
                  addBaiduDateBase(
                      titleName, type, brief, index2, urlBaike, imageUrl, imageBig, finalI, today);
                });

        executorService.execute(thread);
      }
    }
  }

  /**
   * 將中文字符轉成urlcode
   *
   * @param str 中文字符
   * @return urlcode
   */
  public static String strToUrlDeCode(String str) {
    String deCode = "";
    try {
      deCode = URLEncoder.encode(str, "gb18030");
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
      return deCode;
    }
    return deCode;
  }

  /**
   * 將簡介的json轉成map
   *
   * @param json 簡介的json
   * @return 轉成後的map key: url.百度百科 image.小圖封面 orin_image.大圖封面 abstract.簡介
   */
  public static Map<String, Object> summaryJsonToMap(String json) {
    JSONObject content = null;
    try {
      JSONObject jsonObject = JSONUtil.parseObj(json);
      content = JSONUtil.parseObj(jsonObject.get("content"));
    } catch (Exception e) {
    }
    return content;
  }

  public void addBaiduDateBase(Object... obj) {
    StringBuffer sb =
        new StringBuffer(
            "insert into baidu_situation(create_day,no,title,`type`,brief,`index`,url_baike,image_url,image_big)values(");
    sb.append(
        "'" + obj[8] + "','" + obj[7] + "','" + obj[0] + "','" + obj[1] + "','" + obj[2] + "','"
            + obj[3] + "','" + obj[4] + "','" + obj[5] + "','" + obj[6] + "'");

    sb.append(")");

    int update = jdbcTemplate.update(sb.toString());
    if (update < 1) {
      log.error("數據庫插入數據庫失敗,參數爲:{}", obj);
    }
  }
}

 

數據庫

CREATE TABLE `baidu_situation` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '標題名稱',
  `type` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '類型',
  `no` int(11) DEFAULT NULL COMMENT '排名',
  `brief` varchar(2550) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '簡介',
  `index` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '熱度指數',
  `url_baike` varchar(500) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '百科鏈接',
  `image_url` varchar(500) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '封面鏈接',
  `image_big` varchar(500) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '大圖封面鏈接',
  `create_day` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT '創建的年月日  用於做索引',
  `remark` varchar(255) CHARACTER SET utf8 DEFAULT NULL COMMENT '備註',
  `create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '創建時間',
  `state` int(11) NOT NULL DEFAULT '0' COMMENT '數據有效性(0-有效,1-無效)',
  PRIMARY KEY (`id`),
  UNIQUE KEY `titleDay` (`title`,`type`,`create_day`) USING BTREE COMMENT '唯一索引做標識防止重複爬取'
) ENGINE=InnoDB AUTO_INCREMENT=301 DEFAULT CHARSET=utf8 COLLATE=utf8_czech_ci;

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章