案例說明
使用SpringBoot+Mybatis plus+Webmagic爬取51job的職位信息,並保存到mysql數據庫.
創建工程
引入maven依賴
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.2.5.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.hg</groupId>
<artifactId>spider-demo</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>spider-demo</name>
<description>爬蟲實戰</description>
<properties>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.1.0</version>
</dependency>
<!-- druid數據庫連接池 -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid-spring-boot-starter</artifactId>
<version>1.1.10</version>
</dependency>
<!-- mysql connector -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope>
</dependency>
<!-- Mybatis-plus -->
<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-boot-starter</artifactId>
<version>3.0.5</version>
</dependency>
<!--webmagic-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>16.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
建表語句
創建數據庫spider,新建表job_info
CREATE TABLE `job_info` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主鍵id',
`company_name` varchar(100) DEFAULT NULL COMMENT '公司名稱',
`company_addr` varchar(200) DEFAULT NULL COMMENT '公司聯繫方式',
`job_name` varchar(100) DEFAULT NULL COMMENT '職位名稱',
`job_addr` varchar(50) DEFAULT NULL COMMENT '工作地點',
`salary` varchar(50) DEFAULT NULL COMMENT '薪資範圍',
`url` varchar(150) DEFAULT NULL COMMENT '招聘信息詳情頁',
`time` varchar(10) DEFAULT NULL COMMENT '職位最近發佈時間',
`job_detail` text COMMENT '職位詳情',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=utf8 COMMENT='招聘信息';
加入配置文件
創建application.yml
spring:
application:
name: spider-servoce
jackson:
time-zone: GMT+8
date-format: yyyy-MM-dd HH:mm:ss
datasource:
driver-class-name: com.mysql.cj.jdbc.Driver
url: jdbc:mysql://localhost:3306/spider?useUnicode=true&characterEncoding=utf8&autoReconnect=true&useSSL=false
username: root
password: root
type: com.alibaba.druid.pool.DruidDataSource
druid:
initialSize: 10
minIdle: 10
maxActive: 50
maxWait: 60000
timeBetweenEvictionRunsMillis: 60000
minEvictableIdleTimeMillis: 300000
validationQuery: SELECT 1 FROM DUAL
testWhileIdle: true
testOnBorrow: false
testOnReturn: false
poolPreparedStatements: true
maxPoolPreparedStatementPerConnectionSize: 20
filters: stat,wall
connectionProperties: druid.stat.mergeSql=true;druid.stat.slowSqlMillis=5000
#mybatis
mybatis-plus:
mapper-locations: classpath:mapper/**/*.xml
typeAliasesPackage: com.hg.*.entity
global-config:
db-config:
id-type: auto
field-strategy: not_empty
table-underline: true
db-type: mysql
refresh: true
configuration:
map-underscore-to-camel-case: true
cache-enabled: false
logging:
level:
org.springframework.web: info
org.apache.http: info
us.codecraft.webmagic: info
編寫POJO
package com.hg.spider.entity;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
@Data
@TableName("job_info")
@Slf4j
public class JobInfo {
@TableId
private Long id;
/**
* 公司名
*/
private String companyName;
/**
* 公司地址
*/
private String companyAddr;
/**
* 工作名稱
*/
private String jobName;
/**
* 工作地址
*/
private String jobAddr;
/**
* 工作詳情
*/
private String jobDetail;
/**
* 薪資
*/
private String salary;
/**
* 爬取的url
*/
private String url;
/**
* 職位發佈時間
*/
private String time;
}
編寫Dao
package com.hg.spider.dao;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import com.hg.spider.entity.JobInfo;
/**
* @Author skh
* @Date 2020/3/21 16:27
* @Desc
*/
public interface JobInfoDao extends BaseMapper<JobInfo> {
}
編寫Service
package com.hg.spider.service;
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.hg.spider.dao.JobInfoDao;
import com.hg.spider.entity.JobInfo;
import com.hg.spider.webmagic.JobProcessor;
import com.hg.spider.webmagic.MysqlPipeline;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import java.util.List;
/**
* @Author skh
* @Date 2020/3/21 12:10
* @Desc
*/
@Service
@Slf4j
public class JobInfoService extends ServiceImpl<JobInfoDao, JobInfo> {
//開始爬取的url
String url = "https://search.51job.com/list/080200,000000,0000,26,9,99,%25E6%2588%25BF%25E4%25BA%25A7%25E7%25BB%258F%25E7%25BA%25AA%25E4%25BA%25BA,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
@Autowired
private MysqlPipeline mysqlPipeline;
@Autowired
private JobProcessor jobProcessor;
public void getJobInfo() {
log.info("開始爬取數據");
//設置爬蟲配置
Spider.create(jobProcessor)
.addUrl(url) //設置初始爬取的url
//使用布隆過濾器過濾重複url,需要引入guava包
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000)))
.thread(50) //設置線程數
.addPipeline(mysqlPipeline) //設置持久化
.run();
}
public List<JobInfo> selectJobInfoByUrl(String url) {
QueryWrapper<JobInfo> wrapper = new QueryWrapper<>();
wrapper.eq("url", url);
List<JobInfo> jobInfos = this.baseMapper.selectList(wrapper);
return jobInfos;
}
}
編寫Controller
package com.hg.spider.controller;
import com.hg.spider.service.JobInfoService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RestController;
/**
* @Author skh
* @Date 2020/3/21 12:24
* @Desc
*/
@RestController
public class JobInfoController {
@Autowired
private JobInfoService jobInfoService;
@GetMapping("/getJobInfo")
public String getJobInfo() {
jobInfoService.getJobInfo();
return "success";
}
}
實現PageProcessor,定義頁面解析邏輯
package com.hg.spider.webmagic;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.util.StrUtil;
import com.hg.spider.entity.JobInfo;
import com.hg.spider.service.JobInfoService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;
/**
* @Author skh
* @Date 2020/3/20 22:56
* @Desc 解析頁面
*/
@Component
@Slf4j
public class JobProcessor implements PageProcessor {
@Autowired
private JobInfoService jobInfoService;
/**
* 解析頁面
* @param page
*/
@Override
public void process(Page page) {
//解析列表頁
List<Selectable> nodes = page.getHtml().css("div#resultList div.el").nodes();
if (CollUtil.isEmpty(nodes)) {
//爲空表示這是招聘詳情頁,解析頁面,獲取招聘詳情信息,保存數據
try {
this.saveJobInfo(page);
} catch (Exception e) {
log.error("解析異常,異常原因:{}", e.getMessage(),e);
}
} else {
//不爲空表示這是列表頁,解析出詳情頁url,放到任務隊列中
for (Selectable node : nodes) {
//獲取url地址
String jobInfoUrl = node.css("p.t1 span a").links().toString();
if (StrUtil.isNotBlank(jobInfoUrl)) {
//判斷記錄是否已存在
List<JobInfo> jobInfoList = jobInfoService.selectJobInfoByUrl(jobInfoUrl);
if (CollUtil.isEmpty(jobInfoList)) {
//把url放到任務隊列中
page.addTargetRequest(jobInfoUrl);
} else {
log.info("記錄已存在,記錄url:{}",jobInfoUrl);
}
}
}
//獲取下一頁的url
List<String> all = page.getHtml().css("div.p_in li.bk a").links().all();
String bkUrl = all.get(all.size() - 1);
log.info("下一頁Url:{}", bkUrl);
if (StrUtil.containsAny(bkUrl, "11.html")) {
System.out.println("已查到10頁數據,無須無限爬取數據");
return;
}
page.addTargetRequest(bkUrl);
}
}
/**
* 解析job詳情頁
* @param page
*/
private void saveJobInfo(Page page) {
//解析頁面
Html html = page.getHtml();
String companyName = html.css("div.cn p.cname a", "text").get();
List<String> text = html.css("div.bmsg.inbox p.fp", "text").all();
String companyAddr = text.get(text.size() - 1);
String jobName = html.css("div.cn h1", "text").get();
String jobStr = html.css("p.msg.ltype", "text").get();
String[] s = StrUtil.split(jobStr, " ");
String jobAddr = s[0];
String time = "";
for (String s1 : s) {
if (StrUtil.containsAny(s1, "發佈")) {
time = StrUtil.removeAll(s1, "發佈");
break;
}
}
String jonDetail = html.css("div.bmsg.job_msg.inbox", "allText").get();
String url = page.getUrl().get();
String salary = html.css("div.in div.cn strong", "text").get();
JobInfo jobInfo = new JobInfo();
jobInfo.setJobName(jobName);
jobInfo.setJobAddr(jobAddr);
jobInfo.setJobDetail(jonDetail);
jobInfo.setSalary(salary);
jobInfo.setUrl(url);
jobInfo.setTime(time);
jobInfo.setCompanyName(companyName);
jobInfo.setCompanyAddr(companyAddr);
//把結果保存到resultItems,爲了持久化
page.putField("jobInfo", jobInfo);
}
//配置爬蟲信息
private Site site = Site.me()
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36")
.setCharset("gbk")
.setTimeOut(10 * 1000)
.setRetryTimes(3)
.setRetrySleepTime(3000);
@Override
public Site getSite() {
return site;
}
}
實現PipeLine,保存到數據庫
package com.hg.spider.webmagic;
import com.hg.spider.entity.JobInfo;
import com.hg.spider.service.JobInfoService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
/**
* @Author skh
* @Date 2020/3/21 16:18
* @Desc
*/
@Component
@Slf4j
public class MysqlPipeline implements Pipeline
{
@Autowired
private JobInfoService jobInfoService;
@Override
public void process(ResultItems resultItems, Task task) {
//獲取封裝好的數據
JobInfo jobInfo = resultItems.get("jobInfo");
if (jobInfo != null) {
jobInfoService.save(jobInfo);
}
}
}
測試
運行項目,瀏覽器輸入:
http://localhost:3306/getJobInfo
後臺就會開始爬取數據.
總結
以上只是一個簡單的使用WebMagic的爬蟲案例.可以作爲入門學習使用.