Java爬虫项目(岗位爬取并展示)WebMagic+MySQL+Echarts+IDEA

一:标题webmagic爬取51job(前程无忧)网的岗位招聘信息

1.项目框架如下

在这里插入图片描述

用idea创建一个maven项目,然后按照以下步骤创建项目,或者直接将我的包解压了,拖到你创建好的项目路径下

2.pom.xml添加依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">

  <modelVersion>4.0.0</modelVersion>
  <packaging>war</packaging>

  <name>51job</name>
  <groupId>cn.com.scitc</groupId>
  <artifactId>51job</artifactId>
  <version>1.0-SNAPSHOT</version>

  <build>
    <plugins>
      <plugin>
        <groupId>org.mortbay.jetty</groupId>
        <artifactId>maven-jetty-plugin</artifactId>
        <version>6.1.7</version>
        <configuration>
          <connectors>
            <connector implementation="org.mortbay.jetty.nio.SelectChannelConnector">
              <port>8888</port>
              <maxIdleTime>30000</maxIdleTime>
            </connector>
          </connectors>
          <webAppSourceDirectory>${project.build.directory}/${pom.artifactId}-${pom.version}</webAppSourceDirectory>
          <contextPath>/</contextPath>
        </configuration>
      </plugin>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-compiler-plugin</artifactId>
            <configuration>
                <source>8</source>
                <target>8</target>
            </configuration>
        </plugin>
    </plugins>
  </build>

  <dependencies>
      <dependency>
          <groupId>commons-codec</groupId>
          <artifactId>commons-codec</artifactId>
        <version>1.11</version>
      </dependency>

    <dependency>
      <groupId>commons-logging</groupId>
      <artifactId>commons-logging</artifactId>
      <version>1.2</version>
    </dependency>

    <dependency>
      <groupId>org.apache.httpcomponents</groupId>
      <artifactId>httpclient</artifactId>
      <version>4.5.9</version>
    </dependency>

    <dependency>
      <groupId>org.apache.httpcomponents</groupId>
      <artifactId>httpcore</artifactId>
      <version>4.4.11</version>
    </dependency>

    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.12</version>
    </dependency>
    <dependency>
      <groupId>org.jsoup</groupId>
      <artifactId>jsoup</artifactId>
      <version>1.12.1</version>
    </dependency>
    <dependency>
      <groupId>org.mybatis</groupId>
      <artifactId>mybatis</artifactId>
      <version>3.5.1</version>
    </dependency>
    <dependency>
      <groupId>mysql</groupId>
      <artifactId>mysql-connector-java</artifactId>
      <version>8.0.16</version>
    </dependency>
  </dependencies>

</project>

3.根据项目结构图创建包和类

JobMain类

package cn.com.scitc;

import cn.com.scitc.client.URLHandle;
import cn.com.scitc.model.Jobs;
import org.apache.http.ParseException;
import org.apache.http.client.HttpClient;
import org.apache.http.impl.client.HttpClientBuilder;

import java.io.IOException;
import java.util.*;

public class JobMain {

    public static void main(String[] args) {


        System.out.println("正在生成客户端...");
        HttpClient client = null;
        System.out.println("客户端生成完毕.");


        String[] city = {"重庆","西安"};


        String[] value = {
                "060000","200200"
        };


        int pagesize = 1;
        boolean splider = true;
        for (int num = 0; num <410; num ++) {
            while (splider) {
//                000000,0000,01,9,99 其中01是计算机的 打开51job网,搜索对应的之后看他的url地址栏变化
                String url = "https://search.51job.com/list/"+ value[num] +  ",000000,0000,01,9,99," + city[num] + ",2," + pagesize++ + ".html";

                System.err.println("正在爬取当前第" + pagesize + "页数据");
                System.err.println("正在爬取:" + city[num] + "城市" );
                System.out.println(url);

                List<Jobs> jobsList = null;

                System.out.println("正在生成客户端...");
                client = HttpClientBuilder.create().build();
                System.out.println("客户端生成完毕.");

                //开始解析
                try {
                    System.out.println("开始响应客户端...");
                    try {
                        Thread.sleep(200);
                        jobsList = URLHandle.urlParser(client, url);

                       if (jobsList.iterator().next().getJobName() == null) {
                           pagesize  = 1;
                           break;
                       }

                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                    System.out.println("响应完成.");
                } catch (ParseException e) {
                    e.printStackTrace();
                } catch (IOException e) {
                    e.printStackTrace();
                }


                System.out.println("开始输出结果...");

                for (Jobs job : jobsList) {


                }
                System.out.println("整个结果输出完毕,程序结束.");
            }


        }

    }



}

JobParse类

package cn.com.scitc.client;

import cn.com.scitc.mapper.JobsMapper;
import cn.com.scitc.model.Jobs;
import org.apache.ibatis.io.Resources;
import org.apache.ibatis.session.SqlSession;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.io.InputStream;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

public class JobParse {

    public static List<Jobs> getData(String entity){
        /**
         * 读取mybatis配置文件
         */
        String resource = "mybatis-config.xml";
        InputStream inputStream = null;
        try {
            inputStream = Resources.getResourceAsStream(resource);
        } catch (IOException e) {
            e.printStackTrace();
        }


        /**
         * 得到连接对象注册sqlsession
         */
        SqlSessionFactory sqlSessionFactory = new SqlSessionFactoryBuilder().build(inputStream);

        SqlSession sqlSession = sqlSessionFactory.openSession();

        JobsMapper jobsMapper = sqlSession.getMapper(JobsMapper.class);


         List<Jobs> data = new ArrayList<Jobs>();
         Document doc = Jsoup.parse(entity);
         Elements elements = doc.select("div.el");
         Elements title =  elements.select("p.t1").select("span").select("a");
         Elements complany = elements.select("span.t2").select("a");
         Elements address = elements.select("span.t3");
         Elements salary = elements.select("span.t4");
         Elements datas = elements.select("span.t5");
         Elements SrcId = elements.select("p.t1").select("input.checkbox");

         Jobs jobs = new Jobs();

         if (title !=null || title.equals("")) {
             for (Element element : title) {
                 jobs.setJobName(element.text());
             }
         }

         if (complany !=null || complany.equals("")) {
             for (Element element : complany) {
                 jobs.setCompanyName(element.text());
             }
         }

         if (address !=null || address.equals("")) {
             for (Element element : address) {
                 jobs.setWorkAddr(element.text());
             }
         }

         if (salary !=null || salary.equals("")) {
             for (Element element : salary) {
                 jobs.setSalary(element.text());
             }
         }

         if (datas !=null || datas.equals("")) {
             for (Element element : datas) {
                 jobs.setPushDate(element.text());
             }
         }

         if (SrcId !=null || SrcId.equals("")) {
             for (Element element : SrcId) {
                 jobs.setJobKey(element.attr("value"));
             }
         }



        jobsMapper.insert(jobs);
        sqlSession.commit();

        data.add(jobs);
        return data;
    }
}

URLHandle类

package cn.com.scitc.client;

import cn.com.scitc.model.Jobs;
import cn.com.scitc.utils.HTTPUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class URLHandle {

    public static List<Jobs> urlParser(HttpClient client, String url) throws IOException {

        List<Jobs> data = new ArrayList<Jobs>();

        //获取响应资源
        HttpResponse response = HTTPUtils.getHtml(client,url);

        //获取响应状态码
        int statusCode = response.getStatusLine().getStatusCode();
        System.out.println(statusCode);
        if(statusCode == 200) {
            //页面编码
            String entity = EntityUtils.toString(response.getEntity(),"gbk");
            System.out.println("开始解析...");
            data = JobParse.getData(entity);
            System.out.println("URL解析完成.");
        } else {
            EntityUtils.consume(response.getEntity());//释放资源实体
        }
        System.out.println("返回数据.");
        return data;

    }
}

JobsMapper接口类

package cn.com.scitc.mapper;

import cn.com.scitc.model.Jobs;

import java.util.List;

public interface JobsMapper {
    void insert(Jobs jobs);

    List<Jobs> findAll();
}

Jobs类 //这里声明一下 在idea中alt+insert键 可以创建getter和setter

package cn.com.scitc.model;


public class Jobs {
   private Integer jobId;
   private String jobName; //岗位
   private String companyName;//公司名
   private String workAddr;//公司地址
   private String salary;//薪水
   private String pushDate;//发布日期
   private String jobKey;
   public Integer getJobId() {
      return jobId;
   }

   public void setJobId(Integer jobId) {
      this.jobId = jobId;
   }

   public String getJobName() {
      return jobName;
   }

   public void setJobName(String jobName) {
      this.jobName = jobName;
   }

   public String getCompanyName() {
      return companyName;
   }

   public void setCompanyName(String companyName) {
      this.companyName = companyName;
   }

   public String getWorkAddr() {
      return workAddr;
   }

   public void setWorkAddr(String workAddr) {
      this.workAddr = workAddr;
   }

   public String getSalary() {
      return salary;
   }

   public void setSalary(String salary) {
      this.salary = salary;
   }

   public String getPushDate() {
      return pushDate;
   }

   public void setPushDate(String pushDate) {
      this.pushDate = pushDate;
   }

   public String getJobKey() {
      return jobKey;
   }

   public void setJobKey(String jobKey) {
      this.jobKey = jobKey;
   }
}

HTTPUtils类

package cn.com.scitc.utils;

import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.HttpVersion;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.message.BasicHttpResponse;

import java.io.IOException;

public class HTTPUtils {
    public static HttpResponse getHtml(HttpClient client, String url){
        //获取响应文件,即HTML,采用get方法获取响应数据
        HttpGet getMethod = new HttpGet(url);
        HttpResponse response = new BasicHttpResponse(HttpVersion.HTTP_1_1, HttpStatus.SC_OK, "OK");

        try {
            //通过client执行get方法
            response = client.execute(getMethod);
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            //getMethod.abort();
        }

        return response;
    }
}

4.在resource资源文件夹中创建配置文件

jdbc.properties

driver=com.mysql.cj.jdbc.Driver
url=jdbc:mysql://localhost:3306/job51?serverTimezone=UTC&autoReconnect=true&useUnicode=true&characterEncoding=UTF-8&zeroDateTimeBehavior=CONVERT_TO_NULL&useSSL=false
username=root
password=123456

JobsMapper.xml

<!DOCTYPE mapper
        PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
        "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="cn.com.scitc.mapper.JobsMapper">
    <resultMap id="JobsMapperMap" type="cn.com.scitc.model.Jobs">
        <id column="job_id" property="jobId" jdbcType="INTEGER"/>
        <id column="job_name" property="jobName" jdbcType="VARCHAR"/>
        <id column="company_name" property="companyName" jdbcType="VARCHAR"/>
        <id column="work_addr" property="workAddr" jdbcType="VARCHAR"/>
        <id column="salary" property="salary" jdbcType="VARCHAR"/>
        <id column="push_date" property="pushDate" jdbcType="VARCHAR"/>
        <id column="job_key" property="jobKey" jdbcType="VARCHAR"/>

    </resultMap>

    <insert id="insert" keyColumn="jobId" useGeneratedKeys="true"  parameterType="cn.com.scitc.model.Jobs">
      insert into jobs (job_name,company_name,work_addr,salary,push_date,job_key) values (#{jobName},#{companyName},#{workAddr},#{salary},#{pushDate},#{jobKey} )
    </insert>


    <select id="findAll" resultMap="JobsMapperMap">
        SELECT * FROM jobs
    </select>
</mapper>

mybatis-config.xml

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE configuration
        PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
        "http://mybatis.org/dtd/mybatis-3-config.dtd">
<configuration>
    <properties resource="jdbc.properties"></properties>
    <environments default="development">
        <environment id="development">
            <transactionManager type="JDBC"/>
            <dataSource type="POOLED">
                <property name="driver" value="${driver}"/>
                <property name="url" value="${url}"/>
                <property name="username" value="${username}"/>
                <property name="password" value="${password}"/>
            </dataSource>
        </environment>
    </environments>
    <mappers>
        <mapper resource="JobsMapper.xml"/>
    </mappers>
</configuration>

5.数据库中操作如下

新建数据库job51

job51

在这里插入图片描述
执行sql语句,或者我的项目包里有一个51job.sql文件,直接拖进去就行

/*
 Navicat Premium Data Transfer

 Source Server         : localhost
 Source Server Type    : MySQL
 Source Server Version : 80016
 Source Host           : localhost:3306
 Source Schema         : job51

 Target Server Type    : MySQL
 Target Server Version : 80016
 File Encoding         : 65001

 Date: 08/07/2019 10:27:49
*/

SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

-- ----------------------------
-- Table structure for jobs
-- ----------------------------
DROP TABLE IF EXISTS `jobs`;
CREATE TABLE `jobs` (
  `job_id` int(15) NOT NULL AUTO_INCREMENT,
  `job_name` text CHARACTER SET utf8 COLLATE utf8_general_ci,
  `company_name` text CHARACTER SET utf8 COLLATE utf8_general_ci,
  `work_addr` text CHARACTER SET utf8 COLLATE utf8_general_ci,
  `salary` text CHARACTER SET utf8 COLLATE utf8_general_ci,
  `push_date` text CHARACTER SET utf8 COLLATE utf8_general_ci,
  `job_key` text CHARACTER SET utf8 COLLATE utf8_general_ci,
  PRIMARY KEY (`job_id`)
) ENGINE=InnoDB AUTO_INCREMENT=2341 DEFAULT CHARSET=utf8;

SET FOREIGN_KEY_CHECKS = 1;

效果如图
在这里插入图片描述
运行JobMain类,效果如下
在这里插入图片描述
查看数据库中爬取到的数据
在这里插入图片描述

6.说明 自己点进去51job网查询,相关的信息,看地址栏里的url的变化,然后按照注释修改JobMain和jdbc.properties中的设置即可

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

附赠源码下载链接:
https://download.csdn.net/download/weixin_43701595/12457577

7.echarts调取数据库信息并展示正在做,将在下一篇文章中演示,等不及的小伙伴可以自行查阅资料并告诉我

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章