Java爬蟲項目(崗位爬取並展示)WebMagic+MySQL+Echarts+IDEA

一:標題webmagic爬取51job(前程無憂)網的崗位招聘信息

1.項目框架如下

在這裏插入圖片描述

用idea創建一個maven項目,然後按照以下步驟創建項目,或者直接將我的包解壓了,拖到你創建好的項目路徑下

2.pom.xml添加依賴

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">

  <modelVersion>4.0.0</modelVersion>
  <packaging>war</packaging>

  <name>51job</name>
  <groupId>cn.com.scitc</groupId>
  <artifactId>51job</artifactId>
  <version>1.0-SNAPSHOT</version>

  <build>
    <plugins>
      <plugin>
        <groupId>org.mortbay.jetty</groupId>
        <artifactId>maven-jetty-plugin</artifactId>
        <version>6.1.7</version>
        <configuration>
          <connectors>
            <connector implementation="org.mortbay.jetty.nio.SelectChannelConnector">
              <port>8888</port>
              <maxIdleTime>30000</maxIdleTime>
            </connector>
          </connectors>
          <webAppSourceDirectory>${project.build.directory}/${pom.artifactId}-${pom.version}</webAppSourceDirectory>
          <contextPath>/</contextPath>
        </configuration>
      </plugin>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-compiler-plugin</artifactId>
            <configuration>
                <source>8</source>
                <target>8</target>
            </configuration>
        </plugin>
    </plugins>
  </build>

  <dependencies>
      <dependency>
          <groupId>commons-codec</groupId>
          <artifactId>commons-codec</artifactId>
        <version>1.11</version>
      </dependency>

    <dependency>
      <groupId>commons-logging</groupId>
      <artifactId>commons-logging</artifactId>
      <version>1.2</version>
    </dependency>

    <dependency>
      <groupId>org.apache.httpcomponents</groupId>
      <artifactId>httpclient</artifactId>
      <version>4.5.9</version>
    </dependency>

    <dependency>
      <groupId>org.apache.httpcomponents</groupId>
      <artifactId>httpcore</artifactId>
      <version>4.4.11</version>
    </dependency>

    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.12</version>
    </dependency>
    <dependency>
      <groupId>org.jsoup</groupId>
      <artifactId>jsoup</artifactId>
      <version>1.12.1</version>
    </dependency>
    <dependency>
      <groupId>org.mybatis</groupId>
      <artifactId>mybatis</artifactId>
      <version>3.5.1</version>
    </dependency>
    <dependency>
      <groupId>mysql</groupId>
      <artifactId>mysql-connector-java</artifactId>
      <version>8.0.16</version>
    </dependency>
  </dependencies>

</project>

3.根據項目結構圖創建包和類

JobMain類

package cn.com.scitc;

import cn.com.scitc.client.URLHandle;
import cn.com.scitc.model.Jobs;
import org.apache.http.ParseException;
import org.apache.http.client.HttpClient;
import org.apache.http.impl.client.HttpClientBuilder;

import java.io.IOException;
import java.util.*;

public class JobMain {

    public static void main(String[] args) {


        System.out.println("正在生成客戶端...");
        HttpClient client = null;
        System.out.println("客戶端生成完畢.");


        String[] city = {"重慶","西安"};


        String[] value = {
                "060000","200200"
        };


        int pagesize = 1;
        boolean splider = true;
        for (int num = 0; num <410; num ++) {
            while (splider) {
//                000000,0000,01,9,99 其中01是計算機的 打開51job網,搜索對應的之後看他的url地址欄變化
                String url = "https://search.51job.com/list/"+ value[num] +  ",000000,0000,01,9,99," + city[num] + ",2," + pagesize++ + ".html";

                System.err.println("正在爬取當前第" + pagesize + "頁數據");
                System.err.println("正在爬取:" + city[num] + "城市" );
                System.out.println(url);

                List<Jobs> jobsList = null;

                System.out.println("正在生成客戶端...");
                client = HttpClientBuilder.create().build();
                System.out.println("客戶端生成完畢.");

                //開始解析
                try {
                    System.out.println("開始響應客戶端...");
                    try {
                        Thread.sleep(200);
                        jobsList = URLHandle.urlParser(client, url);

                       if (jobsList.iterator().next().getJobName() == null) {
                           pagesize  = 1;
                           break;
                       }

                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                    System.out.println("響應完成.");
                } catch (ParseException e) {
                    e.printStackTrace();
                } catch (IOException e) {
                    e.printStackTrace();
                }


                System.out.println("開始輸出結果...");

                for (Jobs job : jobsList) {


                }
                System.out.println("整個結果輸出完畢,程序結束.");
            }


        }

    }



}

JobParse類

package cn.com.scitc.client;

import cn.com.scitc.mapper.JobsMapper;
import cn.com.scitc.model.Jobs;
import org.apache.ibatis.io.Resources;
import org.apache.ibatis.session.SqlSession;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.io.InputStream;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

public class JobParse {

    public static List<Jobs> getData(String entity){
        /**
         * 讀取mybatis配置文件
         */
        String resource = "mybatis-config.xml";
        InputStream inputStream = null;
        try {
            inputStream = Resources.getResourceAsStream(resource);
        } catch (IOException e) {
            e.printStackTrace();
        }


        /**
         * 得到連接對象註冊sqlsession
         */
        SqlSessionFactory sqlSessionFactory = new SqlSessionFactoryBuilder().build(inputStream);

        SqlSession sqlSession = sqlSessionFactory.openSession();

        JobsMapper jobsMapper = sqlSession.getMapper(JobsMapper.class);


         List<Jobs> data = new ArrayList<Jobs>();
         Document doc = Jsoup.parse(entity);
         Elements elements = doc.select("div.el");
         Elements title =  elements.select("p.t1").select("span").select("a");
         Elements complany = elements.select("span.t2").select("a");
         Elements address = elements.select("span.t3");
         Elements salary = elements.select("span.t4");
         Elements datas = elements.select("span.t5");
         Elements SrcId = elements.select("p.t1").select("input.checkbox");

         Jobs jobs = new Jobs();

         if (title !=null || title.equals("")) {
             for (Element element : title) {
                 jobs.setJobName(element.text());
             }
         }

         if (complany !=null || complany.equals("")) {
             for (Element element : complany) {
                 jobs.setCompanyName(element.text());
             }
         }

         if (address !=null || address.equals("")) {
             for (Element element : address) {
                 jobs.setWorkAddr(element.text());
             }
         }

         if (salary !=null || salary.equals("")) {
             for (Element element : salary) {
                 jobs.setSalary(element.text());
             }
         }

         if (datas !=null || datas.equals("")) {
             for (Element element : datas) {
                 jobs.setPushDate(element.text());
             }
         }

         if (SrcId !=null || SrcId.equals("")) {
             for (Element element : SrcId) {
                 jobs.setJobKey(element.attr("value"));
             }
         }



        jobsMapper.insert(jobs);
        sqlSession.commit();

        data.add(jobs);
        return data;
    }
}

URLHandle類

package cn.com.scitc.client;

import cn.com.scitc.model.Jobs;
import cn.com.scitc.utils.HTTPUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class URLHandle {

    public static List<Jobs> urlParser(HttpClient client, String url) throws IOException {

        List<Jobs> data = new ArrayList<Jobs>();

        //獲取響應資源
        HttpResponse response = HTTPUtils.getHtml(client,url);

        //獲取響應狀態碼
        int statusCode = response.getStatusLine().getStatusCode();
        System.out.println(statusCode);
        if(statusCode == 200) {
            //頁面編碼
            String entity = EntityUtils.toString(response.getEntity(),"gbk");
            System.out.println("開始解析...");
            data = JobParse.getData(entity);
            System.out.println("URL解析完成.");
        } else {
            EntityUtils.consume(response.getEntity());//釋放資源實體
        }
        System.out.println("返回數據.");
        return data;

    }
}

JobsMapper接口類

package cn.com.scitc.mapper;

import cn.com.scitc.model.Jobs;

import java.util.List;

public interface JobsMapper {
    void insert(Jobs jobs);

    List<Jobs> findAll();
}

Jobs類 //這裏聲明一下 在idea中alt+insert鍵 可以創建getter和setter

package cn.com.scitc.model;


public class Jobs {
   private Integer jobId;
   private String jobName; //崗位
   private String companyName;//公司名
   private String workAddr;//公司地址
   private String salary;//薪水
   private String pushDate;//發佈日期
   private String jobKey;
   public Integer getJobId() {
      return jobId;
   }

   public void setJobId(Integer jobId) {
      this.jobId = jobId;
   }

   public String getJobName() {
      return jobName;
   }

   public void setJobName(String jobName) {
      this.jobName = jobName;
   }

   public String getCompanyName() {
      return companyName;
   }

   public void setCompanyName(String companyName) {
      this.companyName = companyName;
   }

   public String getWorkAddr() {
      return workAddr;
   }

   public void setWorkAddr(String workAddr) {
      this.workAddr = workAddr;
   }

   public String getSalary() {
      return salary;
   }

   public void setSalary(String salary) {
      this.salary = salary;
   }

   public String getPushDate() {
      return pushDate;
   }

   public void setPushDate(String pushDate) {
      this.pushDate = pushDate;
   }

   public String getJobKey() {
      return jobKey;
   }

   public void setJobKey(String jobKey) {
      this.jobKey = jobKey;
   }
}

HTTPUtils類

package cn.com.scitc.utils;

import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.HttpVersion;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.message.BasicHttpResponse;

import java.io.IOException;

public class HTTPUtils {
    public static HttpResponse getHtml(HttpClient client, String url){
        //獲取響應文件,即HTML,採用get方法獲取響應數據
        HttpGet getMethod = new HttpGet(url);
        HttpResponse response = new BasicHttpResponse(HttpVersion.HTTP_1_1, HttpStatus.SC_OK, "OK");

        try {
            //通過client執行get方法
            response = client.execute(getMethod);
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            //getMethod.abort();
        }

        return response;
    }
}

4.在resource資源文件夾中創建配置文件

jdbc.properties

driver=com.mysql.cj.jdbc.Driver
url=jdbc:mysql://localhost:3306/job51?serverTimezone=UTC&autoReconnect=true&useUnicode=true&characterEncoding=UTF-8&zeroDateTimeBehavior=CONVERT_TO_NULL&useSSL=false
username=root
password=123456

JobsMapper.xml

<!DOCTYPE mapper
        PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
        "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="cn.com.scitc.mapper.JobsMapper">
    <resultMap id="JobsMapperMap" type="cn.com.scitc.model.Jobs">
        <id column="job_id" property="jobId" jdbcType="INTEGER"/>
        <id column="job_name" property="jobName" jdbcType="VARCHAR"/>
        <id column="company_name" property="companyName" jdbcType="VARCHAR"/>
        <id column="work_addr" property="workAddr" jdbcType="VARCHAR"/>
        <id column="salary" property="salary" jdbcType="VARCHAR"/>
        <id column="push_date" property="pushDate" jdbcType="VARCHAR"/>
        <id column="job_key" property="jobKey" jdbcType="VARCHAR"/>

    </resultMap>

    <insert id="insert" keyColumn="jobId" useGeneratedKeys="true"  parameterType="cn.com.scitc.model.Jobs">
      insert into jobs (job_name,company_name,work_addr,salary,push_date,job_key) values (#{jobName},#{companyName},#{workAddr},#{salary},#{pushDate},#{jobKey} )
    </insert>


    <select id="findAll" resultMap="JobsMapperMap">
        SELECT * FROM jobs
    </select>
</mapper>

mybatis-config.xml

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE configuration
        PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
        "http://mybatis.org/dtd/mybatis-3-config.dtd">
<configuration>
    <properties resource="jdbc.properties"></properties>
    <environments default="development">
        <environment id="development">
            <transactionManager type="JDBC"/>
            <dataSource type="POOLED">
                <property name="driver" value="${driver}"/>
                <property name="url" value="${url}"/>
                <property name="username" value="${username}"/>
                <property name="password" value="${password}"/>
            </dataSource>
        </environment>
    </environments>
    <mappers>
        <mapper resource="JobsMapper.xml"/>
    </mappers>
</configuration>

5.數據庫中操作如下

新建數據庫job51

job51

在這裏插入圖片描述
執行sql語句,或者我的項目包裏有一個51job.sql文件,直接拖進去就行

/*
 Navicat Premium Data Transfer

 Source Server         : localhost
 Source Server Type    : MySQL
 Source Server Version : 80016
 Source Host           : localhost:3306
 Source Schema         : job51

 Target Server Type    : MySQL
 Target Server Version : 80016
 File Encoding         : 65001

 Date: 08/07/2019 10:27:49
*/

SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

-- ----------------------------
-- Table structure for jobs
-- ----------------------------
DROP TABLE IF EXISTS `jobs`;
CREATE TABLE `jobs` (
  `job_id` int(15) NOT NULL AUTO_INCREMENT,
  `job_name` text CHARACTER SET utf8 COLLATE utf8_general_ci,
  `company_name` text CHARACTER SET utf8 COLLATE utf8_general_ci,
  `work_addr` text CHARACTER SET utf8 COLLATE utf8_general_ci,
  `salary` text CHARACTER SET utf8 COLLATE utf8_general_ci,
  `push_date` text CHARACTER SET utf8 COLLATE utf8_general_ci,
  `job_key` text CHARACTER SET utf8 COLLATE utf8_general_ci,
  PRIMARY KEY (`job_id`)
) ENGINE=InnoDB AUTO_INCREMENT=2341 DEFAULT CHARSET=utf8;

SET FOREIGN_KEY_CHECKS = 1;

效果如圖
在這裏插入圖片描述
運行JobMain類,效果如下
在這裏插入圖片描述
查看數據庫中爬取到的數據
在這裏插入圖片描述

6.說明 自己點進去51job網查詢,相關的信息,看地址欄裏的url的變化,然後按照註釋修改JobMain和jdbc.properties中的設置即可

在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述

附贈源碼下載鏈接:
https://download.csdn.net/download/weixin_43701595/12457577

7.echarts調取數據庫信息並展示正在做,將在下一篇文章中演示,等不及的小夥伴可以自行查閱資料並告訴我

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章