java+jsoup 爬取智联招聘 简化版

1、获取列表

需要分析正确的接口url

请求参数:

start=180
&pageSize=90
&cityId=763
&salary=0,0
&workExperience=-1
&education=-1
&companyType=-1
&employmentType=-1
&jobWelfareTag=-1
&kw=Java%E5%BC%80%E5%8F%91  这里汉字需要转码
&kt=3   没有这个参数,请求失败,不知道有什么用,望告知!!

 

public static void main(String[] args) {
        String url = "https://fe-api.zhaopin.com/c/i/sou?start=180&pageSize=90&cityId=763&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Java%E5%BC%80%E5%8F%91&kt=3";
        Connection connect = Jsoup.connect(url).timeout(30000);
        connect.header("authority", "fe-api.zhaopin.com");
        connect.header("accept",
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
        connect.header("path",
                "/c/i/sou?start=180&pageSize=90&cityId=763&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Java%E5%BC%80%E5%8F%91&kt=3&=0");
        connect.header("accept-encoding", "gzip, deflate, br");
        connect.header("accept-language", "zh-CN,zh;q=0.9");
        connect.header("cache-control", "no-cache");
        connect.header("upgrade-insecure-requests", "1");
        connect.header("user-agent",
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36")
                .ignoreContentType(true);
        connect.method(Method.GET);
        try {
            Response response = connect.execute();
            System.out.println(response.body());
        } catch (IOException e1) {
            e1.printStackTrace();
        }

    }

 

2、获得单个职位详情,在爬取中遇到html乱码问题

第一版:HTML乱码

public static void main(String[] args) {
        String url = "https://jobs.zhaopin.com/CC322742114J00246383604.htm";
        Connection connect = Jsoup.connect(url).timeout(30000);
        connect.header("Host", "jobs.zhaopin.com");
        connect.header("Accept",
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
        connect.header("path",
                "/c/i/sou?start=180&pageSize=90&cityId=763&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Java%E5%BC%80%E5%8F%91&kt=3&=0");
        connect.header("Accept-Encoding", "gzip, deflate, br");
        connect.header("Accept-Language", "zh-CN,zh;q=0.9");
        connect.header("Cache-Control", "no-cache");
        connect.header("Connection", "keep-alive");
        connect.header("upgrade-insecure-requests", "1");
        connect.header("user-agent",
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36")
                .ignoreContentType(true);
        connect.method(Method.GET);
        try {
            Response response = connect.execute();
            Document parse = response.parse();
            System.out.println(parse.toString());
            String html = parse.html();
            int index = html.indexOf("__INITIAL_STATE__=");不会使用正则
            String msglist = html.substring(index, html.indexOf("</script>", index)).replace("__INITIAL_STATE__=", "");
            System.out.println(msglist);
        } catch (IOException e1) {
            e1.printStackTrace();
        }

    }

第二版:网上找的解决乱码方法 ,但是无效

public static void main(String[] args) throws IOException {
        String urlstr = "http://jobs.zhaopin.com/CC322742114J00246383604.htm";

        URL url = new URL(urlstr);
        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
        connection.setRequestMethod("GET");
        connection.addRequestProperty("Host", "jobs.zhaopin.com");
        connection.addRequestProperty("Accept",
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
        connection.addRequestProperty("Accept-Language", "zh-CN,zh;q=0.9");
        connection.addRequestProperty("Cache-Control", "no-cache");
        connection.addRequestProperty("Connection", "keep-alive");
        connection.addRequestProperty("upgrade-insecure-requests", "1");
        connection.addRequestProperty("user-agent",
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36");
        try {
            /*第一种 字符乱码  无效*/
            Document parse = Jsoup.parse(connection.getInputStream(), "UTF-8", urlstr);

            String docStr = parse.toString();
            /*第二种 字符乱码  无效*/
            String str = new String(docStr.getBytes("ISO8859-1"), "UTF-8");
            parse = Jsoup.parse(str);
        } catch (IOException e1) {
            e1.printStackTrace();
        }
    }

最终解决方法 :在把第一个版本中的connect.header("Accept-Encoding", "gzip, deflate, br"); 注释了,就不会出现乱码。

为什么会出现第二版:Connection connect = Jsoup.connect(url) 不能请求设置编码

 

爬虫是有时效性的,以上有错误,望指出!!!

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章