java 爬取抖音用戶分享鏈接 html 數據

最近一個項目需要用到獲取抖音用戶數據的模塊,無奈抖音沒有相關的API給我們調用,只能從用戶分享鏈接入手,爬取html相應的值。但是中途發現了一個頭疼的問題,抖音對粉絲、關注等數據是採用woff字體編碼的形式展現出來的,爬取到的是一個特殊符號,並不是直接可用的數值,後面就在這個問題上糾結了很久,網上也找了一大堆相關博客和論文,但都沒有發現解決問題的關鍵。

後來在一篇python爬取抖音用戶信息的博文中得到了啓示(博文地址:https://blog.csdn.net/weixin_43582101/article/details/92658860),自己就摸索着寫出了一個java版本的解析器代碼並封裝成工具類,下面貼出工具類相關代碼。


package com.jf.bnln.goods.common;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.alibaba.druid.util.StringUtils;
import com.alibaba.fastjson.JSONObject;

/**
 * 抖音分享鏈接信息爬取
 * @author ZL
 *
 */
public class DouYinShareUtil {
	
	private Logger logger = LoggerFactory.getLogger(DouYinShareUtil.class);

	private DouYinShareUtil(){}

	private static final class GetInstance {
		static final DouYinShareUtil INSTANCE = new DouYinShareUtil();
		// 抖音字體編碼
		static final JSONObject mapCode2Name = JSONObject.parseObject("{\"0xe602\":\"num_\",\"0xe605\":\"num_3\",\"0xe606\":\"num_4\",\"0xe603\":\"num_1\",\"0xe604\":\"num_2\",\"0xe618\":\"num_\",\"0xe619\":\"num_4\",\"0xe60a\":\"num_8\",\"0xe60b\":\"num_9\",\"0xe60e\":\"num_\",\"0xe60f\":\"num_5\",\"0xe60c\":\"num_4\",\"0xe60d\":\"num_1\",\"0xe612\":\"num_6\",\"0xe613\":\"num_8\",\"0xe610\":\"num_3\",\"0xe611\":\"num_2\",\"0xe616\":\"num_1\",\"0xe617\":\"num_3\",\"0xe614\":\"num_9\",\"0xe615\":\"num_7\",\"0xe609\":\"num_7\",\"0xe607\":\"num_5\",\"0xe608\":\"num_6\",\"0xe61b\":\"num_5\",\"0xe61c\":\"num_8\",\"0xe61a\":\"num_2\",\"0xe61f\":\"num_6\",\"0xe61d\":\"num_9\",\"0xe61e\":\"num_7\"}");
			
		// 對應抖音字體編碼的數字
		static final JSONObject mapCode2Font = JSONObject.parseObject("{\"num_9\":8,\"num_5\":5,\"num_6\":6,\"num_\":1,\"num_7\":9,\"num_8\":7,\"num_1\":0,\"num_2\":3,\"num_3\":2,\"num_4\":4}");
	}

	public static DouYinShareUtil instance() {
		return GetInstance.INSTANCE;
	}

	/**
	 * 根據分享鏈接爬取用戶相關信息
	 * @param shareUrl 抖音用戶名片分享鏈接
	 * */
	public JSONObject getDouyinInfo( String shareUrl ) {

		shareUrl = getHTMLSource(shareUrl);
//		System.out.println("html -> "+shareUrl);
		// 對抖音自定義的字體編碼做處理
		shareUrl = shareUrl.replaceAll("&#", "hzsd");
		//		System.out.println("最後處理 -> "+html);

		JSONObject data = new JSONObject();
		try {
			
			Document doc = Jsoup.parse(shareUrl);
	
			// 頭像
			Elements headUrl = doc.select("[class=avatar]");
			data.put("headUrl", headUrl.attr("src"));
			
			// 暱稱
			Elements nickName = doc.select("[class=nickname]");
			data.put("nickName", nickName.get(0).text());
	
			// id
			Elements idEle = doc.select("[class=shortid]");
			String[] idArr = idEle.get(0).text().split(" ");
			String id = "";
			for (int i = 0; i < idArr.length; i++) {
				id = id+formatNum(idArr[i]);
			}
			data.put("id", id);
	
			// 個性簽名
			Elements sign = doc.select("[class=signature]");
			data.put("sign", sign.get(0).text());
	
			
			// 關注信息
			Elements followInfo = doc.select("[class=follow-info]");
	
			// 關注數
			Elements focusBlock = followInfo.select("[class=focus block]");
			String[] focusArr = focusBlock.select("[class=num]").text().split(" ");
			String focusStr = "";
			for (int i = 0; i < focusArr.length; i++) {
				focusStr = focusStr+formatNum(focusArr[i]);
			}
			data.put("focus", focusStr);
	
			// 粉絲數
			Elements fansBlock = followInfo.select("[class=follower block]");
			String[] fansArr = fansBlock.select("[class=num]").text().split(" ");
			String fansStr = "";
			for (int i = 0; i < fansArr.length; i++) {
				fansStr = fansStr+formatNum(fansArr[i]);
			}
			data.put("fans", fansStr);
	
			// 點贊數
			Elements likedBlock = followInfo.select("[class=liked-num block]");
			String[] likedArr = likedBlock.select("[class=num]").text().split(" ");
			String likedStr = "";
			for (int i = 0; i < likedArr.length; i++) {
				likedStr = likedStr+formatNum(likedArr[i]);
			}
			data.put("liked", likedStr);
	
			// 作品數
			Elements works = doc.select("[class=user-tab active tab get-list]");
			String[] worksArr = works.select("[class=num]").text().split(" ");
			String worksStr = "";
			for (int i = 0; i < worksArr.length; i++) {
				worksStr = worksStr+formatNum(worksArr[i]);
			}
			data.put("works", worksStr);
		} catch (Exception e) {
			logger.error("analyse douyin user info has error.",e);
		}
		return data;
	}

	/**
	 * 請求 html 源碼
	 */
	private String getHTMLSource(String url){
		InputStream openStream = null;
		BufferedReader buf = null;

		try {
			String line = null;
			URL theUrl= new URL(url);
			HttpURLConnection conn = (HttpURLConnection) theUrl.openConnection();


			Map<String, List<String>> map = conn.getHeaderFields();
//			System.out.println("請求頭:"+map.toString());
			// 遍歷所有的響應頭字段
			for (String key : map.keySet())
			{
				//如果發現有重定向了新的地址
				if ("Location".equals(key))
				{
					//獲取新地址
					url = map.get(key).get(0);
					break;
				}
			}
			theUrl= new URL(url);
			conn = (HttpURLConnection) theUrl.openConnection();

			conn.setRequestProperty("accept", "*/*");
			conn.setRequestProperty("connection", "Keep-Alive");
			conn.setRequestProperty("Accept-Charset", "UTF-8");
			conn.setRequestProperty("contentType", "UTF-8");
			conn.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
			conn.setRequestProperty("Content-type", "application/x-www-form-urlencoded");
			conn.setRequestProperty("Accept-Language", Locale.getDefault().toString());

			// 建立實際的連接
			conn.connect();

			buf = new BufferedReader(new InputStreamReader(conn.getInputStream(),"UTF-8"));

			StringBuffer str = new StringBuffer();
			while((line = buf.readLine()) != null){
				str.append(line);
			}

			return str.toString();
		} catch (MalformedURLException e) {
			logger.error("getHTMLSource has MalformedURLException.",e);
		} catch (IOException e) {
			logger.error("getHTMLSource has IOException.",e);
		} finally{
			try {
				if(openStream!=null){
					openStream.close();
				}
				if(buf!=null){
					buf.close();
				}
			} catch (IOException e) {
				logger.error("getHTMLSource close stream IOException.",e);
			}
		}

		return null;

	}

	/**
	 * 
	 * 判斷是否包含中文。true:包含
	 * 
	 */
	private boolean isChinese(String str){

		Pattern p = Pattern.compile("[\u4e00-\u9fa5]");
		Matcher m = p.matcher(str);
		if (m.find()) {
			return true;
		}
		return false;

	}

	/**
	 * 
	 * 抖音自定義字體對應的編碼
	 * 反編譯成阿拉伯數字
	 * 
	 */
	private String formatNum(String str) {
		if (isChinese(str)) {
			return "";
		}
		if ( str.length() < 8 || str.indexOf("hzsdxe6") < 0) {
			return str;
		}
		str = "0"+str.substring(4,str.length()-1);

		String resStr = GetInstance.mapCode2Font.getString(GetInstance.mapCode2Name.getString(str));
		if (StringUtils.isEmpty(resStr)) {
			return str;
		}
		return resStr;
	}
	
	public static void main(String[] args) {
		System.out.println(DouYinShareUtil.instance().getDouyinInfo("http://v.douyin.com/D6fEmD/"));
		
	}

}

 

 

如果我的博客能幫到大家,麻煩大家在下方留言回覆一下,讓我知道我的博客都幫助了多少碰到類似問題的人,謝謝大家~

 

學無止境,生生不息。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章