最近一個項目需要用到獲取抖音用戶數據的模塊,無奈抖音沒有相關的API給我們調用,只能從用戶分享鏈接入手,爬取html相應的值。但是中途發現了一個頭疼的問題,抖音對粉絲、關注等數據是採用woff字體編碼的形式展現出來的,爬取到的是一個特殊符號,並不是直接可用的數值,後面就在這個問題上糾結了很久,網上也找了一大堆相關博客和論文,但都沒有發現解決問題的關鍵。
後來在一篇python爬取抖音用戶信息的博文中得到了啓示(博文地址:https://blog.csdn.net/weixin_43582101/article/details/92658860),自己就摸索着寫出了一個java版本的解析器代碼並封裝成工具類,下面貼出工具類相關代碼。
package com.jf.bnln.goods.common;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.druid.util.StringUtils;
import com.alibaba.fastjson.JSONObject;
/**
* 抖音分享鏈接信息爬取
* @author ZL
*
*/
public class DouYinShareUtil {
private Logger logger = LoggerFactory.getLogger(DouYinShareUtil.class);
private DouYinShareUtil(){}
private static final class GetInstance {
static final DouYinShareUtil INSTANCE = new DouYinShareUtil();
// 抖音字體編碼
static final JSONObject mapCode2Name = JSONObject.parseObject("{\"0xe602\":\"num_\",\"0xe605\":\"num_3\",\"0xe606\":\"num_4\",\"0xe603\":\"num_1\",\"0xe604\":\"num_2\",\"0xe618\":\"num_\",\"0xe619\":\"num_4\",\"0xe60a\":\"num_8\",\"0xe60b\":\"num_9\",\"0xe60e\":\"num_\",\"0xe60f\":\"num_5\",\"0xe60c\":\"num_4\",\"0xe60d\":\"num_1\",\"0xe612\":\"num_6\",\"0xe613\":\"num_8\",\"0xe610\":\"num_3\",\"0xe611\":\"num_2\",\"0xe616\":\"num_1\",\"0xe617\":\"num_3\",\"0xe614\":\"num_9\",\"0xe615\":\"num_7\",\"0xe609\":\"num_7\",\"0xe607\":\"num_5\",\"0xe608\":\"num_6\",\"0xe61b\":\"num_5\",\"0xe61c\":\"num_8\",\"0xe61a\":\"num_2\",\"0xe61f\":\"num_6\",\"0xe61d\":\"num_9\",\"0xe61e\":\"num_7\"}");
// 對應抖音字體編碼的數字
static final JSONObject mapCode2Font = JSONObject.parseObject("{\"num_9\":8,\"num_5\":5,\"num_6\":6,\"num_\":1,\"num_7\":9,\"num_8\":7,\"num_1\":0,\"num_2\":3,\"num_3\":2,\"num_4\":4}");
}
public static DouYinShareUtil instance() {
return GetInstance.INSTANCE;
}
/**
* 根據分享鏈接爬取用戶相關信息
* @param shareUrl 抖音用戶名片分享鏈接
* */
public JSONObject getDouyinInfo( String shareUrl ) {
shareUrl = getHTMLSource(shareUrl);
// System.out.println("html -> "+shareUrl);
// 對抖音自定義的字體編碼做處理
shareUrl = shareUrl.replaceAll("&#", "hzsd");
// System.out.println("最後處理 -> "+html);
JSONObject data = new JSONObject();
try {
Document doc = Jsoup.parse(shareUrl);
// 頭像
Elements headUrl = doc.select("[class=avatar]");
data.put("headUrl", headUrl.attr("src"));
// 暱稱
Elements nickName = doc.select("[class=nickname]");
data.put("nickName", nickName.get(0).text());
// id
Elements idEle = doc.select("[class=shortid]");
String[] idArr = idEle.get(0).text().split(" ");
String id = "";
for (int i = 0; i < idArr.length; i++) {
id = id+formatNum(idArr[i]);
}
data.put("id", id);
// 個性簽名
Elements sign = doc.select("[class=signature]");
data.put("sign", sign.get(0).text());
// 關注信息
Elements followInfo = doc.select("[class=follow-info]");
// 關注數
Elements focusBlock = followInfo.select("[class=focus block]");
String[] focusArr = focusBlock.select("[class=num]").text().split(" ");
String focusStr = "";
for (int i = 0; i < focusArr.length; i++) {
focusStr = focusStr+formatNum(focusArr[i]);
}
data.put("focus", focusStr);
// 粉絲數
Elements fansBlock = followInfo.select("[class=follower block]");
String[] fansArr = fansBlock.select("[class=num]").text().split(" ");
String fansStr = "";
for (int i = 0; i < fansArr.length; i++) {
fansStr = fansStr+formatNum(fansArr[i]);
}
data.put("fans", fansStr);
// 點贊數
Elements likedBlock = followInfo.select("[class=liked-num block]");
String[] likedArr = likedBlock.select("[class=num]").text().split(" ");
String likedStr = "";
for (int i = 0; i < likedArr.length; i++) {
likedStr = likedStr+formatNum(likedArr[i]);
}
data.put("liked", likedStr);
// 作品數
Elements works = doc.select("[class=user-tab active tab get-list]");
String[] worksArr = works.select("[class=num]").text().split(" ");
String worksStr = "";
for (int i = 0; i < worksArr.length; i++) {
worksStr = worksStr+formatNum(worksArr[i]);
}
data.put("works", worksStr);
} catch (Exception e) {
logger.error("analyse douyin user info has error.",e);
}
return data;
}
/**
* 請求 html 源碼
*/
private String getHTMLSource(String url){
InputStream openStream = null;
BufferedReader buf = null;
try {
String line = null;
URL theUrl= new URL(url);
HttpURLConnection conn = (HttpURLConnection) theUrl.openConnection();
Map<String, List<String>> map = conn.getHeaderFields();
// System.out.println("請求頭:"+map.toString());
// 遍歷所有的響應頭字段
for (String key : map.keySet())
{
//如果發現有重定向了新的地址
if ("Location".equals(key))
{
//獲取新地址
url = map.get(key).get(0);
break;
}
}
theUrl= new URL(url);
conn = (HttpURLConnection) theUrl.openConnection();
conn.setRequestProperty("accept", "*/*");
conn.setRequestProperty("connection", "Keep-Alive");
conn.setRequestProperty("Accept-Charset", "UTF-8");
conn.setRequestProperty("contentType", "UTF-8");
conn.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
conn.setRequestProperty("Content-type", "application/x-www-form-urlencoded");
conn.setRequestProperty("Accept-Language", Locale.getDefault().toString());
// 建立實際的連接
conn.connect();
buf = new BufferedReader(new InputStreamReader(conn.getInputStream(),"UTF-8"));
StringBuffer str = new StringBuffer();
while((line = buf.readLine()) != null){
str.append(line);
}
return str.toString();
} catch (MalformedURLException e) {
logger.error("getHTMLSource has MalformedURLException.",e);
} catch (IOException e) {
logger.error("getHTMLSource has IOException.",e);
} finally{
try {
if(openStream!=null){
openStream.close();
}
if(buf!=null){
buf.close();
}
} catch (IOException e) {
logger.error("getHTMLSource close stream IOException.",e);
}
}
return null;
}
/**
*
* 判斷是否包含中文。true:包含
*
*/
private boolean isChinese(String str){
Pattern p = Pattern.compile("[\u4e00-\u9fa5]");
Matcher m = p.matcher(str);
if (m.find()) {
return true;
}
return false;
}
/**
*
* 抖音自定義字體對應的編碼
* 反編譯成阿拉伯數字
*
*/
private String formatNum(String str) {
if (isChinese(str)) {
return "";
}
if ( str.length() < 8 || str.indexOf("hzsdxe6") < 0) {
return str;
}
str = "0"+str.substring(4,str.length()-1);
String resStr = GetInstance.mapCode2Font.getString(GetInstance.mapCode2Name.getString(str));
if (StringUtils.isEmpty(resStr)) {
return str;
}
return resStr;
}
public static void main(String[] args) {
System.out.println(DouYinShareUtil.instance().getDouyinInfo("http://v.douyin.com/D6fEmD/"));
}
}
如果我的博客能幫到大家,麻煩大家在下方留言回覆一下,讓我知道我的博客都幫助了多少碰到類似問題的人,謝謝大家~
學無止境,生生不息。