(終於找到了獲取有效_signature的方法)博客搬家系列(六)-爬取今日頭條文章(二)
一.前情回顧
博客搬家系列(六)-爬取今日頭條文章:https://blog.csdn.net/rico_zhou/article/details/83619564
上回我們說到了使用java htmlunit爬取今日頭條的文章列表難度很大,關鍵在於_signature這個參數的加密算法,經過百度查詢也發現了大家大多數都是使用Python selenium來獲取,但是需要安裝瀏覽器和瀏覽器驅動,這不是我們想要的,並且我們也測試出了以下幾點,
1.使用網上找到的轉化版js是可以獲取到_signature的,但是只能在瀏覽器中打開此html獲取的才能用,而使用htmlunit爬取本地html文件得到的卻不能使用。
2.直接執行今日頭條的TAC.sign()方法獲取到的參數依然無法使用,由於每次獲取的參數都是不一樣的,也無法判斷到底是缺少什麼東西。
基於以上兩點我們開始今天的嘗試(其實就是昨天的事兒,不甘心那!)
二.整體分析
首先上一下此html:
<html>
<head></head>
<body>
<input id="as">
<input id="cp">
<input id="_signature">
<input id="user_id" value="50080767248">
<input id="max_behot_time" value="0">
<textarea id="textarea1"></textarea>
<script>
var user_id=document.getElementById('user_id').value;
var max_behot_time=document.getElementById('max_behot_time').value;
getHoney(user_id,max_behot_time);
function getHoney(user_id,max_behot_time){
function t1(x1,x2,x3){
return x1+x3+x2;
}
function t2(x1,x2,x3){
return x1+x3+x2;
}
function e(e, a, r) {
//console.log(2222222+" "+e)
//console.log(3333333+" "+a)
//console.log(4444444+" "+r)
var aa=((b[e] = t("x,y", "return x " + e + " y")))(r, a);
return aa
//return (b[e] || (b[e] = t1))(r, a)
}
function a(e, a, r) {
//console.log(2222222+" "+e)
//console.log(3333333+" "+a)
//console.log(4444444+" "+r)
//var kk=(k[r] || (k[r] = t("x,y", "return new x[y](" + Array(r + 1).join(",x[++y]").substr(1) + ")")))(e, a);
//console.log(5555555+" "+kk)
kk='Fri Nov 02 2017 10:41:59 GMT+0800 (中國標準時間)'
return kk;
}
function r(e, a, r) {
var n, t, s = {},
b = s.d = r ? r.d + 1 : 0;
for (s["$" + b] = s,
t = 0; t < b; t++)
s[n = "$" + t] = r[n];
for (t = 0,
b = s.length = a.length; t < b; t++)
s[t] = a[t];
return c(e, 0, s)
}
function c(t, b, k) {
function u(e) {
v[x++] = e
}
function f() {
return g = t.charCodeAt(b++) - 32,
t.substring(b, b += g)
}
function l() {
try {
y = c(t, b, k)
} catch (e) {
h = e,
y = l
}
}
for (var h, y, d, g, v = [], x = 0;;)
switch (g = t.charCodeAt(b++) - 32) {
case 1:
u(!v[--x]);
break;
case 4:
v[x++] = f();
break;
case 5:
u(function (e) {
var a = 0,
r = e.length;
return function () {
var c = a < r;
return c && u(e[a++]),
c
}
}(v[--x]));
break;
case 6:
y = v[--x],
u(v[--x](y));
break;
case 8:
if (g = t.charCodeAt(b++) - 32,
l(),
b += g,
g = t.charCodeAt(b++) - 32,
y === c)
b += g;
else if (y !== l)
return y;
break;
case 9:
v[x++] = c;
break;
case 10:
u(s(v[--x]));
break;
case 11:
y = v[--x],
u(v[--x] + y);
break;
case 12:
for (y = f(),
d = [],
g = 0; g < y.length; g++)
d[g] = y.charCodeAt(g) ^ g + y.length;
u(String.fromCharCode.apply(null, d));
break;
case 13:
y = v[--x],
h = delete v[--x][y];
break;
case 14:
v[x++] = t.charCodeAt(b++) - 32;
break;
case 59:
u((g = t.charCodeAt(b++) - 32) ? (y = x,
v.slice(x -= g, y)) : []);
break;
case 61:
u(v[--x][t.charCodeAt(b++) - 32]);
break;
case 62:
g = v[--x],
k[0] = 65599 * k[0] + k[1].charCodeAt(g) >>> 0;
break;
case 65:
h = v[--x],
y = v[--x],
v[--x][y] = h;
break;
case 66:
u(e(t.substr(b++, 1), v[--x], v[--x]));
break;
case 67:
y = v[--x];
d = v[--x];
g = v[--x];
u(g.x === c ? r(g.y, y, k) : g.apply(d, y));
break;
case 68:
u(e((g = t.substr(b++, 1)) < "<" ? (b--,
f()) : g + g, v[--x], v[--x]));
break;
case 70:
u(!1);
break;
case 71:
v[x++] = n;
break;
case 72:
v[x++] = +f();
break;
case 73:
u(parseInt(f(), 36));
break;
case 75:
if (v[--x]) {
b++;
break
}
case 74:
g = t.charCodeAt(b++) - 32 << 16 >> 16,
b += g;
break;
case 76:
u(k[t.charCodeAt(b++) - 32]);
break;
case 77:
y = v[--x],
u(v[--x][y]);
break;
case 78:
g = t.charCodeAt(b++) - 32,
u(a(v, x -= g + 1, g));
break;
case 79:
g = t.charCodeAt(b++) - 32,
u(k["$" + g]);
break;
case 81:
h = v[--x],
v[--x][f()] = h;
break;
case 82:
u(v[--x][f()]);
break;
case 83:
h = v[--x],
k[t.charCodeAt(b++) - 32] = h;
break;
case 84:
v[x++] = !0;
break;
case 85:
v[x++] = void 0;
break;
case 86:
u(v[x - 1]);
break;
case 88:
h = v[--x],
y = v[--x],
v[x++] = h,
v[x++] = y;
break;
case 89:
u(function () {
function e() {
return r(e.y, arguments, k)
}
return e.y = f(),
e.x = c,
e
}());
break;
case 90:
v[x++] = null;
break;
case 91:
v[x++] = h;
break;
case 93:
h = v[--x];
break;
case 0:
return v[--x];
default:
u((g << 16 >> 16) - 16)
}
}
var n = window;
//document.getElementById('textarea1').value=JSON.stringify(window);
var t = n.Function,
s = Object.keys || function (e) {
var a = {},
r = 0;
for (var c in e)
a[r++] = c;
return a.length = r,
a
},
b = {},
k = {};
var rrr= decodeURIComponent("gr%24Daten%20%D0%98b%2Fs!l%20y%CD%92y%C4%B9g%2C(lfi~ah%60%7Bmv%2C-n%7CjqewVxp%7Brvmmx%2C%26eff%7Fkx%5B!cs%22l%22.Pq%25widthl%22%40q%26heightl%22vr*getContextx%24%222d%5B!cs%23l%23%2C*%3B%3F%7Cu.%7Cuc%7Buq%24fontl%23vr(fillTextx%24%24%E9%BE%98%E0%B8%91%E0%B8%A0%EA%B2%BD2%3C%5B%23c%7Dl%232q*shadowBlurl%231q-shadowOffsetXl%23%24%24limeq%2BshadowColorl%23vr%23arcx88802%5B%25c%7Dl%23vr%26strokex%5B%20c%7Dl%22v%2C)%7DeOmyoZB%5Dmx%5B%20cs!0s%24l%24Pb%3Ck7l%20l!r%26lengthb%25%5El%241%2Bs%24j%02l%20%20s%23i%241ek1s%24gr%23tack4)zgr%23tac%24!%20%2B0o!%5B%23cj%3Fo%20%5D!l%24b%25s%22o%20%5D!l%22l%24b*b%5E0d%23%3E%3E%3Es!0s%25yA0s%22l%22l!r%26lengthb%3Ck%2Bl%22%5El%221%2Bs%22j%05l%20%20s%26l%26z0l!%24%20%2B%5B%22cs'(0l%23i'1ps9wxb%26s()%20%26%7Bs)%2Fs(gr%26Stringr%2CfromCharCodes)0s*yWl%20._b%26s%20o!%5D)l%20l%20Jb%3Ck%24.aj%3Bl%20.Tb%3Ck%24.gj%2Fl%20.%5Eb%3Ck%26i%22-4j!%1F%2B%26%20s%2ByPo!%5D%2Bs!l!l%20Hd%3E%26l!l%20Bd%3E%26%2Bl!l%20%3Cd%3E%26%2Bl!l%206d%3E%26%2Bl!l%20%26%2B%20s%2Cy%3Do!o!%5D%2Fq%2213o!l%20q%2210o!%5D%2Cl%202d%3E%26%20s.%7Bs-yMo!o!%5D0q%2213o!%5D*Ld%3Cl%204d%23%3E%3E%3Eb%7Cs!o!l%20q%2210o!%5D%2Cl!%26%20s%2FyIo!o!%5D.q%2213o!%5D%2Co!%5D*Jd%3Cl%206d%23%3E%3E%3Eb%7C%26o!%5D%2Bl%20%26%2B%20s0l-l!%26l-l!i'1z141z4b%2F%40d%3Cl%22b%7C%26%2Bl-l(l!b%5E%26%2Bl-l%26zl'g%2C)gk%7Dejo%7B%7Fcm%2C)%7Cyn~Lij~em%5B%22cl%24b%25%40d%3Cl%26zl'l%20%24%20%2B%5B%22cl%24b%25b%7C%26%2Bl-l%258d%3C%40b%7Cl!b%5E%26%2B%20q%24sign%20");
r(decodeURIComponent("gr%24Daten%20%D0%98b%2Fs!l%20y%CD%92y%C4%B9g%2C(lfi~ah%60%7Bmv%2C-n%7CjqewVxp%7Brvmmx%2C%26eff%7Fkx%5B!cs%22l%22.Pq%25widthl%22%40q%26heightl%22vr*getContextx%24%222d%5B!cs%23l%23%2C*%3B%3F%7Cu.%7Cuc%7Buq%24fontl%23vr(fillTextx%24%24%E9%BE%98%E0%B8%91%E0%B8%A0%EA%B2%BD2%3C%5B%23c%7Dl%232q*shadowBlurl%231q-shadowOffsetXl%23%24%24limeq%2BshadowColorl%23vr%23arcx88802%5B%25c%7Dl%23vr%26strokex%5B%20c%7Dl%22v%2C)%7DeOmyoZB%5Dmx%5B%20cs!0s%24l%24Pb%3Ck7l%20l!r%26lengthb%25%5El%241%2Bs%24j%02l%20%20s%23i%241ek1s%24gr%23tack4)zgr%23tac%24!%20%2B0o!%5B%23cj%3Fo%20%5D!l%24b%25s%22o%20%5D!l%22l%24b*b%5E0d%23%3E%3E%3Es!0s%25yA0s%22l%22l!r%26lengthb%3Ck%2Bl%22%5El%221%2Bs%22j%05l%20%20s%26l%26z0l!%24%20%2B%5B%22cs'(0l%23i'1ps9wxb%26s()%20%26%7Bs)%2Fs(gr%26Stringr%2CfromCharCodes)0s*yWl%20._b%26s%20o!%5D)l%20l%20Jb%3Ck%24.aj%3Bl%20.Tb%3Ck%24.gj%2Fl%20.%5Eb%3Ck%26i%22-4j!%1F%2B%26%20s%2ByPo!%5D%2Bs!l!l%20Hd%3E%26l!l%20Bd%3E%26%2Bl!l%20%3Cd%3E%26%2Bl!l%206d%3E%26%2Bl!l%20%26%2B%20s%2Cy%3Do!o!%5D%2Fq%2213o!l%20q%2210o!%5D%2Cl%202d%3E%26%20s.%7Bs-yMo!o!%5D0q%2213o!%5D*Ld%3Cl%204d%23%3E%3E%3Eb%7Cs!o!l%20q%2210o!%5D%2Cl!%26%20s%2FyIo!o!%5D.q%2213o!%5D%2Co!%5D*Jd%3Cl%206d%23%3E%3E%3Eb%7C%26o!%5D%2Bl%20%26%2B%20s0l-l!%26l-l!i'1z141z4b%2F%40d%3Cl%22b%7C%26%2Bl-l(l!b%5E%26%2Bl-l%26zl'g%2C)gk%7Dejo%7B%7Fcm%2C)%7Cyn~Lij~em%5B%22cl%24b%25%40d%3Cl%26zl'l%20%24%20%2B%5B%22cl%24b%25b%7C%26%2Bl-l%258d%3C%40b%7Cl!b%5E%26%2B%20q%24sign%20"), [TAC = {}]);
tt = TAC.sign(user_id+"" + max_behot_time);
var i = Math.floor((new Date).getTime() / 1e3)
, e = i.toString(16).toUpperCase()
, t = md5(i).toString().toUpperCase();
if (8 != e.length)
return {
as: "479BB4B7254C150",
cp: "7E0AC8874BB0985"
};
for (var n = t.slice(0, 5), o = t.slice(-5), s = "", a = 0; 5 > a; a++)
{ s += n[a] + e[a];}
for (var l = "", r = 0; 5 > r; r++){
l += e[r + 3] + o[r];
}
var as="A1" + s + e.slice(-3);
var cp=e.slice(0, 3) + l + "E1";
document.getElementById('as').value=as;
document.getElementById('cp').value=cp;
document.getElementById('_signature').value=tt;
console.log(tt.substring(18,19))
return {
as: as,
cp: cp,
_signature:tt
}
}
function md5(string) {
function md5_RotateLeft(lValue, iShiftBits) {
return (lValue << iShiftBits) | (lValue >>> (32 - iShiftBits));
}
function md5_AddUnsigned(lX, lY) {
var lX4, lY4, lX8, lY8, lResult;
lX8 = (lX & 0x80000000);
lY8 = (lY & 0x80000000);
lX4 = (lX & 0x40000000);
lY4 = (lY & 0x40000000);
lResult = (lX & 0x3FFFFFFF) + (lY & 0x3FFFFFFF);
if (lX4 & lY4) {
return (lResult ^ 0x80000000 ^ lX8 ^ lY8);
}
if (lX4 | lY4) {
if (lResult & 0x40000000) {
return (lResult ^ 0xC0000000 ^ lX8 ^ lY8);
} else {
return (lResult ^ 0x40000000 ^ lX8 ^ lY8);
}
} else {
return (lResult ^ lX8 ^ lY8);
}
}
function md5_F(x, y, z) {
return (x & y) | ((~x) & z);
}
function md5_G(x, y, z) {
return (x & z) | (y & (~z));
}
function md5_H(x, y, z) {
return (x ^ y ^ z);
}
function md5_I(x, y, z) {
return (y ^ (x | (~z)));
}
function md5_FF(a, b, c, d, x, s, ac) {
a = md5_AddUnsigned(a, md5_AddUnsigned(md5_AddUnsigned(md5_F(b, c, d), x), ac));
return md5_AddUnsigned(md5_RotateLeft(a, s), b);
};
function md5_GG(a, b, c, d, x, s, ac) {
a = md5_AddUnsigned(a, md5_AddUnsigned(md5_AddUnsigned(md5_G(b, c, d), x), ac));
return md5_AddUnsigned(md5_RotateLeft(a, s), b);
};
function md5_HH(a, b, c, d, x, s, ac) {
a = md5_AddUnsigned(a, md5_AddUnsigned(md5_AddUnsigned(md5_H(b, c, d), x), ac));
return md5_AddUnsigned(md5_RotateLeft(a, s), b);
};
function md5_II(a, b, c, d, x, s, ac) {
a = md5_AddUnsigned(a, md5_AddUnsigned(md5_AddUnsigned(md5_I(b, c, d), x), ac));
return md5_AddUnsigned(md5_RotateLeft(a, s), b);
};
function md5_ConvertToWordArray(string) {
var lWordCount;
var lMessageLength = string.length;
var lNumberOfWords_temp1 = lMessageLength + 8;
var lNumberOfWords_temp2 = (lNumberOfWords_temp1 - (lNumberOfWords_temp1 % 64)) / 64;
var lNumberOfWords = (lNumberOfWords_temp2 + 1) * 16;
var lWordArray = Array(lNumberOfWords - 1);
var lBytePosition = 0;
var lByteCount = 0;
while (lByteCount < lMessageLength) {
lWordCount = (lByteCount - (lByteCount % 4)) / 4;
lBytePosition = (lByteCount % 4) * 8;
lWordArray[lWordCount] = (lWordArray[lWordCount] | (string.charCodeAt(lByteCount) << lBytePosition));
lByteCount++;
}
lWordCount = (lByteCount - (lByteCount % 4)) / 4;
lBytePosition = (lByteCount % 4) * 8;
lWordArray[lWordCount] = lWordArray[lWordCount] | (0x80 << lBytePosition);
lWordArray[lNumberOfWords - 2] = lMessageLength << 3;
lWordArray[lNumberOfWords - 1] = lMessageLength >>> 29;
return lWordArray;
};
function md5_WordToHex(lValue) {
var WordToHexValue = "",
WordToHexValue_temp = "",
lByte, lCount;
for (lCount = 0; lCount <= 3; lCount++) {
lByte = (lValue >>> (lCount * 8)) & 255;
WordToHexValue_temp = "0" + lByte.toString(16);
WordToHexValue = WordToHexValue + WordToHexValue_temp.substr(WordToHexValue_temp.length - 2, 2);
}
return WordToHexValue;
};
function md5_Utf8Encode(string) {
string = string.toString().replace(/\r\n/g, "\n");
var utftext = "";
for (var n = 0; n < string.length; n++) {
var c = string.charCodeAt(n);
if (c < 128) {
utftext += String.fromCharCode(c);
} else if ((c > 127) && (c < 2048)) {
utftext += String.fromCharCode((c >> 6) | 192);
utftext += String.fromCharCode((c & 63) | 128);
} else {
utftext += String.fromCharCode((c >> 12) | 224);
utftext += String.fromCharCode(((c >> 6) & 63) | 128);
utftext += String.fromCharCode((c & 63) | 128);
}
}
return utftext;
};
var x = Array();
var k, AA, BB, CC, DD, a, b, c, d;
var S11 = 7,
S12 = 12,
S13 = 17,
S14 = 22;
var S21 = 5,
S22 = 9,
S23 = 14,
S24 = 20;
var S31 = 4,
S32 = 11,
S33 = 16,
S34 = 23;
var S41 = 6,
S42 = 10,
S43 = 15,
S44 = 21;
string = md5_Utf8Encode(string);
x = md5_ConvertToWordArray(string);
a = 0x67452301;
b = 0xEFCDAB89;
c = 0x98BADCFE;
d = 0x10325476;
for (k = 0; k < x.length; k += 16) {
AA = a;
BB = b;
CC = c;
DD = d;
a = md5_FF(a, b, c, d, x[k + 0], S11, 0xD76AA478);
d = md5_FF(d, a, b, c, x[k + 1], S12, 0xE8C7B756);
c = md5_FF(c, d, a, b, x[k + 2], S13, 0x242070DB);
b = md5_FF(b, c, d, a, x[k + 3], S14, 0xC1BDCEEE);
a = md5_FF(a, b, c, d, x[k + 4], S11, 0xF57C0FAF);
d = md5_FF(d, a, b, c, x[k + 5], S12, 0x4787C62A);
c = md5_FF(c, d, a, b, x[k + 6], S13, 0xA8304613);
b = md5_FF(b, c, d, a, x[k + 7], S14, 0xFD469501);
a = md5_FF(a, b, c, d, x[k + 8], S11, 0x698098D8);
d = md5_FF(d, a, b, c, x[k + 9], S12, 0x8B44F7AF);
c = md5_FF(c, d, a, b, x[k + 10], S13, 0xFFFF5BB1);
b = md5_FF(b, c, d, a, x[k + 11], S14, 0x895CD7BE);
a = md5_FF(a, b, c, d, x[k + 12], S11, 0x6B901122);
d = md5_FF(d, a, b, c, x[k + 13], S12, 0xFD987193);
c = md5_FF(c, d, a, b, x[k + 14], S13, 0xA679438E);
b = md5_FF(b, c, d, a, x[k + 15], S14, 0x49B40821);
a = md5_GG(a, b, c, d, x[k + 1], S21, 0xF61E2562);
d = md5_GG(d, a, b, c, x[k + 6], S22, 0xC040B340);
c = md5_GG(c, d, a, b, x[k + 11], S23, 0x265E5A51);
b = md5_GG(b, c, d, a, x[k + 0], S24, 0xE9B6C7AA);
a = md5_GG(a, b, c, d, x[k + 5], S21, 0xD62F105D);
d = md5_GG(d, a, b, c, x[k + 10], S22, 0x2441453);
c = md5_GG(c, d, a, b, x[k + 15], S23, 0xD8A1E681);
b = md5_GG(b, c, d, a, x[k + 4], S24, 0xE7D3FBC8);
a = md5_GG(a, b, c, d, x[k + 9], S21, 0x21E1CDE6);
d = md5_GG(d, a, b, c, x[k + 14], S22, 0xC33707D6);
c = md5_GG(c, d, a, b, x[k + 3], S23, 0xF4D50D87);
b = md5_GG(b, c, d, a, x[k + 8], S24, 0x455A14ED);
a = md5_GG(a, b, c, d, x[k + 13], S21, 0xA9E3E905);
d = md5_GG(d, a, b, c, x[k + 2], S22, 0xFCEFA3F8);
c = md5_GG(c, d, a, b, x[k + 7], S23, 0x676F02D9);
b = md5_GG(b, c, d, a, x[k + 12], S24, 0x8D2A4C8A);
a = md5_HH(a, b, c, d, x[k + 5], S31, 0xFFFA3942);
d = md5_HH(d, a, b, c, x[k + 8], S32, 0x8771F681);
c = md5_HH(c, d, a, b, x[k + 11], S33, 0x6D9D6122);
b = md5_HH(b, c, d, a, x[k + 14], S34, 0xFDE5380C);
a = md5_HH(a, b, c, d, x[k + 1], S31, 0xA4BEEA44);
d = md5_HH(d, a, b, c, x[k + 4], S32, 0x4BDECFA9);
c = md5_HH(c, d, a, b, x[k + 7], S33, 0xF6BB4B60);
b = md5_HH(b, c, d, a, x[k + 10], S34, 0xBEBFBC70);
a = md5_HH(a, b, c, d, x[k + 13], S31, 0x289B7EC6);
d = md5_HH(d, a, b, c, x[k + 0], S32, 0xEAA127FA);
c = md5_HH(c, d, a, b, x[k + 3], S33, 0xD4EF3085);
b = md5_HH(b, c, d, a, x[k + 6], S34, 0x4881D05);
a = md5_HH(a, b, c, d, x[k + 9], S31, 0xD9D4D039);
d = md5_HH(d, a, b, c, x[k + 12], S32, 0xE6DB99E5);
c = md5_HH(c, d, a, b, x[k + 15], S33, 0x1FA27CF8);
b = md5_HH(b, c, d, a, x[k + 2], S34, 0xC4AC5665);
a = md5_II(a, b, c, d, x[k + 0], S41, 0xF4292244);
d = md5_II(d, a, b, c, x[k + 7], S42, 0x432AFF97);
c = md5_II(c, d, a, b, x[k + 14], S43, 0xAB9423A7);
b = md5_II(b, c, d, a, x[k + 5], S44, 0xFC93A039);
a = md5_II(a, b, c, d, x[k + 12], S41, 0x655B59C3);
d = md5_II(d, a, b, c, x[k + 3], S42, 0x8F0CCC92);
c = md5_II(c, d, a, b, x[k + 10], S43, 0xFFEFF47D);
b = md5_II(b, c, d, a, x[k + 1], S44, 0x85845DD1);
a = md5_II(a, b, c, d, x[k + 8], S41, 0x6FA87E4F);
d = md5_II(d, a, b, c, x[k + 15], S42, 0xFE2CE6E0);
c = md5_II(c, d, a, b, x[k + 6], S43, 0xA3014314);
b = md5_II(b, c, d, a, x[k + 13], S44, 0x4E0811A1);
a = md5_II(a, b, c, d, x[k + 4], S41, 0xF7537E82);
d = md5_II(d, a, b, c, x[k + 11], S42, 0xBD3AF235);
c = md5_II(c, d, a, b, x[k + 2], S43, 0x2AD7D2BB);
b = md5_II(b, c, d, a, x[k + 9], S44, 0xEB86D391);
a = md5_AddUnsigned(a, AA);
b = md5_AddUnsigned(b, BB);
c = md5_AddUnsigned(c, CC);
d = md5_AddUnsigned(d, DD);
}
return (md5_WordToHex(a) + md5_WordToHex(b) + md5_WordToHex(c) + md5_WordToHex(d)).toLowerCase();
}
</script>
</body>
</html>
基於上面分析的第二點,我們暫時是可以看看這個算法到底是怎樣(核心代碼來源於網上,暫時忘了url找到後後補上),本人js水,大概也就注意到了以下幾點:
1.直接將js部分用nodejs執行肯定是不行的,因爲代碼中出現了window對象,且使用了window.Function構建動態函數,沒辦法,不會將其構造成純js運行,那麼我們的目標就是將直接打開html獲取的參數param1,和使用htmlunit讀取本地html文件獲取的參數param2比對,找出其中的規律,讓其一樣或者都生效(param1是生效的)。
2.根據代碼中構建函數t=window.Function,我們找到了兩個相關的使用地方:
function e(e, a, r) {
return ((b[e] = t("x,y", "return x " + e + " y")))(r, a);
}
function a(e, a, r) {
return (k[r] || (k[r] = t("x,y", "return new x[y](" + Array(r + 1).join(",x[++y]").substr(1) + ")")))(e, a);
}
說實話,只知道是構建函數但是具體幹嘛的還是不知道,咋辦呢?那就console.log()輸出一下唄,改一下:
function e(e, a, r) {
console.log(2222222+" "+e)
console.log(3333333+" "+a)
console.log(4444444+" "+r)
var aa=((b[e] = t("x,y", "return x " + e + " y")))(r, a);
return aa
//return (b[e] || (b[e] = t1))(r, a)
}
function a(e, a, r) {
console.log(2222222+" "+e)
console.log(3333333+" "+a)
console.log(4444444+" "+r)
var kk=(k[r] || (k[r] = t("x,y", "return new x[y](" + Array(r + 1).join(",x[++y]").substr(1) + ")")))(e, a);
console.log(5555555+" "+kk)
return kk;
}
經過多次打開運行驗證,發現函數e運行了很多次,而且也看不出啥規律,但是函數a卻只運行了一次
注意是55555開頭的,顯然這是時間戳,而且只出現一次,那麼我就先將其寫死,畢竟這個是可以自行獲取不再需要加密了,
function a(e, a, r) {
console.log(2222222+" "+e)
console.log(3333333+" "+a)
console.log(4444444+" "+r)
var kk=(k[r] || (k[r] = t("x,y", "return new x[y](" + Array(r + 1).join(",x[++y]").substr(1) + ")")))(e, a);
console.log(5555555+" "+kk)
//先將其寫死
kk='Fri Nov 02 2017 10:41:59 GMT+0800 (中國標準時間)'
return kk;
}
再次運行
結果:AAAAAAAAAABXN0hbnCxxawAAAB
多次運行,發現參數沒有變化,而且參數變得簡潔了,改變一下時間值發現還是沒有變化,說明此參數的變化是隻跟user_id,max_behot_time有關係,若固定則參數值固定,至於爲什麼沒改之前卻一直變化,大概是k這個函數變化吧,不管了,那麼先看看能不能用,將對應的user_id和max_behot_time帶入,發現確實可以使用,此時欣喜若狂,既然頁面的參數param1固定了而且也可以使用了,那就和使用htmlunit獲取的參數param2好比對了,
趕緊使用htmlunit讀取一下本地剛剛的html
public static void test3() throws Exception {
String urlOne = "file:///C:/Users/rzhou6/Desktop/toutiao/newd.html";
// 模擬瀏覽器操作
// 創建WebClient
WebClient webClient = new WebClient(BrowserVersion.CHROME);
// 關閉css代碼功能
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setCssEnabled(false);
// 如若有可能找不到文件js則加上這句代碼
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
HtmlPage page2 = webClient.getPage(urlOne);
System.out.println(page2.asText());
}
結果:AAAAAAAAAAAYyN2oFg9xawAAAB
簡單比對一下
param1:AAAAAAAAAABXN0hbnCxxawAAAB(有效)
param2:AAAAAAAAAAAYyN2oFg9xawAAAB(無效)
中間的幾位不同,爲了能找到規律,那就得多測試,選擇不同的user_id,每個id不同的頁面請求,並將其參數分割,數據如下:
//同一id 101528687217 2,3,4頁,網頁可行,htmlunit本地文件不可行
1539419885
>AAAAAAAAAA BXN0hbnC x FHA AAAB
>AAAAAAAAAA AYyN2oFg 9 FHA AAAB
1537250643
>AAAAAAAAAA BXN0hbnC x Mew AAAB
>AAAAAAAAAA AYyN2oFg 9 Mew AAAB
1536065316
>AAAAAAAAAA BXN0hbnC w -ig AAAB
>AAAAAAAAAA AYyN2oFg 8 -ig AAAB
usrid 50080767248
1540553612
>AAAAAAAAAA BXN0hbnC x 2Gg AAAB
>AAAAAAAAAA AYyN2oFg 9 2Gg AAAB
1540133733
>AAAAAAAAAA BXN0hbnC w ipA AAAB
>AAAAAAAAAA AYyN2oFg 8 ipA AAAB
1539776774
>AAAAAAAAAA BXN0hbnC x RsQ AAAB
>AAAAAAAAAA AYyN2oFg 9 RsQ AAAB
1539406769
>AAAAAAAAAA BXN0hbnC z Swg AAAB
>AAAAAAAAAA AYyN2oFg . Swg AAAB
1538986022
>AAAAAAAAAA BXN0hbnC z qDw AAAB
>AAAAAAAAAA AYyN2oFg . qDw AAAB
1538388819
>AAAAAAAAAA BXN0hbnC y jAg AAAB
>AAAAAAAAAA AYyN2oFg - jAg AAAB
上一行爲頁面瀏覽器獲取的參數param1,下面的是使用htmlunit讀取html獲取的參數param2,多方比對發現,不管是不是同id或者第幾頁,所有有效的param1中間部分BXN0hbnC,在參數param2中都變成了AYyN2oFg,好辦,那我們只要反向替換一下即可,接下來就只有第19位字符不一樣,接下來的都是一樣的,猜測加密算法中對max_behot_time的值進行加密然後獲取新值,暫時我們是可以發現x對應9,w對應8,z對應.,y對應-,其他的暫時未知,只要我們知道了第19位字符的對應規律就能反向替換,最終通過htmlunit讀取本地html就能獲取到有效的參數_signature,現在更改一下html文件,我們循環一下max_behot_time看看參數第19位都有哪些值出現
for(var i=1000000000;i<1000001000;i++){
getHoney(user_id,i);
}
注意i值不要過大,循環次數不要過多,不然容易卡死,
運行
多次改變i初始值發現,只有y,z,w,x這四個值出現那麼我們只需要替換他即可
接下來寫代碼
/**
* @date Oct 31, 2018 3:59:49 PM
* @Desc 獲取文章list請求url
* @param blogMove
* @param num
* @param max_behot_time
* @return
* @throws IOException
* @throws MalformedURLException
* @throws FailingHttpStatusCodeException
*/
public static String getTouTiaoListUrl(Blogmove blogMove, int num, String max_behot_time)
throws Exception {
String oneUrl = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=%s&max_behot_time=%s&count=20&as=%s&cp=%s&_signature=%s";
String user_id = blogMove.getMoveUserId();
// System.out.println(user_id);
String as = "";
String cp = "";
String _signature = "";
//更改文件
updateHtmlFile("C:/Users/rzhou6/Desktop/toutiao/newd.html",user_id,max_behot_time);
String urlOne = "file:///C:/Users/rzhou6/Desktop/toutiao/newd.html";
// 模擬瀏覽器操作
// 創建WebClient
WebClient webClient = new WebClient(BrowserVersion.CHROME);
// 關閉css代碼功能
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setCssEnabled(false);
// 如若有可能找不到文件js則加上這句代碼
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
HtmlPage page2 = webClient.getPage(urlOne);
System.out.println(page2.asText());
// 執行js
as = page2.getElementById("as").asText();
cp = page2.getElementById("cp").asText();
_signature = page2.getElementById("_signature").asText();
System.out.println(_signature);
_signature = getRightSign(_signature);
System.out.println(as);
System.out.println(cp);
System.out.println(_signature);
oneUrl = String.format(oneUrl, user_id, max_behot_time, as, cp, _signature);
System.out.println(oneUrl);
return oneUrl;
}
/**
* @date Nov 2, 2018 12:36:27 PM
* @Desc
* @param string
* @param max_behot_time
* @param user_id
*/
private static void updateHtmlFile(String string, String user_id, String max_behot_time) {
String html=FileUtils.getFileToString(string);
Document doc = Jsoup.parse(html);
Element imgTags = doc.getElementById("user_id");
imgTags.attr("value",user_id);
Element imgTags2 = doc.getElementById("max_behot_time");
imgTags2.attr("value",max_behot_time);
//寫入文件
new File(string).delete();
FileUtils.appendFile(string, doc.html());
}
/**
* @date Nov 2, 2018 12:24:42 PM
* @Desc
* @param _signature
* @return
*/
private static String getRightSign(String _signature) {
// w:8,x:9,y:-,z:.
// >AAAAAAAAAA BXN0hbnC y jAQ AAAB
// >AAAAAAAAAA AYyN2oFg - jAQ AAAB
String s = _signature.substring(18, 19);
String ss = _signature.substring(19, 22);
if ("8".equals(s)) {
s = "w";
} else if ("9".equals(s)) {
s = "x";
} else if ("-".equals(s)) {
s = "y";
} else if (".".equals(s)) {
s = "z";
}
return "AAAAAAAAAABXN0hbnC" + s + ss + "AAAB";
}
經驗證發現,所得的url均可用
PS:其中最後一步出現了不小的波折,在使用htmlunit時突然獲取的參數跟測試時不一樣了,規律也不一樣,但是代碼是完全一樣的啊,經過對比,發現了是jar包版本的問題,真是奇怪,也沒有任何衝突,總之獲取的就是不一樣,大概這也是期初兩個參數獲取不一樣的原因吧,畢竟htmlunit是模擬而不是實實在在瀏覽器,htmlunit使用2.27版本即可,使用2.32版本獲取的參數規律不再是上文所說了。
PPS:最後又發現了問題,雖然獲取的url瀏覽器是完全可以獲取到json數據的,但是htmlunit發送此get請求時,居然偶爾可行,偶爾不行,估計是今日頭條的反爬又有限制了,不過沒關係,獲取了正確的url害怕取不到數據?
歡迎交流學習!
完整源碼請見github: