0x01 起因
因爲手速慢,漏洞刷不過別人,一個個手補確實慢,所以想自己寫一個api,一鍵抓取百度搜索結果,然後就可以打批量了 ovo(真是太妙了!)
0x02 動工
1.抓包做準備
要做一個抓取的,當然是先抓包啦~
拿出我的bp,和zoomeye篇一樣先輸入一個關鍵字,方便在bp裏面找我輸入的關鍵字
然後回車~
發現我輸入的關鍵字在這個GET的請求包裏面
用repeat模塊後:
發現返回了我要的搜索結果
2.使用php的curl來模擬訪問
PHP支持的由Daniel Stenberg創建的libcurl庫允許你與各種的服務器使用各種類型的協議進行連接和通訊。
libcurl目前支持http、https、ftp、gopher、telnet、dict、file和ldap協議。libcurl同時也支持HTTPS認證、HTTP POST、HTTP PUT、 FTP 上傳(這個也能通過PHP的FTP擴展完成)、HTTP 基於表單的上傳、代理、cookies和用戶名+密碼的認證。
PHP中使用cURL實現Get和Post請求的方法
這些函數在PHP 4.0.2中被引入。
就是說,在php4.0.2中就已經引入了curl,而且還可以做post和get,真是太有用了有木有
拿出我剛剛記錄好的請求包~
Connection: close
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36
Sec-Fetch-Dest: document
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
Sec-Fetch-Site: none
Sec-Fetch-Mode: navigate
Sec-Fetch-User: ?1
Accept-Language: zh-CN,zh;q=0.9
Cookie: PSTM=1588249253; BAIDUID=C34E0834A4B2DA6CBA0B25FA3A67FC8D:FG=1; BIDUPSID=735A45B6473102ED12E4236A4401AE21; BD_UPN=12314353; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDUSS=2FaaXBqcDhaY0p-U1JvMXJ3dnVVdnJlSklhelZkSEQ1aGF1a1lWYjh3WnNVOUplSVFBQUFBJCQAAAAAAAAAAAEAAADfyRotztLX3MrHzvvO-7n-uf4AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGzGql5sxqpeZ; H_PS_PSSID=1461_31325_21098_31254_31342_31271_31464_30824_31164_22158; delPer=0; BD_CK_SAM=1; PSINO=5; sug=3; sugstore=0; ORIGIN=2; bdime=0; H_PS_645EC=fe7bDpIqw0Ye%2FC9V9rTqXv5ARp5x3G1lJcPTrEHREGKf1YbuRCoB6oR0frw
然後用php語言來描述他:
<?php
function curl_post($url){
//$url='https://www.baidu.com/s?wd=%22Office%20Anywhere%22&tn=93348797_hao_pg&ie=utf-8&ch=1&pn='.$pn;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, TRUE);
$headers = array();
$headers[] = 'Connection: close';
$headers[] = 'Upgrade-Insecure-Requests: 1';
$headers[] = 'User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36';
$headers[] = 'Sec-Fetch-Dest: document';
$headers[] = 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9';
$headers[] = 'Sec-Fetch-Site: none';
$headers[] = 'Sec-Fetch-Mode: navigate';
$headers[] = 'Sec-Fetch-User: ?1';
$headers[] = 'Accept-Language: zh-CN,zh;q=0.9';
$headers[] = 'Cookie: PSTM=1588249253; BAIDUID=C34E0834A4B2DA6CBA0B25FA3A67FC8D:FG=1; BIDUPSID=735A45B6473102ED12E4236A4401AE21; BD_UPN=12314353; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDUSS=2FaaXBqcDhaY0p-U1JvMXJ3dnVVdnJlSklhelZkSEQ1aGF1a1lWYjh3WnNVOUplSVFBQUFBJCQAAAAAAAAAAAEAAADfyRotztLX3MrHzvvO-7n-uf4AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGzGql5sxqpeZ; H_PS_PSSID=1461_31325_21098_31254_31342_31271_31464_30824_31164_22158; delPer=0; BD_CK_SAM=1; PSINO=5; sug=3; sugstore=0; ORIGIN=2; bdime=0; H_PS_645EC=fe7bDpIqw0Ye%2FC9V9rTqXv5ARp5x3G1lJcPTrEHREGKf1YbuRCoB6oR0frw';
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
$output= curl_exec($ch);
curl_close($ch);
return $output;
}
?>
這樣子就可以完成一次php中curl對baidu的請求了
然後用正則的方法取出其中的鏈接(http://www.baidu.com/link?url=xxxxx)
$a=curl_post($url);
$pattern = '/<a target="_blank" href="(.*?)"(.*?)" class="(.*?)/i';
preg_match_all($pattern, $a, $match);
如果有不會的可以看我的第一篇(zoomeye篇)
)保留head
最後就會把返回值$a打印在屏幕上了
但是百度的返回值是www.baidu.com/link?url=xxxxxxxx
所以要獲取真實鏈接:
function get_real($url){
$info = parse_url($url);
$fp = fsockopen($info['host'], 80,$errno, $errstr, 30);
fputs($fp,"GET {$info['path']}?{$info['query']} HTTP/1.1\r\n");
fputs($fp, "Host: {$info['host']}\r\n");
fputs($fp, "Connection: close\r\n\r\n");
$rewrite = '';
while(!feof($fp)) {
$line = fgets($fp);
if($line != "\r\n" ) {
if(strpos($line,'Location:') !== false) {
$rewrite = str_replace(array("\r","\n","Location: "),'',$line);
}
}else {
break;
}
}
return $rewrite;
}
最後輸出即可
0x03所以完整代碼如下:
<?php
function get_real($url){
$info = parse_url($url);
$fp = fsockopen($info['host'], 80,$errno, $errstr, 30);
fputs($fp,"GET {$info['path']}?{$info['query']} HTTP/1.1\r\n");
fputs($fp, "Host: {$info['host']}\r\n");
fputs($fp, "Connection: close\r\n\r\n");
$rewrite = '';
while(!feof($fp)) {
$line = fgets($fp);
if($line != "\r\n" ) {
if(strpos($line,'Location:') !== false) {
$rewrite = str_replace(array("\r","\n","Location: "),'',$line);
}
}else {
break;
}
}
return $rewrite;
}
function getSubstr($str, $leftStr, $rightStr)
{
$left = strpos($str, $leftStr);
//echo '左邊:'.$left;
$right = strpos($str, $rightStr,$left);
//echo '<br>右邊:'.$right;
if($left < 0 or $right < $left) return '';
return substr($str, $left + strlen($leftStr), $right-$left-strlen($leftStr));
}
function curl_post($url){
//$url='https://www.baidu.com/s?wd=%22Office%20Anywhere%22&tn=93348797_hao_pg&ie=utf-8&ch=1&pn='.$pn;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, TRUE);
$headers = array();
$headers[] = 'Connection: close';
$headers[] = 'Upgrade-Insecure-Requests: 1';
$headers[] = 'User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36';
$headers[] = 'Sec-Fetch-Dest: document';
$headers[] = 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9';
$headers[] = 'Sec-Fetch-Site: none';
$headers[] = 'Sec-Fetch-Mode: navigate';
$headers[] = 'Sec-Fetch-User: ?1';
$headers[] = 'Accept-Language: zh-CN,zh;q=0.9';
$headers[] = 'Cookie: PSTM=1588249253; BAIDUID=C34E0834A4B2DA6CBA0B25FA3A67FC8D:FG=1; BIDUPSID=735A45B6473102ED12E4236A4401AE21; BD_UPN=12314353; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDUSS=2FaaXBqcDhaY0p-U1JvMXJ3dnVVdnJlSklhelZkSEQ1aGF1a1lWYjh3WnNVOUplSVFBQUFBJCQAAAAAAAAAAAEAAADfyRotztLX3MrHzvvO-7n-uf4AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGzGql5sxqpeZ; H_PS_PSSID=1461_31325_21098_31254_31342_31271_31464_30824_31164_22158; delPer=0; BD_CK_SAM=1; PSINO=5; sug=3; sugstore=0; ORIGIN=2; bdime=0; H_PS_645EC=fe7bDpIqw0Ye%2FC9V9rTqXv5ARp5x3G1lJcPTrEHREGKf1YbuRCoB6oR0frw';
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
$output= curl_exec($ch);
curl_close($ch);
return $output;
}
function get_url($page){
$start='"<a target="_blank" href="';
$end='" class="';
$a='';
/*這裏是關鍵詞*/
$wddd="'Office Anywhere'";
/*這裏是關鍵詞*/
$url="https://www.baidu.com/s?wd=".urlencode($wddd)."&tn=93348797_hao_pg&ie=utf-8&ch=1&pn=".$page;
$a=curl_post($url);
$pattern = '/<a target="_blank" href="(.*?)"(.*?)" class="(.*?)/i';
preg_match_all($pattern, $a, $match);
$first=count($match);
$aaa=array();
$o=1;
for($j=0;$j<=count($match[0]);$j++){
if(strpos($match[0][$j],'http://www.baidu.com/link?url=') !== false){
$match1[$o]=get_real('h'.getSubstr($match[0][$j],$start,$end));
$o++;
}
}
return $match1;
}
for($i=0;$i<=500;$i+=10){
$aaa[$o]=get_url($i);
$o++;
}
$bbb=array();
$ccount=count($aaa[""]);
for($i=1;$i<=$ccount;$i++){
$bbb[$i]=$aaa[""][$i];
}
$aaaaaaaa=count($aaa[$i]);
for($i=1;$i<=count($aaa);$i++){
for($j=1;$j<=$aaaaaaaa;$j++){
if($aaa[$i][$j]!=''){echo $aaa[$i][$j].'</p>';}
}
}
for($i=1;$i<=count($bbb);$i++){
if($bbb[$i]!=''){echo $bbb[$i].'</p>';}
}
?>
關鍵詞要自己設定哦。我在源碼中已經標明設定關鍵詞的地方遼