如何扒網站

首先定義一個cg_http的類
使用curl()方法將網頁url寫入指定的文件夾

class cg_http
{
    private $agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36';
    private $post_fields = array();
    private $post_files = array();
    private $headers = array();
    private $cookies = array();
    private $curl;
    private $url = '';

    public function __construct($url = '')
    {
        $this->curl = curl_init();
        if ($url != '')
        {
            $this->set_url($url);
        }
    }


    public function set_url($url)
    {
        $this->url = $url;
        curl_setopt($this->curl, CURLOPT_URL, $this->url);
        curl_setopt($this->curl, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($this->curl, CURLOPT_FOLLOWLOCATION, true);
        curl_setopt($this->curl, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($this->curl, CURLOPT_AUTOREFERER, true);
        curl_setopt($this->curl, CURLOPT_CONNECTTIMEOUT, $this->timeout);
        curl_setopt($this->curl, CURLOPT_HEADER, false);
        $this->set_agent($this->agent);
    }


    public function set_agent($agent)
    {
        curl_setopt($this->curl, CURLOPT_USERAGENT, $agent);
    }


    public function send_request($data = '', $transcoding = false)
    {
        if ($this->url == '')
        {
            die('url is empty');
        }
        if (is_string($this->cookies))
        {
            curl_setopt($this->curl, CURLOPT_COOKIE, $this->cookies);
        }
        elseif (is_array($this->cookies))
        {
            if (count($this->cookies) > 0)
            {
                curl_setopt($this->curl, CURLOPT_COOKIE, implode(';', $this->cookies));
            }
        }


        if (count($this->post_fields) > 0)
        {
            curl_setopt($this->curl, CURLOPT_POST, true);
            if ($this->has_file)
            {
                curl_setopt($this->curl, CURLOPT_POSTFIELDS, $this->post_fields);
            }
            else
            {
                curl_setopt($this->curl, CURLOPT_POSTFIELDS, http_build_query($this->post_fields));
            }
        }
        if (count($this->headers) > 0)
        {
            curl_setopt($this->curl, CURLOPT_HTTPHEADER, $this->headers);
        }
        if (!empty($data))
        {
            curl_setopt($this->curl, CURLOPT_POSTFIELDS, $data);
        }
        // 支持獲取gzip等編碼類型壓縮過的網頁
        curl_setopt($this->curl, CURLOPT_ENCODING, '');


        $result = curl_exec($this->curl);
        if ($transcoding)
        {
            // 檢測網頁編碼
            $encoding = mb_detect_encoding($result, 'GB2312, GBK, UTF-8');
            if ($encoding != 'UTF-8')
            {
                // 轉換網頁編碼
                $result = iconv('gbk', 'utf-8//IGNORE', $result);
            }
        }
        $error = curl_error($this->curl);
        if (!empty($error))
        {
            //return $error;
        }
        curl_close($this->curl);
        return $result;
    }
}
//$web_url爲目標網頁地址
$web_url = "http://www.baidu.com?curl.php";
$webpage = new cg_http($web_url);
$content = $webpage->send_request();
$filename = basename($web_url);
//這裏我們指定一個文件夾 將獲取的網頁內容存到aa文件夾裏
file_put_contents("D://aa",$filename,$content);

如果是多個url地址也可以批量操作

//將需要操作的url放入數組中
foreach($arr_url as $key =>  $urls)
{
    $webpage = new cg_http($urls);
    $content = $webpage->send_request();
    $filename = "{$key}";
    file_put_contents("D://aa".$filename , $content);
}

讀取D://aa寫入的文件

$dir = "D://aa";
$handler = opendir($dir);
while(($file = readdir($handler))!=false)
{
    if($file == ".." || $file == ".")
    {
        continue;
    }
    $content = file_get_contents($dir.$file);
    //替換掉源代碼中的回車 空格 製表符
    $content = $str_replace("\n",'',$content);
    $content = $str_replace("\t",'',$content);
    $content = $str_replace("\r",'',$content);
    //PCRE 你要匹配的正則表達式
    preg_match_all("PCRE",$content,$matches);
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章