python簡單代碼實現爬蟲,爬數據,刷訪問

python作爲人工智能或者大數據的寵兒,我自然要學習,作爲一個小白,第一個實現的工能就是爬蟲,爬數據,收集數據,我以我爬csdn博客的事情爲例子,附上代碼,大家一起學習

這裏還使用了ip代理基數,一起奉獻了

#!/usr/bin/python
# -*- coding:utf-8 -*-
import httplib
import urllib
import json
import urllib2
import re
import os
import BeautifulSoup
import random
import time


class BaiduImage(object):
    def __init__(self):
        super(BaiduImage, self).__init__()
        print u'圖片獲取中,CTRL+C 退出程序...'
        self.page = 2  # 當前總頁數

    def requestIp(self):
        iplist = []

        conn = httplib.HTTPConnection("www.xicidaili.com")#ip代理網站
        request_url = "/nt"
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',
                   'Content-type': 'test/html'}

        conn.request('GET', request_url, headers=headers)

        r = conn.getresponse()
        if r.status == 200:
            #解析html
            data = r.read()
            soup = BeautifulSoup.BeautifulSoup(data)
            head = soup.find('head')
            result = soup.findAll('tr')
            for row in result[1:]:
                ipitem = IPItem()
                mylist = row.findAll('td')
                ipitem.ip = mylist[1].text
                ipitem.port = mylist[2].text
                ipitem.addr = mylist[3].text
                ipitem.tpye = mylist[5].text
                iplist.append(ipitem)

        self.request(iplist)

    def request(self, iplist):#獲取要爬的csdn博客文章列表
        mylis = []
        for num in range(1, self.page):
            conn = httplib.HTTPConnection("blog.csdn.net")
            request_url = "/songyan_love/article/category/6261675/%s" % (num)
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',
                       'Content-type': 'test/html'}

            conn.request('GET', request_url, headers=headers)

            r = conn.getresponse()
            if r.status == 200:
                #解析出裏面的地址列表
                data = r.read()
                soup = BeautifulSoup.BeautifulSoup(data)
                head = soup.find('head')

                result = soup.find('div', 'list_item_new')
                list = result.findAll("div", "list_item article_item")
                for row in list:
                    result2 = row.find('span', 'link_title')
                    a = row.find('a')
                    # print a.text
                    # print a['href']
                    mylis.append(a['href'])
        # 遍歷這些請求,去請求文字頁面
        self.forrequest(mylis, iplist)

    def forrequest(self, mylis, iplist):
        for item in iplist[0:10]:#我這裏只是舉例子,我選的是ip代理中前10個
            for urllist in mylis:
                time.sleep(1)#睡眠1秒鐘,如果頻繁請求會報,請求報錯[Errno 10060]
                params = "value=1.0.3"
                ipAddress = "%s%s%s" % (item.ip, ":", item.port)#ip代理拼接,61.155.164.109:3128
                headers = {'Host': 'blog.csdn.net',
                           'Connection': 'keep-alive',
                           'Cache-Control': ' max-age=0',
                           'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36',
                           'Upgrade-Insecure-Requests': '1',
                           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                           'Accept-Encoding': 'gzip, deflate',
                           'Accept-Language': 'zh-CN,zh;q=0.9',
                           'Cookie': 'uuid_tt_dd=10_19443020900-1513178520094-768335; gr_user_id=fc73959f-68e8-43bd-95e8-4d293c1b111e; bdshare_firstime=1513211345964; kd_user_id=6f361b81-886d-466f-b1a5-9960b742d462; _ga=GA1.2.993355856.1513212738; __utma=17226283.993355856.1513215738.1514427431.1514512093.14; __utmz=17226283.1513215738.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); UN=songyan_love; BT=1514860667627; __yadk_uid=9PFquxFHCElENSRifm5lJAqMLpVog7Ad; Hm_lvt_3f9df99a208b69b45eb52cfbe2dc3bf8=1515143143,1515375437; __message_district_code=000000; uuid=8e1ade52-fae7-48f3-b3cb-fb786e4c7afe; TY_SESSION_ID=670b7d71-fe0c-4b81-b8c5-c504fa320cdf; ADHOC_MEMBERSHIP_CLIENT_ID1.0=8cb01310-dcd7-5119-b554-5d8ed76572e9; __message_sys_msg_id=0; __message_gu_msg_id=0; __message_cnel_msg_id=0; __message_in_school=0; avh=78599641; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1515466838,1515476031,1515477617,1515479260; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1515479260; dc_tos=p29za4; dc_session_id=10_1515461736402.866875',
                           'RA-Ver': '3.0.8',
                           'RA-Sid': 'B781E81A-20150402-024118-ce25e1-ba5345',
                           'If-None-Match': 'W/"3a3ef0fe6385d9241fdcae8c24d1da08',
                           "X-Forwarded-For": ipAddress, }#X-Forwarded-For": ipAddress是爲了防止被攔截掉,告訴他們我不是代理,(這是我的理解)
                print ipAddress
                con2 = httplib.HTTPConnection(ipAddress)
                try:
                    myurl = "%s%s" % ('http://blog.csdn.net', urllist)#請求文章的地址
                    print myurl
                    con2.request("HTTP", myurl, params, headers)#發送請求
                    r = con2.getresponse()
                    strString = str(r.status)#返回code
                    if strString == '':
                        print "請求失敗%s" % strString
                        break
                    else:
                        print "請求成功%s" % r.status
                except Exception, e:
                    print "請求報錯%s" % e
                    break


class IPItem:
    def __init__(self):
        self.ip = ''  # IP
        self.port = ''  # Port
        self.addr = ''  # 位置
        self.tpye = ''  # 類型:http; https
        self.speed = -1  # 速度


if __name__ == '__main__':
    #主方法
    bi = BaiduImage()
    bi.requestIp()
#這裏的headers是我怕瀏覽器請求的headers的數據
希望有幫助到各位。 
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章