python 分頁爬取

 二十、python 分頁爬取(百思不得姐信息爬取)

import requests

from lxml import etree

import datetime

 

#獲取段子的內容

def getJokeList(basurl='http://www.budejie.com/text/{0}'):

    nextPage = True

    pageNum = 1

    while nextPage:

        url = basurl.format(pageNum)

        response = requests.get(url)

        selector = etree.HTML(response.text)

        jokes = selector.xpath('//*/div[@class="j-r-list-c-desc"]/a/text()')

        for joke in jokes:

            yield joke

        hasNext = selector.xpath('//a[@class="pagenxt"]')

        if hasNext:

            pageNum += 1

        else:

            nextPage = False

       # print pageNum

 

#獲取段子內容、贊、分享、收藏數

def getJokeOfAllList(basurl='http://www.budejie.com/text/{0}'):

    nextPage = True

    pageNum = 1

    while nextPage:

        url = basurl.format(pageNum)

        response = requests.get(url)

        selector = etree.HTML(response.text)

        all = selector.xpath('//*/div[@class="j-r-list"]/ul/li')

        for a in all:

            joke = a.xpath('div[@class="j-r-list-c"]/div[@class="j-r-list-c-desc"]/a/text()')[0]

            like = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-l "]/ul/li[@class="j-r-list-tool-l-up"]/span/text()')[0]

            down = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-l "]/ul/li[@class="j-r-list-tool-l-down "]/span/text()')[0]

            share = a.xpath('div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-ct"]/div[@class="j-r-list-tool-ct-share-c"]/span/text()')[0]

            comment = a.xpath( 'div[@class="j-r-list-tool"]/div[@class="j-r-list-tool-r j-r-list-tool-cc"]/ul/li[@class=" f-tac j-comment j-comment-width  j-comment-down-width"]/a/span[@class="comment-counts"]/text()')[0]

            #print joke, '=====', like, '====', down, '====', share.replace(u"分享??",""), '=====', comment, '====='

            yield joke, like, down, share.replace(u"分享??",""), comment

        hasNext = selector.xpath('//a[@class="pagenxt"]')

        if hasNext:

            pageNum += 1

        else:

            nextPage = False

        print pageNum

 

if __name__ == "__main__":

    f = open('basejie.txt','w')

    # for joke in getJokeList():

    #     #print joke

    #     f.writelines(joke.encode('utf-8'))

    #     f.writelines('\n')

    #     f.writelines('~'*100)

    #     f.writelines('\n')

    # f.close()

 

    ###############################################

    #getJokeOfAllList()

    for joke, like, down, share, comment in getJokeOfAllList():

        print joke.encode('utf-8')+'\t'+(like + '\t' + down + '\t' + share.encode("utf-8").replace('??',"") + '\t' + comment).encode("utf-8")

        f.writelines(joke.encode('utf-8')+'\t'+(like + '\t' + down + '\t' + share.encode("utf-8").replace('??',"") + '\t' + comment).encode("utf-8"))

        f.writelines('\n')

    f.close()

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章