用python+selenium抓取豆瓣讀書中最受關注圖書並按照評分排序

抓取豆瓣讀書中的(http://book.douban.com/)最受關注圖書,按照評分排序,並保存至txt文件中,需要抓取書籍的名稱,作者,評分,體裁和一句話評論

方法一:

#coding=utf-8
from selenium import webdriver
from time import sleep

class DoubanPopularBook:

    def __init__(self):
        self.dr = webdriver.Chrome()
        self.popular_books_list = self.get_douban_popular_books()

    def get_douban_popular_books(self):
        self.dr.get('https://book.douban.com/')
        sleep(3)
        popular_books_list = [] #定義一個空list用於存放獲取的書籍信息
        i = 0
        while i < 10:
            book_info = self.dr.find_elements_by_css_selector("[class='list-col list-col2 list-summary s']>li")[i].text #通過css用class屬性和標籤li組合來獲取書籍所有文本信息
            popular_books_list.append(book_info.split('\n')) #向空list追加書籍信息用並換行符隔開
            i += 1
        popular_books_list.sort(key=lambda x:float(x[1][0:3]), reverse=True) #用sort中key方法根據書籍評分從高到低進行排序
        #popular_books_list = sorted(popular_books_list, key=lambda book: book[1][0:3], reverse=True)
        return popular_books_list

    def get_popular_books_rank_file(self):
        self.file_title = '豆瓣最受關注圖書榜之評分排行'
        self.file = open(self.file_title + '.txt', 'wb')
        for item in self.popular_books_list:
            separate_line = '~~~~~~~~~~~~~~~~~~~~~~~~\n'
            self.file.write(separate_line.encode('utf-8'))
            self.file.write(('書籍名稱:'+item[0]+'\n').encode('utf-8'))
            self.file.write(('評分:'+item[1]+'\n').encode('utf-8'))
            self.file.write((item[2]+'\n').encode('utf-8'))
            self.file.write(('體裁:'+item[3]+'\n').encode('utf-8'))
            if item[4] == '有電子書':
                self.file.write(('一句話評論:'+item[5]+'\n').encode('utf-8'))
            else:
                self.file.write(('一句話評論:'+item[4]+'\n').encode('utf-8'))
        self.file.close()


    def quit(self):
        self.dr.quit()

if __name__ == '__main__':
    popular_books = DoubanPopularBook()
    popular_books.get_popular_books_rank_file()
    popular_books.quit()
    
方法二:

#coding=utf-8
from selenium import webdriver
from time import sleep

class DoubanPopularBook:

    def __init__(self):
        self.dr = webdriver.Chrome()
        self.popular_books_list = self.get_douban_popular_books()

    def get_douban_popular_books(self):
        self.dr.get('https://book.douban.com/')
        sleep(3)
        popular_books_list = [] #定義一個空list用於存放獲取的書籍信息
        i = 0
        while i < 10:  #總共10本書
            book_name = self.dr.find_elements_by_xpath("//h4[@class='title']/a")[i].text #定位書籍名稱
            book_grade = self.dr.find_elements_by_css_selector('.average-rating')[i].text #定位評分
            book_auther = self.dr.find_elements_by_xpath("//p[@class='author']")[i].text #定位作者
            book_genre = self.dr.find_elements_by_css_selector('.book-list-classification')[i].text #定位體裁
            book_comment = self.dr.find_elements_by_css_selector('.reviews')[i].text #定位一句話評論
            popular_books_list.append([book_name, book_grade, book_auther, book_genre, book_comment]) #向空list追加書籍信息
            i += 1 #每本書籍間隔爲1
        popular_books_list = sorted(popular_books_list, key=lambda x:float(x[1]), reverse=True) #用sorted方法按評分從高到低排序
        return popular_books_list

    def get_popular_books_rank_file(self):
        self.file_title = '豆瓣最受關注圖書榜之評分排行'
        self.file = open(self.file_title + '.txt', 'wb')
        for item in self.popular_books_list:
            separate_line = '~~~~~~~~~~~~~~~~~~~~~~~~\n'
            self.file.write(separate_line.encode('utf-8'))
            self.file.write(('書籍名稱:'+item[0]+'\n').encode('utf-8'))
            self.file.write(('評分:'+item[1]+'\n').encode('utf-8'))
            self.file.write((''+item[2]+'\n').encode('utf-8'))
            self.file.write(('體裁:'+item[3]+'\n').encode('utf-8'))
            self.file.write(('一句話評論:'+item[4]+'\n').encode('utf-8'))
        self.file.close()

    def quit(self):
        self.dr.quit()

if __name__ == '__main__':
    popular_books = DoubanPopularBook()
    popular_books.get_popular_books_rank_file()
    popular_books.quit()

網頁如下:

942023-20161214221546354-484427625.png

942023-20161214221630354-618661845.png

生成txt效果如下:


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章