Python3抓取電影天堂的電影數據列表,豆瓣評分排序,導出Excel,Mysql

 測試程序;

# coding = utf-8

import requests
import os
import re
from bs4 import BeautifulSoup
import xlwt
import pymysql


head = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) Firefox/21.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,zh-CN,zh;q=0.9"
    }
moviePageCount = 10      # 獲取最大頁數的數據
timeOut = 60             # 網絡請求超時時間


def clear_html_re(src_html):
    '''
    正則清除HTML標籤
    :param src_html:原文本
    :return: 清除後的文本
    '''
    content = re.sub(r"</?(.+?)>", "", src_html) # 去除標籤
    # content = re.sub(r"&nbsp;", "", content)
    dst_html = re.sub(r"\s+", "", content)  # 去除空白字符
    dst_html = dst_html.replace("'", '')
    return dst_html


def writeTextToFile(path='./temp', text=''):
    '''
    寫入文件
    :param path: 文件地址
    :param text: 內容
    :return:
    '''
    with open(path, 'w', encoding='utf-8') as fs:
        fs.write(text)


def writeTextToExcel(path='./temp_excel.xls', movieArray=[]):
    # file以utf-8格式打開
    file = xlwt.Workbook(encoding='utf-8')
    # 創建一個data表單
    try:
        table = file.get_sheet('data')
    except Exception as e:
        table = file.add_sheet('data', cell_overwrite_ok=True)
        # 表頭信息
        table_head = ['影名', '片名', '譯名', '又名', '上映時間', '豆瓣評分', '國家', '頁面詳情', '下載地址']
        # 將表頭信息寫入到表格的第一行
        for i in range(len(table_head)):
            table.write(0, i, table_head[i])

    nrows = table.rows.__len__()
    for i, movie in enumerate(movieArray):
        # 影名
        table.write(i + 1 + nrows, 0, movie['name'])
        # 片名
        table.write(i + 1 + nrows, 1, movie['title'])
        # 譯名
        table.write(i + 1 + nrows, 2, movie['translateName'])
        # 又名
        table.write(i + 1 + nrows, 3, movie['nickTitle'])
        # 上映時間
        table.write(i + 1 + nrows, 4, movie['belongTime'])
        # 豆瓣評分
        table.write(i + 1 + nrows, 5, movie['gradeCore'])
        # 國家
        table.write(i + 1 + nrows, 6, movie['country'])
        # 頁面詳情
        table.write(i + 1 + nrows, 7, movie['link'])
        # 下載地址
        for url in movie['downloadUrl']:
            table.write(i + 1 + nrows, 8, url + ', ')

        print('已寫入:' + movie['name'])

    file.save(path)


def writeDatatoDataBase(movieArray=[]):
    # 打開數據庫
    db = pymysql.connect('localhost', 'root', '123', 'library')

    # 使用 cursor() 方法創建一個遊標對象 cursor
    cursor = db.cursor()

    # 使用execute() 方法執行Sql 查詢
    for movie in movieArray:
        downloadUrl = ''
        for url in movie['downloadUrl']:
            downloadUrl += url + ','
        downloadUrl = downloadUrl[0:-1]
        mysqlStr = "insert into movie values(null,'%s','%s','%s','%s','%s','%s','%s','%s','%s');" % (movie['name'], movie['title'], movie['translateName'], movie['nickTitle'], movie['belongTime'], movie['country'], movie['gradeCore'], movie['link'], downloadUrl)
        try:
            cursor.execute(mysqlStr)
            db.commit()
        except:
            db.rollback()

    # 關閉數據連接
    db.close()



def downloadMovieWithUrlArray(name, urlArray):
    for url in urlArray:
        print('正在下載: ' + name + '   url:' + url)
        print('')
        break

def bubble_sort(lists, isAsc=True):
    '''
    冒泡排序
    :param lists: 字典數組
    :param isAsc: 是否升序
    :return: 排序好的字典數組
    '''
    # 冒泡排序
    count = len(lists)
    for i in range(0, count-1):
        for j in range(0, count-1-i):
            itemJ = lists[j]
            itemJScore = itemJ['gradeCore']
            itemJNext = lists[j+1]
            itemJNextScore = itemJNext['gradeCore']
            if isAsc:
                if itemJScore > itemJNextScore:
                    temp = lists[j]
                    lists[j] = lists[j + 1]
                    lists[j + 1] = temp
            else:
                if itemJScore < itemJNextScore:
                    # lists[i], lists[j] = lists[j], lists[i]
                    temp = lists[j]
                    lists[j] = lists[j+1]
                    lists[j+1] = temp
    return lists


def getMovieDetail(urlStr, movieName):
    '''
    獲取電影詳情
    :param urlStr: 電影相應地址
    :param movieName: 電影名稱
    :return: 字典
    '''
    res = requests.get(urlStr, headers=head, timeout=timeOut)  # 抓取網頁數據
    res.encoding = 'gb2312'
    #writeTextToFile('./' + movieName + '.html', res.text)

    # html解析
    soup = BeautifulSoup(res.text, 'html.parser')
    translateName = ''  # 譯名
    title = ''          # 片名
    nickTitle = ''      # 又名
    belongTime = ''     # 上映日期
    country = ''        # 國家
    gradeCore = ''      # 評分
    downloadUrl = ''    # 下載鏈接
    for movieDetail in soup.find_all('p'):
        propertys = str(movieDetail.get_text())
        for text in propertys.split('\r\n'):
            if '◎片  名' in text:
                title = clear_html_re(text[5:])
            elif '◎譯   名' in text:
                translateName = clear_html_re(text[5:])
            elif '◎又  名' in text:
                nickTitle = clear_html_re(text[5:])
            elif '◎上映日期' in text:
                belongTime = clear_html_re(text[5:])
            elif '◎國  家' in text:
                country = text[5:]
            elif '◎豆瓣評分' in text:
                gradeCore = text[5:]
                gradeArray = gradeCore.split('/')
                if len(gradeArray) > 0:
                    firstGrade = gradeArray[0]
                    gradeCore = firstGrade[len(firstGrade) - 3:]
                    scoreList = re.findall(r"\d+\.?\d*", gradeCore)
                    if len(scoreList) > 0:
                        gradeCore = scoreList[0]
            # elif '◎IMDb評分' in text:
            #     gradeCore = text[5:]

    if gradeCore == '':
        gradeCore = '0'

    # 獲取下載鏈接
    downloadLinkArray =[]
    for link in soup.find_all('td', bgcolor='#ffffbb'):
        for linkTemp in link.find_all('a'):
            downloadLink = str(linkTemp.get('href'))
            if downloadLink is not None:
                if 'http' not in downloadLink:
                    downloadLinkArray.append(downloadLink)

    print('正在獲取:' + movieName)
    # print('belongTime:' + belongTime)
    # print('country:' + country)
    # print('gradeCore:' + gradeCore)
    # for link in downloadLinkArray:
    #     print('downloadUrl:' + link)
    # print('')

    return {'name': movieName,
            'link': urlStr,
            'title': title,
            'translateName': translateName,
            'nickTitle': nickTitle,
            'belongTime': belongTime,
            'country': country,
            'gradeCore': gradeCore,
            'downloadUrl': downloadLinkArray}


if __name__ == '__main__':
    res = requests.get('http://www.dygang.net', headers=head, timeout=timeOut)  # 抓取網頁數據
    res.encoding = 'gb2312'
    #print(res.headers['content-Type'])
    #print(res.encoding)
    #print(requests.utils.get_encodings_from_content(res.text))  # 獲取網頁的真實中文編碼
    #print(res.text)
    #writeTextToFile('./page_home.html', res.text)

    # html解析
    soup = BeautifulSoup(res.text, 'html.parser')
    movieTypeArray = [{'type': '最新電影', 'link': 'http://www.dygang.net/ys/'}]
    for movieType in soup.find_all('a', target='_blank'):
        #newTitle = movieType.find('font', color='#3333cc')
        if movieType is not None:
            urlTemp = str(movieType.get('href'))
            if 'searchid' in urlTemp:
                typeName = str(movieType.get_text())
                linkName = str(movieType.get('href'))
                newMovieType = {'type': typeName, 'link': linkName}
                movieTypeArray.append(newMovieType)

    print("請選擇序號,獲取電影列表:")
    for index, movieType in enumerate(movieTypeArray):
        print('%d  %s' % (index+1, movieType['type']))
    selectNum = input("輸入序號:")
    moviePageCount = int(re.sub('\D', '', input("要下載網頁上的多少頁電影:")))
    selectMovieTypeDict = movieTypeArray[int(selectNum) - 1]
    print('已選擇:' + selectMovieTypeDict['type'] + '----  ' + str(moviePageCount) + '頁數據。')

    # 獲取對應的電影列表
    for i in range(moviePageCount):
        movieArray = []
        print('正在下載第 %d 頁的電影...' % (i+1))
        res = None
        if selectNum == '1':
            url = selectMovieTypeDict['link'] + 'index_' + str(i+1) + '.htm'
            if i == 0:
                url = selectMovieTypeDict['link']
            res = requests.get(url, headers=head, timeout=timeOut)  # 抓取網頁數據
        else:
            res = requests.get(selectMovieTypeDict['link'] + '&page=' + str(i), headers=head, timeout=15)  # 抓取網頁數據

        res.encoding = 'gb2312'
        #writeTextToFile('./' + selectMovieTypeDict['type'] + '.html', res.text)
        # html解析
        soup = BeautifulSoup(res.text, 'html.parser')
        for movie in soup.find_all('a', class_='classlinkclass'):
            detailLink = str(movie.get('href'))
            movieName = str(movie.get_text())
            movieDict = getMovieDetail(detailLink, movieName)
            movieArray.append(movieDict)


        # 電影根據評分排序:
        #movieArray = bubble_sort(movieArray, False)

        # 將電影排行榜導入Excel
        #writeTextToExcel('./' + selectMovieTypeDict['type'] + '.xls', movieArray)

        # 講電影導入數據庫
        writeDatatoDataBase(movieArray)
        print('寫入完成')

        # 請輸入要下載幾分以上電影:
        # wantDownloadUpScore = input('請輸入要下載幾分以上電影:')

        # 下載電影
        # for movie in movieArray:
        #     if movie['gradeCore'] >= wantDownloadUpScore:
        #         downloadMovieWithUrlArray(movie['name'], movie['downloadUrl'])

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章