測試程序;
# coding = utf-8
import requests
import os
import re
from bs4 import BeautifulSoup
import xlwt
import pymysql
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) Firefox/21.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,zh-CN,zh;q=0.9"
}
moviePageCount = 10 # 獲取最大頁數的數據
timeOut = 60 # 網絡請求超時時間
def clear_html_re(src_html):
'''
正則清除HTML標籤
:param src_html:原文本
:return: 清除後的文本
'''
content = re.sub(r"</?(.+?)>", "", src_html) # 去除標籤
# content = re.sub(r" ", "", content)
dst_html = re.sub(r"\s+", "", content) # 去除空白字符
dst_html = dst_html.replace("'", '')
return dst_html
def writeTextToFile(path='./temp', text=''):
'''
寫入文件
:param path: 文件地址
:param text: 內容
:return:
'''
with open(path, 'w', encoding='utf-8') as fs:
fs.write(text)
def writeTextToExcel(path='./temp_excel.xls', movieArray=[]):
# file以utf-8格式打開
file = xlwt.Workbook(encoding='utf-8')
# 創建一個data表單
try:
table = file.get_sheet('data')
except Exception as e:
table = file.add_sheet('data', cell_overwrite_ok=True)
# 表頭信息
table_head = ['影名', '片名', '譯名', '又名', '上映時間', '豆瓣評分', '國家', '頁面詳情', '下載地址']
# 將表頭信息寫入到表格的第一行
for i in range(len(table_head)):
table.write(0, i, table_head[i])
nrows = table.rows.__len__()
for i, movie in enumerate(movieArray):
# 影名
table.write(i + 1 + nrows, 0, movie['name'])
# 片名
table.write(i + 1 + nrows, 1, movie['title'])
# 譯名
table.write(i + 1 + nrows, 2, movie['translateName'])
# 又名
table.write(i + 1 + nrows, 3, movie['nickTitle'])
# 上映時間
table.write(i + 1 + nrows, 4, movie['belongTime'])
# 豆瓣評分
table.write(i + 1 + nrows, 5, movie['gradeCore'])
# 國家
table.write(i + 1 + nrows, 6, movie['country'])
# 頁面詳情
table.write(i + 1 + nrows, 7, movie['link'])
# 下載地址
for url in movie['downloadUrl']:
table.write(i + 1 + nrows, 8, url + ', ')
print('已寫入:' + movie['name'])
file.save(path)
def writeDatatoDataBase(movieArray=[]):
# 打開數據庫
db = pymysql.connect('localhost', 'root', '123', 'library')
# 使用 cursor() 方法創建一個遊標對象 cursor
cursor = db.cursor()
# 使用execute() 方法執行Sql 查詢
for movie in movieArray:
downloadUrl = ''
for url in movie['downloadUrl']:
downloadUrl += url + ','
downloadUrl = downloadUrl[0:-1]
mysqlStr = "insert into movie values(null,'%s','%s','%s','%s','%s','%s','%s','%s','%s');" % (movie['name'], movie['title'], movie['translateName'], movie['nickTitle'], movie['belongTime'], movie['country'], movie['gradeCore'], movie['link'], downloadUrl)
try:
cursor.execute(mysqlStr)
db.commit()
except:
db.rollback()
# 關閉數據連接
db.close()
def downloadMovieWithUrlArray(name, urlArray):
for url in urlArray:
print('正在下載: ' + name + ' url:' + url)
print('')
break
def bubble_sort(lists, isAsc=True):
'''
冒泡排序
:param lists: 字典數組
:param isAsc: 是否升序
:return: 排序好的字典數組
'''
# 冒泡排序
count = len(lists)
for i in range(0, count-1):
for j in range(0, count-1-i):
itemJ = lists[j]
itemJScore = itemJ['gradeCore']
itemJNext = lists[j+1]
itemJNextScore = itemJNext['gradeCore']
if isAsc:
if itemJScore > itemJNextScore:
temp = lists[j]
lists[j] = lists[j + 1]
lists[j + 1] = temp
else:
if itemJScore < itemJNextScore:
# lists[i], lists[j] = lists[j], lists[i]
temp = lists[j]
lists[j] = lists[j+1]
lists[j+1] = temp
return lists
def getMovieDetail(urlStr, movieName):
'''
獲取電影詳情
:param urlStr: 電影相應地址
:param movieName: 電影名稱
:return: 字典
'''
res = requests.get(urlStr, headers=head, timeout=timeOut) # 抓取網頁數據
res.encoding = 'gb2312'
#writeTextToFile('./' + movieName + '.html', res.text)
# html解析
soup = BeautifulSoup(res.text, 'html.parser')
translateName = '' # 譯名
title = '' # 片名
nickTitle = '' # 又名
belongTime = '' # 上映日期
country = '' # 國家
gradeCore = '' # 評分
downloadUrl = '' # 下載鏈接
for movieDetail in soup.find_all('p'):
propertys = str(movieDetail.get_text())
for text in propertys.split('\r\n'):
if '◎片 名' in text:
title = clear_html_re(text[5:])
elif '◎譯 名' in text:
translateName = clear_html_re(text[5:])
elif '◎又 名' in text:
nickTitle = clear_html_re(text[5:])
elif '◎上映日期' in text:
belongTime = clear_html_re(text[5:])
elif '◎國 家' in text:
country = text[5:]
elif '◎豆瓣評分' in text:
gradeCore = text[5:]
gradeArray = gradeCore.split('/')
if len(gradeArray) > 0:
firstGrade = gradeArray[0]
gradeCore = firstGrade[len(firstGrade) - 3:]
scoreList = re.findall(r"\d+\.?\d*", gradeCore)
if len(scoreList) > 0:
gradeCore = scoreList[0]
# elif '◎IMDb評分' in text:
# gradeCore = text[5:]
if gradeCore == '':
gradeCore = '0'
# 獲取下載鏈接
downloadLinkArray =[]
for link in soup.find_all('td', bgcolor='#ffffbb'):
for linkTemp in link.find_all('a'):
downloadLink = str(linkTemp.get('href'))
if downloadLink is not None:
if 'http' not in downloadLink:
downloadLinkArray.append(downloadLink)
print('正在獲取:' + movieName)
# print('belongTime:' + belongTime)
# print('country:' + country)
# print('gradeCore:' + gradeCore)
# for link in downloadLinkArray:
# print('downloadUrl:' + link)
# print('')
return {'name': movieName,
'link': urlStr,
'title': title,
'translateName': translateName,
'nickTitle': nickTitle,
'belongTime': belongTime,
'country': country,
'gradeCore': gradeCore,
'downloadUrl': downloadLinkArray}
if __name__ == '__main__':
res = requests.get('http://www.dygang.net', headers=head, timeout=timeOut) # 抓取網頁數據
res.encoding = 'gb2312'
#print(res.headers['content-Type'])
#print(res.encoding)
#print(requests.utils.get_encodings_from_content(res.text)) # 獲取網頁的真實中文編碼
#print(res.text)
#writeTextToFile('./page_home.html', res.text)
# html解析
soup = BeautifulSoup(res.text, 'html.parser')
movieTypeArray = [{'type': '最新電影', 'link': 'http://www.dygang.net/ys/'}]
for movieType in soup.find_all('a', target='_blank'):
#newTitle = movieType.find('font', color='#3333cc')
if movieType is not None:
urlTemp = str(movieType.get('href'))
if 'searchid' in urlTemp:
typeName = str(movieType.get_text())
linkName = str(movieType.get('href'))
newMovieType = {'type': typeName, 'link': linkName}
movieTypeArray.append(newMovieType)
print("請選擇序號,獲取電影列表:")
for index, movieType in enumerate(movieTypeArray):
print('%d %s' % (index+1, movieType['type']))
selectNum = input("輸入序號:")
moviePageCount = int(re.sub('\D', '', input("要下載網頁上的多少頁電影:")))
selectMovieTypeDict = movieTypeArray[int(selectNum) - 1]
print('已選擇:' + selectMovieTypeDict['type'] + '---- ' + str(moviePageCount) + '頁數據。')
# 獲取對應的電影列表
for i in range(moviePageCount):
movieArray = []
print('正在下載第 %d 頁的電影...' % (i+1))
res = None
if selectNum == '1':
url = selectMovieTypeDict['link'] + 'index_' + str(i+1) + '.htm'
if i == 0:
url = selectMovieTypeDict['link']
res = requests.get(url, headers=head, timeout=timeOut) # 抓取網頁數據
else:
res = requests.get(selectMovieTypeDict['link'] + '&page=' + str(i), headers=head, timeout=15) # 抓取網頁數據
res.encoding = 'gb2312'
#writeTextToFile('./' + selectMovieTypeDict['type'] + '.html', res.text)
# html解析
soup = BeautifulSoup(res.text, 'html.parser')
for movie in soup.find_all('a', class_='classlinkclass'):
detailLink = str(movie.get('href'))
movieName = str(movie.get_text())
movieDict = getMovieDetail(detailLink, movieName)
movieArray.append(movieDict)
# 電影根據評分排序:
#movieArray = bubble_sort(movieArray, False)
# 將電影排行榜導入Excel
#writeTextToExcel('./' + selectMovieTypeDict['type'] + '.xls', movieArray)
# 講電影導入數據庫
writeDatatoDataBase(movieArray)
print('寫入完成')
# 請輸入要下載幾分以上電影:
# wantDownloadUpScore = input('請輸入要下載幾分以上電影:')
# 下載電影
# for movie in movieArray:
# if movie['gradeCore'] >= wantDownloadUpScore:
# downloadMovieWithUrlArray(movie['name'], movie['downloadUrl'])