测试程序;
# coding = utf-8
import requests
import os
import re
from bs4 import BeautifulSoup
import xlwt
import pymysql
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) Firefox/21.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,zh-CN,zh;q=0.9"
}
moviePageCount = 10 # 获取最大页数的数据
timeOut = 60 # 网络请求超时时间
def clear_html_re(src_html):
'''
正则清除HTML标签
:param src_html:原文本
:return: 清除后的文本
'''
content = re.sub(r"</?(.+?)>", "", src_html) # 去除标签
# content = re.sub(r" ", "", content)
dst_html = re.sub(r"\s+", "", content) # 去除空白字符
dst_html = dst_html.replace("'", '')
return dst_html
def writeTextToFile(path='./temp', text=''):
'''
写入文件
:param path: 文件地址
:param text: 内容
:return:
'''
with open(path, 'w', encoding='utf-8') as fs:
fs.write(text)
def writeTextToExcel(path='./temp_excel.xls', movieArray=[]):
# file以utf-8格式打开
file = xlwt.Workbook(encoding='utf-8')
# 创建一个data表单
try:
table = file.get_sheet('data')
except Exception as e:
table = file.add_sheet('data', cell_overwrite_ok=True)
# 表头信息
table_head = ['影名', '片名', '译名', '又名', '上映时间', '豆瓣评分', '国家', '页面详情', '下载地址']
# 将表头信息写入到表格的第一行
for i in range(len(table_head)):
table.write(0, i, table_head[i])
nrows = table.rows.__len__()
for i, movie in enumerate(movieArray):
# 影名
table.write(i + 1 + nrows, 0, movie['name'])
# 片名
table.write(i + 1 + nrows, 1, movie['title'])
# 译名
table.write(i + 1 + nrows, 2, movie['translateName'])
# 又名
table.write(i + 1 + nrows, 3, movie['nickTitle'])
# 上映时间
table.write(i + 1 + nrows, 4, movie['belongTime'])
# 豆瓣评分
table.write(i + 1 + nrows, 5, movie['gradeCore'])
# 国家
table.write(i + 1 + nrows, 6, movie['country'])
# 页面详情
table.write(i + 1 + nrows, 7, movie['link'])
# 下载地址
for url in movie['downloadUrl']:
table.write(i + 1 + nrows, 8, url + ', ')
print('已写入:' + movie['name'])
file.save(path)
def writeDatatoDataBase(movieArray=[]):
# 打开数据库
db = pymysql.connect('localhost', 'root', '123', 'library')
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
# 使用execute() 方法执行Sql 查询
for movie in movieArray:
downloadUrl = ''
for url in movie['downloadUrl']:
downloadUrl += url + ','
downloadUrl = downloadUrl[0:-1]
mysqlStr = "insert into movie values(null,'%s','%s','%s','%s','%s','%s','%s','%s','%s');" % (movie['name'], movie['title'], movie['translateName'], movie['nickTitle'], movie['belongTime'], movie['country'], movie['gradeCore'], movie['link'], downloadUrl)
try:
cursor.execute(mysqlStr)
db.commit()
except:
db.rollback()
# 关闭数据连接
db.close()
def downloadMovieWithUrlArray(name, urlArray):
for url in urlArray:
print('正在下载: ' + name + ' url:' + url)
print('')
break
def bubble_sort(lists, isAsc=True):
'''
冒泡排序
:param lists: 字典数组
:param isAsc: 是否升序
:return: 排序好的字典数组
'''
# 冒泡排序
count = len(lists)
for i in range(0, count-1):
for j in range(0, count-1-i):
itemJ = lists[j]
itemJScore = itemJ['gradeCore']
itemJNext = lists[j+1]
itemJNextScore = itemJNext['gradeCore']
if isAsc:
if itemJScore > itemJNextScore:
temp = lists[j]
lists[j] = lists[j + 1]
lists[j + 1] = temp
else:
if itemJScore < itemJNextScore:
# lists[i], lists[j] = lists[j], lists[i]
temp = lists[j]
lists[j] = lists[j+1]
lists[j+1] = temp
return lists
def getMovieDetail(urlStr, movieName):
'''
获取电影详情
:param urlStr: 电影相应地址
:param movieName: 电影名称
:return: 字典
'''
res = requests.get(urlStr, headers=head, timeout=timeOut) # 抓取网页数据
res.encoding = 'gb2312'
#writeTextToFile('./' + movieName + '.html', res.text)
# html解析
soup = BeautifulSoup(res.text, 'html.parser')
translateName = '' # 译名
title = '' # 片名
nickTitle = '' # 又名
belongTime = '' # 上映日期
country = '' # 国家
gradeCore = '' # 评分
downloadUrl = '' # 下载链接
for movieDetail in soup.find_all('p'):
propertys = str(movieDetail.get_text())
for text in propertys.split('\r\n'):
if '◎片 名' in text:
title = clear_html_re(text[5:])
elif '◎译 名' in text:
translateName = clear_html_re(text[5:])
elif '◎又 名' in text:
nickTitle = clear_html_re(text[5:])
elif '◎上映日期' in text:
belongTime = clear_html_re(text[5:])
elif '◎国 家' in text:
country = text[5:]
elif '◎豆瓣评分' in text:
gradeCore = text[5:]
gradeArray = gradeCore.split('/')
if len(gradeArray) > 0:
firstGrade = gradeArray[0]
gradeCore = firstGrade[len(firstGrade) - 3:]
scoreList = re.findall(r"\d+\.?\d*", gradeCore)
if len(scoreList) > 0:
gradeCore = scoreList[0]
# elif '◎IMDb评分' in text:
# gradeCore = text[5:]
if gradeCore == '':
gradeCore = '0'
# 获取下载链接
downloadLinkArray =[]
for link in soup.find_all('td', bgcolor='#ffffbb'):
for linkTemp in link.find_all('a'):
downloadLink = str(linkTemp.get('href'))
if downloadLink is not None:
if 'http' not in downloadLink:
downloadLinkArray.append(downloadLink)
print('正在获取:' + movieName)
# print('belongTime:' + belongTime)
# print('country:' + country)
# print('gradeCore:' + gradeCore)
# for link in downloadLinkArray:
# print('downloadUrl:' + link)
# print('')
return {'name': movieName,
'link': urlStr,
'title': title,
'translateName': translateName,
'nickTitle': nickTitle,
'belongTime': belongTime,
'country': country,
'gradeCore': gradeCore,
'downloadUrl': downloadLinkArray}
if __name__ == '__main__':
res = requests.get('http://www.dygang.net', headers=head, timeout=timeOut) # 抓取网页数据
res.encoding = 'gb2312'
#print(res.headers['content-Type'])
#print(res.encoding)
#print(requests.utils.get_encodings_from_content(res.text)) # 获取网页的真实中文编码
#print(res.text)
#writeTextToFile('./page_home.html', res.text)
# html解析
soup = BeautifulSoup(res.text, 'html.parser')
movieTypeArray = [{'type': '最新电影', 'link': 'http://www.dygang.net/ys/'}]
for movieType in soup.find_all('a', target='_blank'):
#newTitle = movieType.find('font', color='#3333cc')
if movieType is not None:
urlTemp = str(movieType.get('href'))
if 'searchid' in urlTemp:
typeName = str(movieType.get_text())
linkName = str(movieType.get('href'))
newMovieType = {'type': typeName, 'link': linkName}
movieTypeArray.append(newMovieType)
print("请选择序号,获取电影列表:")
for index, movieType in enumerate(movieTypeArray):
print('%d %s' % (index+1, movieType['type']))
selectNum = input("输入序号:")
moviePageCount = int(re.sub('\D', '', input("要下载网页上的多少页电影:")))
selectMovieTypeDict = movieTypeArray[int(selectNum) - 1]
print('已选择:' + selectMovieTypeDict['type'] + '---- ' + str(moviePageCount) + '页数据。')
# 获取对应的电影列表
for i in range(moviePageCount):
movieArray = []
print('正在下载第 %d 页的电影...' % (i+1))
res = None
if selectNum == '1':
url = selectMovieTypeDict['link'] + 'index_' + str(i+1) + '.htm'
if i == 0:
url = selectMovieTypeDict['link']
res = requests.get(url, headers=head, timeout=timeOut) # 抓取网页数据
else:
res = requests.get(selectMovieTypeDict['link'] + '&page=' + str(i), headers=head, timeout=15) # 抓取网页数据
res.encoding = 'gb2312'
#writeTextToFile('./' + selectMovieTypeDict['type'] + '.html', res.text)
# html解析
soup = BeautifulSoup(res.text, 'html.parser')
for movie in soup.find_all('a', class_='classlinkclass'):
detailLink = str(movie.get('href'))
movieName = str(movie.get_text())
movieDict = getMovieDetail(detailLink, movieName)
movieArray.append(movieDict)
# 电影根据评分排序:
#movieArray = bubble_sort(movieArray, False)
# 将电影排行榜导入Excel
#writeTextToExcel('./' + selectMovieTypeDict['type'] + '.xls', movieArray)
# 讲电影导入数据库
writeDatatoDataBase(movieArray)
print('写入完成')
# 请输入要下载几分以上电影:
# wantDownloadUpScore = input('请输入要下载几分以上电影:')
# 下载电影
# for movie in movieArray:
# if movie['gradeCore'] >= wantDownloadUpScore:
# downloadMovieWithUrlArray(movie['name'], movie['downloadUrl'])