python-爬虫-猫眼电影TOP100

原創

2020-07-03 07:38

#!/usr/bin/env python
#-*- coding:utf8 -*-
#__author__ = "LiDaguo"


import requests
import re
import xlwt

url = 'https://maoyan.com/board/4?'


headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}   # 请求头



def get_page(url):
    '''输入网址-返回网页内容'''
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200: # 看看是不是正常访问
            return response.text
        else:
            print('获取网页失败')
    except Exception as e:
        print(e)


def get_info(page):
    '''得到网页-提取信息'''

    items = re.findall('board-index .*?>(\d+)</i>.*?class="name"><.*?>(.*?)</a></p>.*?<p class="star">.*?' +
                       '主演：(.*?) .*?</p>.*?<p class="releasetime">(.*?)</p>.*?<p class="score"><i class="integer">' +
                       '(.*?)</i><i class="fraction">(\d+)</i></p>', page, re.S)    # 构造正则表达式
    # items是个列表，列表里的每个元素是个元组。每个元组里都包含各个电影的名称、演员、上映时间等信息
    for item in items:
        data = {}
        data['rank'] = item[0]
        data['title'] = item[1]
        actors = re.sub('\n', '', item[2])
        data['actors'] = actors
        data['date'] = item[3]
        data['score'] = str(item[4]) + str(item[5])
        yield data


urls = ['https://maoyan.com/board/4?offset={}'.format(i * 10) for i in range(1)]
DATA = []


for url in urls:
    page = get_page(url)
    datas = get_info(page)      # datas是一个生成器，不是具体数
    for item in datas:      # 把生成器放到for语句中：每次的item都是生成器返回的值，这两个for循环遥相呼应
        DATA.append(item)  # 将所有的数据添加到DATA里

f = xlwt.Workbook(encoding='utf-8')     # 创建表格
sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)    # 命名sheet
# 表头数据写入excle
sheet01.write(0, 0, 'rank')     # 第1行第1列
sheet01.write(0, 1, 'title')    # 第1行第2列
sheet01.write(0, 2, 'actors')
sheet01.write(0, 3, 'date')
sheet01.write(0, 4, 'score')
# 写内容
for i in range(len(DATA)):
    sheet01.write(i + 1, 0, DATA[i]['rank'])
    sheet01.write(i + 1, 1, DATA[i]['title'])
    sheet01.write(i + 1, 2, DATA[i]['actors'])
    sheet01.write(i + 1, 3, DATA[i]['date'])
    sheet01.write(i + 1, 4, DATA[i]['score'])
    print('爬取完成', end='')

f.save('E:\\猫眼电影.xls')

效果：

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

python-爬虫-猫眼电影TOP100

2024年DataOps趋势预测：AI不会取代数据工程师

云原生周刊：K8s 中的服务和网络｜ 2024.4.29

通过Http链接地址爬取有赞微信商城商品信息及下载至EXCEL

多人同时导出 Excel 干崩服务器！新来的阿里大佬给出的解决方案太优雅了！

[转帖]cpupower

今天，昨天，近七天，近30天，近90天，js封装

华为云云原生FinOps解决方案，释放云原生最大价值

python-烏龜喫小魚(小遊戲)

python-字典-根據值查找鍵（批量處理（如刪除等）所查到的內容）

python-爬蟲-貓眼電影TOP100

python-request（基本用法）

python-類與對象的基本含義、格式和調用方法

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結