豆瓣top250

1. 保存excel

import requests
import pandas
from lxml import etree


class Spider(object):
    def __init__(self):
        # self.url = 'https://movie.douban.com/top250?start=0&filter='
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }
        self.data = []

    def get_data(self, url):
        response = requests.get(url, headers=self.headers).text
        page = etree.HTML(response)
        node_list = page.xpath('//ol[@class="grid_view"]/li')
        # data_list = []
        for node in node_list:
            # 標題
            title = node.xpath('.//div[@class="pic"]//img/@alt')[0]

            # 圖片地址
            pic_url = node.xpath('.//div[@class="pic"]//img/@src')[0]

            # 排名
            rank = node.xpath('.//div[@class="pic"]//em/text()')[0]

            # 詳情頁
            detail_link = node.xpath('.//div[@class="pic"]//a/@href')[0]

            # 評分
            score = node.xpath('.//div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0]

            # 評價人數
            comment_count = node.xpath('.//div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()')[0]
            data = [title, pic_url, rank, detail_link, score, comment_count]
            print(data)
            self.data.append(data)

    def save_excel(self):
        pd = pandas.DataFrame(self.data, columns=['標題', '圖片地址', '排名', '詳情頁', '評分', '評論數'])
        pd.to_excel('douban.xlsx', index=None)

    def run(self):
        for i in range(1, 11):
            url = 'https://movie.douban.com/top250?start={}&filter='.format((i-1)*25)

            self.get_data(url)
            self.save_excel()


if __name__ == '__main__':
    s = Spider()
    s.run()

2. 保存mysql

import requests
from public.operation_db import *
from lxml import etree


class Spider(object):
    def __init__(self):
        # self.url = 'https://movie.douban.com/top250?start=0&filter='
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }
        # self.data = []

    def get_data(self, url):
        response = requests.get(url, headers=self.headers).text
        page = etree.HTML(response)
        node_list = page.xpath('//ol[@class="grid_view"]/li')
        data_list = []
        for node in node_list:
            # 標題
            title = node.xpath('.//div[@class="pic"]//img/@alt')[0]

            # 圖片地址
            pic_url = node.xpath('.//div[@class="pic"]//img/@src')[0]

            # 排名
            rank = node.xpath('.//div[@class="pic"]//em/text()')[0]

            # 詳情頁
            detail_link = node.xpath('.//div[@class="pic"]//a/@href')[0]

            # 評分
            score = node.xpath('.//div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0]

            # 評價人數
            comment_count = node.xpath('.//div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()')[0]
            data = [title, pic_url, rank, detail_link, score, comment_count]
            print(data)
            data_list.append(data)

        return data_list
            # self.data.append(data)

    def save_mysql(self, data_list):
        sql = 'insert into douban(title, pic, rank_, detail_link, score, comment_count) values(%s, %s,%s,%s,%s,%s)'
        save_batch_data(sql, data_list)

    def run(self):
        for i in range(1, 11):
            url = 'https://movie.douban.com/top250?start={}&filter='.format((i-1)*25)

            data_list = self.get_data(url)
            self.save_mysql(data_list)


if __name__ == '__main__':
    s = Spider()
    s.run()

3. 保存mongodb

import requests
import pymongo
from public.operation_db import *
from lxml import etree


class Spider(object):
    def __init__(self):
        # self.url = 'https://movie.douban.com/top250?start=0&filter='
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }
        self.client = pymongo.MongoClient()
        self.db = self.client['douban']
        self.collection = self.db['douban']

    def get_data(self, url):
        response = requests.get(url, headers=self.headers).text
        page = etree.HTML(response)
        node_list = page.xpath('//ol[@class="grid_view"]/li')
        data_list = []
        for node in node_list:
            # 標題
            title = node.xpath('.//div[@class="pic"]//img/@alt')[0]

            # 圖片地址
            pic_url = node.xpath('.//div[@class="pic"]//img/@src')[0]

            # 排名
            rank = node.xpath('.//div[@class="pic"]//em/text()')[0]

            # 詳情頁
            detail_link = node.xpath('.//div[@class="pic"]//a/@href')[0]

            # 評分
            score = node.xpath('.//div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0]

            # 評價人數
            comment_count = node.xpath('.//div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()')[0]
            items = {
                '標題': title,
                '圖片地址': pic_url,
                '排名': rank,
                '詳情頁': detail_link,
                '評分': score,
                '評論數': comment_count
            }
            print(items)
            self.collection.insert(items)



    def run(self):
        for i in range(1, 11):
            url = 'https://movie.douban.com/top250?start={}&filter='.format((i-1)*25)
            self.get_data(url)


if __name__ == '__main__':
    s = Spider()
    s.run()

4. 可視化(flask,echarts)
from flask import Flask, jsonify, render_template
from public.operation_db import *

# template_folder='templates' 指定模板路徑(可以是相對路徑, 絕對路徑)
app = Flask(__name__, template_folder='templates')


@app.route('/')
def template():
    """
    返回模板文件
    :return:
    """
    score = []
    count = []
    sql = 'select score, count(score) from douban GROUP by score'
    data = select_data(sql)
    for item in data:
        score.append(item[0])
        count.append(item[1])
    # 將需要傳遞的參數放在一個字典中
    context = {
        'score': score,
        'count': count,
        'title': '豆瓣250評分分佈圖'

    }
    # 解包的形式傳遞參數
    return render_template('test.html', **context)


if __name__ == '__main__':
    app.run(debug=True)

<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <title>ECharts</title>
    <!-- 引入 echarts.js -->
    <script src="static/js/echarts.min.js"></script>
    <style>
        #main{
            width: 500px;
            height: 500px;
            {#background-color: grey;#}
            margin: 80px 200px;
            text-align: center;
        }
    </style>
</head>
<body>
    <!-- 爲ECharts準備一個具備大小(寬高)的Dom -->
    <div id="main" style="width: 800px;height:500px;"></div>
    <script type="text/javascript">
        // 基於準備好的dom,初始化echarts實例
        var myChart = echarts.init(document.getElementById('main'));

        // 指定圖表的配置項和數據
        var option = {
            title: {
                text: {{ title | tojson }}
            },
            tooltip: {},
            legend: {
                data:[]
            },
            xAxis: {
                {# tojson 將變量輸出爲json字符串 #}
                data: {{ score | tojson }}
            },
            yAxis: {},
            series: [{
                name: '銷量',
                type: 'bar',
                data: {{ count }}
            }]
        };

        // 使用剛指定的配置項和數據顯示圖表。
        myChart.setOption(option);
    </script>
</body>
</html>

在這裏插入圖片描述

在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章