1. 保存excel
import requests
import pandas
from lxml import etree
class Spider(object):
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
self.data = []
def get_data(self, url):
response = requests.get(url, headers=self.headers).text
page = etree.HTML(response)
node_list = page.xpath('//ol[@class="grid_view"]/li')
for node in node_list:
title = node.xpath('.//div[@class="pic"]//img/@alt')[0]
pic_url = node.xpath('.//div[@class="pic"]//img/@src')[0]
rank = node.xpath('.//div[@class="pic"]//em/text()')[0]
detail_link = node.xpath('.//div[@class="pic"]//a/@href')[0]
score = node.xpath('.//div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0]
comment_count = node.xpath('.//div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()')[0]
data = [title, pic_url, rank, detail_link, score, comment_count]
print(data)
self.data.append(data)
def save_excel(self):
pd = pandas.DataFrame(self.data, columns=['標題', '圖片地址', '排名', '詳情頁', '評分', '評論數'])
pd.to_excel('douban.xlsx', index=None)
def run(self):
for i in range(1, 11):
url = 'https://movie.douban.com/top250?start={}&filter='.format((i-1)*25)
self.get_data(url)
self.save_excel()
if __name__ == '__main__':
s = Spider()
s.run()
2. 保存mysql
import requests
from public.operation_db import *
from lxml import etree
class Spider(object):
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
def get_data(self, url):
response = requests.get(url, headers=self.headers).text
page = etree.HTML(response)
node_list = page.xpath('//ol[@class="grid_view"]/li')
data_list = []
for node in node_list:
title = node.xpath('.//div[@class="pic"]//img/@alt')[0]
pic_url = node.xpath('.//div[@class="pic"]//img/@src')[0]
rank = node.xpath('.//div[@class="pic"]//em/text()')[0]
detail_link = node.xpath('.//div[@class="pic"]//a/@href')[0]
score = node.xpath('.//div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0]
comment_count = node.xpath('.//div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()')[0]
data = [title, pic_url, rank, detail_link, score, comment_count]
print(data)
data_list.append(data)
return data_list
def save_mysql(self, data_list):
sql = 'insert into douban(title, pic, rank_, detail_link, score, comment_count) values(%s, %s,%s,%s,%s,%s)'
save_batch_data(sql, data_list)
def run(self):
for i in range(1, 11):
url = 'https://movie.douban.com/top250?start={}&filter='.format((i-1)*25)
data_list = self.get_data(url)
self.save_mysql(data_list)
if __name__ == '__main__':
s = Spider()
s.run()
3. 保存mongodb
import requests
import pymongo
from public.operation_db import *
from lxml import etree
class Spider(object):
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
self.client = pymongo.MongoClient()
self.db = self.client['douban']
self.collection = self.db['douban']
def get_data(self, url):
response = requests.get(url, headers=self.headers).text
page = etree.HTML(response)
node_list = page.xpath('//ol[@class="grid_view"]/li')
data_list = []
for node in node_list:
title = node.xpath('.//div[@class="pic"]//img/@alt')[0]
pic_url = node.xpath('.//div[@class="pic"]//img/@src')[0]
rank = node.xpath('.//div[@class="pic"]//em/text()')[0]
detail_link = node.xpath('.//div[@class="pic"]//a/@href')[0]
score = node.xpath('.//div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0]
comment_count = node.xpath('.//div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()')[0]
items = {
'標題': title,
'圖片地址': pic_url,
'排名': rank,
'詳情頁': detail_link,
'評分': score,
'評論數': comment_count
}
print(items)
self.collection.insert(items)
def run(self):
for i in range(1, 11):
url = 'https://movie.douban.com/top250?start={}&filter='.format((i-1)*25)
self.get_data(url)
if __name__ == '__main__':
s = Spider()
s.run()
4. 可視化(flask,echarts)
from flask import Flask, jsonify, render_template
from public.operation_db import *
app = Flask(__name__, template_folder='templates')
@app.route('/')
def template():
"""
返回模板文件
:return:
"""
score = []
count = []
sql = 'select score, count(score) from douban GROUP by score'
data = select_data(sql)
for item in data:
score.append(item[0])
count.append(item[1])
context = {
'score': score,
'count': count,
'title': '豆瓣250評分分佈圖'
}
return render_template('test.html', **context)
if __name__ == '__main__':
app.run(debug=True)
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>ECharts</title>
<script src="static/js/echarts.min.js"></script>
<style>
#main{
width: 500px;
height: 500px;
{#background-color: grey;#}
margin: 80px 200px;
text-align: center;
}
</style>
</head>
<body>
<div id="main" style="width: 800px;height:500px;"></div>
<script type="text/javascript">
var myChart = echarts.init(document.getElementById('main'));
var option = {
title: {
text: {{ title | tojson }}
},
tooltip: {},
legend: {
data:[]
},
xAxis: {
{# tojson 將變量輸出爲json字符串 #}
data: {{ score | tojson }}
},
yAxis: {},
series: [{
name: '銷量',
type: 'bar',
data: {{ count }}
}]
};
myChart.setOption(option);
</script>
</body>
</html>