前言
本節學習scrapy
包括數據分析、下載器中間件、爬蟲中間件、管道
1、pandas數據分析
延續上一節的例子
感受下數據分析
詳細的可學pandas cookbook
筆者之後會去翻下這本書
數據統計info.py
# encoding: utf-8
import pandas as pd
# 租房 基本信息
# 讀取文件 df=dataframe
df = pd.read_json("zufang.json")
print(df)
print(df.columns)
# 使用pandas的describe方法,打印基本信息
print(df.describe())
# 按照區,分別統計個數
print(df["district"].value_counts())
# 二手房 基本信息
df = pd.read_json("ershoufang.json")
print(df.describe())
#分別統計個數
print(df["district"].value_counts())
餅圖pie_chart.py
import numpy as np
import pandas as pd
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
myfont = FontProperties(fname='/Users/seancheney/.matplotlib/mpl-data/fonts/ttf/SimHei.ttf')
labels = '朝陽', '海淀', '昌平', '東城', '大興', '西城', '豐臺', '石景山', '通州', '順義'
df_zf = pd.read_json("ershoufang.json")
chaoyang_count = df_zf['district'].value_counts()['朝陽']
haidian_count = df_zf['district'].value_counts()['海淀']
changping_count = df_zf['district'].value_counts()['昌平']
dongcheng_count = df_zf['district'].value_counts()['東城']
daxing_count = df_zf['district'].value_counts()['大興']
xicheng_count = df_zf['district'].value_counts()['西城']
fengtai_count = df_zf['district'].value_counts()['豐臺']
shijingshan_count = df_zf['district'].value_counts()['石景山']
tongzhou_count = df_zf['district'].value_counts()['通州']
shunyi_count = df_zf['district'].value_counts()['順義']
sizes = [chaoyang_count, haidian_count, changping_count, dongcheng_count, daxing_count, xicheng_count, fengtai_count, shijingshan_count, tongzhou_count, shunyi_count]
explode = (0.1, 0, 0, 0,0,0,0,0,0,0)
plt.subplot(121)
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=-90)
plt.axis('equal')
plt.title("房屋出售分佈", fontproperties=myfont)
plt.rc('font',family=['SimHei'])
plt.show()
柱狀圖hist.py
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
df = pd.read_json("ershoufang.json")
print(df.columns)
unitprice_values = df.unitprice
plt.hist(unitprice_values,
bins=25
)
plt.xlim(0,200000)
plt.title(u"房屋出售每平米價格分佈")
plt.xlabel(u'價格(單位:萬/平方米)')
plt.ylabel(u'套數')
plt.show()
售租比ratio.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
district = ('西城', '石景山','東城','海淀','豐臺','昌平','大興','朝陽', '通州')
# 讀取租房數據
df_zf = pd.read_json("zufang.json")
unitprice_zf = df_zf['price']/df_zf['area']
df_zf['unitprice'] = unitprice_zf
print(df_zf)
month_price = df_zf.groupby(by=['district']).sum()['unitprice'] / df_zf["district"].value_counts()
print(month_price)
# 讀取二手房數據
df_esf = pd.read_json("ershoufang.json")
sell_price = df_esf.groupby(by=['district']).sum()['unitprice']/df_esf["district"].value_counts()
print(sell_price)
xicheng_ratio = sell_price['西城'] / month_price['西城']
shijingshan_ratio = sell_price['石景山'] / month_price['石景山']
dongcheng_ratio = sell_price['東城'] / month_price['東城']
haidian_ratio = sell_price['海淀'] / month_price['海淀']
fengtai_ratio = sell_price['豐臺'] / month_price['豐臺']
changping_ratio = sell_price['昌平'] / month_price['昌平']
daxing_ratio = sell_price['大興'] / month_price['大興']
chaoyang_ratio = sell_price['朝陽'] / month_price['朝陽']
tongzhou_ratio = sell_price['通州'] / month_price['通州']
ratio = (
xicheng_ratio,
shijingshan_ratio,
dongcheng_ratio,
haidian_ratio,
fengtai_ratio,
changping_ratio,
daxing_ratio,
chaoyang_ratio,
tongzhou_ratio
)
fig, ax = plt.subplots()
y_pos = np.arange(len(district))
ax.barh(y_pos, ratio, align='center', color='green', ecolor='black')
ax.set_yticks(y_pos)
ax.set_yticklabels(district)
# ax.invert_yaxis()
ax.set_xlabel('售租比(單位:月)')
ax.set_title('各區房屋售租比')
plt.show()
2、下載器中間件
下載器中間件按照優先級被調用的:
- 當request從引擎向下載器傳遞時,數字小的下載器中間件先執行,數字大的後執行
- 當下載器將response向引擎傳遞,數字大的下載器中間件先執行,小的後執行
scrapy提供了一套基本的下載器中間件
{
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100,
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300,
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': 400,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 500,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550,
'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': 560,
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 590,
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600,
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750,
'scrapy.downloadermiddlewares.stats.DownloaderStats': 850,
'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900,
}
下載器中間件是個類,類裏可以定義方法
例如process_request(),process_response(),process_exception()
process_request():
- 參數是request, spider
- 參數request是個字典,字典裏包含了headers、url等信息
- process_request()可以利用參數request裏面的信息,對請求做修改,這時一般返回的是None,典型的任務是修改User-agent、變換代理
- 如果根據參數request裏的url直接就去做抓取,返回response對象,返回的response對象就會不經過剩下的下載器中間件,直接返回到引擎
- 如果對請求做了修改,返回的是request對象,就會發回到調度器,等待調度
process_response(request, response, spider):
- 返回的必須是Response、Request或IgnoreRequest異常
3、爬蟲中間件
爬蟲中間件的作用:
- 處理引擎傳遞給爬蟲的響應
- 處理爬蟲傳遞給引擎的請求
- 處理爬蟲傳遞給引擎的數據項
4、管道
每個管道組件都是一個實現了某個功能的Python類,常見功能有:
- 清理html數據
- 做確認
- 查重
- 存入數據庫
每個管道組件的類,必須要有以下方法:
- process_item(self, item, spider)
- open_spider(self, spider)
- close_spider(self, spider)
- from_crawler(cls, crawler)
一些操作如下
# 丟棄數據項
from scrapy.exceptions import DropItem
class PricePipeline(object):
vat_factor = 1.15
def process_item(self, item, spider):
if item['price']:
if item['price_excludes_vat']:
item['price'] = item['price'] * self.vat_factor
return item
else:
raise DropItem("Missing price in %s" % item)
# 存儲到MongoDB
import pymongo
class MongoPipeline(object):
collection_name = 'scrapy_items'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].insert_one(dict(item))
return item
# 存儲到MySQL
class MysqlPipeline():
def __init__(self, host, database, user, password, port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port
@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get('MYSQL_HOST'),
database=crawler.settings.get('MYSQL_DATABASE'),
user=crawler.settings.get('MYSQL_USER'),
password=crawler.settings.get('MYSQL_PASSWORD'),
port=crawler.settings.get('MYSQL_PORT'),
)
def open_spider(self, spider):
self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8',
port=self.port)
self.cursor = self.db.cursor()
def close_spider(self, spider):
self.db.close()
def process_item(self, item, spider):
print(item['title'])
data = dict(item)
keys = ', '.join(data.keys())
values = ', '.join(['%s'] * len(data))
sql = 'insert into %s (%s) values (%s)' % (item.table, keys, values)
self.cursor.execute(sql, tuple(data.values()))
self.db.commit()
return item
# 去重
from scrapy.exceptions import DropItem
class DuplicatesPipeline(object):
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
if item['id'] in self.ids_seen:
raise DropItem("Duplicate item found: %s" % item)
else:
self.ids_seen.add(item['id'])
return item
# 激活管道
ITEM_PIPELINES = {
'myproject.pipelines.PricePipeline': 300,
'myproject.pipelines.JsonWriterPipeline': 800,
}
結語
對scrapy做了進一步的瞭解