###爬黃頁88網的所有企業信息http://b2b.huangye88.com/region/
首先得安裝scrapy 和 pymongo
簡單的安裝和創建爬蟲項目我們就簡單的過一下
pip install scrapy
pip install pymongo
scrapy startproject sellsystem
在spiders目錄下創建我們的爬蟲文件
import copy
import scrapy
from ..items import SellItem
# 先下一頁
class indexSpider(scrapy.Spider):
name = 'sell_finally'
all_province = []
start_urls = [
'http://b2b.huangye88.com/region/'
]
page = 1
def parse(self, response): # 入口程序
urls = response.xpath('//dl[@id="clist"]/dd/a/@href').extract()
for itm in urls:
print(itm)
print('111111111111')
yield scrapy.Request(itm, callback=self.parse_qu) # url
def parse_qu(self, response): # http://b2b.huangye88.com/anyang/
uurls = response.xpath('//*[@id="subarealist"]/div[2]/a/@href').extract()
for url in uurls:
print(url)
print('22222222222222')
yield scrapy.Request(url, callback=self.parse_instury_list) # url
def parse_instury_list(self, response): # 各種不同的行業
item = SellItem()
urls = response.xpath('//div[@class="tag_tx"]/ul/li/a/@href').extract()
privince = response.xpath('//div[@class="subNav"]/a[2]/text()').extract()[0][:-4] # 省
city = response.xpath('//div[@class="subNav"]/a[3]/text()').extract()[0][:-4] # 市
district = response.xpath('/html/body/div[3]/div[1]/text()').extract()[2] # 區
item['privince'] = privince # 省
item['city'] = city # 市
item['district'] = district[district.find('市') + 1:-6] # 區
for itm in urls:
print('33333333333333')
print(item)
yield scrapy.Request(itm, callback=self.parse_instury, meta={'item': copy.deepcopy(item)},dont_filter=True)
def parse_instury(self, response): # 行業詳情
print('--------------------------')
seitem = response.meta['item']
print(seitem)
print(response.url)
# items = response.xpath('//*[@id="jubao"]/dl/dt/h4/a/text()')
# 該頁所有的企業url
content_urls = response.xpath('//*[@id="jubao"]/dl/dt/h4/a/@href').extract()
if len(content_urls) > 0:
for itm in content_urls:
itm = itm + 'company_contact.html' # 進入聯繫我們
print(itm)
print('4444444444444')
yield scrapy.Request(itm, callback=self.parse_content, meta={'item': copy.deepcopy(seitem)},dont_filter=True)
# 下一頁
hrefs = response.xpath(
'//div[@class="page_tag Baidu_paging_indicator"]/span/following-sibling::a[1]/@href').extract()
if len(hrefs) > 0:
print('下一頁------')
yield scrapy.Request(hrefs[0], callback=self.parse_instury, meta={'item': copy.deepcopy(seitem)},dont_filter=True)
def parse_content(self, response): # 內容頁 聯繫我們
item = response.meta['item']
item['page_url'] = response.url
print('===================')
print(item)
# 法人
li_array = response.xpath('//ul[@class="con-txt"]/li').extract()
index = 0
for p in li_array:
title = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/label/text()').extract()[0]
print('index : %' + str(index) + ' len : ' + str(len(li_array)) + ' title : ' + title)
if title == '聯繫人:':
tt = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()
if len(tt) > 0:
item['link_people'] = tt[0]
else:
item['link_people'] = \
response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/a/text()').extract()[
0] # 1聯繫人
if title == '公司名稱:':
item['company_name'] = \
response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[0] # 1公司名稱
if title == '地址:':
item['compay_place'] = \
response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[0] # 1地址
if title == '電話:':
item['phone'] = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[
0] # 1聯繫人電話
if title == '手機:':
item['phone2'] = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[
0] # 1聯繫人手機
if title == '公司主頁:':
item['website'] = \
response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/a/text()').extract()[0] # 1公司主頁
index += 1
uu2 = response.xpath('//ul[@class="meun"]/a[2]/@href').extract()
print('uu2: ' + uu2[0])
if len(uu2) > 0:
yield scrapy.Request(url=uu2[0], callback=self.parse_content2, meta={'item': copy.deepcopy(item)},dont_filter=True)
def parse_content2(self, response): # 內容頁2 公司介紹
item = response.meta['item']
# 列表
li_array = response.xpath('//ul[@class="con-txt"]/li').extract()
print('3333333333333333333333333333333333333333')
print(li_array)
lenss = len(li_array)
index = 0
for p in li_array:
title = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/label/text()').extract()[0]
if title == '成立時間:':
item['establish_time'] = \
response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[0]
print('成立時間:' + item['establish_time'])
if title == '員工人數:':
item['company_people_num'] = \
response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[0]
if title == '主營產品:':
item['product'] = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[
0]
if title == '主營行業:':
item['industry'] = \
response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/a/text()').extract()[0]
if title == '企業法人:':
item['faren'] = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[0]
index += 1
item['introdocution'] = response.xpath('//p[@class="txt"]/text()').extract()[0]
yield copy.deepcopy(item)
需要注意的是我們在這裏yied使用的meta數據scrapy默認是淺複製,多線程下會發生數據錯亂,採用深度複製就可以了copy.deepcopy()
我們的item文件
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class SellItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
link_people = scrapy.Field() #聯繫人
phone = scrapy.Field()#電話
phone2 = scrapy.Field()#電話
company_name = scrapy.Field()#公司名稱
company_instury = scrapy.Field() #主營產品
compay_place = scrapy.Field()#公司地址
website = scrapy.Field()#公司主頁
privince = scrapy.Field()#省
city = scrapy.Field()#市
district = scrapy.Field()#區
establish_time =scrapy.Field()#成立時間
company_people_num =scrapy.Field()#員工人數
product =scrapy.Field()#主營產品
industry =scrapy.Field()#行業
faren =scrapy.Field()#法人
introdocution = scrapy.Field() # 簡介
page_url = scrapy.Field() # 當前訪問的url
對採集後的數據進行處理pipelines.py,保存在MongoDB裏
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from .items import SellItem
import pymongo
from scrapy.conf import settings
class SellsystemPipeline(object):
def __init__(self):
host = settings['MONGODB_HOST']
port = settings['MONGODB_PORT']
dbName = settings['MONGODB_DBNAME']
client = pymongo.MongoClient(host=host,port=port)
tdb = client[dbName]
self.post = tdb[settings['MONGODB_DOCNAME']]
def process_item(self, item, spider):
bookInfo = dict(item)
self.post.insert(bookInfo)
return item
在setting.py 文件中設置MongoDB的參數
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'sell'
MONGODB_DOCNAME = 'company'
在項目根目錄下創建一個main.py文件
from scrapy import cmdline
cmdline.execute('scrapy crawl sell_finally'.split())
最後運行我們的main.py文件
大概20分鐘有10w多條數據,這個看個人網速