(知乎也有我的文章)
在這裏,先表明,此爬蟲是否失效,視時間而定,解析網頁內容方法較爲原始,
本人並非爬蟲大神,開始爬蟲只是因爲數學建模需要自己爬取數據(坑爹),整個隊伍就我一個計算機專業,責任在我,只好硬着頭皮去搞,沒想到還挺有成就感。
好,話不多說,直接上代碼
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 8 18:09:44 2018
@author: 白馬非馬
"""
#!/usr/bin/env python
# -*- coding:utf-8 -*-
#需要事先安裝selenium和plantom.js,不適合大量爬蟲,速度太慢,這裏只爬取所有行業的第一頁20個公司
#注意需要少量人工值守,當代理IP意外在4個行業以內超時崩掉時。需要手動關閉,可能不關閉代理IP會自動好轉,但基本不可能
from selenium import webdriver
import time
import pymysql
from bs4 import BeautifulSoup #網頁代碼解析器
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.proxy import Proxy
from selenium.webdriver.common.proxy import ProxyType
import json
import urllib.request
ipurl = "http://piping.mogumiao.com/proxy/api/get_ip_al?appKey=6d22aed70f7d0479cbce55dff726a8d8a&count=1&expiryDate=5&format=1"
#代理IP獲取API
connect = pymysql.Connect(
host='localhost',
port=3306,
user='root',
passwd='1234',
db='user',
charset='utf8'
)
#mysql數據庫驅動信息
#獲取代理IP
def getip_port():
req = urllib.request.Request(ipurl)
data = urllib.request.urlopen(req).read()
#loads:把json轉換爲dict
s1 = json.loads(data)
#print (s1["msg"][0]["ip"] )
#print (s1["msg"][0]["port"] )
ipstrs=s1["msg"][0]["ip"]+":"+s1["msg"][0]["port"]
print("代理IP:"+ipstrs)
return ipstrs
#創建瀏覽器驅動
def driver_open():
#dcap = dict(DesiredCapabilities.PHANTOMJS)
# dcap["phantomjs.page.settings.userAgent"] = (
#"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
#)
#driver = webdriver.PhantomJS(executable_path='phantomjs.exe', desired_capabilities=dcap)
proxy = Proxy(
{
'proxyType': ProxyType.MANUAL,
'httpProxy': getip_port() # 代理ip和端口
}
)
desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
desired_capabilities = dict(DesiredCapabilities.PHANTOMJS)
desired_capabilities["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
)
# 把代理ip加入到技能中
proxy.add_to_capabilities(desired_capabilities)
driver = webdriver.PhantomJS(
executable_path='phantomjs.exe',
desired_capabilities=desired_capabilities
)
return driver
#獲取網頁內容
def get_content(driver,url):
driver.get(url)
#等待5秒,更據動態網頁加載耗時自定義
#sleeptime=random.randint(2,3)
time.sleep(1)
content = driver.page_source.encode('utf-8')
#driver.close()
soup = BeautifulSoup(content, 'lxml')
#print(soup)
return soup
#解析網頁內容,爬蟲篩選不完善,不匹配所有網頁,
#天眼查4分之三的網頁可正常解析,時間爲2018-2-27
#有爬蟲大神可對此進行改進,期待謝謝
def get_basic_info(soup,instr):
#com=soup.find_all("span")
#print(com[6])
company = soup.find(attrs={'class':'f18 in-block vertival-middle sec-c2'}).text
fddbr = soup.find(attrs={'class':'f18 overflow-width sec-c3'}).text
#fddbr=soup.find_all("a")
baseinfo = soup.find_all(attrs={'class':'baseinfo-module-content-value'})
zczb =baseinfo[0].text
zt = baseinfo[2].text
zcrq =baseinfo[1].text
foundAllTd = soup.find_all("td");
#print len(basics)
#jyfw = soup.find(attrs={'class':'js-full-container hidden'}).text
print (u'公司名稱:'+company)
print( u'法定代表人:'+fddbr)
print (u'註冊資本:'+zczb)
print (u'公司狀態:'+zt)
print (u'註冊日期:'+zcrq)
#根據網頁td標籤粗略識別網頁類型,
#有兩種,一種大公司,報表內容較爲多,td標籤數大致爲800到1000
#小公司基本在500以下
#少量公司td標籤數在中間,無法很好識別,數量不多,影響不大,時間:2018-2-26
if len(foundAllTd) > 600:
"""
print (u'員工人數:'+foundAllTd[50].text)
print (u'行業:'+foundAllTd[527].text)
print (u'企業類型:'+foundAllTd[523].text)
#print (u'工商註冊號:'+foundAllTd[517].text)
print( u'組織機構代碼:'+foundAllTd[519].text)
print (u'營業期限:'+foundAllTd[529].text)
print( u'登記機構:'+foundAllTd[533].text)
print (u'覈准日期:'+foundAllTd[531].text)
print( u'統一社會信用代碼:'+foundAllTd[521].text)
print (u'註冊地址:'+foundAllTd[537].text)
print (u'經營範圍:'+foundAllTd[539].text)
"""
sql = "INSERT INTO company (instr,company_name,industry,business_scope,type_enterprise,regist_capital,legal_represent,regist_date,company_status,operat_period,registrat_body,approval_date,address,people_num) VALUES ( '%s','%s','%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s' )"
data = (instr,company, foundAllTd[527].text, foundAllTd[539].text,foundAllTd[523].text ,zczb , fddbr,zcrq ,zt,foundAllTd[529].text ,foundAllTd[533].text ,foundAllTd[531].text ,foundAllTd[537].text,foundAllTd[49].text)
else:
"""
print (u'行業:'+foundAllTd[18].text)
#print (u'工商註冊號:'+foundAllTd[8].text)
print (u'企業類型:'+foundAllTd[14].text)
print( u'組織機構代碼:'+foundAllTd[10].text)
print (u'營業期限:'+foundAllTd[20].text)
print( u'登記機構:'+foundAllTd[24].text)
print (u'覈准日期:'+foundAllTd[22].text)
print( u'統一社會信用代碼:'+foundAllTd[16].text)
print (u'註冊地址:'+foundAllTd[28].text)
print (u'經營範圍:'+foundAllTd[30].text)
"""
sql = "INSERT INTO company (instr,company_name,industry,business_scope,type_enterprise,regist_capital,legal_represent,regist_date,company_status,operat_period,registrat_body,approval_date,address) VALUES ( '%s','%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s' )"
data = (instr,company, foundAllTd[18].text, foundAllTd[30].text,foundAllTd[14].text ,zczb , fddbr,zcrq ,zt,foundAllTd[20].text ,foundAllTd[24].text ,foundAllTd[22].text ,foundAllTd[28].text)
# 插入數據
cursor.execute(sql % data)
connect.commit()
#print('成功插入', cursor.rowcount, '條數據')
#獲取高管信息,已失效,對代碼運行沒有影響
def get_gg_info(soup):
ggpersons = soup.find_all(attrs={"event-name": "company-detail-staff"})
ggnames = soup.select('table.staff-table > tbody > tr > td.ng-scope > span.ng-binding')
# print(len(gg))
for i in range(len(ggpersons)):
ggperson = ggpersons[i].text
ggname = ggnames[i].text
print (ggperson+" "+ggname)
#獲取信息,已失效,對代碼運行沒有影響
def get_gd_info(soup):
tzfs = soup.find_all(attrs={"event-name": "company-detail-investment"})
for i in range(len(tzfs)):
tzf_split = tzfs[i].text.replace("\n","").split()
tzf = ' '.join(tzf_split)
print (tzf)
#獲取信息,已失效,對代碼運行沒有影響
def get_tz_info(soup):
btzs = soup.select('a.query_name')
for i in range(len(btzs)):
btz_name = btzs[i].select('span')[0].text
print (btz_name)
#在首頁獲取行業鏈接
def get_industry(soup):
# print(soup.find(attrs={'class':'industry_container js-industry-container'}))
#hangye = soup.find(attrs={'class':'industry_container js-industry-container'}).find_all("a")
x=[]
buyao=70 #開始爬數據時刪掉
hangye = soup.find_all('a')
for item in hangye:
if 'https://www.tianyancha.com/search/oc' in str(item.get("href")):
print (item.get("href"))
if buyao>0:
buyao-=1
else:
x.append(str(item.get("href")))
print("行業數")
print(len(x))
return x;
#獲取行業下公司鏈接
def get_industry_company(soup):
y=[]
companylist = soup.find_all('a')
for item in companylist:
if 'https://www.tianyancha.com/company/' in str(item.get("href")):
print (item.get("href"))
y.append(str(item.get("href")))
return y
if __name__=='__main__':
cursor = connect.cursor() #連接數據庫
companycount=0 #爬取的公司數
instrcount=0 #爬取的行業數,每4個行業換一個代理IP,每個行業爬取第一頁20個
theinscount=0 #需要爬取的行業標籤數,每4個行業換一個代理IP,每個行業爬取第一頁20個
driver = driver_open()
url = "https://www.tianyancha.com/"
soup = get_content(driver, url)
instrlist=get_industry(soup)
theinscount=len(instrlist)
print
for instr in instrlist: #遍歷行業鏈接
instrcount+=1
print(instrcount)
print(instr)
compsoup = get_content(driver, instr)
complist =get_industry_company(compsoup)
for comp in complist: #遍歷行業下公司鏈接
print(comp)
companycount+=1
#print(num)
print("行業數爬了"+str(instrcount))
try:
infosoup = get_content(driver, comp)
print ('----獲取基礎信息----')
get_basic_info(infosoup,instr)
except:
print('異常跳過', end=' ')
if instrcount%4 == 0 : #每3個行業鏈接換一個代理IP,防止網頁封禁代理IP,
#有時會出問題,代理IP超時之類,遇到此類情況關掉程序,或者關掉plantomjs
print("換IP")
#driver.close()#關閉驅動 ,可能會有多個plantomjs窗口,需要常關
driver = driver_open()
#try:
# get_basic_info(soup,instr)
#except:
# print('異常跳過', end=' ')
# print()
cursor.close()
connect.close() #關閉數據庫鏈接
代碼註釋已經打的比較詳細,可以直接看。
上面的代碼爬取結果還需要數據預處理,尤其是天眼查煞筆的數據加密,
上面加密的數據有,註冊資本,註冊時間,營業期限,加密方法賊原始,
我遇到的加密是,
數字加密方式
密文 明文
7 4
5 8
4 .
3 9
0 1
. 5
9 2
6 0
1 3
8 6
2 7
就這麼簡單,哈哈哈,發現這個時沒笑死我。這個解碼的操作較爲簡單,小夥伴自己去操練去吧。
有人說,爲啥不去爬國家企業信用信息公示系統,原因只有一個,我實在懶得去搞什麼滑動驗證碼,文字點擊驗證碼,看着就煩,(註定無法成爲爬蟲工程師)需要的夥伴可以看這位老兄的博客,他的說已經失效了,可以借鑑點經驗,【爬蟲】關於企業信用信息公示系統-加速樂最新反爬蟲機制
還好天眼查沒有驗證碼,不然建模小夥伴要被我這個辣雞氣死。
另外,如果有小夥伴實在不想自己爬數據的,只想要數據的,可以私信找我要,沒錯我還真的想過去買點數據應付一下建模,不過看到價格,基本就放棄了,看圖
代理IP是找的蘑菇代理,就花了6塊錢,1000個高匿IP,上面的API好像還剩700個,給你們用吧,,反正我是不想搞爬蟲了,有大神搞了個爬取代理IP的點擊打開鏈接。
,感覺一句話,做技術只是累,學技術不僅累還難。
坑,爬蟲坑,填了土。