爬取企業信息-企業信用信息查詢系統-天眼查爬蟲

（知乎也有我的文章）

在這裏，先表明，此爬蟲是否失效，視時間而定，解析網頁內容方法較爲原始，

本人並非爬蟲大神，開始爬蟲只是因爲數學建模需要自己爬取數據（坑爹），整個隊伍就我一個計算機專業，責任在我，只好硬着頭皮去搞，沒想到還挺有成就感。

好，話不多說，直接上代碼

# -*- coding: utf-8 -*-
"""
Created on Thu Feb  8 18:09:44 2018

@author: 白馬非馬
"""

#!/usr/bin/env python
# -*- coding:utf-8 -*-
#需要事先安裝selenium和plantom.js，不適合大量爬蟲，速度太慢，這裏只爬取所有行業的第一頁20個公司
#注意需要少量人工值守，當代理IP意外在4個行業以內超時崩掉時。需要手動關閉，可能不關閉代理IP會自動好轉，但基本不可能
from selenium import webdriver    
import time
import pymysql
from bs4 import BeautifulSoup     #網頁代碼解析器
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.proxy import Proxy
from selenium.webdriver.common.proxy import ProxyType
import json
import urllib.request

ipurl = "http://piping.mogumiao.com/proxy/api/get_ip_al?appKey=6d22aed70f7d0479cbce55dff726a8d8a&count=1&expiryDate=5&format=1"
#代理IP獲取API


connect = pymysql.Connect(  
    host='localhost',  
    port=3306,  
    user='root',  
    passwd='1234',  
    db='user',  
    charset='utf8'  
)

#mysql數據庫驅動信息


#獲取代理IP
def getip_port():
    req = urllib.request.Request(ipurl)
    data = urllib.request.urlopen(req).read()
    #loads:把json轉換爲dict  
    s1 = json.loads(data)
    #print (s1["msg"][0]["ip"] )
    #print (s1["msg"][0]["port"] )
    ipstrs=s1["msg"][0]["ip"]+":"+s1["msg"][0]["port"]
    print("代理IP:"+ipstrs)
    return ipstrs

#創建瀏覽器驅動
def driver_open():
    #dcap = dict(DesiredCapabilities.PHANTOMJS)
   # dcap["phantomjs.page.settings.userAgent"] = (
#"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
    #)
    #driver = webdriver.PhantomJS(executable_path='phantomjs.exe', desired_capabilities=dcap)
    
    
    proxy = Proxy(
    {
    'proxyType': ProxyType.MANUAL,
    'httpProxy': getip_port()  # 代理ip和端口
    }
    )
    desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
    desired_capabilities = dict(DesiredCapabilities.PHANTOMJS)
    desired_capabilities["phantomjs.page.settings.userAgent"] = (
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
    )
    # 把代理ip加入到技能中
    proxy.add_to_capabilities(desired_capabilities)
    driver = webdriver.PhantomJS(
    executable_path='phantomjs.exe', 
    desired_capabilities=desired_capabilities
    )
    return driver

#獲取網頁內容
def get_content(driver,url):
    driver.get(url)
#等待5秒，更據動態網頁加載耗時自定義
    #sleeptime=random.randint(2,3)
    time.sleep(1)
    content = driver.page_source.encode('utf-8')
    #driver.close()
    soup = BeautifulSoup(content, 'lxml')
    #print(soup)
    return soup

#解析網頁內容，爬蟲篩選不完善，不匹配所有網頁，
#天眼查4分之三的網頁可正常解析，時間爲2018-2-27
#有爬蟲大神可對此進行改進，期待謝謝
def get_basic_info(soup,instr):
    #com=soup.find_all("span")
    #print(com[6])
    
    company = soup.find(attrs={'class':'f18 in-block vertival-middle sec-c2'}).text
    fddbr = soup.find(attrs={'class':'f18 overflow-width sec-c3'}).text 
    #fddbr=soup.find_all("a")           
    baseinfo = soup.find_all(attrs={'class':'baseinfo-module-content-value'})                
    zczb =baseinfo[0].text
    zt = baseinfo[2].text           
    zcrq =baseinfo[1].text
    
    foundAllTd = soup.find_all("td");                   
    #print len(basics)
    
    #jyfw = soup.find(attrs={'class':'js-full-container hidden'}).text 
    print (u'公司名稱：'+company)
    print( u'法定代表人：'+fddbr)
    print (u'註冊資本：'+zczb)
    
    print (u'公司狀態：'+zt)
    print (u'註冊日期：'+zcrq)
    
    #根據網頁td標籤粗略識別網頁類型，
    #有兩種，一種大公司，報表內容較爲多，td標籤數大致爲800到1000
    #小公司基本在500以下
    #少量公司td標籤數在中間，無法很好識別，數量不多，影響不大，時間：2018-2-26
    if len(foundAllTd) > 600:
        """
    
        print (u'員工人數：'+foundAllTd[50].text)
        print (u'行業：'+foundAllTd[527].text)
        print (u'企業類型：'+foundAllTd[523].text)
        
        #print (u'工商註冊號：'+foundAllTd[517].text)
        print( u'組織機構代碼：'+foundAllTd[519].text)
        print (u'營業期限：'+foundAllTd[529].text)
        print( u'登記機構：'+foundAllTd[533].text)
        print (u'覈准日期：'+foundAllTd[531].text)
        print( u'統一社會信用代碼：'+foundAllTd[521].text)
        print (u'註冊地址：'+foundAllTd[537].text)
        print (u'經營範圍：'+foundAllTd[539].text)
        """
        sql = "INSERT INTO company (instr,company_name,industry,business_scope,type_enterprise,regist_capital,legal_represent,regist_date,company_status,operat_period,registrat_body,approval_date,address,people_num) VALUES ( '%s','%s','%s', '%s', '%s', '%s', '%s',  '%s', '%s', '%s', '%s', '%s', '%s', '%s' )"  
        data = (instr,company, foundAllTd[527].text, foundAllTd[539].text,foundAllTd[523].text ,zczb , fddbr,zcrq ,zt,foundAllTd[529].text ,foundAllTd[533].text ,foundAllTd[531].text ,foundAllTd[537].text,foundAllTd[49].text)  
        
    else:
        """
        print (u'行業：'+foundAllTd[18].text)
        #print (u'工商註冊號：'+foundAllTd[8].text)
        print (u'企業類型：'+foundAllTd[14].text)
        print( u'組織機構代碼：'+foundAllTd[10].text)
        print (u'營業期限：'+foundAllTd[20].text)
        print( u'登記機構：'+foundAllTd[24].text)
        print (u'覈准日期：'+foundAllTd[22].text)
        print( u'統一社會信用代碼：'+foundAllTd[16].text)
        print (u'註冊地址：'+foundAllTd[28].text)
        print (u'經營範圍：'+foundAllTd[30].text)
        """
        sql = "INSERT INTO company (instr,company_name,industry,business_scope,type_enterprise,regist_capital,legal_represent,regist_date,company_status,operat_period,registrat_body,approval_date,address) VALUES ( '%s','%s', '%s', '%s', '%s', '%s',  '%s', '%s', '%s', '%s', '%s', '%s', '%s' )"  
        data = (instr,company, foundAllTd[18].text, foundAllTd[30].text,foundAllTd[14].text ,zczb , fddbr,zcrq ,zt,foundAllTd[20].text ,foundAllTd[24].text ,foundAllTd[22].text ,foundAllTd[28].text)  
    
     
  
# 插入數據  
   
    cursor.execute(sql % data)  
    connect.commit()  
    #print('成功插入', cursor.rowcount, '條數據')


        
#獲取高管信息，已失效，對代碼運行沒有影響
def get_gg_info(soup):
    ggpersons = soup.find_all(attrs={"event-name": "company-detail-staff"})
    ggnames = soup.select('table.staff-table > tbody > tr > td.ng-scope > span.ng-binding')
# print(len(gg))
    for i in range(len(ggpersons)):
            ggperson = ggpersons[i].text
            ggname = ggnames[i].text
            print (ggperson+" "+ggname)
#獲取信息，已失效，對代碼運行沒有影響
def get_gd_info(soup):
    tzfs = soup.find_all(attrs={"event-name": "company-detail-investment"})
    for i in range(len(tzfs)):
            tzf_split = tzfs[i].text.replace("\n","").split()
            tzf = ' '.join(tzf_split)
            print (tzf)
#獲取信息，已失效，對代碼運行沒有影響
def get_tz_info(soup):
    btzs = soup.select('a.query_name')
    for i in range(len(btzs)):
            btz_name = btzs[i].select('span')[0].text
            print (btz_name)

#在首頁獲取行業鏈接         
def get_industry(soup):
   # print(soup.find(attrs={'class':'industry_container js-industry-container'}))
    #hangye = soup.find(attrs={'class':'industry_container js-industry-container'}).find_all("a")
    x=[]
    buyao=70   #開始爬數據時刪掉
    hangye = soup.find_all('a')
    for item in hangye:
        if 'https://www.tianyancha.com/search/oc' in str(item.get("href")):
            print (item.get("href"))
            if buyao>0:
                buyao-=1
            else:
                x.append(str(item.get("href")))
    print("行業數")
    print(len(x))
    return x;

#獲取行業下公司鏈接            
def get_industry_company(soup):
    y=[]
    companylist = soup.find_all('a')
    for item in companylist:
        if 'https://www.tianyancha.com/company/' in str(item.get("href")):
            print (item.get("href"))
            y.append(str(item.get("href")))
    return y

if __name__=='__main__':
    cursor = connect.cursor() #連接數據庫
    
    companycount=0            #爬取的公司數
    instrcount=0     #爬取的行業數，每4個行業換一個代理IP,每個行業爬取第一頁20個
    theinscount=0    #需要爬取的行業標籤數，每4個行業換一個代理IP,每個行業爬取第一頁20個
    
    driver = driver_open()
    url = "https://www.tianyancha.com/"
    soup = get_content(driver, url)
    instrlist=get_industry(soup)
    theinscount=len(instrlist)
    print
    
    for instr in instrlist:   #遍歷行業鏈接
        instrcount+=1
        print(instrcount)
        print(instr)
        compsoup = get_content(driver, instr)
        complist =get_industry_company(compsoup)
        for comp in complist:  #遍歷行業下公司鏈接  
            print(comp)
            companycount+=1
            #print(num)
            print("行業數爬了"+str(instrcount))
            
            try:
                infosoup = get_content(driver, comp)
                print ('----獲取基礎信息----')
                get_basic_info(infosoup,instr)
            except:
                print('異常跳過', end=' ')
        
        if instrcount%4 == 0 :  #每3個行業鏈接換一個代理IP,防止網頁封禁代理IP,
        #有時會出問題，代理IP超時之類，遇到此類情況關掉程序，或者關掉plantomjs
            print("換IP")
            #driver.close()#關閉驅動 ,可能會有多個plantomjs窗口，需要常關
            driver = driver_open()
            #try:
            #    get_basic_info(soup,instr)
            #except:
            #    print('異常跳過', end=' ')
           # print()
    
            
    cursor.close()  
    connect.close()  #關閉數據庫鏈接

代碼註釋已經打的比較詳細，可以直接看。

上面的代碼爬取結果還需要數據預處理，尤其是天眼查煞筆的數據加密，

上面加密的數據有，註冊資本，註冊時間，營業期限，加密方法賊原始，

我遇到的加密是，

就這麼簡單，哈哈哈，發現這個時沒笑死我。這個解碼的操作較爲簡單，小夥伴自己去操練去吧。

有人說，爲啥不去爬國家企業信用信息公示系統，原因只有一個，我實在懶得去搞什麼滑動驗證碼，文字點擊驗證碼，看着就煩，（註定無法成爲爬蟲工程師）需要的夥伴可以看這位老兄的博客，他的說已經失效了，可以借鑑點經驗，【爬蟲】關於企業信用信息公示系統-加速樂最新反爬蟲機制

還好天眼查沒有驗證碼，不然建模小夥伴要被我這個辣雞氣死。

另外，如果有小夥伴實在不想自己爬數據的，只想要數據的，可以私信找我要，沒錯我還真的想過去買點數據應付一下建模，不過看到價格，基本就放棄了，看圖

代理IP是找的蘑菇代理，就花了6塊錢，1000個高匿IP,上面的API好像還剩700個，給你們用吧，，反正我是不想搞爬蟲了，有大神搞了個爬取代理IP的點擊打開鏈接。

，感覺一句話，做技術只是累，學技術不僅累還難。

坑，爬蟲坑，填了土。

爬取企業信息-企業信用信息查詢系統-天眼查爬蟲

spring代理內部方法不生效的原因

阿里Java研發工程師實習生題

HTTP常見錯誤（面試必問）

華農linux實驗6第2題

java常見包名類型

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結