python多線程採集高新技術企業名錄

#3601
#coded by 伊瑪目的門徒
import re
import requests
import time
from bs4 import BeautifulSoup
import pandas as pd

urllist=[]
titlelist=[]

list0=[]
start = time.clock()  # 計時-開始

from concurrent.futures import ThreadPoolExecutor

header={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.XXXX.XXX Safari/537.36'}


def do(i):
    try:
        cd=[]
        html=requests.get('http://www.zcfudao.com/article/qyjs/p'+str(i)+'.html',headers=header)

        html.encoding='utf-8'
        #print (html.text)

        Soup = BeautifulSoup(html.text, "lxml")
        #ab=Soup.select('li a[target="_blank"]')
        ab=Soup.select('tr')[1:]
        #print (ab)

        for x in ab:
            #print (x)
            list0.append(x)


        print (i)

        list1.remove(i)


    except:
        pass



# 多線程
def multithreading():
    sum=0

    while len(list1)>0:
        with ThreadPoolExecutor(max_workers=10) as executor:
            for result in executor.map(do, list1):
                sum+=1

    return sum


list1=list(range(1,3602,1))

sum=multithreading()
print ('還剩下{}頁'.format(list1))



end = time.clock()  # 計時-結束
print (("爬取完成 用時:"))
print ((end - start))


print ('總爬取 %d 頁 '%(sum))


num=[]
nam=[]
pname=[]
year_t=[]
money=[]
type0=[]


for thing in list0:
    thing=list(thing)
    #print (thing)
    if len(thing)==0:
        pass
    else:
        num.append(thing[1])
        nam.append(thing[3])
        pname.append(thing[5])
        year_t.append(thing[7])
        money.append(thing[9])
        type0.append(thing[11])

df=pd.DataFrame({'num':num,
                'nam':nam,
                'pname':pname,
'year_t':year_t,
'money':money,
'type0':type0})

print (df)

df.to_excel('2.xlsx', index=False)


'''

#可作爲TXT輸出
with open("test.txt","w") as f:
        for thing in list0:
            f.write(str(thing))
            f.write('\r\n')

'''

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章