python多線程採集高新技術企業名錄

#3601
#coded by 伊瑪目的門徒
import re
import requests
import time
from bs4 import BeautifulSoup
import pandas as pd

urllist=[]
titlelist=[]

list0=[]
start = time.clock()  # 計時-開始

from concurrent.futures import ThreadPoolExecutor

header={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.XXXX.XXX Safari/537.36'}


def do(i):
    try:
        cd=[]
        html=requests.get('http://www.zcfudao.com/article/qyjs/p'+str(i)+'.html',headers=header)

        html.encoding='utf-8'
        #print (html.text)

        Soup = BeautifulSoup(html.text, "lxml")
        #ab=Soup.select('li a[target="_blank"]')
        ab=Soup.select('tr')[1:]
        #print (ab)

        for x in ab:
            #print (x)
            list0.append(x)


        print (i)

        list1.remove(i)


    except:
        pass



# 多線程
def multithreading():
    sum=0

    while len(list1)>0:
        with ThreadPoolExecutor(max_workers=10) as executor:
            for result in executor.map(do, list1):
                sum+=1

    return sum


list1=list(range(1,3602,1))

sum=multithreading()
print ('還剩下{}頁'.format(list1))



end = time.clock()  # 計時-結束
print (("爬取完成 用時："))
print ((end - start))


print ('總爬取 %d 頁 '%(sum))


num=[]
nam=[]
pname=[]
year_t=[]
money=[]
type0=[]


for thing in list0:
    thing=list(thing)
    #print (thing)
    if len(thing)==0:
        pass
    else:
        num.append(thing[1])
        nam.append(thing[3])
        pname.append(thing[5])
        year_t.append(thing[7])
        money.append(thing[9])
        type0.append(thing[11])

df=pd.DataFrame({'num':num,
                'nam':nam,
                'pname':pname,
'year_t':year_t,
'money':money,
'type0':type0})

print (df)

df.to_excel('2.xlsx', index=False)


'''

#可作爲TXT輸出
with open("test.txt","w") as f:
        for thing in list0:
            f.write(str(thing))
            f.write('\r\n')

'''

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

python多線程採集高新技術企業名錄

985 碩士程序員，空窗 4 個月沒有 Offer！

一文搞懂 Spring 循環依賴

賽博鬥地主——使用大語言模型扮演Agent智能體玩牌類遊戲。

VScode右鍵打開(添加到右鍵)

記一次 .NET某工控視覺自動化系統卡死分析

給VPS linux CentOS 7 安裝Anaconda jupyter

用python寫期貨量化策略，期貨單品種MACD擇時加ATR止損

代理IP 多線程僞造表頭爬蟲小框架

SAS 二元邏輯迴歸預測下月是否會消費

python 百度api OCR識別表格

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結