requests_thread_gxrc_com.py
#!/usr/bin/env python3 # coding=utf-8 # Version:python3.6.1 # File:gxrc_com.py # Author:LGSP_Harold import pymongo import requests from multiprocessing import Queue from lxml import etree import threading from handle_mongo import MongoClient # 處理頁碼類 class CrawlPage(threading.Thread): # 重寫父類(子類CrawPage需要添加新的參數時,需要重寫父類),thread_name, page_queue, data_queue是子類使用,不需要傳給父類 def __init__(self, thread_name, page_queue, data_queue, *args, **kwargs): # super自動找到父類(threading.Thread),幫助調用父類方法__init__() super(CrawlPage, self).__init__(*args, **kwargs) # 線程的名稱 self.thread_name = thread_name # 頁碼的隊列 self.page_queue = page_queue # 數據的隊列 self.data_queue = data_queue # 默認請求頭 self.headers = { 'Accept': 'text / html, application / xhtml + xml, application / xml; q = 0.9, image / webp, image / apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'DNT': '1', 'Host': 's.gxrc.com', 'Pragma': 'no-cache', 'Referer': 'https://www.gxrc.com/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', } def run(self) -> None: print('當前啓動處理頁碼的任務線程爲:%s' % self.thread_name) while not page_flag: # Queue隊列去put或者get的時候,需要設置block # block默認爲True,需設置成False # 當前隊列沒有數據或隊列滿了,將會拋出異常(empty,full) try: # 通過get方法,將隊列裏面的頁碼get出來。 # block默認爲True,當隊列爲空時,會一直等待,或設置timeout等待時間 # 將block設置爲False,當隊列爲空,會報異常,通過捕獲來處理 page = self.page_queue.get(block=False) page_url = 'https://s.gxrc.com/sJob?schType=1&page=' + str(page) print('當前構造的url爲%s' % page_url) # 配置動態代理 # proxy = { # 'http': 'http://xxxxxx:[email protected]:9999', # 'https': 'http://xxxxxx:[email protected]:9999' # } # 通過requests方法請求構造的url # res = requests.get(url=page_url, headers=self.headers, proxies=proxy) res = requests.get(url=page_url, headers=self.headers) # 設置網頁編碼 res.encoding = 'utf-8' # 將請求回來的數據放到數據隊列裏面去 self.data_queue.put(res.text) except Exception as e: pass # 處理網頁文本數據類 class CrawlHtml(threading.Thread): # 從頁碼解析過來的數據,需要保存到data_queue def __init__(self, thread_name, data_queue, lock, db, collections, *args, **kwargs): super(CrawlHtml, self).__init__(*args, **kwargs) self.thread_name = thread_name self.data_queue = data_queue self.lock = lock self.db = db self.collections = collections # 處理網頁方法 def parse(self, text): # HTML實例化 html = etree.HTML(text) items = html.xpath('//div[@class="rlOne"]/ul[@class="posDetailUL clearfix"]') data_list = [] for item in items: data = {} data['job_name'] = item.xpath('.//a[@class="posName"]/text()')[0] data['company_name'] = item.xpath('.//a[@class="entName"]/text()')[0] try: data['company_address'] = item.xpath('.//li[@class="w4"]/text()')[0] except Exception as e: data['company_address'] = '未知' try: data['money'] = item.xpath('.//li[@class="w3"]/text()')[0] except Exception as e: data['money'] = '面議' data['date'] = item.xpath('.//li[@class="w5"]/text()')[0] data_list.append(data) return data_list def run(self) -> None: print('當前啓動處理數據任務線程爲:%s' % self.thread_name) while not data_flag: try: # 從隊列取數據 text = self.data_queue.get(block=False) # 調用方法處理數據 result = self.parse(text) # print(result) # 引入鎖 with self.lock: insert_data = MongoClient(self.db, self.collections) insert_data.insert_db(result) except Exception as e: pass # 定義兩個全局的flag page_flag = False data_flag = False def main(): # 定義兩個隊列,存放頁碼和文本數據的隊列 page_queue = Queue() data_queue = Queue() # 定義一個鎖 lock = threading.Lock() # 將頁碼放到頁碼隊列裏面去 for page in range(1, 504): # 通過put方法將頁碼存放到page_queue裏面 page_queue.put(page) # 打印一個提示,page_queue.qsize()返回當前隊列的長度 print('當前頁碼隊列的總量爲%s' % page_queue.qsize()) # 包含線程的名稱,開三個線程 crawl_page_list = ['頁碼線程1', '頁碼線程2', '頁碼線程3'] page_thread_list = [] for thread_name_page in crawl_page_list: thread_page = CrawlPage(thread_name_page, page_queue, data_queue) # 啓動線程 thread_page.start() page_thread_list.append(thread_page) # 設置三個線程,處理文本數據 parse_list = ['文本線程1', '文本線程2', '文本線程3'] parse_thread_list = [] db = 'db_gxrc' collections = 'collections_gxrc' for thread_name_parse in parse_list: thread_parse = CrawlHtml(thread_name_parse, data_queue, lock, db, collections) thread_parse.start() parse_thread_list.append(thread_parse) # 設置線程退出機制 # 頁碼線程 global page_flag # 在page_queue爲空時,while不成立 while not page_queue.empty(): pass page_flag = True # 結束頁碼處理線程 for thread_page_join in page_thread_list: thread_page_join.join() print(thread_page_join.thread_name, '處理結束') # 文本線程 global data_flag while not data_queue.empty(): pass data_flag = True for thread_data_join in parse_thread_list: thread_data_join.join() print(thread_data_join.thread_name, '處理結束') if __name__ == '__main__': # 函數入口 main()
handle_mongo.py
#!/usr/bin/env python3 # coding=utf-8 # Version:python3.6.1 # File:handle_mongo.py # Author:LGSP_Harold import pymongo class MongoClient: def __init__(self, db, collections, *args, **kwargs): super(MongoClient, self).__init__(*args, **kwargs) client = pymongo.MongoClient('mongodb://admin:[email protected]:27017') self.db = client[db] self.collections = self.db[collections] def insert_db(self, item): self.collections.insert_many(item)