python Thrading demo - 線程
pyhon 多線程、多進程
線程與子進程的區別
進程和線程的區別在於粒度不同, 進程之間的變量(或者說是內存)是不能直接互相訪問的, 而線程可以, 線程一定會依附在某一個進程上執行.我舉個例子, 你在Windows下開一個IE瀏覽器, 這個IE瀏覽器是一個進程. 你用瀏覽器去打開一個pdf, IE就去調用Acrobat去打開, 這時Acrobat是一個獨立的進程, 就是IE的子進程.而IE自己本身同時用同一個進程開了2個網頁, 並且同時在跑兩個網頁上的腳本, 這兩個網頁的執行就是IE自己通過兩個線程實現的.值得注意的是, 線程仍然是IE的內容, 而子進程Acrobat嚴格來說就不屬於IE了, 是另外一個程序.
之所以是IE的子進程, 只是受IE調用而啓動的而已.
Linux系統的實現打破了純粹的進程與純粹的線程之間的差異。在Linux系統下二者是本質一致的
密集型
https://blog.csdn.net/qq_33020901/article/details/80207594
計算密集型
IO密集型
併發和並行
https://blog.csdn.net/qq_27825451/article/details/78850336
併發:同一時間間隔交替進行
並行:同時進行
併發和並行其實概念上最核心的區別就在於是否“同時”
並行是同時
【Python3之多線程】
https://www.cnblogs.com/smallmars/p/7149507.html
結論:
多線程用於IO密集型,如socket,爬蟲,web
多進程用於計算密集型,如金融分析
demo_1
# -*- coding: utf-8 -*-
import threading, time
def run(num):
pass
print("subThread({}) is start...".format(threading.current_thread().name))
time.sleep(0.5)
print(num)
time.sleep(0.5)
print("subThread({}) is stop.".format(threading.current_thread().name))
if __name__ == '__main__':
pass
print("mainThreading({}) is starting...".format(threading.current_thread().name))
# create subThreading
"""
target=function
name = subThradingName
args = subThreading input args(tupe)
"""
subThreading = threading.Thread(target=run, name='runThreading', args=(1,))
subThreading.start()
# wait the mainThreading stop,then subThreading is stop
subThreading.join() # if not write this line ,the mainThreading is stop befor subThreading
print("mainThreading({}) is stop.".format(threading.current_thread().name))
demo_2 多線程
- 定義全局變量,便於資源共享
- 不定義線程鎖。輸出數字混亂
- 定義線程函數,並聲明全局變量的使用範圍,使用with對鎖操作
# -*- coding: utf-8 -*-
import threading
num = 100
def run(n):
global num
for i in range(1000000):
num = num + n
num = num - n
if __name__ == '__main__':
t1 = threading.Thread(target=run, args=(6, ))
t2 = threading.Thread(target=run, args=(9, ))
t1.start()
t2.start()
t1.join()
t2.join()
print("num = {}".format(num))
print("mainThreading({}) is stop.".format(threading.current_thread().name))
demo_3 多線程_Lock
- 定義全局變量,便於資源共享
- 定義線程鎖,只有當一個線程結束後,鎖纔會被釋放,鎖不被線程共享
- 定義線程函數,並聲明全局變量的使用範圍,使用with對鎖操作
# -*- coding: utf-8 -*-
import threading
num = 100
lock = threading.Lock()
def run(n):
global num
for i in range(1000000):
with lock:
num = num + n
num = num - n
if __name__ == '__main__':
t1 = threading.Thread(target=run, args=(6, ))
t2 = threading.Thread(target=run, args=(9, ))
t1.start()
t2.start()
t1.join()
t2.join()
print("num = {}".format(num))
print("mainThreading({}) is stop.".format(threading.current_thread().name))
demo_4 一起過馬路
- 湊夠3人才能過馬路
bar = threading.Barrier(3)
# -*- coding: utf-8 -*-
import threading, time
# 湊夠3人才能過馬路
bar = threading.Barrier(3)
def run():
print("{} is starting...".format(threading.current_thread().name))
time.sleep(1)
bar.wait()
print("{} is end.".format(threading.current_thread().name))
if __name__ == '__main__':
for i in range(5):
threading.Thread(target=run).start()
demo_5 Threading.Time
定時線程
# -*- coding: utf-8 -*-
import threading
def run():
print('-*' * 10)
print("runThreading is end.")
if __name__ == '__main__':
print("mainThreading is start...")
timeThreading = threading.Timer(3, run)
timeThreading.start()
timeThreading.join()
print("mainThreading is end.")
多線程爬取demo
業務流
#!/usr/bin/python3
# -*- coding:utf-8 -*-
# @Software : PyCharm
# @CreateTime: 2019-12-23 09:36
# @Author : spider
# @File : pyThread
import sys
import time
import pprint
import json
import re
from lxml import etree
import requests
import threading
from queue import Queue
"""
2類線程:3個下載,3個解析
內容隊列:下載線程往隊列中put數據,解析線程從隊列中get數據
url隊列: 下載線程從url隊列get數據
寫數據:上鎖,保證文件不能同時被修改
"""
g_crawl_list = []
g_parse_list = []
class CrawThread(threading.Thread):
def __init__(self, name, page_queue, data_queue):
super(CrawThread, self).__init__()
self.name = name
self.page_queue = page_queue
self.data_queue = data_queue
self.url = r"{}"
self.hreaders = {}
def run(self):
print("{} ---------- crawl_thread start".format(self.name))
while True:
if self.page_queue.empty:
break
# 從隊列中取出頁碼
page = self.page_queue.get()
# 拼接url,發送請求
url = self.url.format(page)
res = requests.get(url, headers=self.hreaders)
if res.ok:
# 將響應內容存放到data_queue
self.data_queue.put(res.text)
print("{} ---------- crawl_thread stop".format(self.name))
class ParserThread(threading.Thread):
def __init__(self, name, data_queue, fp, lock):
super(ParserThread, self).__init__()
self.name = name
self.data_queue = data_queue
self.lock = lock
self.fp = fp
def run(self):
while True:
if self.data_queue.empty():
break
print("{} ---------- parse_thread start".format(self.name))
# 從data_queue中取出一頁數據
pageSourceHtml = self.data_queue.get()
# 解析內容
self.parse_content(pageSourceHtml)
print("{} ---------- parse_thread stop".format(self.name))
def parse_content(self, pageSourceHtml):
tree = etree.HTML(pageSourceHtml)
li_list = tree.xpath("//li")
items = []
for oli in li_list:
title = 'title'
imgLink = 'imgLink'
item = {
'title':title,
'imgLink':imgLink,
}
items.append(item)
# write to jsonFile
self.lock.acquire()
self.fp.write(json.dumps(items), ensure_acsii=False)
self.lock.release()
def function():
print("in function...")
print("-=" * 90)
try:
pass
except Exception as e:
print('\nLine_{:0>5d} in {} - {}'.format(
sys._getframe().f_lineno, __file__, e))
finally:
pass
def create_queue():
# 創建 頁碼 隊列
page_queue = Queue()
for page in range(1, 11):
page_queue.put(page)
# 創建 內存 隊列
data_queue = Queue()
return page_queue, data_queue
def create_crawl_thread(page_queue, data_queue):
crawl_name = ['crawlThread1', 'crawlThread2', 'crawlThread3']
for name in crawl_name:
tCrawl = CrawThread(name, page_queue, data_queue)
g_crawl_list.append(tCrawl)
def create_parser_thread(data_queue, fp, lock):
crawl_name = ['parseThread1', 'parseThread2', 'parseThread3']
for name in crawl_name:
tParse = ParserThread(name, data_queue, fp, lock)
g_parse_list.append(tParse)
def main():
# 打開文件
fp = open('jian.json', 'a', encoding='utf8')
# 創建鎖
lock = threading.Lock()
# 創建隊列函數
page_queue, data_queue = create_queue()
# 創建 採集 線程
create_crawl_thread(page_queue, data_queue)
# 創建 解析 線程
create_parser_thread(data_queue, lock, fp)
# 啓動所有 採集 線程
for tCrawl in g_crawl_list:
tCrawl.start()
# 啓動所有 解析 線程
for tPrase in g_parse_list:
tPrase .start()
# 主線程等待子線程結束
for tCrawl in g_crawl_list:
tCrawl.join()
for tPrase in g_parse_list:
tPrase .join()
if __name__ == '__main__':
print("in startMain...")
print("-=" * 90)
main()