前言：

在联系二的时候，练习了爬当当网、爬豆瓣网，但是爬取的信息都比较慢，

练习：

基础练习

实现代码

import  threading
import  time

#创建一个线程子类：
class MyThread(threading.Thread):
    def __init__(self,threadID,name,couter):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.couter = couter

    def run(self):
        print("开始线程： "+self.name)
        moyu_time(self.name, self.couter, 10)
        print("退出线程："+self.name)

def moyu_time(threadName,delay,counter):
        while counter:
            time.sleep(delay)
            print("%s 开始摸鱼 %s" % (threadName,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
            counter -= 1


#创建新线程
# 小明休息一秒
# 小红消息两秒
thread1 = MyThread(1,"小明",1)
thread2 = MyThread(2,"小红",2)

#开启新线程
thread1.start()
thread2.start()

#等待线程终止
thread1.join()
thread2.join()

print("退出主线程")

结果

线程池练习一：

疯狂的开启线程，因为频繁的创建线程销毁线程，非常的浪费资源，所以呢，应该把他们放到池子里面去一起洗澡--线程池

可以使用 ThreadPoolExecutor 来实现线程池，这样就不会去重复的创建销毁线程了

实现代码：

import  threading
import  time

#创建一个线程子类：
from concurrent.futures.thread import ThreadPoolExecutor


class MyThread(threading.Thread):
    def __init__(self,threadID,name,couter):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.couter = couter

    def run(self):
        print("开始线程： "+self.name)
        moyu_time(self.name, self.couter, 10)
        print("退出线程："+self.name)

def moyu_time(threadName,delay,counter):
        while counter:
            time.sleep(delay)
            print("%s 开始摸鱼 %s" % (threadName,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
            counter -= 1


if __name__ == '__main__':
    pool = ThreadPoolExecutor (29)
    for i  in range(1,5):
        pool.submit(moyu_time(('YDLin'+str(i)),1,3))

用队列做线程池

import threading
import time
from queue import Queue

class CustomThread(threading.Thread):
    def __init__(self,queue):
        threading.Thread.__init__(self)
        self.__queue = queue

    def run(self):
        while True:
            #要执行的话就需要去队列里面取了
            q_method = self.__queue.get()
            q_method()
            self.__queue.task_done()

def moyu():
    print(" 开始摸鱼 %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))

def queue_pool():
    queue = Queue(5)
    for i in range(queue.maxsize):
        t = CustomThread(queue)
        t.setDaemon(True) #每个线程都让它们处于守护状态
        t.start()

    for i in range(20):
        queue.put(moyu)
    queue.join()

if __name__ == '__main__':
    queue_pool()

实战：

之前爬取豆瓣网的改进：
使用多线程，几分钟就搞定了

import json
import multiprocessing

import multiprocess as multiprocess
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import re
import time
import xlwt

def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
            }

        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

book = xlwt.Workbook(encoding='utf-8', style_compression=0)

sheet = book.add_sheet('豆瓣电影Top250', cell_overwrite_ok=True)
sheet.write(0, 0, '名称')
sheet.write(0, 1, '图片')
sheet.write(0, 2, '排名')
sheet.write(0, 3, '评分')
sheet.write(0, 4, '作者')
sheet.write(0, 5, '简介')

n = 1
def save_to_excel(soup):
    list = soup.find(class_='grid_view').find_all('li')

    for item in list:
        item_name = item.find(class_='title').string
        item_img = item.find('a').find('img').get('src')
        item_index = item.find(class_='').string
        item_score = item.find(class_='rating_num').string
        item_author = item.find('p').text
        if (item.find(class_='inq') != None):
            item_intr = item.find(class_='inq').string
        # print('爬取电影：' + item_index + ' | ' + item_name +' | ' + item_img +' | ' + item_score +' | ' + item_author +' | ' + item_intr )
        print('爬取电影：' + item_index + ' | ' + item_name + ' | ' + item_score + ' | ' + item_intr)
        global n
        sheet.write(n, 0, item_name)
        sheet.write(n, 1, item_img)
        sheet.write(n, 2, item_index)
        sheet.write(n, 3, item_score)
        sheet.write(n, 4, item_author)
        sheet.write(n, 5, item_intr)
        n = n + 1

def parse_one_page(html):
    pattern = re.compile(
        '<li>.*?<em class="">(.*?)</em>.*?title.*?>(.*?)</span>.*? <span class="rating_num" property="v:average">(.*?)</span>.*?<span class="inq">(.*?)</span>',
        re.S)
    items = re.findall(pattern, html)
    for item in items:
        yield {'index': item[0],
               'title': item[1],
               'score': item[2],
               'comment': item[3]
               }


def write_to_file(content):
    with open('douban250.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')


def main(url):
    html = get_one_page(url)
    soup = BeautifulSoup(html, 'lxml')
    save_to_excel(soup)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)


if __name__ == '__main__':
    start =time.time()
    urls = []
    pool = multiprocessing.Pool(multiprocessing.cpu_count())#我们根据电脑 CPU 的内核数量创建相应的进程池
    for i in range(0, 10):
        url = 'https://movie.douban.com/top250?start=' + str(i * 25) + '&filter='
        urls.append(url)
    pool.map(main, urls)#通过 map 方法去执行我们的主函数将我们获得的 url 传过去
    pool.close()
    pool.join()#为的是让进程池的进程执行完毕再结束

book.save(u'豆瓣最受欢迎的250部电影.xlsx')

Python爬虫练习三-多线程爬虫

前言：

练习：

基础练习

实现代码

结果

线程池练习一：

用队列做线程池

实战：

機器學習基礎-機器學習練習 6 - 支持向量機

Andrew Ng-機器學習基礎筆記-SVM

Andrew Ng-機器學習基礎筆記-Python實現代碼

Python爬蟲練習四-scrapy框架練手

Andrew Ng-機器學習基礎筆記（下）-Python實現代碼

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結