最近學習了disruptor,於是自己實現了一個極輕量化的多生產者多消費者c++版本disruptor,並用這個版本優化了github上排第一的threadpool項目,效率得到了一定的提升,特別是執行函數相對比mutex鎖所需時間較小時候。
源碼地址:https://github.com/WoBuShiXiaoKang/DisruptorThreadPool/tree/master
老哥們順手去給個star唄~
目錄
(1)序號類Sequence和AtomicSequence:
1、disruptor概述
關於disruptor的詳細介紹可以拜讀Disruptor的論文,中文翻譯見https://www.cnblogs.com/daoqidelv/p/7043696.html,下面給出總結,歡迎補充指正:
- Disruptor的核心目的是消除鎖(mutex的開銷很大)。做法是隔離生產者與消費者之間,各生產者之間,各消費者之間對緩存隊列的操作(原始的單隊列緩存,每個生產者或消費者對隊列操作都需要加鎖,因爲push和pop都是對隊列的寫操作),具體做法是預先分配緩存隊列的大小,各生產者與消費者只是獲取其中的槽位下標,然後對自己的槽位進行寫或者讀操作,這樣就不需要對整個隊列讀寫操作過程進行加鎖。
- 減輕GC機制負擔。隊列每一個元素pop實際都delete了那個節點,對GC機制造成負擔。
2、C++實現
(1)序號類Sequence和AtomicSequence:
該類封裝了int64_t和std::atomic_int64_t,進行內存補齊保證_seq在一個緩存行中,以防止false sharing(http://ifeve.com/falsesharing/):
#define CACHELINE_SIZE_BYTES 64
#define CACHELINE_PADDING_FOR_ATOMIC_INT64_SIZE (CACHELINE_SIZE_BYTES - sizeof(std::atomic_int64_t))
#define CACHELINE_PADDING_FOR_INT64_SIZE (CACHELINE_SIZE_BYTES - sizeof(int64_t))
namespace Kang {
//對std::atomic_int64_t進行了封裝,內存補齊保證_seq在一個緩存行中
class AtomicSequence
{
public:
AtomicSequence(int64_t num = 0L) : _seq(num) {};
~AtomicSequence() {};
AtomicSequence(const AtomicSequence&) = delete;
AtomicSequence(const AtomicSequence&&) = delete;
void operator=(const AtomicSequence&) = delete;
void store(const int64_t val)//, std::memory_order _order = std::memory_order_seq_cst)
{
_seq.store(val);//,_order);
}
int64_t load()//std::memory_order _order = std::memory_order_seq_cst)
{
return _seq.load();// _order);
}
int64_t fetch_add(const int64_t increment)//, std::memory_order _order = std::memory_order_seq_cst)
{
return _seq.fetch_add(increment);// _order);
}
private:
//兩邊都補齊,以保證_seq不會與其它變量共享一個緩存行
char _frontPadding[CACHELINE_PADDING_FOR_ATOMIC_INT64_SIZE];
std::atomic_int64_t _seq;
char _backPadding[CACHELINE_PADDING_FOR_ATOMIC_INT64_SIZE];
};
//對int64_t進行了封裝,內存補齊保證_seq在一個緩存行中
class Sequence
{
public:
Sequence(int64_t num = 0L) : _seq(num) {};
~Sequence() {};
Sequence(const Sequence&) = delete;
Sequence(const Sequence&&) = delete;
void operator=(const Sequence&) = delete;
void store(const int64_t val)
{
_seq = val;
}
int64_t load()
{
return _seq;
}
private:
//兩邊都補齊,以保證_seq不會與其它變量共享一個緩存行
char _frontPadding[CACHELINE_PADDING_FOR_INT64_SIZE];
int64_t _seq;
char _backPadding[CACHELINE_PADDING_FOR_INT64_SIZE];
};
}
(2)Disruptor類
使用_ringbuf存放內容(實際爲一個分配好空間的數組);
每次有生產者要往buffer中寫時,獲取_writableSeq的值並將其+1。每次生產者寫完buffer後,等待_lastWrote等於自身seq-1後更新_lastWrote爲自身seq(這裏保證了_lastWrote前面的內容肯定是生產者寫過的內容)。
每次有消費者要讀取buffer中內容時,將最後一次派發給消費者的槽位_lastDispatch+1,然後獲取其值。每次消費者讀完buffer內容後,等待_lastRead等於自身seq-1後更新_lastRead爲自身seq(這裏保證了_lastRead前面的內容肯定是消費者讀過的內容)。
使用_stopWorking讓disruptor類停止工作,修改爲true後,不允許再向buffer中寫內容,並且若buffer中已經沒有內容時,有消費者來讀只會獲取一個無效的槽位下標-1。
namespace Kang {
//環形buffer默認大小,爲了簡化取餘操作,這裏的長度需要是2的n次方
constexpr size_t DefaultRingBufferSize = 262144;
//該disruptor類提供對環形buffer的操作
//寫接口:WriteInBuf()
//讀接口:(1)GetReadableSeq()獲取可讀的環形buffer槽位下標
// (2)ReadFromBuf()獲取可讀的內容
// (3)讀完後調用FinishReading()
//注:讀接口使用複雜,使用了BufConsumer類進行了封裝(利用RAII)
template<class ValueType, size_t N = DefaultRingBufferSize>
class Disruptor
{
public:
Disruptor() : _lastRead(-1L) , _lastWrote(-1L), _lastDispatch(-1L), _writableSeq(0L) , _stopWorking(0L){};
~Disruptor() {};
Disruptor(const Disruptor&) = delete;
Disruptor(const Disruptor&&) = delete;
void operator=(const Disruptor&) = delete;
static_assert(((N > 0) && ((N& (~N + 1)) == N)),
"RingBuffer's size must be a positive power of 2");
//向buffer中寫內容
void WriteInBuf(ValueType&& val)
{
const int64_t writableSeq = _writableSeq.fetch_add(1);
while (writableSeq - _lastRead.load() > N)
{//等待策略
if (_stopWorking.load())
throw std::runtime_error("writting when stopped disruptor");
//std::this_thread::yield();
}
//寫操作
_ringBuf[writableSeq & (N - 1)] = val;
while (writableSeq - 1 != _lastWrote.load())
{//等待策略
}
_lastWrote.store(writableSeq);
};
//向buffer中寫內容
void WriteInBuf(ValueType& val)
{
const int64_t writableSeq = _writableSeq.fetch_add(1);
while (writableSeq - _lastRead.load() > N)
{//等待策略
if (_stopWorking.load())
throw std::runtime_error("writting when stopped disruptor");
//std::this_thread::yield();
}
//寫操作
_ringBuf[writableSeq & (N - 1)] = val;
while (writableSeq - 1 != _lastWrote.load())
{//等待策略
}
_lastWrote.store(writableSeq);
};
//獲取可讀的buffer下標
const int64_t GetReadableSeq()
{
const int64_t readableSeq = _lastDispatch.fetch_add(1) + 1;
while (readableSeq > _lastWrote.load())
{//等待策略
if (_stopWorking.load() && empty())
{
return -1L;
}
}
return readableSeq;
};
//讀取指定下標位置的buffer內容
ValueType& ReadFromBuf(const int64_t readableSeq)
{
if (readableSeq < 0)
{
throw("error : incorrect seq for ring Buffer when ReadFromBuf(seq)!");
}
return _ringBuf[readableSeq & (N - 1)];
}
//讀取完指定下標位置的buffer內容,交還下標位置使用權
void FinishReading(const int64_t seq)
{
if (seq < 0)
{
return;
}
while (seq - 1 != _lastRead.load())
{//等待策略
}
//_lastRead = seq;
_lastRead.store(seq);
};
bool empty()
{
return _writableSeq.load() - _lastRead.load() == 1;
}
//通知disruptor停止工作,調用該函數後,若buffer已經全部處理完,那麼獲取可讀下標時只會獲取到-1L
void stop()
{
//_stopWorking = true;
_stopWorking.store(1L);
}
private:
//最後一個已讀內容位置
Sequence _lastRead;
//最後一個已寫內容位置
Sequence _lastWrote;
//disruptor是否停止工作
Sequence _stopWorking;
//最後一個派發給消費者使用的槽位序號
AtomicSequence _lastDispatch;
//當前可寫的槽位序號
AtomicSequence _writableSeq;
//環形buffer,爲加快取餘操作,N需要時2的n次冪
std::array<ValueType, N> _ringBuf;
};
}
(3)BufConsumer類
由於Disruptor的讀接口使用起來麻煩,利用RAII封裝了一個BufConsumer類,實際就是調用Disruptor類的接口:構造的時候調用GetReadableSeq()獲取可讀序號,使用empty()判斷是否有可讀內容,使用GetContent()獲取buffer中的內容,析構的時候調用FinishReading()交還disruptor中ringbuffer的槽位使用權。
namespace Kang {
//利用RAII封裝Disruptor的讀操作:
//構造的時候調用GetReadableSeq()獲取可讀序號
//使用empty()判斷是否有可讀內容
//使用GetContent()獲取buffer中的內容
//析構的時候調用FinishReading()交還disruptor中ringbuffer的槽位使用權
//
//使用實例:
// std::function<void()> task;
// {
// BufConsumer<std::function<void()>> consumer(this->_tasks);
// if (consumer.empty())
// {
// return;
// }
// task = std::move(consumer.GetContent());
// }
// task();
template<class ValueType>
class BufConsumer
{
public:
BufConsumer(Disruptor<ValueType>* disruptor) : _disruptor(disruptor), _seq(-1L) {
_seq = _disruptor->GetReadableSeq();
};
~BufConsumer()
{
_disruptor->FinishReading(_seq);
};
BufConsumer(const BufConsumer&) = delete;
BufConsumer(const BufConsumer&&) = delete;
void operator=(const BufConsumer&) = delete;
bool empty()
{
return _seq < 0;
}
ValueType& GetContent()
{
return _disruptor->ReadFromBuf(_seq);
}
private:
Disruptor<ValueType>* _disruptor;
int64_t _seq;
};
}
3、應用於線程池
對Github第一的線程池項目https://github.com/progschj/ThreadPool使用disruptor進行了優化:
namespace Kang {
class ThreadPool {
public:
ThreadPool(size_t);
template<class F, class... Args>
auto enqueue(F&& f, Args&&... args)
->std::future<typename std::result_of<F(Args...)>::type>;
~ThreadPool();
private:
// need to keep track of threads so we can join them
std::vector< std::thread > _workers;
//Disruptor
Disruptor<std::function<void()>> *_tasks;
};
// the constructor just launches some amount of workers
inline ThreadPool::ThreadPool(size_t threads)
{
_tasks = new Disruptor<std::function<void()>>();
for (size_t i = 0; i < threads; ++i)
{
_workers.emplace_back(
[this]
{
for (;;)
{
std::function<void()> task;
{
BufConsumer<std::function<void()>> consumer(this->_tasks);
if (consumer.empty())
{
return;
}
task = std::move(consumer.GetContent());
}
task();
}
}
);
}
}
// add new work item to the pool
template<class F, class... Args>
auto ThreadPool::enqueue(F&& f, Args&&... args)
-> std::future<typename std::result_of<F(Args...)>::type>
{
using return_type = typename std::result_of<F(Args...)>::type;
auto task = std::make_shared< std::packaged_task<return_type()> >(
std::bind(std::forward<F>(f), std::forward<Args>(args)...)
);
std::future<return_type> res = task->get_future();
_tasks->WriteInBuf(std::move([task]() { (*task)(); }));
return res;
}
// the destructor joins all threads
inline ThreadPool::~ThreadPool()
{
_tasks->stop();
for (std::thread& worker : _workers)
worker.join();
delete _tasks;
}
}
在此之前,我對github這個線程池項目進行過雙緩存隊列改造,所以在這裏一起進行對比,打印兩個時間:
- 寫操作入隊時間
- 所有任務執行完時間
Disruptor改造的線程池的寫操作入隊時間主要受限於ringbuffer的大小(寫操作一般很快,buffer不夠大需要等待前面的讀完纔有空位寫),所以測試都修改爲ringbuffer很大,這裏PO一個3生產者-3消費者的測試結果,第一個爲雙緩存隊列改造的線程池,第二個爲github上面的原始線程池,第三個爲disruptor改造的線程池:
可見效率提高了不少。
測試代碼如下,使用的gtest框架:
#define THREADPOOL_SIZE 3
#define PRODUCER_SIZE 3
#define FUNC_TIMES 2000
#define ENQUEUE_NUMS 60000
int sum(int n)
{
int ret = 0;
for (int i = 1; i < n; ++i)
{
ret += i;
}
return ret;
}
void MyKangWrite(Kang::ThreadPool* threadPool)
{
for (int i = 0; i < ENQUEUE_NUMS; ++i)
threadPool->enqueue(sum, FUNC_TIMES);
}
void MyCCWrite(CCThreadPool* threadPool)
{
for (int i = 0; i < ENQUEUE_NUMS; ++i)
threadPool->enqueue(sum, FUNC_TIMES);
}
void MyGitWrite(ThreadPool* threadPool)
{
for (int i = 0; i < ENQUEUE_NUMS; ++i)
threadPool->enqueue(sum, FUNC_TIMES);
}
TEST(ThreadPoolTest, Dul_QueueThreadPool) {
clock_t enstart, enfinish, finish;
double duration;
{
CCThreadPool threadPool(THREADPOOL_SIZE);
std::vector< std::thread > writers;
enstart = clock();
for (size_t i = 0; i < PRODUCER_SIZE; ++i)
{
writers.emplace_back(std::thread(MyCCWrite, &threadPool));
}
for (size_t i = 0; i < PRODUCER_SIZE; ++i)
{
writers[i].join();
}
enfinish = clock();
duration = ((double)(enfinish - enstart) / CLOCKS_PER_SEC) * 1000;
printf("Dul-Queue Threadpool enqueue %f ms!\n", duration);
}
finish = clock();
duration = ((double)(finish - enstart) / CLOCKS_PER_SEC) * 1000;
printf("Dul-Queue Threadpool Run %f ms!\n", duration);
}
TEST(ThreadPoolTest, GithubThreadPool) {
clock_t enstart, enfinish, finish;
double duration;
{
ThreadPool threadPool(THREADPOOL_SIZE);
std::vector< std::thread > writers;
enstart = clock();
for (size_t i = 0; i < PRODUCER_SIZE; ++i)
{
writers.emplace_back(std::thread(MyGitWrite, &threadPool));
}
for (size_t i = 0; i < PRODUCER_SIZE; ++i)
{
writers[i].join();
}
enfinish = clock();
duration = ((double)(enfinish - enstart) / CLOCKS_PER_SEC) * 1000;
printf("Github Threadpool Multi enqueue %f ms!\n", duration);
}
finish = clock();
duration = ((double)(finish - enstart) / CLOCKS_PER_SEC) * 1000;
printf("Github Threadpool Multi Run %f ms!\n", duration);
}
TEST(ThreadPoolTest, DisruptorThreadPool) {
clock_t enstart, enfinish, finish;
double duration;
{
Kang::ThreadPool threadPool(THREADPOOL_SIZE);
std::vector< std::thread > writers;
enstart = clock();
for (size_t i = 0; i < PRODUCER_SIZE; ++i)
{
writers.emplace_back(std::thread(MyKangWrite, &threadPool));
}
for (size_t i = 0; i < PRODUCER_SIZE; ++i)
{
writers[i].join();
}
enfinish = clock();
duration = ((double)(enfinish - enstart) / CLOCKS_PER_SEC) * 1000;
printf("Disruptor Threadpool Multi enqueue %f ms!\n", duration);
}
finish = clock();
duration = ((double)(finish - enstart) / CLOCKS_PER_SEC) * 1000;
printf("Disruptor Threadpool Multi Run %f ms!\n", duration);
}
4、不足
(1)沒有寫等待策略類,寫得很輕量,所有需要等待的地方都是自旋的,這個很快,但是對CPU負擔也不小,所以這裏不適合消費者和生產者很多的情況,很多的話需要用yield或者bloking wait。
(2)自旋的等待有些實際是可以優化掉的,比如像線程池這種確定了消費者數量的情況,可以創建一個數組,專門存放該消費者lastRead的下標,然後通過遍歷這個數組來更新_lastRead,就不需要每個線程等待別的線程讀到自己的上一個位子再更新_lastRead了。生產者數量確定的話也同理。
(3)單生產者單消費者連原子變量都不需要了,不過這個也不算不足。