原文作者:@玄冬Wong
好久没做过benchmark了,这次之所以想测试下,是怕std::atomic的效率没有windows的Interlocked性能好,测一下发现,性能差不多,Interlocked微弱的领先优势可以忽略不计。
先公布结果:三者的性能几乎相同,windows的Interlocked略好一点点。
测试代码:
#ifdef _WIN64 #ifndef _DEBUG #pragma comment(lib, "libboost_atomic-vc140-mt-1_60.lib") #endif #endif //#include "stdafx.h" #include <windows.h> #include <iostream> #include <atomic> #include <boost/atomic.hpp> #include <time.h> #include <thread> #include <list> #define MAX_THREADS 16 #define LOOP_COUNT 10000000 volatile long g_CountWin = 0; std::atomic<long> g_CountStd = 0; boost::atomic_long g_CountBoost(0); void Interlocked_fun() { for (int i = 0; i < LOOP_COUNT; i++) { InterlockedIncrement((LPLONG)&g_CountWin); } } void std_atomic_fun() { for (int i = 0; i < LOOP_COUNT; i++) { ++g_CountStd; } } void boost_atomic_fun() { for (int i = 0; i < LOOP_COUNT; i++) { ++g_CountBoost; } } void test_Interlocked() { std::list<std::thread*> threadlist; //测试Interlocked printf("testing Interlocked...\n"); clock_t start = clock(); for (int i = 0; i < MAX_THREADS; ++i) { std::thread *t1 = new std::thread((&Interlocked_fun)); threadlist.push_back(t1); } for (std::list<std::thread*>::const_iterator i = threadlist.begin(); i != threadlist.end(); i++) { (*i)->join(); } clock_t finish = clock(); printf("result:%d\n", g_CountWin); printf("cost:%dms\n", finish - start); for (std::list<std::thread*>::const_iterator i = threadlist.begin(); i != threadlist.end(); i++) { delete(*i); } } void test_std_atomic() { std::list<std::thread*> threadlist; //测试std::atomic printf("testing std::atomic...\n"); clock_t start = clock(); for (int i = 0; i < MAX_THREADS; ++i) { std::thread *t1 = new std::thread((&std_atomic_fun)); threadlist.push_back(t1); } for (std::list<std::thread*>::const_iterator i = threadlist.begin(); i != threadlist.end(); i++) { (*i)->join(); } clock_t finish = clock(); printf("result:%d\n", g_CountStd); printf("cost:%dms\n", finish - start); for (std::list<std::thread*>::const_iterator i = threadlist.begin(); i != threadlist.end(); i++) { delete(*i); } } void test_boost_atomic() { std::list<std::thread*> threadlist; //测试boost::atomic printf("testing boost::atomic...\n"); clock_t start = clock(); for (int i = 0; i < MAX_THREADS; ++i) { std::thread *t1 = new std::thread((&boost_atomic_fun)); threadlist.push_back(t1); } for (std::list<std::thread*>::const_iterator i = threadlist.begin(); i != threadlist.end(); i++) { (*i)->join(); } clock_t finish = clock(); printf("result:%d\n", g_CountBoost); printf("cost:%dms\n", finish - start); for (std::list<std::thread*>::const_iterator i = threadlist.begin(); i != threadlist.end(); i++) { delete(*i); } } int main(char* args, int size) { test_Interlocked(); //test_std_atomic(); //test_boost_atomic(); }
三种API的测试线程数都是16个并发线程,测试输出结果如下(跑了5次,取的平均值):
testing Interlocked...
result:160000000
cost:4926ms
testing std::atomic...
result:160000000
cost:4952ms
testing boost::atomic...
result:160000000
cost:4949ms
测试环境:
boost 1.60
windows 10 pro x64
VS2015企业版 update2,release x64
CPU:i7二代移动版