little thread: Lock-free는 과연 빠른가? (2)

수십개의 쓰레드가 1~n개의 변수를 각각 atomic하게 변경하는 함수를 매우 빈번하게 호출한다고 가정하자.
해당 함수를 lock-free 로 구현한 버젼이 있고, spin-lock 이나 mutex 로 구현한 버젼이 각각 있다. 어떤 함수가 더 빠를까? 게다가 쓰레드 수가 충분히 많다면? 그리고 C++ 로 구현한 lock-free 함수와 자바 언어로 구현한 함수를 비교하면?
아마도 많은 개발자들이 쉽게 아래와 같이 대답할 것 같다.
1) 병렬처리 기법 즉, Lock-Free를 이용한 함수가 더 빠를 것이다.
2) 쓰레드 수가 많아질수록 Lock-free 기법을 이용한 함수가 더 빠를 것이다.
3) 질문자 수준이 의심된다. 당연히 C언어 Lock-Free 가 월등히 빠를 것이다.

과연 그럴까?
결론부터 말하면 위의 세 개의 답은 모두 틀렸다.
아래는 테스트에 사용된 소스코드와 그 결과값이다.
물론, 아래 테스트 결과는 매우 제한된 조건에서만 유의미하다. 다만, 잘못된 상식을 벗어나나게 하는 용도로는 큰 효과가 있을 것이다.

include <Windows.h>
#include <stdio.h>

using namespace std;

CRITICAL_SECTION cs;

class SpinLock {
public:
SpinLock(int cntSpin) {
this->spin_lock = 0;
this->cntSpin = cntSpin;
}

void lock() {
int v;
while (::InterlockedCompareExchange(&spin_lock, (LONG)&v, 0) != 0) {
if (cntSpin <= 0) {
::Sleep(1);
}
else {
for (int i = 0; i < cntSpin; i ++) {
_asm nop;
}
}
}
}

void unlock() {
spin_lock = 0;
}

class Auto {
public:
Auto(SpinLock* spinLock) {
this->spinLock = spinLock;
spinLock->lock();
}

~Auto(){
spinLock->unlock();
}

private:
SpinLock* spinLock;
};

private:
volatile LONG spin_lock;
bool cntSpin;
};

static SpinLock gSlowSpinLock(0);
static SpinLock gFastSpinLock(1);
volatile LONG gSharedData[20] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, };
static int TEST_LOOP;

DWORD WINAPI test_slow_lock(LPVOID param) {
int cntTest = (int) param;
for (int i = 0; i < TEST_LOOP; i++) {
SpinLock::Auto lock(&gSlowSpinLock);
volatile LONG* p = (LONG*)gSharedData;
for (int j = 0; j < cntTest; j++) {
*p += 1;
p++;
}
}
return 0;
}

DWORD WINAPI test_fast_lock(LPVOID param) {
int cntTest = (int) param;
for (int i = 0; i < TEST_LOOP; i++) {
SpinLock::Auto lock(&gFastSpinLock);
volatile LONG* p = (LONG*)gSharedData;
for (int j = 0; j < cntTest; j++) {
*p += 1;
p++;
}
}
return 0;
}

DWORD WINAPI test_lock_free(LPVOID param) {
int cntTest = (int)param;
for (int i = 0; i < TEST_LOOP; i++) {
volatile LONG* p = gSharedData;
for (int j = 0; j < cntTest; j++) {
::InterlockedIncrement(p);
p++;
}
}
return 0;
}

DWORD WINAPI test_critical_section(LPVOID param) {
int cntTest = (int) param;
for (int i = 0; i < TEST_LOOP; i++) {
::EnterCriticalSection(&cs);
volatile LONG* p = (LONG*)gSharedData;
for (int j = 0; j < cntTest; j++) {
*p += 1;
p++;
}
::LeaveCriticalSection(&cs);
}
return 0;
}

static const int MAX_THREADS = 32;
int doLockTest(int cntThread, LPTHREAD_START_ROUTINE test_fn, int cntAtomicUpdate) {

HANDLE th[MAX_THREADS];
memset((void*)gSharedData, 0, cntAtomicUpdate*sizeof(int));

ULONGLONG t0 = ::GetTickCount64();
for (int i = 0; i < cntThread; ++i)
th[i] = CreateThread(NULL, NULL, test_fn, (void *)cntAtomicUpdate, NULL, NULL);

WaitForMultipleObjects(cntThread, th, TRUE, INFINITE);
ULONGLONG t1 = ::GetTickCount64();

for (int i = 0; i < cntAtomicUpdate; ++i) {
if (gSharedData[i] != TEST_LOOP * cntThread) {
printf("!!! Invalid test function");
exit(-1);
}
}

return (int)(t1 - t0);
}

int _tmain(int argc, _TCHAR* argv[])
{
::InitializeCriticalSection(&cs);
for (int cntThread = 1; cntThread <= MAX_THREADS; cntThread *= 2) {
TEST_LOOP = 12800 / cntThread * 1000;
printf("\n\n=========== Threads = %d, Test loop = %d*1000 ===============\n",
cntThread, TEST_LOOP/1000);
printf("atomic_update | lock_free | slow_spin | fast_spin | crit_sect \n");
for (int cntAtomicUpdate = 1; cntAtomicUpdate <= 5; cntAtomicUpdate ++) {
int t1 = doLockTest(cntThread, test_lock_free, cntAtomicUpdate);
int t2 = doLockTest(cntThread, test_slow_lock, cntAtomicUpdate);
int t3 = doLockTest(cntThread, test_fast_lock, cntAtomicUpdate);
int t4 = doLockTest(cntThread, test_critical_section, cntAtomicUpdate);
printf("% 7d % 14d % 11d % 11d % 11d\n", cntAtomicUpdate, t1, t2, t3, t4);
}
}
return 0;
}

아래가 위 소스코드를 실행한 결과이다.

=========== Threads = 1, Test loop = 12800*1000 ===============
atomic_update | lock_free | slow_spin | fast_spin | crit_sect
1 109 125 125 281
2 203 124 141 265
3 296 141 140 297
4 374 140 141 296
5 453 156 140 296

=========== Threads = 2, Test loop = 6400*1000 ===============
atomic_update | lock_free | slow_spin | fast_spin | crit_sect
1 359 125 452 749
2 609 140 437 733
3 827 140 531 592
4 1061 141 499 702
5 1139 140 936 686

=========== Threads = 4, Test loop = 3200*1000 ===============
atomic_update | lock_free | slow_spin | fast_spin | crit_sect
1 344 124 1014 874
2 608 141 1014 874
3 873 141 1076 593
4 1107 141 1045 874
5 1060 141 1310 624

=========== Threads = 8, Test loop = 1600*1000 ===============
atomic_update | lock_free | slow_spin | fast_spin | crit_sect
1 312 125 1747 874
2 546 125 1778 952
3 795 141 1778 593
4 1092 140 1794 1045
5 1092 156 1935 593

=========== Threads = 16, Test loop = 800*1000 ===============
atomic_update | lock_free | slow_spin | fast_spin | crit_sect
1 296 140 3573 920
2 531 140 2793 936
3 811 140 2808 593
4 1076 141 3167 1060
5 1092 156 3339 593

=========== Threads = 32, Test loop = 400*1000 ===============
atomic_update | lock_free | slow_spin | fast_spin | crit_sect
1 296 156 5881 921
2 530 156 4945 952
3 811 156 4883 577
4 1092 156 6209 998
5 1108 172 7004 593

아마 위 결과를 보고 믿지 못하는 분도 계실지 모르겠다. 그런 경우 소스코드를 직접 실행해 보시기 바란다.
보다시피 쓰레드 수가 2 이상인 경우, 단 한 개의 변수를 변경할 때도 SlowSpinLock 은 다른 모든 방식보다 보다 2~3배 이상 빠르다. (첨고로, FastSpinLock 은 다른 모든 방식보다 매우 큰 차이로 느리다.)
이러한 차이는 SlowSpinLock 이 매우 효과적으로 쓰레드간 경쟁상태를 최소화하기 때문에 발생하는 것으로 판단된다. CriticalSection은 FastSpinLock을 일부 포함하고 있어서 어정쩡한 결과가 나오는 것이라면, SlowSpinLock 과 유사하게 구현된 java synchronized 는 어떤 결과가 나올까?
아래가 그 결과이다. Windows 용 JDK 1.6.3 에서 테스트한 것이다.
놀랍게도 Thread 수에 관계없이 atomic 하게 변경하는 변수가 2 또는 3개 이상이면 java synchronized 가 C++ 로 구현한 Lock-free 코드보다 더 빨라지기 시작한다.

=========== Threads = 1, Test loop = 12800*1000 ===============
atomic_update = 1 : java_synchronized = 181
atomic_update = 2 : java_synchronized = 181
atomic_update = 3 : java_synchronized = 179
atomic_update = 4 : java_synchronized = 180
atomic_update = 5 : java_synchronized = 184

=========== Threads = 2, Test loop = 6400*1000 ===============
atomic_update = 1 : java_synchronized = 414
atomic_update = 2 : java_synchronized = 572
atomic_update = 3 : java_synchronized = 482
atomic_update = 4 : java_synchronized = 472
atomic_update = 5 : java_synchronized = 426

=========== Threads = 4, Test loop = 3200*1000 ===============
atomic_update = 1 : java_synchronized = 576
atomic_update = 2 : java_synchronized = 555
atomic_update = 3 : java_synchronized = 559
atomic_update = 4 : java_synchronized = 490
atomic_update = 5 : java_synchronized = 513

=========== Threads = 8, Test loop = 1600*1000 ===============
atomic_update = 1 : java_synchronized = 493
atomic_update = 2 : java_synchronized = 501
atomic_update = 3 : java_synchronized = 586
atomic_update = 4 : java_synchronized = 526
atomic_update = 5 : java_synchronized = 530

=========== Threads = 16, Test loop = 800*1000 ===============
atomic_update = 1 : java_synchronized = 530
atomic_update = 2 : java_synchronized = 538
atomic_update = 3 : java_synchronized = 524
atomic_update = 4 : java_synchronized = 522
atomic_update = 5 : java_synchronized = 569

=========== Threads = 32, Test loop = 400*1000 ===============
atomic_update = 1 : java_synchronized = 574
atomic_update = 2 : java_synchronized = 571
atomic_update = 3 : java_synchronized = 563
atomic_update = 4 : java_synchronized = 557
atomic_update = 5 : java_synchronized = 568

과연 자바는 최강의 멀티쓰레드 플랫폼이라 불릴만하다.

참고로, pthread_mutex 를 사용하면 CriticalSection 보다 2~3배 느려진다. Linux에서 Java 버젼도 이런 영향을 받아선지 성능이 뚝 떨어지지만 그래도 pthread_mutex 보다는 2배 이상 빠르다.

결론은 멀티쓰레드 코딩 방식에는 절대 강자는 없으며, 병렬컴퓨팅이 명확히 필요한 경우가 아니라면 lock-free 에 집착할 필요가 없다는 것이다. 위 테스트 결과처럼, 때로는 매우 단순한 CAS 함수와 Sleep 함수 하나 만으로도 매우 탁월한 퍼포먼스를 얻어낼 수 있다. Thread 간 충돌이 빈번히 발생하는 상황에서 Thread 블로킹을 최소화하려면 Lock-Free가 필요하지만 오히려 Lock을 적절히 사용하여 Thread 간 충돌을 최소화하여 퍼포먼스를 더 높일 수 있는 방법도 존재하는 것이다. 즉, 때로는 Lock-free 가, 때로는 FastSpinLock 이, 때로는 CriticalSection 이 가장 빠를 때가 있으며, 그때 그때 상황에 맞춰 해결하여야 한다. 다만, 자바 프로그래머라면 이런 고민에서 완전히 해방감을 느껴도 무방하다. ^^

누군가 C++11 의 mutex 관련 테스트자료도 올려주시면 고맙겠는데...
참고로, 아래에 자바 소스 코드도 첨부합니다.

public class TestMain {
static int gSharedData[] = new int[20];
static int TEST_LOOP;

static class JSync extends Thread {
JSync(int c) {
this.cntTest = c;
start();
}
public void run() {
for (int i = 0; i < TEST_LOOP; i++) {
synchronized (gSharedData) {
for (int j = 0; j < cntTest; j++) {
gSharedData[j] ++;
}
}
}
}
int cntTest;
}

static final int MAX_THREADS = 32;
static int doLockTest(int cntThread, int cntAtomicUpdate, boolean testSync) throws InterruptedException {
Thread th[] = new Thread[MAX_THREADS];
for (int i = 0; i < cntAtomicUpdate; i ++) {
gSharedData[i] = 0;
}

long t0 = System.currentTimeMillis();

for (int i = 0; i < cntThread; ++i) {
th[i] = new JSync(cntAtomicUpdate);
}
for (int i = 0; i < cntThread; ++i) {
th[i].join();
}

long t1 = System.currentTimeMillis();

for (int i = 0; i < cntAtomicUpdate; ++i) {
if (gSharedData[i] != TEST_LOOP * cntThread) {
System.out.print("!!! Invalid test function");
System.exit(-1);
}
}

return (int)(t1 - t0);
}

public static void main(String[] args) {

try {
for (int cntThread = 1; cntThread <= MAX_THREADS; cntThread *= 2) {
TEST_LOOP = 12800 / cntThread * 1000;
System.out.print("\n\n=========== Threads = " + cntThread + ", Test loop = " + TEST_LOOP/1000 + "*1000 ===============\n");

doLockTest(cntThread, 1, true);
for (int cntAtomicUpdate = 1; cntAtomicUpdate <= 5; cntAtomicUpdate += 1) {
int t1 = doLockTest(cntThread, cntAtomicUpdate, true);
System.out.print("atomic_update = " + cntAtomicUpdate
+ " : java_synchronized = " + t1 + "\n");
}
}
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}

little thread

2014년 8월 22일 금요일

Lock-free는 과연 빠른가? (2)

댓글 없음:

댓글 쓰기

블로그 보관함