diff options
author | Yunqing Wang <yunqingwang@google.com> | 2013-12-27 15:25:54 -0800 |
---|---|---|
committer | Yunqing Wang <yunqingwang@google.com> | 2014-01-31 14:44:53 -0800 |
commit | 903801f1ef7ac8d13d4f57571d048b604e8aaafd (patch) | |
tree | 23567c0947d8492ea9333ff924ed02e0d505c8bb /vp9/decoder/vp9_thread.h | |
parent | e78c174e540117dcfcdff505d38478d4ac6df844 (diff) | |
download | libvpx-903801f1ef7ac8d13d4f57571d048b604e8aaafd.tar libvpx-903801f1ef7ac8d13d4f57571d048b604e8aaafd.tar.gz libvpx-903801f1ef7ac8d13d4f57571d048b604e8aaafd.tar.bz2 libvpx-903801f1ef7ac8d13d4f57571d048b604e8aaafd.zip |
vp9 decoder: row-based multi-threaded loopfilter
Implemented parallel loopfiltering, which uses existing tile-
decoding threads. Each thread works on one row, and when that row
is loopfiltered, it moves to next unattended row. To ensure the
correct filtering order, threads are synchronized and one
superblock is filtered only if the superblocks it depends on are
filtered already.
To reduce synchronization overhead and speed up the decoder, we use
nsync > 1 for high resolution.
Performance tests:
1. on desktop:
8-tile 4k video using 8 threads, speedup: 70% - 80%
4-tile HD video using 4 threads, speedup: ~35%
2. on mobile device(Nexus 7):
4-tile 1080p video using 4 threads, speedup: 18% - 25%
4-tile 1080p video using 2 threads, speedup: 10% - 15%
Change-Id: If54b4a11960dd706c22d5ad145ad94156031f36a
Diffstat (limited to 'vp9/decoder/vp9_thread.h')
-rw-r--r-- | vp9/decoder/vp9_thread.h | 119 |
1 files changed, 114 insertions, 5 deletions
diff --git a/vp9/decoder/vp9_thread.h b/vp9/decoder/vp9_thread.h index bc69cfa1f..2f8728dcf 100644 --- a/vp9/decoder/vp9_thread.h +++ b/vp9/decoder/vp9_thread.h @@ -26,7 +26,8 @@ extern "C" { #if CONFIG_MULTITHREAD #if defined(_WIN32) - +#include <errno.h> // NOLINT +#include <process.h> // NOLINT #include <windows.h> // NOLINT typedef HANDLE pthread_t; typedef CRITICAL_SECTION pthread_mutex_t; @@ -36,12 +37,120 @@ typedef struct { HANDLE signal_event_; } pthread_cond_t; -#else - +//------------------------------------------------------------------------------ +// simplistic pthread emulation layer + +// _beginthreadex requires __stdcall +#define THREADFN unsigned int __stdcall +#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val) + +static INLINE int pthread_create(pthread_t* const thread, const void* attr, + unsigned int (__stdcall *start)(void*), + void* arg) { + (void)attr; + *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ + 0, /* unsigned stack_size */ + start, + arg, + 0, /* unsigned initflag */ + NULL); /* unsigned *thrdaddr */ + if (*thread == NULL) return 1; + SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); + return 0; +} + +static INLINE int pthread_join(pthread_t thread, void** value_ptr) { + (void)value_ptr; + return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 || + CloseHandle(thread) == 0); +} + +// Mutex +static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, + void* mutexattr) { + (void)mutexattr; + InitializeCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { + return TryEnterCriticalSection(mutex) ? 0 : EBUSY; +} + +static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { + EnterCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { + LeaveCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { + DeleteCriticalSection(mutex); + return 0; +} + +// Condition +static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { + int ok = 1; + ok &= (CloseHandle(condition->waiting_sem_) != 0); + ok &= (CloseHandle(condition->received_sem_) != 0); + ok &= (CloseHandle(condition->signal_event_) != 0); + return !ok; +} + +static INLINE int pthread_cond_init(pthread_cond_t *const condition, + void* cond_attr) { + (void)cond_attr; + condition->waiting_sem_ = CreateSemaphore(NULL, 0, 1, NULL); + condition->received_sem_ = CreateSemaphore(NULL, 0, 1, NULL); + condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL); + if (condition->waiting_sem_ == NULL || + condition->received_sem_ == NULL || + condition->signal_event_ == NULL) { + pthread_cond_destroy(condition); + return 1; + } + return 0; +} + +static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { + int ok = 1; + if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) { + // a thread is waiting in pthread_cond_wait: allow it to be notified + ok = SetEvent(condition->signal_event_); + // wait until the event is consumed so the signaler cannot consume + // the event via its own pthread_cond_wait. + ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) != + WAIT_OBJECT_0); + } + return !ok; +} + +static INLINE int pthread_cond_wait(pthread_cond_t *const condition, + pthread_mutex_t *const mutex) { + int ok; + // note that there is a consumer available so the signal isn't dropped in + // pthread_cond_signal + if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) + return 1; + // now unlock the mutex so pthread_cond_signal may be issued + pthread_mutex_unlock(mutex); + ok = (WaitForSingleObject(condition->signal_event_, INFINITE) == + WAIT_OBJECT_0); + ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL); + pthread_mutex_lock(mutex); + return !ok; +} +#else // _WIN32 #include <pthread.h> // NOLINT +# define THREADFN void* +# define THREAD_RETURN(val) val +#endif -#endif /* _WIN32 */ -#endif /* CONFIG_MULTITHREAD */ +#endif // CONFIG_MULTITHREAD // State of the worker thread object typedef enum { |