summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xbuild/make/configure.sh3
-rwxr-xr-xconfigure1
-rw-r--r--vp8/common/arm/neon/loopfilter_neon.asm15
-rw-r--r--vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm16
-rw-r--r--vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm16
-rw-r--r--vp8/common/arm/neon/mbloopfilter_neon.asm27
-rw-r--r--vp8/common/generic/systemdependent.c48
-rw-r--r--vp8/common/onyxc_int.h3
-rw-r--r--vp8/decoder/threading.c10
-rw-r--r--vp8/encoder/ethreading.c10
-rw-r--r--vp8/encoder/onyx_if.c4
-rw-r--r--vp8/encoder/onyx_int.h2
12 files changed, 91 insertions, 64 deletions
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 3324be36e..28a0247d8 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -980,6 +980,9 @@ EOF
esac
fi
+ # for sysconf(3) and friends.
+ check_header unistd.h
+
# glibc needs these
if enabled linux; then
add_cflags -D_LARGEFILE_SOURCE
diff --git a/configure b/configure
index 0f30df2cb..bbe58069a 100755
--- a/configure
+++ b/configure
@@ -211,6 +211,7 @@ HAVE_LIST="
alt_tree_layout
pthread_h
sys_mman_h
+ unistd_h
"
EXPERIMENT_LIST="
extend_qrange
diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm
index d3a79f640..e73dd6401 100644
--- a/vp8/common/arm/neon/loopfilter_neon.asm
+++ b/vp8/common/arm/neon/loopfilter_neon.asm
@@ -308,7 +308,6 @@
; q9 q2
; q10 q3
|vp8_loop_filter_neon| PROC
- ldr r12, _lf_coeff_
; vp8_filter_mask
vabd.u8 q11, q3, q4 ; abs(p3 - p2)
@@ -339,7 +338,7 @@
vqadd.u8 q9, q9, q2 ; a = b + a
vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
- vld1.u8 {q0}, [r12]!
+ vmov.u8 q0, #0x80 ; 0x80
; vp8_filter() function
; convert to signed
@@ -348,7 +347,7 @@
veor q5, q5, q0 ; ps1
veor q8, q8, q0 ; qs1
- vld1.u8 {q10}, [r12]!
+ vmov.u8 q10, #3 ; #3
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
vsubl.s8 q11, d15, d13
@@ -367,7 +366,7 @@
vaddw.s8 q2, q2, d2
vaddw.s8 q11, q11, d3
- vld1.u8 {q9}, [r12]!
+ vmov.u8 q9, #4 ; #4
; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d2, q2
@@ -399,12 +398,4 @@
;-----------------
-_lf_coeff_
- DCD lf_coeff
-lf_coeff
- DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080
- DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303
- DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404
- DCD 0x01010101, 0x01010101, 0x01010101, 0x01010101
-
END
diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
index 5fe7e7e6d..7c5ea3644 100644
--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
@@ -22,20 +22,19 @@
; r1 int p, //pitch
; r2 const signed char *flimit,
; r3 const signed char *limit,
-; stack(r4) const signed char *thresh,
+; stack(r4) const signed char *thresh (unused)
; //stack(r5) int count --unused
|vp8_loop_filter_simple_horizontal_edge_neon| PROC
sub r0, r0, r1, lsl #1 ; move src pointer down by 2 lines
- ldr r12, _lfhy_coeff_
vld1.u8 {q5}, [r0], r1 ; p1
vld1.s8 {d2[], d3[]}, [r2] ; flimit
vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13
vld1.u8 {q6}, [r0], r1 ; p0
- vld1.u8 {q0}, [r12]! ; 0x80
+ vmov.u8 q0, #0x80 ; 0x80
vld1.u8 {q7}, [r0], r1 ; q0
- vld1.u8 {q10}, [r12]! ; 0x03
+ vmov.u8 q10, #0x03 ; 0x03
vld1.u8 {q8}, [r0] ; q1
;vp8_filter_mask() function
@@ -66,7 +65,7 @@
vadd.s16 q11, q2, q2 ; 3 * ( qs0 - ps0)
vadd.s16 q12, q3, q3
- vld1.u8 {q9}, [r12]! ; 0x04
+ vmov.u8 q9, #0x04 ; 0x04
vadd.s16 q2, q2, q11
vadd.s16 q3, q3, q12
@@ -105,11 +104,4 @@
;-----------------
-_lfhy_coeff_
- DCD lfhy_coeff
-lfhy_coeff
- DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080
- DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303
- DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404
-
END
diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
index c30378b9c..a7f7b690e 100644
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
@@ -22,7 +22,7 @@
; r1 int p, //pitch
; r2 const signed char *flimit,
; r3 const signed char *limit,
-; stack(r4) const signed char *thresh,
+; stack(r4) const signed char *thresh (unused)
; //stack(r5) int count --unused
|vp8_loop_filter_simple_vertical_edge_neon| PROC
@@ -32,7 +32,6 @@
vld1.s8 {d2[], d3[]}, [r2] ; flimit
vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13
vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r0], r1
- ldr r12, _vlfy_coeff_
vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r1
vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r0], r1
vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r1
@@ -41,11 +40,11 @@
vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r0], r1
vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
- vld1.u8 {q0}, [r12]! ; 0x80
+ vmov.u8 q0, #0x80 ; 0x80
vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
- vld1.u8 {q11}, [r12]! ; 0x03
+ vmov.u8 q11, #0x03 ; 0x03
vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
- vld1.u8 {q12}, [r12]! ; 0x04
+ vmov.u8 q12, #0x04 ; 0x04
vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
@@ -146,11 +145,4 @@
;-----------------
-_vlfy_coeff_
- DCD vlfy_coeff
-vlfy_coeff
- DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080
- DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303
- DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404
-
END
diff --git a/vp8/common/arm/neon/mbloopfilter_neon.asm b/vp8/common/arm/neon/mbloopfilter_neon.asm
index 981adffd1..72f0f9271 100644
--- a/vp8/common/arm/neon/mbloopfilter_neon.asm
+++ b/vp8/common/arm/neon/mbloopfilter_neon.asm
@@ -372,7 +372,6 @@
; q10 q3
|vp8_mbloop_filter_neon| PROC
- ldr r12, _mblf_coeff_
; vp8_filter_mask
vabd.u8 q11, q3, q4 ; abs(p3 - p2)
@@ -396,7 +395,7 @@
vld1.s8 {d4[], d5[]}, [r2] ; flimit
- vld1.u8 {q0}, [r12]!
+ vmov.u8 q0, #0x80 ; 0x80
vadd.u8 q2, q2, q2 ; flimit * 2
vadd.u8 q2, q2, q1 ; flimit * 2 + limit
@@ -431,12 +430,12 @@
vadd.s16 q2, q2, q10
vadd.s16 q13, q13, q11
- vld1.u8 {q12}, [r12]! ; #3
+ vmov.u8 q12, #3 ; #3
vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q13, q13, d3
- vld1.u8 {q11}, [r12]! ; #4
+ vmov.u8 q11, #4 ; #4
; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d2, q2
@@ -444,16 +443,16 @@
vand q1, q1, q15 ; vp8_filter &= mask
- vld1.u8 {q15}, [r12]! ; #63
- ;
+ vmov.u16 q15, #63 ; #63
+
vand q13, q1, q14 ; Filter2 &= hev
- vld1.u8 {d7}, [r12]! ; #9
+ vmov.u8 d7, #9 ; #9
vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4)
vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3)
- vld1.u8 {d6}, [r12]! ; #18
+ vmov.u8 d6, #18 ; #18
vshr.s8 q2, q2, #3 ; Filter1 >>= 3
vshr.s8 q13, q13, #3 ; Filter2 >>= 3
@@ -463,7 +462,7 @@
vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1)
- vld1.u8 {d5}, [r12]! ; #27
+ vmov.u8 d5, #27 ; #27
vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2)
@@ -507,14 +506,4 @@
;-----------------
-_mblf_coeff_
- DCD mblf_coeff
-mblf_coeff
- DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080
- DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303
- DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404
- DCD 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
- DCD 0x09090909, 0x09090909, 0x12121212, 0x12121212
- DCD 0x1b1b1b1b, 0x1b1b1b1b
-
END
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index fea6dcd23..c7fbb3e09 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -17,9 +17,54 @@
#include "vp8/common/idct.h"
#include "vp8/common/onyxc_int.h"
+#if CONFIG_MULTITHREAD
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#elif defined(_WIN32)
+#include <windows.h>
+typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO);
+#endif
+#endif
+
extern void vp8_arch_x86_common_init(VP8_COMMON *ctx);
extern void vp8_arch_arm_common_init(VP8_COMMON *ctx);
+#if CONFIG_MULTITHREAD
+static int get_cpu_count()
+{
+ int core_count = 16;
+
+#if HAVE_UNISTD_H
+#if defined(_SC_NPROCESSORS_ONLN)
+ core_count = sysconf(_SC_NPROCESSORS_ONLN);
+#elif defined(_SC_NPROC_ONLN)
+ core_count = sysconf(_SC_NPROC_ONLN);
+#endif
+#elif defined(_WIN32)
+ {
+ PGNSI pGNSI;
+ SYSTEM_INFO sysinfo;
+
+ /* Call GetNativeSystemInfo if supported or
+ * GetSystemInfo otherwise. */
+
+ pGNSI = (PGNSI) GetProcAddress(
+ GetModuleHandle(TEXT("kernel32.dll")), "GetNativeSystemInfo");
+ if (pGNSI != NULL)
+ pGNSI(&sysinfo);
+ else
+ GetSystemInfo(&sysinfo);
+
+ core_count = sysinfo.dwNumberOfProcessors;
+ }
+#else
+ /* other platforms */
+#endif
+
+ return core_count > 0 ? core_count : 1;
+}
+#endif
+
void vp8_machine_specific_config(VP8_COMMON *ctx)
{
#if CONFIG_RUNTIME_CPU_DETECT
@@ -98,4 +143,7 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
#endif
+#if CONFIG_MULTITHREAD
+ ctx->processor_core_count = get_cpu_count();
+#endif /* CONFIG_MULTITHREAD */
}
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index e67d39cbb..0565127e1 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -196,6 +196,9 @@ typedef struct VP8Common
#if CONFIG_RUNTIME_CPU_DETECT
VP8_COMMON_RTCD rtcd;
#endif
+#if CONFIG_MULTITHREAD
+ int processor_core_count;
+#endif
struct postproc_state postproc_state;
} VP8_COMMON;
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 56275940e..9ef85e9cd 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -439,12 +439,18 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi)
pbi->b_multithreaded_rd = 0;
pbi->allocated_decoding_thread_count = 0;
- core_count = (pbi->max_threads > 16) ? 16 : pbi->max_threads;
+
+ /* limit decoding threads to the max number of token partitions */
+ core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads;
+
+ /* limit decoding threads to the available cores */
+ if (core_count > pbi->common.processor_core_count)
+ core_count = pbi->common.processor_core_count;
if (core_count > 1)
{
pbi->b_multithreaded_rd = 1;
- pbi->decoding_thread_count = core_count -1;
+ pbi->decoding_thread_count = core_count - 1;
CHECK_MEM_ERROR(pbi->h_decoding_thread, vpx_malloc(sizeof(pthread_t) * pbi->decoding_thread_count));
CHECK_MEM_ERROR(pbi->h_event_start_decoding, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count));
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index f5006ddab..c92a366e8 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -459,15 +459,15 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi)
cpi->b_multi_threaded = 0;
cpi->encoding_thread_count = 0;
- cpi->processor_core_count = 32; //vp8_get_proc_core_count();
- if (cpi->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
+ if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
{
int ithread;
int th_count = cpi->oxcf.multi_threaded - 1;
- if (cpi->oxcf.multi_threaded > cpi->processor_core_count)
- th_count = cpi->processor_core_count - 1;
+ /* don't allocate more threads than cores available */
+ if (cpi->oxcf.multi_threaded > cm->processor_core_count)
+ th_count = cm->processor_core_count - 1;
/* we have th_count + 1 (main) threads processing one row each */
/* no point to have more threads than the sync range allows */
@@ -514,6 +514,7 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi)
LPFTHREAD_DATA * lpfthd = &cpi->lpf_thread_data;
sem_init(&cpi->h_event_start_lpf, 0, 0);
+ sem_init(&cpi->h_event_end_picklpf, 0, 0);
sem_init(&cpi->h_event_end_lpf, 0, 0);
lpfthd->ptr1 = (void *)cpi;
@@ -547,6 +548,7 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi)
sem_destroy(&cpi->h_event_end_encoding);
sem_destroy(&cpi->h_event_end_lpf);
+ sem_destroy(&cpi->h_event_end_picklpf);
sem_destroy(&cpi->h_event_start_lpf);
//free thread related resources
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 6a8b70e9f..e07b994bf 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -3215,7 +3215,7 @@ void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
#if CONFIG_MULTITHREAD
if (cpi->b_multi_threaded)
- sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
+ sem_post(&cpi->h_event_end_picklpf); /* signal that we have set filter_level */
#endif
if (cm->filter_level > 0)
@@ -4221,7 +4221,7 @@ static void encode_frame_to_data_rate
#if CONFIG_MULTITHREAD
/* wait that filter_level is picked so that we can continue with stream packing */
if (cpi->b_multi_threaded)
- sem_wait(&cpi->h_event_end_lpf);
+ sem_wait(&cpi->h_event_end_picklpf);
#endif
// build the bitstream
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index a66056dff..c2fcff88c 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -576,7 +576,6 @@ typedef struct
// multithread data
int * mt_current_mb_col;
int mt_sync_range;
- int processor_core_count;
int b_multi_threaded;
int encoding_thread_count;
@@ -591,6 +590,7 @@ typedef struct
sem_t *h_event_start_encoding;
sem_t h_event_end_encoding;
sem_t h_event_start_lpf;
+ sem_t h_event_end_picklpf;
sem_t h_event_end_lpf;
#endif