diff options
Diffstat (limited to 'vp8/common')
-rw-r--r-- | vp8/common/arm/arm_systemdependent.c | 6 | ||||
-rw-r--r-- | vp8/common/arm/armv6/loopfilter_v6.asm | 64 | ||||
-rw-r--r-- | vp8/common/arm/armv6/simpleloopfilter_v6.asm | 29 | ||||
-rw-r--r-- | vp8/common/arm/loopfilter_arm.c | 191 | ||||
-rw-r--r-- | vp8/common/arm/loopfilter_arm.h | 34 | ||||
-rw-r--r-- | vp8/common/arm/neon/loopfilter_neon.asm | 294 | ||||
-rw-r--r-- | vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm | 100 | ||||
-rw-r--r-- | vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm | 178 | ||||
-rw-r--r-- | vp8/common/arm/neon/mbloopfilter_neon.asm | 422 | ||||
-rw-r--r-- | vp8/common/generic/systemdependent.c | 4 | ||||
-rw-r--r-- | vp8/common/loopfilter.c | 696 | ||||
-rw-r--r-- | vp8/common/loopfilter.h | 67 | ||||
-rw-r--r-- | vp8/common/loopfilter_filters.c | 80 | ||||
-rw-r--r-- | vp8/common/onyxc_int.h | 21 | ||||
-rw-r--r-- | vp8/common/x86/loopfilter_mmx.asm | 78 | ||||
-rw-r--r-- | vp8/common/x86/loopfilter_sse2.asm | 63 | ||||
-rw-r--r-- | vp8/common/x86/loopfilter_x86.c | 170 | ||||
-rw-r--r-- | vp8/common/x86/loopfilter_x86.h | 24 | ||||
-rw-r--r-- | vp8/common/x86/x86_systemdependent.c | 10 |
19 files changed, 1202 insertions, 1329 deletions
diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c index 8aab0ff03..c0467cd84 100644 --- a/vp8/common/arm/arm_systemdependent.c +++ b/vp8/common/arm/arm_systemdependent.c @@ -54,9 +54,11 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_armv6; rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6; rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_armv6; - rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6; + rtcd->loopfilter.simple_mb_v = + vp8_loop_filter_simple_vertical_edge_armv6; rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_armv6; - rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6; + rtcd->loopfilter.simple_mb_h = + vp8_loop_filter_simple_horizontal_edge_armv6; rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_armv6; rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6; diff --git a/vp8/common/arm/armv6/loopfilter_v6.asm b/vp8/common/arm/armv6/loopfilter_v6.asm index c7441b055..1cbbbcdef 100644 --- a/vp8/common/arm/armv6/loopfilter_v6.asm +++ b/vp8/common/arm/armv6/loopfilter_v6.asm @@ -53,14 +53,11 @@ count RN r5 ;r0 unsigned char *src_ptr, ;r1 int src_pixel_step, -;r2 const char *flimit, +;r2 const char *blimit, ;r3 const char *limit, ;stack const char *thresh, ;stack int count -;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed -;for flimit. Same way applies to limit and thresh. - ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |vp8_loop_filter_horizontal_edge_armv6| PROC ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- @@ -72,14 +69,18 @@ count RN r5 sub sp, sp, #16 ; create temp buffer ldr r9, [src], pstep ; p3 - ldr r4, [r2], #4 ; flimit + ldrb r4, [r2] ; blimit ldr r10, [src], pstep ; p2 - ldr r2, [r3], #4 ; limit + ldrb r2, [r3] ; limit ldr r11, [src], pstep ; p1 - uadd8 r4, r4, r4 ; flimit * 2 - ldr r3, [r6], #4 ; thresh + orr r4, r4, r4, lsl #8 + ldrb r3, [r6] ; thresh + orr r2, r2, r2, lsl #8 mov count, count, lsl #1 ; 4-in-parallel - uadd8 r4, r4, r2 ; flimit * 2 + limit + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 |Hnext8| ; vp8_filter_mask() function @@ -275,14 +276,18 @@ count RN r5 sub sp, sp, #16 ; create temp buffer ldr r9, [src], pstep ; p3 - ldr r4, [r2], #4 ; flimit + ldrb r4, [r2] ; blimit ldr r10, [src], pstep ; p2 - ldr r2, [r3], #4 ; limit + ldrb r2, [r3] ; limit ldr r11, [src], pstep ; p1 - uadd8 r4, r4, r4 ; flimit * 2 - ldr r3, [r6], #4 ; thresh + orr r4, r4, r4, lsl #8 + ldrb r3, [r6] ; thresh + orr r2, r2, r2, lsl #8 mov count, count, lsl #1 ; 4-in-parallel - uadd8 r4, r4, r2 ; flimit * 2 + limit + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 |MBHnext8| @@ -584,15 +589,19 @@ count RN r5 sub sp, sp, #16 ; create temp buffer ldr r6, [src], pstep ; load source data - ldr r4, [r2], #4 ; flimit + ldrb r4, [r2] ; blimit ldr r7, [src], pstep - ldr r2, [r3], #4 ; limit + ldrb r2, [r3] ; limit ldr r8, [src], pstep - uadd8 r4, r4, r4 ; flimit * 2 - ldr r3, [r12], #4 ; thresh + orr r4, r4, r4, lsl #8 + ldrb r3, [r12] ; thresh + orr r2, r2, r2, lsl #8 ldr lr, [src], pstep mov count, count, lsl #1 ; 4-in-parallel - uadd8 r4, r4, r2 ; flimit * 2 + limit + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 |Vnext8| @@ -855,18 +864,22 @@ count RN r5 sub sp, sp, #16 ; create temp buffer ldr r6, [src], pstep ; load source data - ldr r4, [r2], #4 ; flimit + ldrb r4, [r2] ; blimit pld [src, #23] ldr r7, [src], pstep - ldr r2, [r3], #4 ; limit + ldrb r2, [r3] ; limit pld [src, #23] ldr r8, [src], pstep - uadd8 r4, r4, r4 ; flimit * 2 - ldr r3, [r12], #4 ; thresh + orr r4, r4, r4, lsl #8 + ldrb r3, [r12] ; thresh + orr r2, r2, r2, lsl #8 pld [src, #23] ldr lr, [src], pstep mov count, count, lsl #1 ; 4-in-parallel - uadd8 r4, r4, r2 ; flimit * 2 + limit + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 |MBVnext8| ; vp8_filter_mask() function @@ -906,6 +919,7 @@ count RN r5 str lr, [sp, #8] ldr lr, [src], pstep + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 ldr lr, [sp, #8] ; load back (f)limit accumulator @@ -954,6 +968,7 @@ count RN r5 beq mbvskip_filter ; skip filtering + ;vp8_hevmask() function ;calculate high edge variance @@ -1121,6 +1136,7 @@ count RN r5 smlabb r8, r6, lr, r7 smlatb r6, r6, lr, r7 smlabb r9, r10, lr, r7 + smlatb r10, r10, lr, r7 ssat r8, #8, r8, asr #7 ssat r6, #8, r6, asr #7 diff --git a/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/vp8/common/arm/armv6/simpleloopfilter_v6.asm index 40a71f49d..5e00cf01b 100644 --- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm +++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm @@ -45,35 +45,28 @@ MEND + src RN r0 pstep RN r1 ;r0 unsigned char *src_ptr, ;r1 int src_pixel_step, -;r2 const char *flimit, -;r3 const char *limit, -;stack const char *thresh, -;stack int count - -; All 16 elements in flimit are equal. So, in the code, only one load is needed -; for flimit. Same applies to limit. thresh is not used in simple looopfilter +;r2 const char *blimit ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |vp8_loop_filter_simple_horizontal_edge_armv6| PROC ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- stmdb sp!, {r4 - r11, lr} - ldr r12, [r3] ; limit + ldrb r12, [r2] ; blimit ldr r3, [src, -pstep, lsl #1] ; p1 ldr r4, [src, -pstep] ; p0 ldr r5, [src] ; q0 ldr r6, [src, pstep] ; q1 - ldr r7, [r2] ; flimit + orr r12, r12, r12, lsl #8 ; blimit ldr r2, c0x80808080 - ldr r9, [sp, #40] ; count for 8-in-parallel - uadd8 r7, r7, r7 ; flimit * 2 - mov r9, r9, lsl #1 ; double the count. we're doing 4 at a time - uadd8 r12, r7, r12 ; flimit * 2 + limit + orr r12, r12, r12, lsl #16 ; blimit + mov r9, #4 ; double the count. we're doing 4 at a time mov lr, #0 ; need 0 in a couple places |simple_hnext8| @@ -148,34 +141,32 @@ pstep RN r1 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- stmdb sp!, {r4 - r11, lr} - ldr r12, [r2] ; r12: flimit + ldrb r12, [r2] ; r12: blimit ldr r2, c0x80808080 - ldr r7, [r3] ; limit + orr r12, r12, r12, lsl #8 ; load soure data to r7, r8, r9, r10 ldrh r3, [src, #-2] pld [src, #23] ; preload for next block ldrh r4, [src], pstep - uadd8 r12, r12, r12 ; flimit * 2 + orr r12, r12, r12, lsl #16 ldrh r5, [src, #-2] pld [src, #23] ldrh r6, [src], pstep - uadd8 r12, r12, r7 ; flimit * 2 + limit pkhbt r7, r3, r4, lsl #16 ldrh r3, [src, #-2] pld [src, #23] ldrh r4, [src], pstep - ldr r11, [sp, #40] ; count (r11) for 8-in-parallel pkhbt r8, r5, r6, lsl #16 ldrh r5, [src, #-2] pld [src, #23] ldrh r6, [src], pstep - mov r11, r11, lsl #1 ; 4-in-parallel + mov r11, #4 ; double the count. we're doing 4 at a time |simple_vnext8| ; vp8_simple_filter_mask() function diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c index 6d1caa485..c841d455a 100644 --- a/vp8/common/arm/loopfilter_arm.c +++ b/vp8/common/arm/loopfilter_arm.c @@ -9,30 +9,34 @@ */ -#include "vpx_ports/config.h" -#include <math.h> +#include "vpx_config.h" #include "vp8/common/loopfilter.h" #include "vp8/common/onyxc_int.h" +#if HAVE_ARMV6 extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6); extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6); extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6); extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6); -extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6); -extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6); - -extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_y_neon); -extern prototype_loopfilter(vp8_loop_filter_vertical_edge_y_neon); -extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_y_neon); -extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_y_neon); -extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_neon); -extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_neon); - -extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_neon; -extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_neon; -extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_neon; -extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon; +#endif +#if HAVE_ARMV7 +typedef void loopfilter_y_neon(unsigned char *src, int pitch, + unsigned char blimit, unsigned char limit, unsigned char thresh); +typedef void loopfilter_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, unsigned char limit, unsigned char thresh, + unsigned char *v); + +extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon; +extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon; +extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon; +extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; + +extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon; +extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon; +extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon; +extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon; +#endif #if HAVE_ARMV6 /*ARMV6 loopfilter functions*/ @@ -40,96 +44,72 @@ extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon; void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); -} - -void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } /* Vertical MB Filtering */ void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); -} - -void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } /* Horizontal B Filtering */ void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit); } /* Vertical B Filtering */ void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit); } #endif @@ -139,83 +119,58 @@ void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); if (u_ptr) - vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr); -} - -void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); } /* Vertical MB Filtering */ void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; - if (u_ptr) - vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr); -} + vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); -void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + if (u_ptr) + vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); } /* Horizontal B Filtering */ void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; - if (u_ptr) - vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride); -} + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr); -void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + if (u_ptr) + vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride); } /* Vertical B Filtering */ void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; - if (u_ptr) - vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4); -} + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr); -void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + if (u_ptr) + vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4); } #endif diff --git a/vp8/common/arm/loopfilter_arm.h b/vp8/common/arm/loopfilter_arm.h index cd62207d7..390a547b0 100644 --- a/vp8/common/arm/loopfilter_arm.h +++ b/vp8/common/arm/loopfilter_arm.h @@ -12,15 +12,17 @@ #ifndef LOOPFILTER_ARM_H #define LOOPFILTER_ARM_H +#include "vpx_config.h" + #if HAVE_ARMV6 extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6); extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6); extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6); extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6); -extern prototype_loopfilter_block(vp8_loop_filter_mbvs_armv6); -extern prototype_loopfilter_block(vp8_loop_filter_bvs_armv6); -extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6); -extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6); +extern prototype_simple_loopfilter(vp8_loop_filter_bvs_armv6); +extern prototype_simple_loopfilter(vp8_loop_filter_bhs_armv6); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_lf_normal_mb_v @@ -36,28 +38,29 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6); #define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6 #undef vp8_lf_simple_mb_v -#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_armv6 +#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_armv6 #undef vp8_lf_simple_b_v #define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6 #undef vp8_lf_simple_mb_h -#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_armv6 +#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_armv6 #undef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6 -#endif -#endif +#endif /* !CONFIG_RUNTIME_CPU_DETECT */ + +#endif /* HAVE_ARMV6 */ #if HAVE_ARMV7 extern prototype_loopfilter_block(vp8_loop_filter_mbv_neon); extern prototype_loopfilter_block(vp8_loop_filter_bv_neon); extern prototype_loopfilter_block(vp8_loop_filter_mbh_neon); extern prototype_loopfilter_block(vp8_loop_filter_bh_neon); -extern prototype_loopfilter_block(vp8_loop_filter_mbvs_neon); -extern prototype_loopfilter_block(vp8_loop_filter_bvs_neon); -extern prototype_loopfilter_block(vp8_loop_filter_mbhs_neon); -extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon); +extern prototype_simple_loopfilter(vp8_loop_filter_mbvs_neon); +extern prototype_simple_loopfilter(vp8_loop_filter_bvs_neon); +extern prototype_simple_loopfilter(vp8_loop_filter_mbhs_neon); +extern prototype_simple_loopfilter(vp8_loop_filter_bhs_neon); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_lf_normal_mb_v @@ -83,7 +86,8 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon); #undef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_neon -#endif -#endif +#endif /* !CONFIG_RUNTIME_CPU_DETECT */ + +#endif /* HAVE_ARMV7 */ -#endif +#endif /* LOOPFILTER_ARM_H */ diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm index e73dd6401..e44be0a1e 100644 --- a/vp8/common/arm/neon/loopfilter_neon.asm +++ b/vp8/common/arm/neon/loopfilter_neon.asm @@ -14,109 +14,97 @@ EXPORT |vp8_loop_filter_vertical_edge_y_neon| EXPORT |vp8_loop_filter_vertical_edge_uv_neon| ARM - REQUIRE8 - PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -; flimit, limit, and thresh should be positive numbers. -; All 16 elements in these variables are equal. - -; void vp8_loop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; int count) ; r0 unsigned char *src ; r1 int pitch -; r2 const signed char *flimit -; r3 const signed char *limit -; sp const signed char *thresh, -; sp+4 int count (unused) +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, |vp8_loop_filter_horizontal_edge_y_neon| PROC - stmdb sp!, {lr} - vld1.s8 {d0[], d1[]}, [r2] ; flimit - vld1.s8 {d2[], d3[]}, [r3] ; limit - sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines - ldr r12, [sp, #4] ; load thresh pointer - - vld1.u8 {q3}, [r2], r1 ; p3 - vld1.u8 {q4}, [r2], r1 ; p2 - vld1.u8 {q5}, [r2], r1 ; p1 - vld1.u8 {q6}, [r2], r1 ; p0 - vld1.u8 {q7}, [r2], r1 ; q0 - vld1.u8 {q8}, [r2], r1 ; q1 - vld1.u8 {q9}, [r2], r1 ; q2 - vld1.u8 {q10}, [r2] ; q3 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - sub r0, r0, r1, lsl #1 + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + vdup.u8 q1, r3 ; duplicate limit + sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines + ldr r3, [sp, #4] ; load thresh + add r12, r2, r1 + add r1, r1, r1 + + vdup.u8 q2, r3 ; duplicate thresh + + vld1.u8 {q3}, [r2@128], r1 ; p3 + vld1.u8 {q4}, [r12@128], r1 ; p2 + vld1.u8 {q5}, [r2@128], r1 ; p1 + vld1.u8 {q6}, [r12@128], r1 ; p0 + vld1.u8 {q7}, [r2@128], r1 ; q0 + vld1.u8 {q8}, [r12@128], r1 ; q1 + vld1.u8 {q9}, [r2@128] ; q2 + vld1.u8 {q10}, [r12@128] ; q3 + + sub r2, r2, r1, lsl #1 + sub r12, r12, r1, lsl #1 bl vp8_loop_filter_neon - vst1.u8 {q5}, [r0], r1 ; store op1 - vst1.u8 {q6}, [r0], r1 ; store op0 - vst1.u8 {q7}, [r0], r1 ; store oq0 - vst1.u8 {q8}, [r0], r1 ; store oq1 + vst1.u8 {q5}, [r2@128], r1 ; store op1 + vst1.u8 {q6}, [r12@128], r1 ; store op0 + vst1.u8 {q7}, [r2@128], r1 ; store oq0 + vst1.u8 {q8}, [r12@128], r1 ; store oq1 - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_loop_filter_horizontal_edge_y_neon| -; void vp8_loop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; unsigned char *v) + ; r0 unsigned char *u, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, ; sp+4 unsigned char *v |vp8_loop_filter_horizontal_edge_uv_neon| PROC - stmdb sp!, {lr} - vld1.s8 {d0[], d1[]}, [r2] ; flimit - vld1.s8 {d2[], d3[]}, [r3] ; limit + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + vdup.u8 q1, r3 ; duplicate limit + ldr r12, [sp, #4] ; load thresh ldr r2, [sp, #8] ; load v ptr + vdup.u8 q2, r12 ; duplicate thresh sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines - vld1.u8 {d6}, [r3], r1 ; p3 - vld1.u8 {d8}, [r3], r1 ; p2 - vld1.u8 {d10}, [r3], r1 ; p1 - vld1.u8 {d12}, [r3], r1 ; p0 - vld1.u8 {d14}, [r3], r1 ; q0 - vld1.u8 {d16}, [r3], r1 ; q1 - vld1.u8 {d18}, [r3], r1 ; q2 - vld1.u8 {d20}, [r3] ; q3 - - ldr r3, [sp, #4] ; load thresh pointer - sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines - vld1.u8 {d7}, [r12], r1 ; p3 - vld1.u8 {d9}, [r12], r1 ; p2 - vld1.u8 {d11}, [r12], r1 ; p1 - vld1.u8 {d13}, [r12], r1 ; p0 - vld1.u8 {d15}, [r12], r1 ; q0 - vld1.u8 {d17}, [r12], r1 ; q1 - vld1.u8 {d19}, [r12], r1 ; q2 - vld1.u8 {d21}, [r12] ; q3 - vld1.s8 {d4[], d5[]}, [r3] ; thresh + vld1.u8 {d6}, [r3@64], r1 ; p3 + vld1.u8 {d7}, [r12@64], r1 ; p3 + vld1.u8 {d8}, [r3@64], r1 ; p2 + vld1.u8 {d9}, [r12@64], r1 ; p2 + vld1.u8 {d10}, [r3@64], r1 ; p1 + vld1.u8 {d11}, [r12@64], r1 ; p1 + vld1.u8 {d12}, [r3@64], r1 ; p0 + vld1.u8 {d13}, [r12@64], r1 ; p0 + vld1.u8 {d14}, [r3@64], r1 ; q0 + vld1.u8 {d15}, [r12@64], r1 ; q0 + vld1.u8 {d16}, [r3@64], r1 ; q1 + vld1.u8 {d17}, [r12@64], r1 ; q1 + vld1.u8 {d18}, [r3@64], r1 ; q2 + vld1.u8 {d19}, [r12@64], r1 ; q2 + vld1.u8 {d20}, [r3@64] ; q3 + vld1.u8 {d21}, [r12@64] ; q3 bl vp8_loop_filter_neon sub r0, r0, r1, lsl #1 sub r2, r2, r1, lsl #1 - vst1.u8 {d10}, [r0], r1 ; store u op1 - vst1.u8 {d11}, [r2], r1 ; store v op1 - vst1.u8 {d12}, [r0], r1 ; store u op0 - vst1.u8 {d13}, [r2], r1 ; store v op0 - vst1.u8 {d14}, [r0], r1 ; store u oq0 - vst1.u8 {d15}, [r2], r1 ; store v oq0 - vst1.u8 {d16}, [r0] ; store u oq1 - vst1.u8 {d17}, [r2] ; store v oq1 + vst1.u8 {d10}, [r0@64], r1 ; store u op1 + vst1.u8 {d11}, [r2@64], r1 ; store v op1 + vst1.u8 {d12}, [r0@64], r1 ; store u op0 + vst1.u8 {d13}, [r2@64], r1 ; store v op0 + vst1.u8 {d14}, [r0@64], r1 ; store u oq0 + vst1.u8 {d15}, [r2@64], r1 ; store v oq0 + vst1.u8 {d16}, [r0@64] ; store u oq1 + vst1.u8 {d17}, [r2@64] ; store v oq1 - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon| ; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, @@ -124,39 +112,38 @@ ; const signed char *limit, ; const signed char *thresh, ; int count) -; r0 unsigned char *src, -; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, -; sp+4 int count (unused) +; r0 unsigned char *src +; r1 int pitch +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, + |vp8_loop_filter_vertical_edge_y_neon| PROC - stmdb sp!, {lr} - vld1.s8 {d0[], d1[]}, [r2] ; flimit - vld1.s8 {d2[], d3[]}, [r3] ; limit - sub r2, r0, #4 ; src ptr down by 4 columns - sub r0, r0, #2 ; dst ptr - ldr r12, [sp, #4] ; load thresh pointer - - vld1.u8 {d6}, [r2], r1 ; load first 8-line src data - vld1.u8 {d8}, [r2], r1 + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + vdup.u8 q1, r3 ; duplicate limit + sub r2, r0, #4 ; src ptr down by 4 columns + add r1, r1, r1 + ldr r3, [sp, #4] ; load thresh + add r12, r2, r1, asr #1 + + vld1.u8 {d6}, [r2], r1 + vld1.u8 {d8}, [r12], r1 vld1.u8 {d10}, [r2], r1 - vld1.u8 {d12}, [r2], r1 + vld1.u8 {d12}, [r12], r1 vld1.u8 {d14}, [r2], r1 - vld1.u8 {d16}, [r2], r1 + vld1.u8 {d16}, [r12], r1 vld1.u8 {d18}, [r2], r1 - vld1.u8 {d20}, [r2], r1 - - vld1.s8 {d4[], d5[]}, [r12] ; thresh + vld1.u8 {d20}, [r12], r1 vld1.u8 {d7}, [r2], r1 ; load second 8-line src data - vld1.u8 {d9}, [r2], r1 + vld1.u8 {d9}, [r12], r1 vld1.u8 {d11}, [r2], r1 - vld1.u8 {d13}, [r2], r1 + vld1.u8 {d13}, [r12], r1 vld1.u8 {d15}, [r2], r1 - vld1.u8 {d17}, [r2], r1 - vld1.u8 {d19}, [r2], r1 - vld1.u8 {d21}, [r2] + vld1.u8 {d17}, [r12], r1 + vld1.u8 {d19}, [r2] + vld1.u8 {d21}, [r12] ;transpose to 8x16 matrix vtrn.32 q3, q7 @@ -164,6 +151,8 @@ vtrn.32 q5, q9 vtrn.32 q6, q10 + vdup.u8 q2, r3 ; duplicate thresh + vtrn.16 q3, q5 vtrn.16 q4, q6 vtrn.16 q7, q9 @@ -178,28 +167,34 @@ vswp d12, d11 vswp d16, d13 + + sub r0, r0, #2 ; dst ptr + vswp d14, d12 vswp d16, d15 + add r12, r0, r1, asr #1 + ;store op1, op0, oq0, oq1 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 - vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 - vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1 + vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1 + vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1 - vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r0], r1 + vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1 vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1 - vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r0], r1 + vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1 vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1 - vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r0], r1 - vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0], r1 - vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r0] + vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1 + vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0] + vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12] - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_loop_filter_vertical_edge_y_neon| ; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch @@ -209,38 +204,36 @@ ; unsigned char *v) ; r0 unsigned char *u, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, ; sp+4 unsigned char *v |vp8_loop_filter_vertical_edge_uv_neon| PROC - stmdb sp!, {lr} - sub r12, r0, #4 ; move u pointer down by 4 columns - vld1.s8 {d0[], d1[]}, [r2] ; flimit - vld1.s8 {d2[], d3[]}, [r3] ; limit - + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + sub r12, r0, #4 ; move u pointer down by 4 columns ldr r2, [sp, #8] ; load v ptr - - vld1.u8 {d6}, [r12], r1 ;load u data - vld1.u8 {d8}, [r12], r1 - vld1.u8 {d10}, [r12], r1 - vld1.u8 {d12}, [r12], r1 - vld1.u8 {d14}, [r12], r1 - vld1.u8 {d16}, [r12], r1 - vld1.u8 {d18}, [r12], r1 - vld1.u8 {d20}, [r12] - + vdup.u8 q1, r3 ; duplicate limit sub r3, r2, #4 ; move v pointer down by 4 columns + + vld1.u8 {d6}, [r12], r1 ;load u data vld1.u8 {d7}, [r3], r1 ;load v data + vld1.u8 {d8}, [r12], r1 vld1.u8 {d9}, [r3], r1 + vld1.u8 {d10}, [r12], r1 vld1.u8 {d11}, [r3], r1 + vld1.u8 {d12}, [r12], r1 vld1.u8 {d13}, [r3], r1 + vld1.u8 {d14}, [r12], r1 vld1.u8 {d15}, [r3], r1 + vld1.u8 {d16}, [r12], r1 vld1.u8 {d17}, [r3], r1 + vld1.u8 {d18}, [r12], r1 vld1.u8 {d19}, [r3], r1 + vld1.u8 {d20}, [r12] vld1.u8 {d21}, [r3] - ldr r12, [sp, #4] ; load thresh pointer + ldr r12, [sp, #4] ; load thresh ;transpose to 8x16 matrix vtrn.32 q3, q7 @@ -248,6 +241,8 @@ vtrn.32 q5, q9 vtrn.32 q6, q10 + vdup.u8 q2, r12 ; duplicate thresh + vtrn.16 q3, q5 vtrn.16 q4, q6 vtrn.16 q7, q9 @@ -258,18 +253,16 @@ vtrn.8 q7, q8 vtrn.8 q9, q10 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - bl vp8_loop_filter_neon - sub r0, r0, #2 - sub r2, r2, #2 - vswp d12, d11 vswp d16, d13 vswp d14, d12 vswp d16, d15 + sub r0, r0, #2 + sub r2, r2, #2 + ;store op1, op0, oq0, oq1 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1 @@ -288,7 +281,7 @@ vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0] vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2] - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_loop_filter_vertical_edge_uv_neon| ; void vp8_loop_filter_neon(); @@ -316,42 +309,44 @@ vabd.u8 q14, q8, q7 ; abs(q1 - q0) vabd.u8 q3, q9, q8 ; abs(q2 - q1) vabd.u8 q4, q10, q9 ; abs(q3 - q2) - vabd.u8 q9, q6, q7 ; abs(p0 - q0) vmax.u8 q11, q11, q12 vmax.u8 q12, q13, q14 vmax.u8 q3, q3, q4 vmax.u8 q15, q11, q12 + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + ; vp8_hevmask vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 vmax.u8 q15, q15, q3 - vadd.u8 q0, q0, q0 ; flimit * 2 - vadd.u8 q0, q0, q1 ; flimit * 2 + limit - vcge.u8 q15, q1, q15 + vmov.u8 q10, #0x80 ; 0x80 vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 - vshr.u8 q2, q2, #1 ; a = a / 2 - vqadd.u8 q9, q9, q2 ; a = b + a - vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1 - vmov.u8 q0, #0x80 ; 0x80 + vcge.u8 q15, q1, q15 ; vp8_filter() function ; convert to signed - veor q7, q7, q0 ; qs0 - veor q6, q6, q0 ; ps0 - veor q5, q5, q0 ; ps1 - veor q8, q8, q0 ; qs1 + veor q7, q7, q10 ; qs0 + vshr.u8 q2, q2, #1 ; a = a / 2 + veor q6, q6, q10 ; ps0 + + veor q5, q5, q10 ; ps1 + vqadd.u8 q9, q9, q2 ; a = b + a + + veor q8, q8, q10 ; qs1 vmov.u8 q10, #3 ; #3 vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) vsubl.s8 q11, d15, d13 + vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1 + vmovl.u8 q4, d20 vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1) @@ -378,19 +373,20 @@ vshr.s8 q2, q2, #3 ; Filter2 >>= 3 vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2) vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1) ; outer tap adjustments: ++vp8_filter >> 1 vrshr.s8 q1, q1, #1 vbic q1, q1, q14 ; vp8_filter &= ~hev - + vmov.u8 q0, #0x80 ; 0x80 vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter) vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter) - veor q5, q13, q0 ; *op1 = u^0x80 veor q6, q11, q0 ; *op0 = u^0x80 veor q7, q10, q0 ; *oq0 = u^0x80 + veor q5, q13, q0 ; *op1 = u^0x80 veor q8, q12, q0 ; *oq1 = u^0x80 bx lr diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm index 7c5ea3644..adf848b9c 100644 --- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm +++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm @@ -9,99 +9,109 @@ ; - EXPORT |vp8_loop_filter_simple_horizontal_edge_neon| + ;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon| + EXPORT |vp8_loop_filter_bhs_neon| + EXPORT |vp8_loop_filter_mbhs_neon| ARM - REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit -;are equal. So, in the code, only one load is needed -;for flimit. Same way applies to limit and thresh. -; r0 unsigned char *s, -; r1 int p, //pitch -; r2 const signed char *flimit, -; r3 const signed char *limit, -; stack(r4) const signed char *thresh (unused) -; //stack(r5) int count --unused + +; r0 unsigned char *s, PRESERVE +; r1 int p, PRESERVE +; q1 limit, PRESERVE |vp8_loop_filter_simple_horizontal_edge_neon| PROC - sub r0, r0, r1, lsl #1 ; move src pointer down by 2 lines - vld1.u8 {q5}, [r0], r1 ; p1 - vld1.s8 {d2[], d3[]}, [r2] ; flimit - vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13 - vld1.u8 {q6}, [r0], r1 ; p0 - vmov.u8 q0, #0x80 ; 0x80 - vld1.u8 {q7}, [r0], r1 ; q0 - vmov.u8 q10, #0x03 ; 0x03 - vld1.u8 {q8}, [r0] ; q1 + sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines + + vld1.u8 {q7}, [r0@128], r1 ; q0 + vld1.u8 {q5}, [r3@128], r1 ; p0 + vld1.u8 {q8}, [r0@128] ; q1 + vld1.u8 {q6}, [r3@128] ; p1 - ;vp8_filter_mask() function vabd.u8 q15, q6, q7 ; abs(p0 - q0) vabd.u8 q14, q5, q8 ; abs(p1 - q1) + vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 + vmov.u8 q0, #0x80 ; 0x80 + vmov.s16 q13, #3 vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 - ;vp8_filter() function veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value - vadd.u8 q1, q1, q1 ; flimit * 2 - vadd.u8 q1, q1, q13 ; flimit * 2 + limit - vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1 -;;;;;;;;;; - ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) vsubl.s8 q3, d15, d13 vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1) - ;vmul.i8 q2, q2, q10 ; 3 * ( qs0 - ps0) - vadd.s16 q11, q2, q2 ; 3 * ( qs0 - ps0) - vadd.s16 q12, q3, q3 + vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0) + vmul.s16 q3, q3, q13 + vmov.u8 q10, #0x03 ; 0x03 vmov.u8 q9, #0x04 ; 0x04 - vadd.s16 q2, q2, q11 - vadd.s16 q3, q3, q12 - vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0) vaddw.s8 q3, q3, d9 - ;vqadd.s8 q4, q4, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) vqmovn.s16 d9, q3 -;;;;;;;;;;;;; - vand q4, q4, q15 ; vp8_filter &= mask + vand q14, q4, q15 ; vp8_filter &= mask - vqadd.s8 q2, q4, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) - vqadd.s8 q4, q4, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vqadd.s8 q2, q14, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q3, q14, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q4, q4, #3 ; Filter1 >>= 3 + vshr.s8 q4, q3, #3 ; Filter1 >>= 3 - sub r0, r0, r1, lsl #1 + sub r0, r0, r1 ;calculate output vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1) - add r3, r0, r1 - veor q6, q11, q0 ; *op0 = u^0x80 veor q7, q10, q0 ; *oq0 = u^0x80 - vst1.u8 {q6}, [r0] ; store op0 - vst1.u8 {q7}, [r3] ; store oq0 + vst1.u8 {q6}, [r3@128] ; store op0 + vst1.u8 {q7}, [r0@128] ; store oq0 bx lr ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon| -;----------------- +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_bhs_neon| PROC + push {r4, lr} + ldrb r3, [r2] ; load blim from mem + vdup.s8 q1, r3 ; duplicate blim + + add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride + bl vp8_loop_filter_simple_horizontal_edge_neon + ; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1 + add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride + bl vp8_loop_filter_simple_horizontal_edge_neon + add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride + pop {r4, lr} + b vp8_loop_filter_simple_horizontal_edge_neon + ENDP ;|vp8_loop_filter_bhs_neon| + +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_mbhs_neon| PROC + ldrb r3, [r2] ; load blim from mem + vdup.s8 q1, r3 ; duplicate mblim + b vp8_loop_filter_simple_horizontal_edge_neon + ENDP ;|vp8_loop_filter_bhs_neon| END diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm index a7f7b690e..e690df2f7 100644 --- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm +++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm @@ -9,59 +9,54 @@ ; - EXPORT |vp8_loop_filter_simple_vertical_edge_neon| + ;EXPORT |vp8_loop_filter_simple_vertical_edge_neon| + EXPORT |vp8_loop_filter_bvs_neon| + EXPORT |vp8_loop_filter_mbvs_neon| ARM - REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;Note: flimit, limit, and thresh should be positive numbers. All 16 elements in flimit -;are equal. So, in the code, only one load is needed -;for flimit. Same way applies to limit and thresh. -; r0 unsigned char *s, -; r1 int p, //pitch -; r2 const signed char *flimit, -; r3 const signed char *limit, -; stack(r4) const signed char *thresh (unused) -; //stack(r5) int count --unused + +; r0 unsigned char *s, PRESERVE +; r1 int p, PRESERVE +; q1 limit, PRESERVE |vp8_loop_filter_simple_vertical_edge_neon| PROC sub r0, r0, #2 ; move src pointer down by 2 columns - - vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r1 - vld1.s8 {d2[], d3[]}, [r2] ; flimit - vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13 - vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r0], r1 - vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r1 - vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r0], r1 - vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r1 - vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r0], r1 - vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r1 - vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r0], r1 - - vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vmov.u8 q0, #0x80 ; 0x80 - vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 - vmov.u8 q11, #0x03 ; 0x03 - vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vmov.u8 q12, #0x04 ; 0x04 - vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 - vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 - vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 - vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 - vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1 + add r12, r1, r1 + add r3, r0, r1 + + vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12 + vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12 + vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12 + vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12 + vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12 + vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12 + vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12 + vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12 + + vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12 + vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12 + vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12 + vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12 + vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12 + vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12 + vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12 + vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3] vswp d7, d10 vswp d12, d9 - ;vswp q4, q5 ; p1:q3, p0:q5, q0:q4, q1:q6 ;vp8_filter_mask() function ;vp8_hevmask() function sub r0, r0, r1, lsl #4 vabd.u8 q15, q5, q4 ; abs(p0 - q0) vabd.u8 q14, q3, q6 ; abs(p1 - q1) + vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 + vmov.u8 q0, #0x80 ; 0x80 + vmov.s16 q11, #3 vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value @@ -69,80 +64,91 @@ veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value - vadd.u8 q1, q1, q1 ; flimit * 2 - vadd.u8 q1, q1, q13 ; flimit * 2 + limit vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 - ;vp8_filter() function -;;;;;;;;;; - ;vqsub.s8 q2, q5, q4 ; ( qs0 - ps0) vsubl.s8 q2, d8, d10 ; ( qs0 - ps0) vsubl.s8 q13, d9, d11 - vqsub.s8 q1, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + vqsub.s8 q14, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + + vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0) + vmul.s16 q13, q13, q11 - ;vmul.i8 q2, q2, q11 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - vadd.s16 q10, q2, q2 ; 3 * ( qs0 - ps0) - vadd.s16 q14, q13, q13 - vadd.s16 q2, q2, q10 - vadd.s16 q13, q13, q14 + vmov.u8 q11, #0x03 ; 0x03 + vmov.u8 q12, #0x04 ; 0x04 - ;vqadd.s8 q1, q1, q2 - vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) - vaddw.s8 q13, q13, d3 + vaddw.s8 q2, q2, d28 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d29 - vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d3, q13 + vqmovn.s16 d28, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d29, q13 add r0, r0, #1 - add r2, r0, r1 -;;;;;;;;;;; + add r3, r0, r1 - vand q1, q1, q15 ; vp8_filter &= mask + vand q14, q14, q15 ; vp8_filter &= mask - vqadd.s8 q2, q1, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) - vqadd.s8 q1, q1, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vqadd.s8 q2, q14, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q3, q14, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + vshr.s8 q14, q3, #3 ; Filter1 >>= 3 ;calculate output - vqsub.s8 q10, q4, q1 ; u = vp8_signed_char_clamp(qs0 - Filter1) vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q4, q14 ; u = vp8_signed_char_clamp(qs0 - Filter1) - veor q7, q10, q0 ; *oq0 = u^0x80 veor q6, q11, q0 ; *op0 = u^0x80 - - add r3, r2, r1 + veor q7, q10, q0 ; *oq0 = u^0x80 + add r12, r1, r1 vswp d13, d14 - add r12, r3, r1 ;store op1, op0, oq0, oq1 - vst2.8 {d12[0], d13[0]}, [r0] - vst2.8 {d12[1], d13[1]}, [r2] - vst2.8 {d12[2], d13[2]}, [r3] - vst2.8 {d12[3], d13[3]}, [r12], r1 - add r0, r12, r1 - vst2.8 {d12[4], d13[4]}, [r12] - vst2.8 {d12[5], d13[5]}, [r0], r1 - add r2, r0, r1 - vst2.8 {d12[6], d13[6]}, [r0] - vst2.8 {d12[7], d13[7]}, [r2], r1 - add r3, r2, r1 - vst2.8 {d14[0], d15[0]}, [r2] - vst2.8 {d14[1], d15[1]}, [r3], r1 - add r12, r3, r1 - vst2.8 {d14[2], d15[2]}, [r3] - vst2.8 {d14[3], d15[3]}, [r12], r1 - add r0, r12, r1 - vst2.8 {d14[4], d15[4]}, [r12] - vst2.8 {d14[5], d15[5]}, [r0], r1 - add r2, r0, r1 - vst2.8 {d14[6], d15[6]}, [r0] - vst2.8 {d14[7], d15[7]}, [r2] + vst2.8 {d12[0], d13[0]}, [r0], r12 + vst2.8 {d12[1], d13[1]}, [r3], r12 + vst2.8 {d12[2], d13[2]}, [r0], r12 + vst2.8 {d12[3], d13[3]}, [r3], r12 + vst2.8 {d12[4], d13[4]}, [r0], r12 + vst2.8 {d12[5], d13[5]}, [r3], r12 + vst2.8 {d12[6], d13[6]}, [r0], r12 + vst2.8 {d12[7], d13[7]}, [r3], r12 + vst2.8 {d14[0], d15[0]}, [r0], r12 + vst2.8 {d14[1], d15[1]}, [r3], r12 + vst2.8 {d14[2], d15[2]}, [r0], r12 + vst2.8 {d14[3], d15[3]}, [r3], r12 + vst2.8 {d14[4], d15[4]}, [r0], r12 + vst2.8 {d14[5], d15[5]}, [r3], r12 + vst2.8 {d14[6], d15[6]}, [r0], r12 + vst2.8 {d14[7], d15[7]}, [r3] bx lr ENDP ; |vp8_loop_filter_simple_vertical_edge_neon| -;----------------- - +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_bvs_neon| PROC + push {r4, lr} + ldrb r3, [r2] ; load blim from mem + mov r4, r0 + add r0, r0, #4 + vdup.s8 q1, r3 ; duplicate blim + bl vp8_loop_filter_simple_vertical_edge_neon + ; vp8_loop_filter_simple_vertical_edge_neon preserves r1 and q1 + add r0, r4, #8 + bl vp8_loop_filter_simple_vertical_edge_neon + add r0, r4, #12 + pop {r4, lr} + b vp8_loop_filter_simple_vertical_edge_neon + ENDP ;|vp8_loop_filter_bvs_neon| + +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_mbvs_neon| PROC + ldrb r3, [r2] ; load mblim from mem + vdup.s8 q1, r3 ; duplicate mblim + b vp8_loop_filter_simple_vertical_edge_neon + ENDP ;|vp8_loop_filter_bvs_neon| END diff --git a/vp8/common/arm/neon/mbloopfilter_neon.asm b/vp8/common/arm/neon/mbloopfilter_neon.asm index 72f0f9271..f41c156df 100644 --- a/vp8/common/arm/neon/mbloopfilter_neon.asm +++ b/vp8/common/arm/neon/mbloopfilter_neon.asm @@ -14,155 +14,143 @@ EXPORT |vp8_mbloop_filter_vertical_edge_y_neon| EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon| ARM - REQUIRE8 - PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -; flimit, limit, and thresh should be positive numbers. -; All 16 elements in these variables are equal. - ; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; int count) +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh) ; r0 unsigned char *src, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, -; sp+4 int count (unused) +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, |vp8_mbloop_filter_horizontal_edge_y_neon| PROC - stmdb sp!, {lr} - sub r0, r0, r1, lsl #2 ; move src pointer down by 4 lines - ldr r12, [sp, #4] ; load thresh pointer - - vld1.u8 {q3}, [r0], r1 ; p3 - vld1.s8 {d2[], d3[]}, [r3] ; limit - vld1.u8 {q4}, [r0], r1 ; p2 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - vld1.u8 {q5}, [r0], r1 ; p1 - vld1.u8 {q6}, [r0], r1 ; p0 - vld1.u8 {q7}, [r0], r1 ; q0 - vld1.u8 {q8}, [r0], r1 ; q1 - vld1.u8 {q9}, [r0], r1 ; q2 - vld1.u8 {q10}, [r0], r1 ; q3 + push {lr} + add r1, r1, r1 ; double stride + ldr r12, [sp, #4] ; load thresh + sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines + vdup.u8 q2, r12 ; thresh + add r12, r0, r1, lsr #1 ; move src pointer up by 1 line + + vld1.u8 {q3}, [r0@128], r1 ; p3 + vld1.u8 {q4}, [r12@128], r1 ; p2 + vld1.u8 {q5}, [r0@128], r1 ; p1 + vld1.u8 {q6}, [r12@128], r1 ; p0 + vld1.u8 {q7}, [r0@128], r1 ; q0 + vld1.u8 {q8}, [r12@128], r1 ; q1 + vld1.u8 {q9}, [r0@128], r1 ; q2 + vld1.u8 {q10}, [r12@128], r1 ; q3 bl vp8_mbloop_filter_neon - sub r0, r0, r1, lsl #3 - add r0, r0, r1 - add r2, r0, r1 - add r3, r2, r1 - - vst1.u8 {q4}, [r0] ; store op2 - vst1.u8 {q5}, [r2] ; store op1 - vst1.u8 {q6}, [r3], r1 ; store op0 - add r12, r3, r1 - vst1.u8 {q7}, [r3] ; store oq0 - vst1.u8 {q8}, [r12], r1 ; store oq1 - vst1.u8 {q9}, [r12] ; store oq2 - - ldmia sp!, {pc} + sub r12, r12, r1, lsl #2 + add r0, r12, r1, lsr #1 + + vst1.u8 {q4}, [r12@128],r1 ; store op2 + vst1.u8 {q5}, [r0@128],r1 ; store op1 + vst1.u8 {q6}, [r12@128], r1 ; store op0 + vst1.u8 {q7}, [r0@128],r1 ; store oq0 + vst1.u8 {q8}, [r12@128] ; store oq1 + vst1.u8 {q9}, [r0@128] ; store oq2 + + pop {pc} ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon| ; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh, ; unsigned char *v) ; r0 unsigned char *u, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, ; sp+4 unsigned char *v + |vp8_mbloop_filter_horizontal_edge_uv_neon| PROC - stmdb sp!, {lr} - sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines - vld1.s8 {d2[], d3[]}, [r3] ; limit - ldr r3, [sp, #8] ; load v ptr - ldr r12, [sp, #4] ; load thresh pointer - sub r3, r3, r1, lsl #2 ; move v pointer down by 4 lines - - vld1.u8 {d6}, [r0], r1 ; p3 - vld1.u8 {d7}, [r3], r1 ; p3 - vld1.u8 {d8}, [r0], r1 ; p2 - vld1.u8 {d9}, [r3], r1 ; p2 - vld1.u8 {d10}, [r0], r1 ; p1 - vld1.u8 {d11}, [r3], r1 ; p1 - vld1.u8 {d12}, [r0], r1 ; p0 - vld1.u8 {d13}, [r3], r1 ; p0 - vld1.u8 {d14}, [r0], r1 ; q0 - vld1.u8 {d15}, [r3], r1 ; q0 - vld1.u8 {d16}, [r0], r1 ; q1 - vld1.u8 {d17}, [r3], r1 ; q1 - vld1.u8 {d18}, [r0], r1 ; q2 - vld1.u8 {d19}, [r3], r1 ; q2 - vld1.u8 {d20}, [r0], r1 ; q3 - vld1.u8 {d21}, [r3], r1 ; q3 - - vld1.s8 {d4[], d5[]}, [r12] ; thresh + push {lr} + ldr r12, [sp, #4] ; load thresh + sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines + vdup.u8 q2, r12 ; thresh + ldr r12, [sp, #8] ; load v ptr + sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines + + vld1.u8 {d6}, [r0@64], r1 ; p3 + vld1.u8 {d7}, [r12@64], r1 ; p3 + vld1.u8 {d8}, [r0@64], r1 ; p2 + vld1.u8 {d9}, [r12@64], r1 ; p2 + vld1.u8 {d10}, [r0@64], r1 ; p1 + vld1.u8 {d11}, [r12@64], r1 ; p1 + vld1.u8 {d12}, [r0@64], r1 ; p0 + vld1.u8 {d13}, [r12@64], r1 ; p0 + vld1.u8 {d14}, [r0@64], r1 ; q0 + vld1.u8 {d15}, [r12@64], r1 ; q0 + vld1.u8 {d16}, [r0@64], r1 ; q1 + vld1.u8 {d17}, [r12@64], r1 ; q1 + vld1.u8 {d18}, [r0@64], r1 ; q2 + vld1.u8 {d19}, [r12@64], r1 ; q2 + vld1.u8 {d20}, [r0@64], r1 ; q3 + vld1.u8 {d21}, [r12@64], r1 ; q3 bl vp8_mbloop_filter_neon sub r0, r0, r1, lsl #3 - sub r3, r3, r1, lsl #3 + sub r12, r12, r1, lsl #3 add r0, r0, r1 - add r3, r3, r1 - - vst1.u8 {d8}, [r0], r1 ; store u op2 - vst1.u8 {d9}, [r3], r1 ; store v op2 - vst1.u8 {d10}, [r0], r1 ; store u op1 - vst1.u8 {d11}, [r3], r1 ; store v op1 - vst1.u8 {d12}, [r0], r1 ; store u op0 - vst1.u8 {d13}, [r3], r1 ; store v op0 - vst1.u8 {d14}, [r0], r1 ; store u oq0 - vst1.u8 {d15}, [r3], r1 ; store v oq0 - vst1.u8 {d16}, [r0], r1 ; store u oq1 - vst1.u8 {d17}, [r3], r1 ; store v oq1 - vst1.u8 {d18}, [r0], r1 ; store u oq2 - vst1.u8 {d19}, [r3], r1 ; store v oq2 - - ldmia sp!, {pc} + add r12, r12, r1 + + vst1.u8 {d8}, [r0@64], r1 ; store u op2 + vst1.u8 {d9}, [r12@64], r1 ; store v op2 + vst1.u8 {d10}, [r0@64], r1 ; store u op1 + vst1.u8 {d11}, [r12@64], r1 ; store v op1 + vst1.u8 {d12}, [r0@64], r1 ; store u op0 + vst1.u8 {d13}, [r12@64], r1 ; store v op0 + vst1.u8 {d14}, [r0@64], r1 ; store u oq0 + vst1.u8 {d15}, [r12@64], r1 ; store v oq0 + vst1.u8 {d16}, [r0@64], r1 ; store u oq1 + vst1.u8 {d17}, [r12@64], r1 ; store v oq1 + vst1.u8 {d18}, [r0@64], r1 ; store u oq2 + vst1.u8 {d19}, [r12@64], r1 ; store v oq2 + + pop {pc} ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon| ; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; int count) +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh) ; r0 unsigned char *src, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, -; sp+4 int count (unused) +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, |vp8_mbloop_filter_vertical_edge_y_neon| PROC - stmdb sp!, {lr} + push {lr} + ldr r12, [sp, #4] ; load thresh sub r0, r0, #4 ; move src pointer down by 4 columns + vdup.s8 q2, r12 ; thresh + add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines vld1.u8 {d6}, [r0], r1 ; load first 8-line src data - ldr r12, [sp, #4] ; load thresh pointer + vld1.u8 {d7}, [r12], r1 ; load second 8-line src data vld1.u8 {d8}, [r0], r1 - sub sp, sp, #32 + vld1.u8 {d9}, [r12], r1 vld1.u8 {d10}, [r0], r1 + vld1.u8 {d11}, [r12], r1 vld1.u8 {d12}, [r0], r1 + vld1.u8 {d13}, [r12], r1 vld1.u8 {d14}, [r0], r1 + vld1.u8 {d15}, [r12], r1 vld1.u8 {d16}, [r0], r1 + vld1.u8 {d17}, [r12], r1 vld1.u8 {d18}, [r0], r1 + vld1.u8 {d19}, [r12], r1 vld1.u8 {d20}, [r0], r1 - - vld1.u8 {d7}, [r0], r1 ; load second 8-line src data - vld1.u8 {d9}, [r0], r1 - vld1.u8 {d11}, [r0], r1 - vld1.u8 {d13}, [r0], r1 - vld1.u8 {d15}, [r0], r1 - vld1.u8 {d17}, [r0], r1 - vld1.u8 {d19}, [r0], r1 - vld1.u8 {d21}, [r0], r1 + vld1.u8 {d21}, [r12], r1 ;transpose to 8x16 matrix vtrn.32 q3, q7 @@ -180,29 +168,17 @@ vtrn.8 q7, q8 vtrn.8 q9, q10 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - vld1.s8 {d2[], d3[]}, [r3] ; limit - mov r12, sp - vst1.u8 {q3}, [r12]! - vst1.u8 {q10}, [r12]! + sub r0, r0, r1, lsl #3 bl vp8_mbloop_filter_neon - sub r0, r0, r1, lsl #4 - - add r2, r0, r1 - - add r3, r2, r1 - - vld1.u8 {q3}, [sp]! - vld1.u8 {q10}, [sp]! + sub r12, r12, r1, lsl #3 ;transpose to 16x8 matrix vtrn.32 q3, q7 vtrn.32 q4, q8 vtrn.32 q5, q9 vtrn.32 q6, q10 - add r12, r3, r1 vtrn.16 q3, q5 vtrn.16 q4, q6 @@ -215,36 +191,30 @@ vtrn.8 q9, q10 ;store op2, op1, op0, oq0, oq1, oq2 - vst1.8 {d6}, [r0] - vst1.8 {d8}, [r2] - vst1.8 {d10}, [r3] - vst1.8 {d12}, [r12], r1 - add r0, r12, r1 - vst1.8 {d14}, [r12] - vst1.8 {d16}, [r0], r1 - add r2, r0, r1 - vst1.8 {d18}, [r0] - vst1.8 {d20}, [r2], r1 - add r3, r2, r1 - vst1.8 {d7}, [r2] - vst1.8 {d9}, [r3], r1 - add r12, r3, r1 - vst1.8 {d11}, [r3] + vst1.8 {d6}, [r0], r1 + vst1.8 {d7}, [r12], r1 + vst1.8 {d8}, [r0], r1 + vst1.8 {d9}, [r12], r1 + vst1.8 {d10}, [r0], r1 + vst1.8 {d11}, [r12], r1 + vst1.8 {d12}, [r0], r1 vst1.8 {d13}, [r12], r1 - add r0, r12, r1 - vst1.8 {d15}, [r12] - vst1.8 {d17}, [r0], r1 - add r2, r0, r1 - vst1.8 {d19}, [r0] - vst1.8 {d21}, [r2] - - ldmia sp!, {pc} + vst1.8 {d14}, [r0], r1 + vst1.8 {d15}, [r12], r1 + vst1.8 {d16}, [r0], r1 + vst1.8 {d17}, [r12], r1 + vst1.8 {d18}, [r0], r1 + vst1.8 {d19}, [r12], r1 + vst1.8 {d20}, [r0] + vst1.8 {d21}, [r12] + + pop {pc} ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon| ; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh, ; unsigned char *v) ; r0 unsigned char *u, ; r1 int pitch, @@ -253,30 +223,29 @@ ; sp const signed char *thresh, ; sp+4 unsigned char *v |vp8_mbloop_filter_vertical_edge_uv_neon| PROC - stmdb sp!, {lr} - sub r0, r0, #4 ; move src pointer down by 4 columns - vld1.s8 {d2[], d3[]}, [r3] ; limit - ldr r3, [sp, #8] ; load v ptr - ldr r12, [sp, #4] ; load thresh pointer - - sub r3, r3, #4 ; move v pointer down by 4 columns + push {lr} + ldr r12, [sp, #4] ; load thresh + sub r0, r0, #4 ; move u pointer down by 4 columns + vdup.u8 q2, r12 ; thresh + ldr r12, [sp, #8] ; load v ptr + sub r12, r12, #4 ; move v pointer down by 4 columns vld1.u8 {d6}, [r0], r1 ;load u data - vld1.u8 {d7}, [r3], r1 ;load v data + vld1.u8 {d7}, [r12], r1 ;load v data vld1.u8 {d8}, [r0], r1 - vld1.u8 {d9}, [r3], r1 + vld1.u8 {d9}, [r12], r1 vld1.u8 {d10}, [r0], r1 - vld1.u8 {d11}, [r3], r1 + vld1.u8 {d11}, [r12], r1 vld1.u8 {d12}, [r0], r1 - vld1.u8 {d13}, [r3], r1 + vld1.u8 {d13}, [r12], r1 vld1.u8 {d14}, [r0], r1 - vld1.u8 {d15}, [r3], r1 + vld1.u8 {d15}, [r12], r1 vld1.u8 {d16}, [r0], r1 - vld1.u8 {d17}, [r3], r1 + vld1.u8 {d17}, [r12], r1 vld1.u8 {d18}, [r0], r1 - vld1.u8 {d19}, [r3], r1 + vld1.u8 {d19}, [r12], r1 vld1.u8 {d20}, [r0], r1 - vld1.u8 {d21}, [r3], r1 + vld1.u8 {d21}, [r12], r1 ;transpose to 8x16 matrix vtrn.32 q3, q7 @@ -294,19 +263,11 @@ vtrn.8 q7, q8 vtrn.8 q9, q10 - sub sp, sp, #32 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - mov r12, sp - vst1.u8 {q3}, [r12]! - vst1.u8 {q10}, [r12]! + sub r0, r0, r1, lsl #3 bl vp8_mbloop_filter_neon - sub r0, r0, r1, lsl #3 - sub r3, r3, r1, lsl #3 - - vld1.u8 {q3}, [sp]! - vld1.u8 {q10}, [sp]! + sub r12, r12, r1, lsl #3 ;transpose to 16x8 matrix vtrn.32 q3, q7 @@ -326,23 +287,23 @@ ;store op2, op1, op0, oq0, oq1, oq2 vst1.8 {d6}, [r0], r1 - vst1.8 {d7}, [r3], r1 + vst1.8 {d7}, [r12], r1 vst1.8 {d8}, [r0], r1 - vst1.8 {d9}, [r3], r1 + vst1.8 {d9}, [r12], r1 vst1.8 {d10}, [r0], r1 - vst1.8 {d11}, [r3], r1 + vst1.8 {d11}, [r12], r1 vst1.8 {d12}, [r0], r1 - vst1.8 {d13}, [r3], r1 + vst1.8 {d13}, [r12], r1 vst1.8 {d14}, [r0], r1 - vst1.8 {d15}, [r3], r1 + vst1.8 {d15}, [r12], r1 vst1.8 {d16}, [r0], r1 - vst1.8 {d17}, [r3], r1 + vst1.8 {d17}, [r12], r1 vst1.8 {d18}, [r0], r1 - vst1.8 {d19}, [r3], r1 - vst1.8 {d20}, [r0], r1 - vst1.8 {d21}, [r3], r1 + vst1.8 {d19}, [r12], r1 + vst1.8 {d20}, [r0] + vst1.8 {d21}, [r12] - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon| ; void vp8_mbloop_filter_neon() @@ -350,26 +311,19 @@ ; functions do the necessary load, transpose (if necessary), preserve (if ; necessary) and store. -; TODO: -; The vertical filter writes p3/q3 back out because two 4 element writes are -; much simpler than ordering and writing two 3 element sets (or three 2 elements -; sets, or whichever other combinations are possible). -; If we can preserve q3 and q10, the vertical filter will be able to avoid -; storing those values on the stack and reading them back after the filter. - ; r0,r1 PRESERVE -; r2 flimit -; r3 PRESERVE -; q1 limit +; r2 mblimit +; r3 limit + ; q2 thresh -; q3 p3 +; q3 p3 PRESERVE ; q4 p2 ; q5 p1 ; q6 p0 ; q7 q0 ; q8 q1 ; q9 q2 -; q10 q3 +; q10 q3 PRESERVE |vp8_mbloop_filter_neon| PROC @@ -378,12 +332,12 @@ vabd.u8 q12, q4, q5 ; abs(p2 - p1) vabd.u8 q13, q5, q6 ; abs(p1 - p0) vabd.u8 q14, q8, q7 ; abs(q1 - q0) - vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q1, q9, q8 ; abs(q2 - q1) vabd.u8 q0, q10, q9 ; abs(q3 - q2) vmax.u8 q11, q11, q12 vmax.u8 q12, q13, q14 - vmax.u8 q3, q3, q0 + vmax.u8 q1, q1, q0 vmax.u8 q15, q11, q12 vabd.u8 q12, q6, q7 ; abs(p0 - q0) @@ -391,44 +345,46 @@ ; vp8_hevmask vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1 vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1 - vmax.u8 q15, q15, q3 + vmax.u8 q15, q15, q1 - vld1.s8 {d4[], d5[]}, [r2] ; flimit + vdup.u8 q1, r3 ; limit + vdup.u8 q2, r2 ; mblimit vmov.u8 q0, #0x80 ; 0x80 - vadd.u8 q2, q2, q2 ; flimit * 2 - vadd.u8 q2, q2, q1 ; flimit * 2 + limit vcge.u8 q15, q1, q15 vabd.u8 q1, q5, q8 ; a = abs(p1 - q1) vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2 - vshr.u8 q1, q1, #1 ; a = a / 2 - vqadd.u8 q12, q12, q1 ; a = b + a - vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1 + vmov.u16 q11, #3 ; #3 ; vp8_filter ; convert to signed veor q7, q7, q0 ; qs0 + vshr.u8 q1, q1, #1 ; a = a / 2 veor q6, q6, q0 ; ps0 veor q5, q5, q0 ; ps1 + + vqadd.u8 q12, q12, q1 ; a = b + a + veor q8, q8, q0 ; qs1 veor q4, q4, q0 ; ps2 veor q9, q9, q0 ; qs2 vorr q14, q13, q14 ; vp8_hevmask + vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1 + vsubl.s8 q2, d14, d12 ; qs0 - ps0 vsubl.s8 q13, d15, d13 vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1) - vadd.s16 q10, q2, q2 ; 3 * (qs0 - ps0) - vadd.s16 q11, q13, q13 + vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0) + vand q15, q15, q12 ; vp8_filter_mask - vadd.s16 q2, q2, q10 - vadd.s16 q13, q13, q11 + vmul.i16 q13, q13, q11 vmov.u8 q12, #3 ; #3 @@ -447,23 +403,19 @@ vand q13, q1, q14 ; Filter2 &= hev - vmov.u8 d7, #9 ; #9 - vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4) vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3) - vmov.u8 d6, #18 ; #18 + vmov q0, q15 vshr.s8 q2, q2, #3 ; Filter1 >>= 3 vshr.s8 q13, q13, #3 ; Filter2 >>= 3 - vmov q10, q15 + vmov q11, q15 vmov q12, q15 vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1) - vmov.u8 d5, #27 ; #27 - vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2) vbic q1, q1, q14 ; vp8_filter &= ~hev @@ -471,35 +423,43 @@ ; roughly 1/7th difference across boundary ; roughly 2/7th difference across boundary ; roughly 3/7th difference across boundary - vmov q11, q15 + + vmov.u8 d5, #9 ; #9 + vmov.u8 d4, #18 ; #18 + vmov q13, q15 vmov q14, q15 - vmlal.s8 q10, d2, d7 ; Filter2 * 9 - vmlal.s8 q11, d3, d7 - vmlal.s8 q12, d2, d6 ; Filter2 * 18 - vmlal.s8 q13, d3, d6 - vmlal.s8 q14, d2, d5 ; Filter2 * 27 + vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9 + vmlal.s8 q11, d3, d5 + vmov.u8 d5, #27 ; #27 + vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18 + vmlal.s8 q13, d3, d4 + vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27 vmlal.s8 q15, d3, d5 - vqshrn.s16 d20, q10, #7 ; u = clamp((63 + Filter2 * 9)>>7) - vqshrn.s16 d21, q11, #7 + + vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7) + vqshrn.s16 d1, q11, #7 vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7) vqshrn.s16 d25, q13, #7 vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7) vqshrn.s16 d29, q15, #7 - vqsub.s8 q11, q9, q10 ; s = clamp(qs2 - u) - vqadd.s8 q10, q4, q10 ; s = clamp(ps2 + u) + vmov.u8 q1, #0x80 ; 0x80 + + vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u) + vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u) vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u) vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u) vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u) vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u) - veor q9, q11, q0 ; *oq2 = s^0x80 - veor q4, q10, q0 ; *op2 = s^0x80 - veor q8, q13, q0 ; *oq1 = s^0x80 - veor q5, q12, q0 ; *op2 = s^0x80 - veor q7, q15, q0 ; *oq0 = s^0x80 - veor q6, q14, q0 ; *op0 = s^0x80 + + veor q9, q11, q1 ; *oq2 = s^0x80 + veor q4, q0, q1 ; *op2 = s^0x80 + veor q8, q13, q1 ; *oq1 = s^0x80 + veor q5, q12, q1 ; *op2 = s^0x80 + veor q7, q15, q1 ; *oq0 = s^0x80 + veor q6, q14, q1 ; *op0 = s^0x80 bx lr ENDP ; |vp8_mbloop_filter_neon| diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c index 133938097..c61629407 100644 --- a/vp8/common/generic/systemdependent.c +++ b/vp8/common/generic/systemdependent.c @@ -108,9 +108,9 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_c; rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_c; rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_c; - rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_c; + rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_c; rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_c; - rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_c; + rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_c; rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_c; #if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_INTERNAL_STATS) diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c index a3242716f..be3f53593 100644 --- a/vp8/common/loopfilter.c +++ b/vp8/common/loopfilter.c @@ -9,152 +9,149 @@ */ -#include "vpx_ports/config.h" +#include "vpx_config.h" #include "loopfilter.h" #include "onyxc_int.h" +#include "vpx_mem/vpx_mem.h" typedef unsigned char uc; - prototype_loopfilter(vp8_loop_filter_horizontal_edge_c); prototype_loopfilter(vp8_loop_filter_vertical_edge_c); prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c); prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c); -prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c); -prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c); + +prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_c); +prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_c); /* Horizontal MB filtering */ -void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); -} - -void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } /* Vertical MB Filtering */ -void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); -} - -void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } /* Horizontal B Filtering */ -void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit); } /* Vertical B Filtering */ -void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit); } -void vp8_init_loop_filter(VP8_COMMON *cm) +static void lf_init_lut(loop_filter_info_n *lfi) { - loop_filter_info *lfi = cm->lf_info; - LOOPFILTERTYPE lft = cm->filter_type; - int sharpness_lvl = cm->sharpness_level; - int frame_type = cm->frame_type; - int i, j; + int filt_lvl; - int block_inside_limit = 0; - int HEVThresh; - - /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */ - for (i = 0; i <= MAX_LOOP_FILTER; i++) + for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++) { - int filt_lvl = i; - - if (frame_type == KEY_FRAME) + if (filt_lvl >= 40) { - if (filt_lvl >= 40) - HEVThresh = 2; - else if (filt_lvl >= 15) - HEVThresh = 1; - else - HEVThresh = 0; + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3; + } + else if (filt_lvl >= 20) + { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2; + } + else if (filt_lvl >= 15) + { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1; } else { - if (filt_lvl >= 40) - HEVThresh = 3; - else if (filt_lvl >= 20) - HEVThresh = 2; - else if (filt_lvl >= 15) - HEVThresh = 1; - else - HEVThresh = 0; + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0; } + } + + lfi->mode_lf_lut[DC_PRED] = 1; + lfi->mode_lf_lut[V_PRED] = 1; + lfi->mode_lf_lut[H_PRED] = 1; + lfi->mode_lf_lut[TM_PRED] = 1; + lfi->mode_lf_lut[B_PRED] = 0; + + lfi->mode_lf_lut[ZEROMV] = 1; + lfi->mode_lf_lut[NEARESTMV] = 2; + lfi->mode_lf_lut[NEARMV] = 2; + lfi->mode_lf_lut[NEWMV] = 2; + lfi->mode_lf_lut[SPLITMV] = 3; + +} + +void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi, + int sharpness_lvl) +{ + int i; + + /* For each possible value for the loop filter fill out limits */ + for (i = 0; i <= MAX_LOOP_FILTER; i++) + { + int filt_lvl = i; + int block_inside_limit = 0; /* Set loop filter paramaeters that control sharpness. */ block_inside_limit = filt_lvl >> (sharpness_lvl > 0); @@ -169,119 +166,120 @@ void vp8_init_loop_filter(VP8_COMMON *cm) if (block_inside_limit < 1) block_inside_limit = 1; - for (j = 0; j < 16; j++) - { - lfi[i].lim[j] = block_inside_limit; - lfi[i].mbflim[j] = filt_lvl + 2; - lfi[i].flim[j] = filt_lvl; - lfi[i].thr[j] = HEVThresh; - } - + vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH); + vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), + SIMD_WIDTH); + vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit), + SIMD_WIDTH); } +} - /* Set up the function pointers depending on the type of loop filtering selected */ - if (lft == NORMAL_LOOPFILTER) - { - cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v); - cm->lf_bv = LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v); - cm->lf_mbh = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h); - cm->lf_bh = LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h); - } - else +void vp8_loop_filter_init(VP8_COMMON *cm) +{ + loop_filter_info_n *lfi = &cm->lf_info; + int i; + + /* init limits for given sharpness*/ + vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level); + cm->last_sharpness_level = cm->sharpness_level; + + /* init LUT for lvl and hev thr picking */ + lf_init_lut(lfi); + + /* init hev threshold const vectors */ + for(i = 0; i < 4 ; i++) { - cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v); - cm->lf_bv = LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v); - cm->lf_mbh = LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h); - cm->lf_bh = LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h); + vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH); } } -/* Put vp8_init_loop_filter() in vp8dx_create_decompressor(). Only call vp8_frame_init_loop_filter() while decoding - * each frame. Check last_frame_type to skip the function most of times. - */ -void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type) +void vp8_loop_filter_frame_init(VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl, + int sharpness_lvl) { - int HEVThresh; - int i, j; + int seg, /* segment number */ + ref, /* index in ref_lf_deltas */ + mode; /* index in mode_lf_deltas */ - /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */ - for (i = 0; i <= MAX_LOOP_FILTER; i++) + loop_filter_info_n *lfi = &cm->lf_info; + + /* update limits if sharpness has changed */ + if(cm->last_sharpness_level != sharpness_lvl) { - int filt_lvl = i; + vp8_loop_filter_update_sharpness(lfi, sharpness_lvl); + cm->last_sharpness_level = sharpness_lvl; + } - if (frame_type == KEY_FRAME) - { - if (filt_lvl >= 40) - HEVThresh = 2; - else if (filt_lvl >= 15) - HEVThresh = 1; - else - HEVThresh = 0; - } - else + for(seg = 0; seg < MAX_MB_SEGMENTS; seg++) + { + int lvl_seg = default_filt_lvl; + int lvl_ref, lvl_mode; + + /* Note the baseline filter values for each segment */ + if (mbd->segmentation_enabled) { - if (filt_lvl >= 40) - HEVThresh = 3; - else if (filt_lvl >= 20) - HEVThresh = 2; - else if (filt_lvl >= 15) - HEVThresh = 1; - else - HEVThresh = 0; + /* Abs value */ + if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) + { + lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; + } + else /* Delta Value */ + { + lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; + lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0; + } } - for (j = 0; j < 16; j++) + if (!mbd->mode_ref_lf_delta_enabled) { - /*lfi[i].lim[j] = block_inside_limit; - lfi[i].mbflim[j] = filt_lvl+2;*/ - /*lfi[i].flim[j] = filt_lvl;*/ - lfi[i].thr[j] = HEVThresh; + /* we could get rid of this if we assume that deltas are set to + * zero when not in use; encoder always uses deltas + */ + vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 ); + continue; } - } -} + lvl_ref = lvl_seg; -int vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int filter_level) -{ - MB_MODE_INFO *mbmi = &mbd->mode_info_context->mbmi; + /* INTRA_FRAME */ + ref = INTRA_FRAME; - if (mbd->mode_ref_lf_delta_enabled) - { /* Apply delta for reference frame */ - filter_level += mbd->ref_lf_deltas[mbmi->ref_frame]; + lvl_ref += mbd->ref_lf_deltas[ref]; - /* Apply delta for mode */ - if (mbmi->ref_frame == INTRA_FRAME) - { - /* Only the split mode BPRED has a further special case */ - if (mbmi->mode == B_PRED) - filter_level += mbd->mode_lf_deltas[0]; - } - else + /* Apply delta for Intra modes */ + mode = 0; /* B_PRED */ + /* Only the split mode BPRED has a further special case */ + lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode]; + lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */ + + lfi->lvl[seg][ref][mode] = lvl_mode; + + mode = 1; /* all the rest of Intra modes */ + lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0; /* clamp */ + lfi->lvl[seg][ref][mode] = lvl_mode; + + /* LAST, GOLDEN, ALT */ + for(ref = 1; ref < MAX_REF_FRAMES; ref++) { - /* Zero motion mode */ - if (mbmi->mode == ZEROMV) - filter_level += mbd->mode_lf_deltas[1]; + int lvl_ref = lvl_seg; - /* Split MB motion mode */ - else if (mbmi->mode == SPLITMV) - filter_level += mbd->mode_lf_deltas[3]; + /* Apply delta for reference frame */ + lvl_ref += mbd->ref_lf_deltas[ref]; - /* All other inter motion modes (Nearest, Near, New) */ - else - filter_level += mbd->mode_lf_deltas[2]; - } + /* Apply delta for Inter modes */ + for (mode = 1; mode < 4; mode++) + { + lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode]; + lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */ - /* Range check */ - if (filter_level > MAX_LOOP_FILTER) - filter_level = MAX_LOOP_FILTER; - else if (filter_level < 0) - filter_level = 0; + lfi->lvl[seg][ref][mode] = lvl_mode; + } + } } - return filter_level; } - void vp8_loop_filter_frame ( VP8_COMMON *cm, @@ -290,49 +288,23 @@ void vp8_loop_filter_frame ) { YV12_BUFFER_CONFIG *post = cm->frame_to_show; - loop_filter_info *lfi = cm->lf_info; + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + FRAME_TYPE frame_type = cm->frame_type; int mb_row; int mb_col; - - int baseline_filter_level[MAX_MB_SEGMENTS]; int filter_level; - int alt_flt_enabled = mbd->segmentation_enabled; - int i; unsigned char *y_ptr, *u_ptr, *v_ptr; - mbd->mode_info_context = cm->mi; /* Point at base of Mb MODE_INFO list */ - - /* Note the baseline filter values for each segment */ - if (alt_flt_enabled) - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - { - /* Abs value */ - if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) - baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - /* Delta Value */ - else - { - baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */ - } - } - } - else - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - baseline_filter_level[i] = default_filt_lvl; - } + /* Point at base of Mb MODE_INFO list */ + const MODE_INFO *mode_info_context = cm->mi; /* Initialize the loop filter for this frame. */ - if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level)) - vp8_init_loop_filter(cm); - else if (frame_type != cm->last_frame_type) - vp8_frame_init_loop_filter(lfi, frame_type); + vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl, cm->sharpness_level); /* Set up the buffer pointers */ y_ptr = post->y_buffer; @@ -344,51 +316,79 @@ void vp8_loop_filter_frame { for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { - int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0; - int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED && - mbd->mode_info_context->mbmi.mode != SPLITMV && - mbd->mode_info_context->mbmi.mb_skip_coeff); + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); - filter_level = baseline_filter_level[Segment]; + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; - /* Distance of Mb to the various image edges. - * These specified to 8th pel as they are always compared to values that are in 1/8th pel units - * Apply any context driven MB level adjustment - */ - filter_level = vp8_adjust_mb_lf_value(mbd, filter_level); + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; if (filter_level) { - if (mb_col > 0) - cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]); - - if (!skip_lf) - cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]); - - /* don't apply across umv border */ - if (mb_row > 0) - cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]); - - if (!skip_lf) - cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]); + if (cm->filter_type == NORMAL_LOOPFILTER) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v) + (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v) + (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h) + (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h) + (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); + } + else + { + if (mb_col > 0) + LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v) + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v) + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h) + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h) + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + } } y_ptr += 16; u_ptr += 8; v_ptr += 8; - mbd->mode_info_context++; /* step to next MB */ + mode_info_context++; /* step to next MB */ } y_ptr += post->y_stride * 16 - post->y_width; u_ptr += post->uv_stride * 8 - post->uv_width; v_ptr += post->uv_stride * 8 - post->uv_width; - mbd->mode_info_context++; /* Skip border mb */ + mode_info_context++; /* Skip border mb */ } } - void vp8_loop_filter_frame_yonly ( VP8_COMMON *cm, @@ -399,49 +399,28 @@ void vp8_loop_filter_frame_yonly { YV12_BUFFER_CONFIG *post = cm->frame_to_show; - int i; unsigned char *y_ptr; int mb_row; int mb_col; - loop_filter_info *lfi = cm->lf_info; - int baseline_filter_level[MAX_MB_SEGMENTS]; + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + int filter_level; - int alt_flt_enabled = mbd->segmentation_enabled; FRAME_TYPE frame_type = cm->frame_type; - (void) sharpness_lvl; + /* Point at base of Mb MODE_INFO list */ + const MODE_INFO *mode_info_context = cm->mi; - /*MODE_INFO * this_mb_mode_info = cm->mi;*/ /* Point at base of Mb MODE_INFO list */ - mbd->mode_info_context = cm->mi; /* Point at base of Mb MODE_INFO list */ + sharpness_lvl = cm->sharpness_level; - /* Note the baseline filter values for each segment */ - if (alt_flt_enabled) - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - { - /* Abs value */ - if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) - baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - /* Delta Value */ - else - { - baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */ - } - } - } - else - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - baseline_filter_level[i] = default_filt_lvl; - } +#if 0 + if(default_filt_lvl == 0) /* no filter applied */ + return; +#endif /* Initialize the loop filter for this frame. */ - if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level)) - vp8_init_loop_filter(cm); - else if (frame_type != cm->last_frame_type) - vp8_frame_init_loop_filter(lfi, frame_type); + vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl, sharpness_lvl); /* Set up the buffer pointers */ y_ptr = post->y_buffer; @@ -451,44 +430,75 @@ void vp8_loop_filter_frame_yonly { for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { - int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0; - int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED && - mbd->mode_info_context->mbmi.mode != SPLITMV && - mbd->mode_info_context->mbmi.mb_skip_coeff); + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); - filter_level = baseline_filter_level[Segment]; + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; - /* Apply any context driven MB level adjustment */ - filter_level = vp8_adjust_mb_lf_value(mbd, filter_level); + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; if (filter_level) { - if (mb_col > 0) - cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); - - if (!skip_lf) - cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); - - /* don't apply across umv border */ - if (mb_row > 0) - cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); - - if (!skip_lf) - cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); + if (cm->filter_type == NORMAL_LOOPFILTER) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + } + else + { + if (mb_col > 0) + LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v) + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v) + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h) + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h) + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + } } y_ptr += 16; - mbd->mode_info_context ++; /* step to next MB */ + mode_info_context ++; /* step to next MB */ } y_ptr += post->y_stride * 16 - post->y_width; - mbd->mode_info_context ++; /* Skip border mb */ + mode_info_context ++; /* Skip border mb */ } } - void vp8_loop_filter_partial_frame ( VP8_COMMON *cm, @@ -500,25 +510,32 @@ void vp8_loop_filter_partial_frame { YV12_BUFFER_CONFIG *post = cm->frame_to_show; - int i; unsigned char *y_ptr; int mb_row; int mb_col; - /*int mb_rows = post->y_height >> 4;*/ int mb_cols = post->y_width >> 4; - int linestocopy; + int linestocopy, i; + + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; - loop_filter_info *lfi = cm->lf_info; - int baseline_filter_level[MAX_MB_SEGMENTS]; int filter_level; int alt_flt_enabled = mbd->segmentation_enabled; FRAME_TYPE frame_type = cm->frame_type; - (void) sharpness_lvl; + const MODE_INFO *mode_info_context; + + int lvl_seg[MAX_MB_SEGMENTS]; + + sharpness_lvl = cm->sharpness_level; - /*MODE_INFO * this_mb_mode_info = cm->mi + (post->y_height>>5) * (mb_cols + 1);*/ /* Point at base of Mb MODE_INFO list */ - mbd->mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); /* Point at base of Mb MODE_INFO list */ +#if 0 + if(default_filt_lvl == 0) /* no filter applied */ + return; +#endif + + mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); linestocopy = (post->y_height >> (4 + Fraction)); @@ -531,29 +548,24 @@ void vp8_loop_filter_partial_frame if (alt_flt_enabled) { for (i = 0; i < MAX_MB_SEGMENTS; i++) - { - /* Abs value */ + { /* Abs value */ if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) - baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + { + lvl_seg[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + } /* Delta Value */ else { - baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */ + lvl_seg[i] = default_filt_lvl + + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + lvl_seg[i] = (lvl_seg[i] > 0) ? + ((lvl_seg[i] > 63) ? 63: lvl_seg[i]) : 0; } } } else - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - baseline_filter_level[i] = default_filt_lvl; - } + lvl_seg[0] = default_filt_lvl; - /* Initialize the loop filter for this frame. */ - if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level)) - vp8_init_loop_filter(cm); - else if (frame_type != cm->last_frame_type) - vp8_frame_init_loop_filter(lfi, frame_type); /* Set up the buffer pointers */ y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride; @@ -563,32 +575,64 @@ void vp8_loop_filter_partial_frame { for (mb_col = 0; mb_col < mb_cols; mb_col++) { - int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0; - int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED && - mbd->mode_info_context->mbmi.mode != SPLITMV && - mbd->mode_info_context->mbmi.mb_skip_coeff); + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); - filter_level = baseline_filter_level[Segment]; + if (alt_flt_enabled) + filter_level = lvl_seg[mode_info_context->mbmi.segment_id]; + else + filter_level = lvl_seg[0]; if (filter_level) { - if (mb_col > 0) - cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); - - if (!skip_lf) - cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); - - cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); - - if (!skip_lf) - cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); + if (cm->filter_type == NORMAL_LOOPFILTER) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + } + else + { + if (mb_col > 0) + LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v) + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v) + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + + LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h) + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h) + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + } } y_ptr += 16; - mbd->mode_info_context += 1; /* step to next MB */ + mode_info_context += 1; /* step to next MB */ } y_ptr += post->y_stride * 16 - post->y_width; - mbd->mode_info_context += 1; /* Skip border mb */ + mode_info_context += 1; /* Skip border mb */ } } diff --git a/vp8/common/loopfilter.h b/vp8/common/loopfilter.h index ca136b3a4..2d6dad306 100644 --- a/vp8/common/loopfilter.h +++ b/vp8/common/loopfilter.h @@ -13,6 +13,7 @@ #define loopfilter_h #include "vpx_ports/mem.h" +#include "vpx_config.h" #define MAX_LOOP_FILTER 63 @@ -22,27 +23,46 @@ typedef enum SIMPLE_LOOPFILTER = 1 } LOOPFILTERTYPE; -/* FRK - * Need to align this structure so when it is declared and +#if ARCH_ARM +#define SIMD_WIDTH 1 +#else +#define SIMD_WIDTH 16 +#endif + +/* Need to align this structure so when it is declared and * passed it can be loaded into vector registers. */ typedef struct { - DECLARE_ALIGNED(16, signed char, lim[16]); - DECLARE_ALIGNED(16, signed char, flim[16]); - DECLARE_ALIGNED(16, signed char, thr[16]); - DECLARE_ALIGNED(16, signed char, mbflim[16]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]); + unsigned char lvl[4][4][4]; + unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1]; + unsigned char mode_lf_lut[10]; +} loop_filter_info_n; + +typedef struct +{ + const unsigned char * mblim; + const unsigned char * blim; + const unsigned char * lim; + const unsigned char * hev_thr; } loop_filter_info; #define prototype_loopfilter(sym) \ - void sym(unsigned char *src, int pitch, const signed char *flimit,\ - const signed char *limit, const signed char *thresh, int count) + void sym(unsigned char *src, int pitch, const unsigned char *blimit,\ + const unsigned char *limit, const unsigned char *thresh, int count) #define prototype_loopfilter_block(sym) \ - void sym(unsigned char *y, unsigned char *u, unsigned char *v,\ + void sym(unsigned char *y, unsigned char *u, unsigned char *v, \ int ystride, int uv_stride, loop_filter_info *lfi) +#define prototype_simple_loopfilter(sym) \ + void sym(unsigned char *y, int ystride, const unsigned char *blimit) + #if ARCH_X86 || ARCH_X86_64 #include "x86/loopfilter_x86.h" #endif @@ -71,38 +91,39 @@ extern prototype_loopfilter_block(vp8_lf_normal_mb_h); #endif extern prototype_loopfilter_block(vp8_lf_normal_b_h); - #ifndef vp8_lf_simple_mb_v -#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_c +#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_c #endif -extern prototype_loopfilter_block(vp8_lf_simple_mb_v); +extern prototype_simple_loopfilter(vp8_lf_simple_mb_v); #ifndef vp8_lf_simple_b_v #define vp8_lf_simple_b_v vp8_loop_filter_bvs_c #endif -extern prototype_loopfilter_block(vp8_lf_simple_b_v); +extern prototype_simple_loopfilter(vp8_lf_simple_b_v); #ifndef vp8_lf_simple_mb_h -#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_c +#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_c #endif -extern prototype_loopfilter_block(vp8_lf_simple_mb_h); +extern prototype_simple_loopfilter(vp8_lf_simple_mb_h); #ifndef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_c #endif -extern prototype_loopfilter_block(vp8_lf_simple_b_h); +extern prototype_simple_loopfilter(vp8_lf_simple_b_h); typedef prototype_loopfilter_block((*vp8_lf_block_fn_t)); +typedef prototype_simple_loopfilter((*vp8_slf_block_fn_t)); + typedef struct { vp8_lf_block_fn_t normal_mb_v; vp8_lf_block_fn_t normal_b_v; vp8_lf_block_fn_t normal_mb_h; vp8_lf_block_fn_t normal_b_h; - vp8_lf_block_fn_t simple_mb_v; - vp8_lf_block_fn_t simple_b_v; - vp8_lf_block_fn_t simple_mb_h; - vp8_lf_block_fn_t simple_b_h; + vp8_slf_block_fn_t simple_mb_v; + vp8_slf_block_fn_t simple_b_v; + vp8_slf_block_fn_t simple_mb_h; + vp8_slf_block_fn_t simple_b_h; } vp8_loopfilter_rtcd_vtable_t; #if CONFIG_RUNTIME_CPU_DETECT @@ -115,9 +136,9 @@ typedef void loop_filter_uvfunction ( unsigned char *u, /* source pointer */ int p, /* pitch */ - const signed char *flimit, - const signed char *limit, - const signed char *thresh, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, unsigned char *v ); diff --git a/vp8/common/loopfilter_filters.c b/vp8/common/loopfilter_filters.c index 694052924..10228ae09 100644 --- a/vp8/common/loopfilter_filters.c +++ b/vp8/common/loopfilter_filters.c @@ -24,8 +24,9 @@ static __inline signed char vp8_signed_char_clamp(int t) /* should we apply any filter at all ( 11111111 yes, 00000000 no) */ -static __inline signed char vp8_filter_mask(signed char limit, signed char flimit, - uc p3, uc p2, uc p1, uc p0, uc q0, uc q1, uc q2, uc q3) +static __inline signed char vp8_filter_mask(uc limit, uc blimit, + uc p3, uc p2, uc p1, uc p0, + uc q0, uc q1, uc q2, uc q3) { signed char mask = 0; mask |= (abs(p3 - p2) > limit) * -1; @@ -34,13 +35,13 @@ static __inline signed char vp8_filter_mask(signed char limit, signed char flimi mask |= (abs(q1 - q0) > limit) * -1; mask |= (abs(q2 - q1) > limit) * -1; mask |= (abs(q3 - q2) > limit) * -1; - mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = ~mask; return mask; } /* is there high variance internal edge ( 11111111 yes, 00000000 no) */ -static __inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, uc q1) +static __inline signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1) { signed char hev = 0; hev |= (abs(p1 - p0) > thresh) * -1; @@ -48,7 +49,8 @@ static __inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, return hev; } -static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *op0, uc *oq0, uc *oq1) +static __inline void vp8_filter(signed char mask, uc hev, uc *op1, + uc *op0, uc *oq0, uc *oq1) { signed char ps0, qs0; @@ -98,9 +100,9 @@ void vp8_loop_filter_horizontal_edge_c ( unsigned char *s, int p, /* pitch */ - const signed char *flimit, - const signed char *limit, - const signed char *thresh, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count ) { @@ -113,11 +115,11 @@ void vp8_loop_filter_horizontal_edge_c */ do { - mask = vp8_filter_mask(limit[i], flimit[i], + mask = vp8_filter_mask(limit[0], blimit[0], s[-4*p], s[-3*p], s[-2*p], s[-1*p], s[0*p], s[1*p], s[2*p], s[3*p]); - hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]); + hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]); vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p); @@ -130,9 +132,9 @@ void vp8_loop_filter_vertical_edge_c ( unsigned char *s, int p, - const signed char *flimit, - const signed char *limit, - const signed char *thresh, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count ) { @@ -145,10 +147,10 @@ void vp8_loop_filter_vertical_edge_c */ do { - mask = vp8_filter_mask(limit[i], flimit[i], + mask = vp8_filter_mask(limit[0], blimit[0], s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]); - hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]); + hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); vp8_filter(mask, hev, s - 2, s - 1, s, s + 1); @@ -157,7 +159,7 @@ void vp8_loop_filter_vertical_edge_c while (++i < count * 8); } -static __inline void vp8_mbfilter(signed char mask, signed char hev, +static __inline void vp8_mbfilter(signed char mask, uc hev, uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2) { signed char s, u; @@ -216,9 +218,9 @@ void vp8_mbloop_filter_horizontal_edge_c ( unsigned char *s, int p, - const signed char *flimit, - const signed char *limit, - const signed char *thresh, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count ) { @@ -232,11 +234,11 @@ void vp8_mbloop_filter_horizontal_edge_c do { - mask = vp8_filter_mask(limit[i], flimit[i], + mask = vp8_filter_mask(limit[0], blimit[0], s[-4*p], s[-3*p], s[-2*p], s[-1*p], s[0*p], s[1*p], s[2*p], s[3*p]); - hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]); + hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]); vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p); @@ -251,9 +253,9 @@ void vp8_mbloop_filter_vertical_edge_c ( unsigned char *s, int p, - const signed char *flimit, - const signed char *limit, - const signed char *thresh, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count ) { @@ -264,10 +266,10 @@ void vp8_mbloop_filter_vertical_edge_c do { - mask = vp8_filter_mask(limit[i], flimit[i], + mask = vp8_filter_mask(limit[0], blimit[0], s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]); - hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]); + hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2); @@ -278,13 +280,13 @@ void vp8_mbloop_filter_vertical_edge_c } /* should we apply any filter at all ( 11111111 yes, 00000000 no) */ -static __inline signed char vp8_simple_filter_mask(signed char limit, signed char flimit, uc p1, uc p0, uc q0, uc q1) +static __inline signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1) { /* Why does this cause problems for win32? * error C2143: syntax error : missing ';' before 'type' * (void) limit; */ - signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= flimit * 2 + limit) * -1; + signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1; return mask; } @@ -317,47 +319,37 @@ void vp8_loop_filter_simple_horizontal_edge_c ( unsigned char *s, int p, - const signed char *flimit, - const signed char *limit, - const signed char *thresh, - int count + const unsigned char *blimit ) { signed char mask = 0; int i = 0; - (void) thresh; do { - /*mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1*p],s[0*p]);*/ - mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2*p], s[-1*p], s[0*p], s[1*p]); + mask = vp8_simple_filter_mask(blimit[0], s[-2*p], s[-1*p], s[0*p], s[1*p]); vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p); ++s; } - while (++i < count * 8); + while (++i < 16); } void vp8_loop_filter_simple_vertical_edge_c ( unsigned char *s, int p, - const signed char *flimit, - const signed char *limit, - const signed char *thresh, - int count + const unsigned char *blimit ) { signed char mask = 0; int i = 0; - (void) thresh; do { - /*mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1],s[0]);*/ - mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2], s[-1], s[0], s[1]); + mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]); vp8_simple_filter(mask, s - 2, s - 1, s, s + 1); s += p; } - while (++i < count * 8); + while (++i < 16); } diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h index a381dfe87..4356b5133 100644 --- a/vp8/common/onyxc_int.h +++ b/vp8/common/onyxc_int.h @@ -83,6 +83,7 @@ typedef struct VP8_COMMON_RTCD } VP8_COMMON_RTCD; typedef struct VP8Common + { struct vpx_internal_error_info error; @@ -107,7 +108,8 @@ typedef struct VP8Common YV12_BUFFER_CONFIG post_proc_buffer; YV12_BUFFER_CONFIG temp_scale_frame; - FRAME_TYPE last_frame_type; /* Save last frame's frame type for loopfilter init checking and motion search. */ + + FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */ FRAME_TYPE frame_type; int show_frame; @@ -148,11 +150,9 @@ typedef struct VP8Common INTERPOLATIONFILTERTYPE mcomp_filter_type; LOOPFILTERTYPE last_filter_type; LOOPFILTERTYPE filter_type; - loop_filter_info lf_info[MAX_LOOP_FILTER+1]; - prototype_loopfilter_block((*lf_mbv)); - prototype_loopfilter_block((*lf_mbh)); - prototype_loopfilter_block((*lf_bv)); - prototype_loopfilter_block((*lf_bh)); + + loop_filter_info_n lf_info; + int filter_level; int last_sharpness_level; int sharpness_level; @@ -205,10 +205,9 @@ typedef struct VP8Common struct postproc_state postproc_state; } VP8_COMMON; - -int vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int filter_level); -void vp8_init_loop_filter(VP8_COMMON *cm); -void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type); -extern void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val); +void vp8_loop_filter_init(VP8_COMMON *cm); +void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd, + int default_filt_lvl, int sharpness_lvl); +void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val); #endif diff --git a/vp8/common/x86/loopfilter_mmx.asm b/vp8/common/x86/loopfilter_mmx.asm index c6c215c3c..ad47284cf 100644 --- a/vp8/common/x86/loopfilter_mmx.asm +++ b/vp8/common/x86/loopfilter_mmx.asm @@ -16,7 +16,7 @@ ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -122,12 +122,10 @@ next8_h: paddusb mm5, mm5 ; abs(p0-q0)*2 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - mov rdx, arg(2) ;flimit ; get flimit - movq mm2, [rdx] ; flimit mm2 - paddb mm2, mm2 ; flimit*2 (less than 255) - paddb mm7, mm2 ; flimit * 2 + limit (less than 255) + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] ; blimit - psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit por mm1, mm5 pxor mm5, mm5 pcmpeqb mm1, mm5 ; mask mm1 @@ -230,7 +228,7 @@ next8_h: ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -406,9 +404,9 @@ next8_v: pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero psrlw mm5, 1 ; abs(p1-q1)/2 - mov rdx, arg(2) ;flimit ; + mov rdx, arg(2) ;blimit ; - movq mm2, [rdx] ;flimit mm2 + movq mm4, [rdx] ;blimit movq mm1, mm3 ; mm1=mm3=p0 movq mm7, mm6 ; mm7=mm6=q0 @@ -419,10 +417,7 @@ next8_v: paddusb mm1, mm1 ; abs(q0-p0)*2 paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - paddb mm2, mm2 ; flimit*2 (less than 255) - paddb mm4, mm2 ; flimit * 2 + limit (less than 255) - - psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit por mm1, mm0; ; mask pxor mm0, mm0 @@ -603,7 +598,7 @@ next8_v: ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -719,17 +714,15 @@ next8_mbh: paddusb mm5, mm5 ; abs(p0-q0)*2 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - mov rdx, arg(2) ;flimit ; get flimit - movq mm2, [rdx] ; flimit mm2 - paddb mm2, mm2 ; flimit*2 (less than 255) - paddb mm7, mm2 ; flimit * 2 + limit (less than 255) + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] ; blimit - psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit por mm1, mm5 pxor mm5, mm5 pcmpeqb mm1, mm5 ; mask mm1 - ; mm1 = mask, mm0=q0, mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0) + ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0) ; mm6 = p0, ; calculate high edge variance @@ -922,7 +915,7 @@ next8_mbh: ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -1108,9 +1101,9 @@ next8_mbv: pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero psrlw mm5, 1 ; abs(p1-q1)/2 - mov rdx, arg(2) ;flimit ; + mov rdx, arg(2) ;blimit ; - movq mm2, [rdx] ;flimit mm2 + movq mm4, [rdx] ;blimit movq mm1, mm3 ; mm1=mm3=p0 movq mm7, mm6 ; mm7=mm6=q0 @@ -1121,10 +1114,7 @@ next8_mbv: paddusb mm1, mm1 ; abs(q0-p0)*2 paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - paddb mm2, mm2 ; flimit*2 (less than 255) - paddb mm4, mm2 ; flimit * 2 + limit (less than 255) - - psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit por mm1, mm0; ; mask pxor mm0, mm0 @@ -1392,16 +1382,13 @@ next8_mbv: ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, -; const char *limit, -; const char *thresh, -; int count +; const char *blimit ;) global sym(vp8_loop_filter_simple_horizontal_edge_mmx) sym(vp8_loop_filter_simple_horizontal_edge_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 3 GET_GOT rbx push rsi push rdi @@ -1410,14 +1397,10 @@ sym(vp8_loop_filter_simple_horizontal_edge_mmx): mov rsi, arg(0) ;src_ptr movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - movsxd rcx, dword ptr arg(5) ;count + mov rcx, 2 ; count nexts8_h: - mov rdx, arg(3) ;limit - movq mm7, [rdx] - mov rdx, arg(2) ;flimit ; get flimit + mov rdx, arg(2) ;blimit ; get blimit movq mm3, [rdx] ; - paddb mm3, mm3 ; flimit*2 (less than 255) - paddb mm3, mm7 ; flimit * 2 + limit (less than 255) mov rdi, rsi ; rdi points to row +1 for indirect addressing add rdi, rax @@ -1445,7 +1428,7 @@ nexts8_h: paddusb mm5, mm5 ; abs(p0-q0)*2 paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit pxor mm3, mm3 pcmpeqb mm5, mm3 @@ -1515,16 +1498,13 @@ nexts8_h: ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, -; const char *limit, -; const char *thresh, -; int count +; const char *blimit ;) global sym(vp8_loop_filter_simple_vertical_edge_mmx) sym(vp8_loop_filter_simple_vertical_edge_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 3 GET_GOT rbx push rsi push rdi @@ -1539,7 +1519,7 @@ sym(vp8_loop_filter_simple_vertical_edge_mmx): movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? lea rsi, [rsi + rax*4- 2]; ; - movsxd rcx, dword ptr arg(5) ;count + mov rcx, 2 ; count nexts8_v: lea rdi, [rsi + rax]; @@ -1602,14 +1582,10 @@ nexts8_v: paddusb mm5, mm5 ; abs(p0-q0)*2 paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - mov rdx, arg(2) ;flimit ; get flimit + mov rdx, arg(2) ;blimit ; get blimit movq mm7, [rdx] - mov rdx, arg(3) ; get limit - movq mm6, [rdx] - paddb mm7, mm7 ; flimit*2 (less than 255) - paddb mm7, mm6 ; flimit * 2 + limit (less than 255) - psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit pxor mm7, mm7 pcmpeqb mm5, mm7 ; mm5 = mask diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm index c2ce1a106..4efff7eb5 100644 --- a/vp8/common/x86/loopfilter_sse2.asm +++ b/vp8/common/x86/loopfilter_sse2.asm @@ -110,7 +110,7 @@ psubusb xmm6, xmm5 ; p1-=p0 por xmm6, xmm4 ; abs(p1 - p0) - mov rdx, arg(2) ; get flimit + mov rdx, arg(2) ; get blimit movdqa t1, xmm6 ; save to t1 @@ -123,7 +123,7 @@ psubusb xmm1, xmm7 por xmm2, xmm3 ; abs(p1-q1) - movdqa xmm4, XMMWORD PTR [rdx] ; flimit + movdqa xmm7, XMMWORD PTR [rdx] ; blimit movdqa xmm3, xmm0 ; q0 pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero @@ -134,13 +134,11 @@ psrlw xmm2, 1 ; abs(p1-q1)/2 psubusb xmm5, xmm3 ; p0-=q0 - paddb xmm4, xmm4 ; flimit*2 (less than 255) psubusb xmm3, xmm6 ; q0-=p0 por xmm5, xmm3 ; abs(p0 - q0) paddusb xmm5, xmm5 ; abs(p0-q0)*2 - paddb xmm7, xmm4 ; flimit * 2 + limit (less than 255) movdqa xmm4, t0 ; hev get abs (q1 - q0) @@ -150,7 +148,7 @@ movdqa xmm2, XMMWORD PTR [rdx] ; hev - psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit psubusb xmm4, xmm2 ; hev psubusb xmm3, xmm2 ; hev @@ -278,7 +276,7 @@ ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -328,7 +326,7 @@ sym(vp8_loop_filter_horizontal_edge_sse2): ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -574,7 +572,7 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -624,7 +622,7 @@ sym(vp8_mbloop_filter_horizontal_edge_sse2): ;( ; unsigned char *u, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; unsigned char *v @@ -904,7 +902,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): movdqa xmm4, XMMWORD PTR [rdx]; limit pmaxub xmm0, xmm7 - mov rdx, arg(2) ; flimit + mov rdx, arg(2) ; blimit psubusb xmm0, xmm4 movdqa xmm5, xmm2 ; q1 @@ -921,12 +919,11 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): psrlw xmm5, 1 ; abs(p1-q1)/2 psubusb xmm6, xmm3 ; q0-p0 - movdqa xmm2, XMMWORD PTR [rdx]; flimit + movdqa xmm4, XMMWORD PTR [rdx]; blimit mov rdx, arg(4) ; get thresh por xmm1, xmm6 ; abs(q0-p0) - paddb xmm2, xmm2 ; flimit*2 (less than 255) movdqa xmm6, t0 ; get abs (q1 - q0) @@ -939,10 +936,9 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh - paddb xmm4, xmm2 ; flimit * 2 + limit (less than 255) psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh - psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh por xmm1, xmm0 ; mask @@ -1014,7 +1010,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -1081,7 +1077,7 @@ sym(vp8_loop_filter_vertical_edge_sse2): ;( ; unsigned char *u, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; unsigned char *v @@ -1239,7 +1235,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -1308,7 +1304,7 @@ sym(vp8_mbloop_filter_vertical_edge_sse2): ;( ; unsigned char *u, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; unsigned char *v @@ -1376,16 +1372,13 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2): ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, -; const char *limit, -; const char *thresh, -; int count +; const char *blimit, ;) global sym(vp8_loop_filter_simple_horizontal_edge_sse2) sym(vp8_loop_filter_simple_horizontal_edge_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 3 SAVE_XMM 7 GET_GOT rbx push rsi @@ -1394,13 +1387,8 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): mov rsi, arg(0) ;src_ptr movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - mov rdx, arg(2) ;flimit ; get flimit + mov rdx, arg(2) ;blimit movdqa xmm3, XMMWORD PTR [rdx] - mov rdx, arg(3) ;limit - movdqa xmm7, XMMWORD PTR [rdx] - - paddb xmm3, xmm3 ; flimit*2 (less than 255) - paddb xmm3, xmm7 ; flimit * 2 + limit (less than 255) mov rdi, rsi ; rdi points to row +1 for indirect addressing add rdi, rax @@ -1428,7 +1416,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): paddusb xmm5, xmm5 ; abs(p0-q0)*2 paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit pxor xmm3, xmm3 pcmpeqb xmm5, xmm3 @@ -1493,16 +1481,13 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, -; const char *limit, -; const char *thresh, -; int count +; const char *blimit, ;) global sym(vp8_loop_filter_simple_vertical_edge_sse2) sym(vp8_loop_filter_simple_vertical_edge_sse2): push rbp ; save old base pointer value. mov rbp, rsp ; set new base pointer value. - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 3 SAVE_XMM 7 GET_GOT rbx ; save callee-saved reg push rsi @@ -1607,14 +1592,10 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): paddusb xmm5, xmm5 ; abs(p0-q0)*2 paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - mov rdx, arg(2) ;flimit + mov rdx, arg(2) ;blimit movdqa xmm7, XMMWORD PTR [rdx] - mov rdx, arg(3) ; get limit - movdqa xmm6, XMMWORD PTR [rdx] - paddb xmm7, xmm7 ; flimit*2 (less than 255) - paddb xmm7, xmm6 ; flimit * 2 + limit (less than 255) - psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit pxor xmm7, xmm7 pcmpeqb xmm5, xmm7 ; mm5 = mask diff --git a/vp8/common/x86/loopfilter_x86.c b/vp8/common/x86/loopfilter_x86.c index a52420c98..9360ac17c 100644 --- a/vp8/common/x86/loopfilter_x86.c +++ b/vp8/common/x86/loopfilter_x86.c @@ -9,30 +9,18 @@ */ -#include "vpx_ports/config.h" +#include "vpx_config.h" #include "vp8/common/loopfilter.h" -prototype_loopfilter(vp8_loop_filter_horizontal_edge_c); -prototype_loopfilter(vp8_loop_filter_vertical_edge_c); -prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c); -prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c); -prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c); -prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c); - prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx); prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx); prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx); prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx); -prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx); -prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx); prototype_loopfilter(vp8_loop_filter_vertical_edge_sse2); prototype_loopfilter(vp8_loop_filter_horizontal_edge_sse2); prototype_loopfilter(vp8_mbloop_filter_vertical_edge_sse2); prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_sse2); -prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_sse2); -prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_sse2); -prototype_loopfilter(vp8_fast_loop_filter_vertical_edges_sse2); extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2; extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2; @@ -44,23 +32,13 @@ extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2; void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); -} - - -void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } @@ -68,23 +46,13 @@ void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsign void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); -} - - -void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } @@ -92,27 +60,23 @@ void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsign void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, blimit); } @@ -120,27 +84,23 @@ void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit); } #endif @@ -150,20 +110,10 @@ void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr); -} - - -void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr); } @@ -171,20 +121,10 @@ void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsig void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr); -} - - -void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr); } @@ -192,24 +132,20 @@ void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsig void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride); + vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride); } -void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, blimit); } @@ -217,36 +153,20 @@ void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4); + vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4); } -void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit); } #endif - -#if 0 -void vp8_fast_loop_filter_vertical_edges_sse(unsigned char *y_ptr, - int y_stride, - loop_filter_info *lfi) -{ - - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); -} -#endif diff --git a/vp8/common/x86/loopfilter_x86.h b/vp8/common/x86/loopfilter_x86.h index 80dbebc8d..1ed6c213f 100644 --- a/vp8/common/x86/loopfilter_x86.h +++ b/vp8/common/x86/loopfilter_x86.h @@ -24,10 +24,10 @@ extern prototype_loopfilter_block(vp8_loop_filter_mbv_mmx); extern prototype_loopfilter_block(vp8_loop_filter_bv_mmx); extern prototype_loopfilter_block(vp8_loop_filter_mbh_mmx); extern prototype_loopfilter_block(vp8_loop_filter_bh_mmx); -extern prototype_loopfilter_block(vp8_loop_filter_mbvs_mmx); -extern prototype_loopfilter_block(vp8_loop_filter_bvs_mmx); -extern prototype_loopfilter_block(vp8_loop_filter_mbhs_mmx); -extern prototype_loopfilter_block(vp8_loop_filter_bhs_mmx); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx); +extern prototype_simple_loopfilter(vp8_loop_filter_bvs_mmx); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx); +extern prototype_simple_loopfilter(vp8_loop_filter_bhs_mmx); #if !CONFIG_RUNTIME_CPU_DETECT @@ -44,13 +44,13 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_mmx); #define vp8_lf_normal_b_h vp8_loop_filter_bh_mmx #undef vp8_lf_simple_mb_v -#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_mmx +#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_mmx #undef vp8_lf_simple_b_v #define vp8_lf_simple_b_v vp8_loop_filter_bvs_mmx #undef vp8_lf_simple_mb_h -#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_mmx +#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_mmx #undef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_mmx @@ -63,10 +63,10 @@ extern prototype_loopfilter_block(vp8_loop_filter_mbv_sse2); extern prototype_loopfilter_block(vp8_loop_filter_bv_sse2); extern prototype_loopfilter_block(vp8_loop_filter_mbh_sse2); extern prototype_loopfilter_block(vp8_loop_filter_bh_sse2); -extern prototype_loopfilter_block(vp8_loop_filter_mbvs_sse2); -extern prototype_loopfilter_block(vp8_loop_filter_bvs_sse2); -extern prototype_loopfilter_block(vp8_loop_filter_mbhs_sse2); -extern prototype_loopfilter_block(vp8_loop_filter_bhs_sse2); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_sse2); +extern prototype_simple_loopfilter(vp8_loop_filter_bvs_sse2); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_sse2); +extern prototype_simple_loopfilter(vp8_loop_filter_bhs_sse2); #if !CONFIG_RUNTIME_CPU_DETECT @@ -83,13 +83,13 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_sse2); #define vp8_lf_normal_b_h vp8_loop_filter_bh_sse2 #undef vp8_lf_simple_mb_v -#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_sse2 +#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_sse2 #undef vp8_lf_simple_b_v #define vp8_lf_simple_b_v vp8_loop_filter_bvs_sse2 #undef vp8_lf_simple_mb_h -#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_sse2 +#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_sse2 #undef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_sse2 diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c index 87374f3c6..33a984b79 100644 --- a/vp8/common/x86/x86_systemdependent.c +++ b/vp8/common/x86/x86_systemdependent.c @@ -9,7 +9,7 @@ */ -#include "vpx_ports/config.h" +#include "vpx_config.h" #include "vpx_ports/x86.h" #include "vp8/common/g_common.h" #include "vp8/common/subpixel.h" @@ -63,9 +63,9 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_mmx; rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_mmx; rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_mmx; - rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_mmx; + rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_mmx; rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_mmx; - rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_mmx; + rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_mmx; rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_mmx; #if CONFIG_POSTPROC @@ -101,9 +101,9 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_sse2; rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_sse2; rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_sse2; - rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_sse2; + rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_sse2; rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_sse2; - rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_sse2; + rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_sse2; rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_sse2; #if CONFIG_POSTPROC |