summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--vp8/common/loopfilter_filters.c1
-rw-r--r--vp8/common/rtcd_defs.sh4
-rw-r--r--vp8/common/x86/loopfilter_mmx.asm467
-rw-r--r--vp8/common/x86/loopfilter_sse2.asm414
-rw-r--r--vp8/common/x86/loopfilter_x86.c133
-rw-r--r--vp8/vp8_common.mk1
6 files changed, 122 insertions, 898 deletions
diff --git a/vp8/common/loopfilter_filters.c b/vp8/common/loopfilter_filters.c
index 8ed5fc60f..323d48de8 100644
--- a/vp8/common/loopfilter_filters.c
+++ b/vp8/common/loopfilter_filters.c
@@ -271,7 +271,6 @@ void vp8_mbloop_filter_horizontal_edge_c
} while (++i < count * 8);
}
-
void vp8_mbloop_filter_vertical_edge_c
(
unsigned char *s,
diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh
index 8d7318007..1d5ae5bf0 100644
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh
@@ -125,13 +125,13 @@ specialize vp8_comp_intra_uv4x4_predict;
# Loopfilter
#
prototype void vp8_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_mbv;
+specialize vp8_loop_filter_mbv sse2
prototype void vp8_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
specialize vp8_loop_filter_bv;
prototype void vp8_loop_filter_bv8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_bv8x8;
+specialize vp8_loop_filter_bv8x8 sse2
prototype void vp8_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
specialize vp8_loop_filter_mbh sse2
diff --git a/vp8/common/x86/loopfilter_mmx.asm b/vp8/common/x86/loopfilter_mmx.asm
index ab1525fd6..63b72385b 100644
--- a/vp8/common/x86/loopfilter_mmx.asm
+++ b/vp8/common/x86/loopfilter_mmx.asm
@@ -594,473 +594,6 @@ sym(vp8_loop_filter_vertical_edge_mmx):
ret
-;void vp8_mbloop_filter_vertical_edge_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; int count
-;)
-global sym(vp8_mbloop_filter_vertical_edge_mmx)
-sym(vp8_mbloop_filter_vertical_edge_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 96 ; reserve 96 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
- %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- lea rsi, [rsi + rax*4 - 4]
-
- movsxd rcx, dword ptr arg(5) ;count
-.next8_mbv:
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
-
- ;transpose
- movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70
- movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60
-
- movq mm7, mm6 ; 77 76 75 74 73 72 71 70
- punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64
-
- punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
- movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50
-
- movq mm4, [rsi] ; 47 46 45 44 43 42 41 40
- movq mm5, mm4 ; 47 46 45 44 43 42 41 40
-
- punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44
- punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
-
- movq mm3, mm5 ; 57 47 56 46 55 45 54 44
- punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46
-
- punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44
- movq mm2, mm4 ; 53 43 52 42 51 41 50 40
-
- punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42
- punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40
-
- neg rax
-
- movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30
- movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20
-
- movq mm1, mm6 ; 27 26 25 24 23 22 21 20
- punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24
-
- punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20
-
- movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00
- punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04
-
- movq mm0, mm7 ; 17 07 16 06 15 05 14 04
- punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06
-
- punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04
- movq mm6, mm7 ; 37 27 17 07 36 26 16 06
-
- punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3
- punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2
-
- lea rdx, srct
- movq mm5, mm6 ; 76 66 56 46 36 26 16 06
-
- movq [rdx+56], mm7
- psubusb mm5, mm7 ; q2-q3
-
-
- movq [rdx+48], mm6
- psubusb mm7, mm6 ; q3-q2
-
- por mm7, mm5; ; mm7=abs (q3-q2)
- movq mm5, mm0 ; 35 25 15 05 34 24 14 04
-
- punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1
- punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0
-
- movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1
- psubusb mm3, mm6 ; q1-q2
-
- psubusb mm6, mm5 ; q2-q1
- por mm6, mm3 ; mm6=abs(q2-q1)
-
- movq [rdx+40], mm5 ; save q1
- movq [rdx+32], mm0 ; save q0
-
- movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00
- punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00
-
- movq mm0, mm3 ; 13 03 12 02 11 01 10 00
- punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00
-
- punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02
- movq mm1, mm0 ; 31 21 11 01 30 20 10 00
-
- punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3
- punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2
-
- movq [rdx], mm0 ; save p3
- movq [rdx+8], mm1 ; save p2
-
- movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2
- psubusb mm2, mm0 ; p2-p3
-
- psubusb mm0, mm1 ; p3-p2
- por mm0, mm2 ; mm0=abs(p3-p2)
-
- movq mm2, mm3 ; 33 23 13 03 32 22 12 02
- punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1
-
- punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0
- movq [rdx+24], mm3 ; save p0
-
- movq [rdx+16], mm2 ; save p1
- movq mm5, mm2 ; mm5 = p1
-
- psubusb mm2, mm1 ; p1-p2
- psubusb mm1, mm5 ; p2-p1
-
- por mm1, mm2 ; mm1=abs(p2-p1)
- mov rdx, arg(3) ;limit
-
- movq mm4, [rdx] ; mm4 = limit
- psubusb mm7, mm4 ; abs(q3-q2) > limit
-
- psubusb mm0, mm4 ; abs(p3-p2) > limit
- psubusb mm1, mm4 ; abs(p2-p1) > limit
-
- psubusb mm6, mm4 ; abs(q2-q1) > limit
- por mm7, mm6 ; or
-
- por mm0, mm1 ;
- por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
-
- movq mm1, mm5 ; p1
-
- movq mm7, mm3 ; mm3=mm7=p0
- psubusb mm7, mm5 ; p0 - p1
-
- psubusb mm5, mm3 ; p1 - p0
- por mm5, mm7 ; abs(p1-p0)
-
- movq t0, mm5 ; save abs(p1-p0)
- lea rdx, srct
-
- psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit
- por mm0, mm5 ; mm0=mask
-
- movq mm5, [rdx+32] ; mm5=q0
- movq mm7, [rdx+40] ; mm7=q1
-
- movq mm6, mm5 ; mm6=q0
- movq mm2, mm7 ; q1
- psubusb mm5, mm7 ; q0-q1
-
- psubusb mm7, mm6 ; q1-q0
- por mm7, mm5 ; abs(q1-q0)
-
- movq t1, mm7 ; save abs(q1-q0)
- psubusb mm7, mm4 ; mm7=abs(q1-q0)> limit
-
- por mm0, mm7 ; mask
-
- movq mm5, mm2 ; q1
- psubusb mm5, mm1 ; q1-=p1
- psubusb mm1, mm2 ; p1-=q1
- por mm5, mm1 ; abs(p1-q1)
- pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw mm5, 1 ; abs(p1-q1)/2
-
- mov rdx, arg(2) ;blimit ;
-
- movq mm4, [rdx] ;blimit
- movq mm1, mm3 ; mm1=mm3=p0
-
- movq mm7, mm6 ; mm7=mm6=q0
- psubusb mm1, mm7 ; p0-q0
-
- psubusb mm7, mm3 ; q0-p0
- por mm1, mm7 ; abs(q0-p0)
- paddusb mm1, mm1 ; abs(q0-p0)*2
- paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
- por mm1, mm0; ; mask
-
- pxor mm0, mm0
- pcmpeqb mm1, mm0
-
- ; calculate high edge variance
- mov rdx, arg(4) ;thresh ; get thresh
- movq mm7, [rdx]
- ;
- movq mm4, t0 ; get abs (q1 - q0)
- psubusb mm4, mm7 ; abs(q1 - q0) > thresh
-
- movq mm3, t1 ; get abs (p1 - p0)
- psubusb mm3, mm7 ; abs(p1 - p0)> thresh
-
- por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
- pcmpeqb mm4, mm0
-
- pcmpeqb mm0, mm0
- pxor mm4, mm0
-
-
-
-
- ; start work on filters
- lea rdx, srct
-
- ; start work on filters
- movq mm2, [rdx+16] ; p1
- movq mm7, [rdx+40] ; q1
- pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
- psubsb mm2, mm7 ; p1 - q1
-
- movq mm6, [rdx+24] ; p0
- movq mm0, [rdx+32] ; q0
- pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
-
- movq mm3, mm0 ; q0
- psubsb mm0, mm6 ; q0 - p0
- paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1)
- paddsb mm2, mm0 ; 2 * (q0 - p0)
- paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1)
- pand mm1, mm2 ; mask filter values we don't care about
-
- ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
- movq mm2, mm1 ; vp8_filter
- pand mm2, mm4; ; Filter2 = vp8_filter & hev
-
- movq mm5, mm2 ;
- paddsb mm5, [GLOBAL(t3)];
-
- pxor mm0, mm0 ; 0
- pxor mm7, mm7 ; 0
-
- punpcklbw mm0, mm5 ; e0f0g0h0
- psraw mm0, 11 ; sign extended shift right by 3
- punpckhbw mm7, mm5 ; a0b0c0d0
- psraw mm7, 11 ; sign extended shift right by 3
- packsswb mm0, mm7 ; Filter2 >>=3;
-
- movq mm5, mm0 ; Filter2
-
- paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
- pxor mm0, mm0 ; 0
- pxor mm7, mm7 ; 0
-
- punpcklbw mm0, mm2 ; e0f0g0h0
- psraw mm0, 11 ; sign extended shift right by 3
- punpckhbw mm7, mm2 ; a0b0c0d0
- psraw mm7, 11 ; sign extended shift right by 3
- packsswb mm0, mm7 ; Filter2 >>=3;
-
- ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
- psubsb mm3, mm0 ; qs0 =qs0 - filter1
- paddsb mm6, mm5 ; ps0 =ps0 + Fitler2
-
- ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
- ; vp8_filter &= ~hev;
- ; Filter2 = vp8_filter;
- pandn mm4, mm1 ; vp8_filter&=~hev
-
-
- ; mm3=qs0, mm4=filter2, mm6=ps0
-
- ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
- ; s = vp8_signed_char_clamp(qs0 - u);
- ; *oq0 = s^0x80;
- ; s = vp8_signed_char_clamp(ps0 + u);
- ; *op0 = s^0x80;
- pxor mm0, mm0
-
- pxor mm1, mm1
- pxor mm2, mm2
- punpcklbw mm1, mm4
- punpckhbw mm2, mm4
- pmulhw mm1, [GLOBAL(s27)]
- pmulhw mm2, [GLOBAL(s27)]
- paddw mm1, [GLOBAL(s63)]
- paddw mm2, [GLOBAL(s63)]
- psraw mm1, 7
- psraw mm2, 7
- packsswb mm1, mm2
-
- psubsb mm3, mm1
- paddsb mm6, mm1
-
- pxor mm3, [GLOBAL(t80)]
- pxor mm6, [GLOBAL(t80)]
- movq [rdx+24], mm6
- movq [rdx+32], mm3
-
- ; roughly 2/7th difference across boundary
- ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
- ; s = vp8_signed_char_clamp(qs1 - u);
- ; *oq1 = s^0x80;
- ; s = vp8_signed_char_clamp(ps1 + u);
- ; *op1 = s^0x80;
- pxor mm1, mm1
- pxor mm2, mm2
- punpcklbw mm1, mm4
- punpckhbw mm2, mm4
- pmulhw mm1, [GLOBAL(s18)]
- pmulhw mm2, [GLOBAL(s18)]
- paddw mm1, [GLOBAL(s63)]
- paddw mm2, [GLOBAL(s63)]
- psraw mm1, 7
- psraw mm2, 7
- packsswb mm1, mm2
-
- movq mm3, [rdx + 40]
- movq mm6, [rdx + 16] ; p1
- pxor mm3, [GLOBAL(t80)]
- pxor mm6, [GLOBAL(t80)]
-
- paddsb mm6, mm1
- psubsb mm3, mm1
-
- pxor mm6, [GLOBAL(t80)]
- pxor mm3, [GLOBAL(t80)]
- movq [rdx + 40], mm3
- movq [rdx + 16], mm6
-
- ; roughly 1/7th difference across boundary
- ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
- ; s = vp8_signed_char_clamp(qs2 - u);
- ; *oq2 = s^0x80;
- ; s = vp8_signed_char_clamp(ps2 + u);
- ; *op2 = s^0x80;
- pxor mm1, mm1
- pxor mm2, mm2
- punpcklbw mm1, mm4
- punpckhbw mm2, mm4
- pmulhw mm1, [GLOBAL(s9)]
- pmulhw mm2, [GLOBAL(s9)]
- paddw mm1, [GLOBAL(s63)]
- paddw mm2, [GLOBAL(s63)]
- psraw mm1, 7
- psraw mm2, 7
- packsswb mm1, mm2
-
- movq mm6, [rdx+ 8]
- movq mm3, [rdx+48]
-
- pxor mm6, [GLOBAL(t80)]
- pxor mm3, [GLOBAL(t80)]
-
- paddsb mm6, mm1
- psubsb mm3, mm1
-
- pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01
- pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06
-
- ; tranpose and write back
- movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00
- movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00
-
- punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00
- punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40
-
- movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02
- movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02
-
- punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02
- punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42
-
- movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00
- punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00
-
- punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20
- movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40
-
- punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40
- punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60
-
- movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04
- punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04
-
- movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06
- punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06
-
- movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04
- punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04
-
- punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24
- movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00
-
- punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00
- punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10
-
- movq [rsi+rax*4], mm0 ; write out
- movq [rdi+rax*4], mm6 ; write out
-
- movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20
- punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20
-
- punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30
- movq [rsi+rax*2], mm0 ; write out
-
- movq [rdi+rax*2], mm5 ; write out
- movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04
-
- punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44
- punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46
-
- movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44
- punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44
-
- punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64
- movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40
-
- movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60
- punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40
-
- punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50
- movq [rsi], mm0 ; write out
-
- movq [rdi], mm1 ; write out
- neg rax
-
- punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60
- punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60
-
- movq [rsi+rax*2], mm3
- movq [rdi+rax*2], mm4
-
- lea rsi, [rsi+rax*8]
- dec rcx
-
- jnz .next8_mbv
-
- add rsp, 96
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
;void vp8_loop_filter_simple_horizontal_edge_mmx
;(
; unsigned char *src_ptr,
diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
index fed819688..6f6531c86 100644
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -380,193 +380,6 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
ret
-%macro MB_FILTER_AND_WRITEBACK 1
-%if %1 == 0
- movdqa xmm2, p1 ; p1
- movdqa xmm7, q1 ; q1
-%elif %1 == 1
- movdqa xmm2, [rsi+2*rax] ; p1
- movdqa xmm7, [rdi] ; q1
-
- mov rcx, rax
- neg rcx
-%elif %1 == 2
- lea rdx, srct
-
- movdqa xmm2, [rdx+32] ; p1
- movdqa xmm7, [rdx+80] ; q1
- movdqa xmm6, [rdx+48] ; p0
- movdqa xmm0, [rdx+64] ; q0
-%endif
-
- pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
- pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
-
- psubsb xmm2, xmm7 ; p1 - q1
- movdqa xmm3, xmm0 ; q0
-
- psubsb xmm0, xmm6 ; q0 - p0
-
- paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1)
-
- paddsb xmm2, xmm0 ; 2 * (q0 - p0)
-
- paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1)
-
- pand xmm1, xmm2 ; mask filter values we don't care about
-
- movdqa xmm2, xmm1 ; vp8_filter
-
- pand xmm2, xmm4 ; Filter2 = vp8_filter & hev
- pxor xmm0, xmm0
-
- pandn xmm4, xmm1 ; vp8_filter&=~hev
- pxor xmm1, xmm1
-
- punpcklbw xmm0, xmm4 ; Filter 2 (hi)
- movdqa xmm5, xmm2
-
- punpckhbw xmm1, xmm4 ; Filter 2 (lo)
- paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3)
-
- pmulhw xmm1, [GLOBAL(s9)] ; Filter 2 (lo) * 9
-
- pmulhw xmm0, [GLOBAL(s9)] ; Filter 2 (hi) * 9
-
- punpckhbw xmm7, xmm5 ; axbxcxdx
- paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
-
- punpcklbw xmm5, xmm5 ; exfxgxhx
- psraw xmm7, 11 ; sign extended shift right by 3
-
- psraw xmm5, 11 ; sign extended shift right by 3
- punpckhbw xmm4, xmm2 ; axbxcxdx
-
- punpcklbw xmm2, xmm2 ; exfxgxhx
- psraw xmm4, 11 ; sign extended shift right by 3
-
- packsswb xmm5, xmm7 ; Filter2 >>=3;
- psraw xmm2, 11 ; sign extended shift right by 3
-
- packsswb xmm2, xmm4 ; Filter1 >>=3;
- movdqa xmm7, xmm1
-
- paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2
- movdqa xmm4, xmm1
-
- psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1
- movdqa xmm5, xmm0
-
- movdqa xmm2, xmm5
- paddw xmm0, [GLOBAL(s63)] ; Filter 2 (hi) * 9 + 63
-
- paddw xmm1, [GLOBAL(s63)] ; Filter 2 (lo) * 9 + 63
- paddw xmm5, xmm5 ; Filter 2 (hi) * 18
-
- paddw xmm7, xmm7 ; Filter 2 (lo) * 18
- paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63
-
- paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63
- paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63
-
- paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63
- psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7
-
- psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7
- psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7
-
- packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
- psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7
-
- psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7
- packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
-
- psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7
-
- packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
-
- psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3)
- paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3)
-
-%if %1 == 0
- movdqa xmm5, q2 ; q2
- movdqa xmm1, q1 ; q1
- movdqa xmm4, p1 ; p1
- movdqa xmm7, p2 ; p2
-
-%elif %1 == 1
- movdqa xmm5, XMMWORD PTR [rdi+rcx] ; q2
- movdqa xmm1, XMMWORD PTR [rdi] ; q1
- movdqa xmm4, XMMWORD PTR [rsi+rax*2] ; p1
- movdqa xmm7, XMMWORD PTR [rdi+rax*4] ; p2
-%elif %1 == 2
- movdqa xmm5, XMMWORD PTR [rdx+96] ; q2
- movdqa xmm1, XMMWORD PTR [rdx+80] ; q1
- movdqa xmm4, XMMWORD PTR [rdx+32] ; p1
- movdqa xmm7, XMMWORD PTR [rdx+16] ; p2
-%endif
-
- pxor xmm3, [GLOBAL(t80)] ; *oq0 = sq^0x80
- pxor xmm6, [GLOBAL(t80)] ; *oq0 = sp^0x80
-
- pxor xmm1, [GLOBAL(t80)]
- pxor xmm4, [GLOBAL(t80)]
-
- psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2)
- paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2)
-
- pxor xmm1, [GLOBAL(t80)] ; *oq1 = sq^0x80;
- pxor xmm4, [GLOBAL(t80)] ; *op1 = sp^0x80;
-
- pxor xmm7, [GLOBAL(t80)]
- pxor xmm5, [GLOBAL(t80)]
-
- paddsb xmm7, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u)
- psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u)
-
- pxor xmm7, [GLOBAL(t80)] ; *op2 = sp^0x80;
- pxor xmm5, [GLOBAL(t80)] ; *oq2 = sq^0x80;
-
-%if %1 == 0
- lea rsi, [rsi+rcx*2]
- lea rdi, [rdi+rcx*2]
-
- movq MMWORD PTR [rsi], xmm6 ; p0
- movhps MMWORD PTR [rdi], xmm6
- movq MMWORD PTR [rsi + rcx], xmm3 ; q0
- movhps MMWORD PTR [rdi + rcx], xmm3
-
- movq MMWORD PTR [rsi+rcx*2], xmm1 ; q1
- movhps MMWORD PTR [rdi+rcx*2], xmm1
-
- movq MMWORD PTR [rsi + rax], xmm4 ; p1
- movhps MMWORD PTR [rdi + rax], xmm4
-
- movq MMWORD PTR [rsi+rax*2], xmm7 ; p2
- movhps MMWORD PTR [rdi+rax*2], xmm7
-
- lea rsi, [rsi + rcx]
- lea rdi, [rdi + rcx]
- movq MMWORD PTR [rsi+rcx*2], xmm5 ; q2
- movhps MMWORD PTR [rdi+rcx*2], xmm5
-%elif %1 == 1
- movdqa XMMWORD PTR [rdi+rcx], xmm5 ; q2
- movdqa XMMWORD PTR [rdi], xmm1 ; q1
- movdqa XMMWORD PTR [rsi], xmm3 ; q0
- movdqa XMMWORD PTR [rsi+rax ],xmm6 ; p0
- movdqa XMMWORD PTR [rsi+rax*2],xmm4 ; p1
- movdqa XMMWORD PTR [rdi+rax*4],xmm7 ; p2
-%elif %1 == 2
- movdqa XMMWORD PTR [rdx+80], xmm1 ; q1
- movdqa XMMWORD PTR [rdx+64], xmm3 ; q0
- movdqa XMMWORD PTR [rdx+48], xmm6 ; p0
- movdqa XMMWORD PTR [rdx+32], xmm4 ; p1
-%endif
-
-%endmacro
-
%macro TRANSPOSE_16X8 2
movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
@@ -1032,233 +845,6 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
pop rbp
ret
-%macro MBV_TRANSPOSE 0
- movdqa xmm0, [rdx] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
- movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-
- punpcklbw xmm0, xmm7 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
- punpckhbw xmm1, xmm7 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
-
- movdqa xmm2, [rdx+32] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- movdqa xmm6, xmm2 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
- punpcklbw xmm2, [rdx+48] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
- punpckhbw xmm6, [rdx+48] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
- movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
- punpcklwd xmm0, xmm2 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
-
- punpckhwd xmm3, xmm2 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
- movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
-
- punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
- punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
-
- movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
- punpcklbw xmm2, [rdx+80] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
-
- movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
- punpcklbw xmm6, [rdx+112] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
-
- movdqa xmm7, xmm2 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
- punpcklwd xmm2, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
-
- punpckhwd xmm7, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
- movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
-
- punpckldq xmm0, xmm2 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
- punpckhdq xmm6, xmm2 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
-%endmacro
-
-%macro MBV_WRITEBACK_1 0
- movq QWORD PTR [rsi], xmm0
- movhps MMWORD PTR [rdi], xmm0
-
- movq QWORD PTR [rsi+2*rax], xmm6
- movhps MMWORD PTR [rdi+2*rax], xmm6
-
- movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
- punpckldq xmm0, xmm7 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
-
- punpckhdq xmm3, xmm7 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
-
- movq QWORD PTR [rsi+4*rax], xmm0
- movhps MMWORD PTR [rdi+4*rax], xmm0
-
- movq QWORD PTR [rsi+2*rcx], xmm3
- movhps MMWORD PTR [rdi+2*rcx], xmm3
-
- movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
- punpckhbw xmm2, [rdx+80] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
-
- punpckhbw xmm5, [rdx+112] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
- movdqa xmm0, xmm2
-
- punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
- punpckhwd xmm2, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
-
- movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
- punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
-
- punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
-%endmacro
-
-%macro MBV_WRITEBACK_2 0
- movq QWORD PTR [rsi], xmm1
- movhps MMWORD PTR [rdi], xmm1
-
- movq QWORD PTR [rsi+2*rax], xmm5
- movhps MMWORD PTR [rdi+2*rax], xmm5
-
- movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
- punpckldq xmm1, xmm2 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
- punpckhdq xmm4, xmm2 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
-
- movq QWORD PTR [rsi+4*rax], xmm1
- movhps MMWORD PTR [rdi+4*rax], xmm1
-
- movq QWORD PTR [rsi+2*rcx], xmm4
- movhps MMWORD PTR [rdi+2*rcx], xmm4
-%endmacro
-
-
-;void vp8_mbloop_filter_vertical_edge_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; int count
-;)
-global sym(vp8_mbloop_filter_vertical_edge_sse2)
-sym(vp8_mbloop_filter_vertical_edge_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 160 ; reserve 160 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
- %define srct [rsp + 32] ;__declspec(align(16)) char srct[128];
-
- mov rsi, arg(0) ; src_ptr
- movsxd rax, dword ptr arg(1) ; src_pixel_step
-
- lea rsi, [rsi - 4]
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
- lea rcx, [rax*2+rax]
-
- ; Transpose
- TRANSPOSE_16X8 1, 0
-
- ; calculate filter mask and high edge variance
- LFV_FILTER_MASK_HEV_MASK 0
-
- neg rax
- ; start work on filters
- MB_FILTER_AND_WRITEBACK 2
-
- lea rsi, [rsi+rax*8]
- lea rdi, [rdi+rax*8]
-
- ; transpose and write back
- MBV_TRANSPOSE
-
- neg rax
-
- MBV_WRITEBACK_1
-
- lea rsi, [rsi+rax*8]
- lea rdi, [rdi+rax*8]
- MBV_WRITEBACK_2
-
- add rsp, 160
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_mbloop_filter_vertical_edge_uv_sse2
-;(
-; unsigned char *u,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; unsigned char *v
-;)
-global sym(vp8_mbloop_filter_vertical_edge_uv_sse2)
-sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 160 ; reserve 160 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
- %define srct [rsp + 32] ;__declspec(align(16)) char srct[128];
-
- mov rsi, arg(0) ; u_ptr
- movsxd rax, dword ptr arg(1) ; src_pixel_step
-
- lea rsi, [rsi - 4]
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
- lea rcx, [rax+2*rax]
-
- lea rdx, srct
-
- ; Transpose
- TRANSPOSE_16X8 0, 0
-
- ; calculate filter mask and high edge variance
- LFV_FILTER_MASK_HEV_MASK 0
-
- ; start work on filters
- MB_FILTER_AND_WRITEBACK 2
-
- ; transpose and write back
- MBV_TRANSPOSE
-
- mov rsi, arg(0) ;u_ptr
- lea rsi, [rsi - 4]
- lea rdi, [rsi + rax]
- MBV_WRITEBACK_1
- mov rsi, arg(5) ;v_ptr
- lea rsi, [rsi - 4]
- lea rdi, [rsi + rax]
- MBV_WRITEBACK_2
-
- add rsp, 160
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
;void vp8_loop_filter_simple_horizontal_edge_sse2
;(
; unsigned char *src_ptr,
diff --git a/vp8/common/x86/loopfilter_x86.c b/vp8/common/x86/loopfilter_x86.c
index 17fa5aa7c..1fa8ed431 100644
--- a/vp8/common/x86/loopfilter_x86.c
+++ b/vp8/common/x86/loopfilter_x86.c
@@ -13,17 +13,14 @@
#include "vpx_config.h"
#include "vp8/common/loopfilter.h"
-prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
prototype_loopfilter(vp8_loop_filter_vertical_edge_sse2);
prototype_loopfilter(vp8_loop_filter_horizontal_edge_sse2);
-prototype_loopfilter(vp8_mbloop_filter_vertical_edge_sse2);
extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
-extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
#if HAVE_MMX
/* Horizontal MB filtering */
@@ -35,13 +32,6 @@ void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, struct loop_filter_info *lfi) {
- vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
- if (u_ptr)
- vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
@@ -340,6 +330,107 @@ void vp8_mbloop_filter_horizontal_edge_c_sse2
}
}
}
+static __inline void transpose(unsigned char *src[], int in_p,
+ unsigned char *dst[], int out_p,
+ int num_8x8_to_transpose) {
+ int idx8x8 = 0;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+
+ do {
+ unsigned char *in = src[idx8x8];
+ unsigned char *out = dst[idx8x8];
+
+ x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07
+ x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17
+ x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27
+ x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37
+ x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47
+ x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57
+ x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67
+ x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ x0 = _mm_unpacklo_epi8(x0, x1);
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ x1 = _mm_unpacklo_epi8(x2, x3);
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ x2 = _mm_unpacklo_epi8(x4, x5);
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ x3 = _mm_unpacklo_epi8(x6, x7);
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ x4 = _mm_unpacklo_epi16(x0, x1);
+ // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ x5 = _mm_unpacklo_epi16(x2, x3);
+ // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ x7 = _mm_unpackhi_epi32(x4, x5);
+
+ _mm_storel_pd((double *)(out + 0*out_p),
+ (__m128d)x6); // 00 10 20 30 40 50 60 70
+ _mm_storeh_pd((double *)(out + 1*out_p),
+ (__m128d)x6); // 01 11 21 31 41 51 61 71
+ _mm_storel_pd((double *)(out + 2*out_p),
+ (__m128d)x7); // 02 12 22 32 42 52 62 72
+ _mm_storeh_pd((double *)(out + 3*out_p),
+ (__m128d)x7); // 03 13 23 33 43 53 63 73
+
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ x4 = _mm_unpackhi_epi16(x0, x1);
+ // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+ x5 = _mm_unpackhi_epi16(x2, x3);
+ // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ x7 = _mm_unpackhi_epi32(x4, x5);
+
+ _mm_storel_pd((double *)(out + 4*out_p),
+ (__m128d)x6); // 04 14 24 34 44 54 64 74
+ _mm_storeh_pd((double *)(out + 5*out_p),
+ (__m128d)x6); // 05 15 25 35 45 55 65 75
+ _mm_storel_pd((double *)(out + 6*out_p),
+ (__m128d)x7); // 06 16 26 36 46 56 66 76
+ _mm_storeh_pd((double *)(out + 7*out_p),
+ (__m128d)x7); // 07 17 27 37 47 57 67 77
+ } while (++idx8x8 < num_8x8_to_transpose);
+}
+void vp8_mbloop_filter_vertical_edge_c_sse2
+(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count
+) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 16]);
+ unsigned char *src[4];
+ unsigned char *dst[4];
+
+ src[0] = s - 5;
+ src[1] = s - 5 + 8;
+ src[2] = s - 5 + p*8;
+ src[3] = s - 5 + p*8 + 8;
+
+ dst[0] = t_dst;
+ dst[1] = t_dst + 16*8;
+ dst[2] = t_dst + 8;
+ dst[3] = t_dst + 16*8 + 8;
+
+ // 16x16->16x16 or 16x8->8x16
+ transpose(src, p, dst, 16, (1 << count));
+
+ vp8_mbloop_filter_horizontal_edge_c_sse2(t_dst + 5*16, 16, blimit, limit,
+ thresh, count);
+
+ dst[0] = s - 5;
+ dst[1] = s - 5 + p*8;
+
+ src[0] = t_dst;
+ src[1] = t_dst + 8;
+
+ // 16x8->8x16 or 8x8->8x8
+ transpose(src, 16, dst, p, (1 << (count - 1)));
+}
/* Horizontal MB filtering */
void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -366,14 +457,28 @@ void vp8_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
}
/* Vertical MB Filtering */
-void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, struct loop_filter_info *lfi) {
- vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ struct loop_filter_info *lfi) {
+ vp8_mbloop_filter_vertical_edge_c_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 2);
+ /* TODO: write sse2 version with u,v interleaved */
if (u_ptr)
- vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
+ vp8_mbloop_filter_vertical_edge_c_sse2(u_ptr, uv_stride, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_vertical_edge_c_sse2(v_ptr, uv_stride, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1);
}
+void vp8_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ struct loop_filter_info *lfi) {
+ vp8_mbloop_filter_vertical_edge_c_sse2(
+ y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+}
/* Horizontal B Filtering */
void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 5e58f8d26..69df04cd9 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -120,6 +120,7 @@ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/filter_sse2.c
ifeq ($(HAVE_SSE2),yes)
vp8/common/x86/filter_sse2.c.o: CFLAGS += -msse2
vp8/common/x86/loopfilter_x86.c.o: CFLAGS += -msse2
+vp8/common/loopfilter_filters.c.o: CFLAGS += -msse2
endif
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/arm_systemdependent.c