summaryrefslogtreecommitdiff
path: root/vp8/common
diff options
context:
space:
mode:
Diffstat (limited to 'vp8/common')
-rw-r--r--vp8/common/x86/loopfilter_sse2.asm44
-rw-r--r--vp8/common/x86/postproc_mmx.c1508
2 files changed, 22 insertions, 1530 deletions
diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
index 4efff7eb5..295609c58 100644
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -1395,8 +1395,8 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
neg rax
; calculate mask
- movdqu xmm1, [rsi+2*rax] ; p1
- movdqu xmm0, [rdi] ; q1
+ movdqa xmm1, [rsi+2*rax] ; p1
+ movdqa xmm0, [rdi] ; q1
movdqa xmm2, xmm1
movdqa xmm7, xmm0
movdqa xmm4, xmm0
@@ -1406,8 +1406,8 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
psrlw xmm1, 1 ; abs(p1-q1)/2
- movdqu xmm5, [rsi+rax] ; p0
- movdqu xmm4, [rsi] ; q0
+ movdqa xmm5, [rsi+rax] ; p0
+ movdqa xmm4, [rsi] ; q0
movdqa xmm0, xmm4 ; q0
movdqa xmm6, xmm5 ; p0
psubusb xmm5, xmm4 ; p0-=q0
@@ -1449,7 +1449,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
psubsb xmm3, xmm0 ; q0-= q0 add
pxor xmm3, [GLOBAL(t80)] ; unoffset
- movdqu [rsi], xmm3 ; write back
+ movdqa [rsi], xmm3 ; write back
; now do +3 side
psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
@@ -1465,7 +1465,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
paddsb xmm6, xmm0 ; p0+= p0 add
pxor xmm6, [GLOBAL(t80)] ; unoffset
- movdqu [rsi+rax], xmm6 ; write back
+ movdqa [rsi+rax], xmm6 ; write back
; begin epilog
pop rdi
@@ -1507,17 +1507,17 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
lea rdx, [rsi + rax*4]
lea rcx, [rdx + rax]
- movdqu xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00
- movdqu xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40
- movdqu xmm2, [rdi] ; 13 12 11 10
- movdqu xmm3, [rcx] ; 53 52 51 50
+ movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00
+ movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40
+ movd xmm2, [rdi] ; 13 12 11 10
+ movd xmm3, [rcx] ; 53 52 51 50
punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00
punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10
- movdqu xmm4, [rsi + rax*2] ; 23 22 21 20
- movdqu xmm5, [rdx + rax*2] ; 63 62 61 60
- movdqu xmm6, [rdi + rax*2] ; 33 32 31 30
- movdqu xmm7, [rcx + rax*2] ; 73 72 71 70
+ movd xmm4, [rsi + rax*2] ; 23 22 21 20
+ movd xmm5, [rdx + rax*2] ; 63 62 61 60
+ movd xmm6, [rdi + rax*2] ; 33 32 31 30
+ movd xmm7, [rcx + rax*2] ; 73 72 71 70
punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20
punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30
@@ -1540,17 +1540,17 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
lea rdx, [rsi + rax*4]
lea rcx, [rdx + rax]
- movdqu xmm4, [rsi] ; 83 82 81 80
- movdqu xmm1, [rdx] ; c3 c2 c1 c0
- movdqu xmm6, [rdi] ; 93 92 91 90
- movdqu xmm3, [rcx] ; d3 d2 d1 d0
+ movd xmm4, [rsi] ; 83 82 81 80
+ movd xmm1, [rdx] ; c3 c2 c1 c0
+ movd xmm6, [rdi] ; 93 92 91 90
+ movd xmm3, [rcx] ; d3 d2 d1 d0
punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80
punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90
- movdqu xmm0, [rsi + rax*2] ; a3 a2 a1 a0
- movdqu xmm5, [rdx + rax*2] ; e3 e2 e1 e0
- movdqu xmm2, [rdi + rax*2] ; b3 b2 b1 b0
- movdqu xmm7, [rcx + rax*2] ; f3 f2 f1 f0
+ movd xmm0, [rsi + rax*2] ; a3 a2 a1 a0
+ movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0
+ movd xmm2, [rdi + rax*2] ; b3 b2 b1 b0
+ movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0
punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0
punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0
diff --git a/vp8/common/x86/postproc_mmx.c b/vp8/common/x86/postproc_mmx.c
deleted file mode 100644
index 6b6321ace..000000000
--- a/vp8/common/x86/postproc_mmx.c
+++ /dev/null
@@ -1,1508 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <math.h>
-#include <stdlib.h>
-#include "vpx_scale/yv12config.h"
-#include "pragmas.h"
-
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT 7
-
-
-
-/* static constants */
-__declspec(align(16))
-const static short Blur[48] =
-{
-
- 16, 16, 16, 16, 16, 16, 16, 16,
- 16, 16, 16, 16, 16, 16, 16, 16,
- 64, 64, 64, 64, 64, 64, 64, 64,
- 16, 16, 16, 16, 16, 16, 16, 16,
- 16, 16, 16, 16, 16, 16, 16, 16,
- 0, 0, 0, 0, 0, 0, 0, 0,
-
-};
-#define RD __declspec(align(16)) __int64 rd = 0x0040004000400040;
-#define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004};
-
-#ifndef RELOCATEABLE
-const static RD;
-const static R4D2;
-#endif
-
-
-/* external references */
-extern double vp8_gaussian(double sigma, double mu, double x);
-extern short vp8_rv[];
-extern int vp8_q2mbl(int x) ;
-
-
-
-void vp8_post_proc_down_and_across_mmx
-(
- unsigned char *src_ptr,
- unsigned char *dst_ptr,
- int src_pixels_per_line,
- int dst_pixels_per_line,
- int rows,
- int cols,
- int flimit
-)
-{
-#ifdef RELOCATEABLE
- RD
- R4D2
-#endif
-
- __asm
- {
- push ebx
- lea ebx, Blur
- movd mm2, flimit
- punpcklwd mm2, mm2
- punpckldq mm2, mm2
-
- mov esi, src_ptr
- mov edi, dst_ptr
-
- mov ecx, DWORD PTR rows
- mov eax, src_pixels_per_line ;
- destination pitch?
- pxor mm0, mm0 ;
- mm0 = 00000000
-
- nextrow:
-
- xor edx, edx ;
-
- clear out edx for use as loop counter
- nextcol:
-
- pxor mm7, mm7 ;
-
- mm7 = 00000000
- movq mm6, [ebx + 32 ] ;
- mm6 = kernel 2 taps
- movq mm3, [esi] ;
- mm4 = r0 p0..p7
- punpcklbw mm3, mm0 ;
- mm3 = p0..p3
- movq mm1, mm3 ;
- mm1 = p0..p3
- pmullw mm3, mm6 ;
- mm3 *= kernel 2 modifiers
-
- movq mm6, [ebx + 48] ;
- mm6 = kernel 3 taps
- movq mm5, [esi + eax] ;
- mm4 = r1 p0..p7
- punpcklbw mm5, mm0 ;
- mm5 = r1 p0..p3
- pmullw mm6, mm5 ;
- mm6 *= p0..p3 * kernel 3 modifiers
- paddusw mm3, mm6 ;
- mm3 += mm6
-
- ;
- thresholding
- movq mm7, mm1 ;
- mm7 = r0 p0..p3
- psubusw mm7, mm5 ;
- mm7 = r0 p0..p3 - r1 p0..p3
- psubusw mm5, mm1 ;
- mm5 = r1 p0..p3 - r0 p0..p3
- paddusw mm7, mm5 ;
- mm7 = abs(r0 p0..p3 - r1 p0..p3)
- pcmpgtw mm7, mm2
-
- movq mm6, [ebx + 64 ] ;
- mm6 = kernel 4 modifiers
- movq mm5, [esi + 2*eax] ;
- mm4 = r2 p0..p7
- punpcklbw mm5, mm0 ;
- mm5 = r2 p0..p3
- pmullw mm6, mm5 ;
- mm5 *= kernel 4 modifiers
- paddusw mm3, mm6 ;
- mm3 += mm5
-
- ;
- thresholding
- movq mm6, mm1 ;
- mm6 = r0 p0..p3
- psubusw mm6, mm5 ;
- mm6 = r0 p0..p3 - r2 p0..p3
- psubusw mm5, mm1 ;
- mm5 = r2 p0..p3 - r2 p0..p3
- paddusw mm6, mm5 ;
- mm6 = abs(r0 p0..p3 - r2 p0..p3)
- pcmpgtw mm6, mm2
- por mm7, mm6 ;
- accumulate thresholds
-
-
- neg eax
- movq mm6, [ebx ] ;
- kernel 0 taps
- movq mm5, [esi+2*eax] ;
- mm4 = r-2 p0..p7
- punpcklbw mm5, mm0 ;
- mm5 = r-2 p0..p3
- pmullw mm6, mm5 ;
- mm5 *= kernel 0 modifiers
- paddusw mm3, mm6 ;
- mm3 += mm5
-
- ;
- thresholding
- movq mm6, mm1 ;
- mm6 = r0 p0..p3
- psubusw mm6, mm5 ;
- mm6 = p0..p3 - r-2 p0..p3
- psubusw mm5, mm1 ;
- mm5 = r-2 p0..p3 - p0..p3
- paddusw mm6, mm5 ;
- mm6 = abs(r0 p0..p3 - r-2 p0..p3)
- pcmpgtw mm6, mm2
- por mm7, mm6 ;
- accumulate thresholds
-
- movq mm6, [ebx + 16] ;
- kernel 1 taps
- movq mm4, [esi+eax] ;
- mm4 = r-1 p0..p7
- punpcklbw mm4, mm0 ;
- mm4 = r-1 p0..p3
- pmullw mm6, mm4 ;
- mm4 *= kernel 1 modifiers.
- paddusw mm3, mm6 ;
- mm3 += mm5
-
- ;
- thresholding
- movq mm6, mm1 ;
- mm6 = r0 p0..p3
- psubusw mm6, mm4 ;
- mm6 = p0..p3 - r-2 p0..p3
- psubusw mm4, mm1 ;
- mm5 = r-1 p0..p3 - p0..p3
- paddusw mm6, mm4 ;
- mm6 = abs(r0 p0..p3 - r-1 p0..p3)
- pcmpgtw mm6, mm2
- por mm7, mm6 ;
- accumulate thresholds
-
-
- paddusw mm3, rd ;
- mm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ;
- mm3 /= 128
-
- pand mm1, mm7 ;
- mm1 select vals > thresh from source
- pandn mm7, mm3 ;
- mm7 select vals < thresh from blurred result
- paddusw mm1, mm7 ;
- combination
-
- packuswb mm1, mm0 ;
- pack to bytes
-
- movd [edi], mm1 ;
- neg eax ;
- pitch is positive
-
-
- add esi, 4
- add edi, 4
- add edx, 4
-
- cmp edx, cols
- jl nextcol
- // done with the all cols, start the across filtering in place
- sub esi, edx
- sub edi, edx
-
-
- push eax
- xor edx, edx
- mov eax, [edi-4];
-
- acrossnextcol:
- pxor mm7, mm7 ;
- mm7 = 00000000
- movq mm6, [ebx + 32 ] ;
- movq mm4, [edi+edx] ;
- mm4 = p0..p7
- movq mm3, mm4 ;
- mm3 = p0..p7
- punpcklbw mm3, mm0 ;
- mm3 = p0..p3
- movq mm1, mm3 ;
- mm1 = p0..p3
- pmullw mm3, mm6 ;
- mm3 *= kernel 2 modifiers
-
- movq mm6, [ebx + 48]
- psrlq mm4, 8 ;
- mm4 = p1..p7
- movq mm5, mm4 ;
- mm5 = p1..p7
- punpcklbw mm5, mm0 ;
- mm5 = p1..p4
- pmullw mm6, mm5 ;
- mm6 *= p1..p4 * kernel 3 modifiers
- paddusw mm3, mm6 ;
- mm3 += mm6
-
- ;
- thresholding
- movq mm7, mm1 ;
- mm7 = p0..p3
- psubusw mm7, mm5 ;
- mm7 = p0..p3 - p1..p4
- psubusw mm5, mm1 ;
- mm5 = p1..p4 - p0..p3
- paddusw mm7, mm5 ;
- mm7 = abs(p0..p3 - p1..p4)
- pcmpgtw mm7, mm2
-
- movq mm6, [ebx + 64 ]
- psrlq mm4, 8 ;
- mm4 = p2..p7
- movq mm5, mm4 ;
- mm5 = p2..p7
- punpcklbw mm5, mm0 ;
- mm5 = p2..p5
- pmullw mm6, mm5 ;
- mm5 *= kernel 4 modifiers
- paddusw mm3, mm6 ;
- mm3 += mm5
-
- ;
- thresholding
- movq mm6, mm1 ;
- mm6 = p0..p3
- psubusw mm6, mm5 ;
- mm6 = p0..p3 - p1..p4
- psubusw mm5, mm1 ;
- mm5 = p1..p4 - p0..p3
- paddusw mm6, mm5 ;
- mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw mm6, mm2
- por mm7, mm6 ;
- accumulate thresholds
-
-
- movq mm6, [ebx ]
- movq mm4, [edi+edx-2] ;
- mm4 = p-2..p5
- movq mm5, mm4 ;
- mm5 = p-2..p5
- punpcklbw mm5, mm0 ;
- mm5 = p-2..p1
- pmullw mm6, mm5 ;
- mm5 *= kernel 0 modifiers
- paddusw mm3, mm6 ;
- mm3 += mm5
-
- ;
- thresholding
- movq mm6, mm1 ;
- mm6 = p0..p3
- psubusw mm6, mm5 ;
- mm6 = p0..p3 - p1..p4
- psubusw mm5, mm1 ;
- mm5 = p1..p4 - p0..p3
- paddusw mm6, mm5 ;
- mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw mm6, mm2
- por mm7, mm6 ;
- accumulate thresholds
-
- movq mm6, [ebx + 16]
- psrlq mm4, 8 ;
- mm4 = p-1..p5
- punpcklbw mm4, mm0 ;
- mm4 = p-1..p2
- pmullw mm6, mm4 ;
- mm4 *= kernel 1 modifiers.
- paddusw mm3, mm6 ;
- mm3 += mm5
-
- ;
- thresholding
- movq mm6, mm1 ;
- mm6 = p0..p3
- psubusw mm6, mm4 ;
- mm6 = p0..p3 - p1..p4
- psubusw mm4, mm1 ;
- mm5 = p1..p4 - p0..p3
- paddusw mm6, mm4 ;
- mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw mm6, mm2
- por mm7, mm6 ;
- accumulate thresholds
-
- paddusw mm3, rd ;
- mm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ;
- mm3 /= 128
-
- pand mm1, mm7 ;
- mm1 select vals > thresh from source
- pandn mm7, mm3 ;
- mm7 select vals < thresh from blurred result
- paddusw mm1, mm7 ;
- combination
-
- packuswb mm1, mm0 ;
- pack to bytes
- mov DWORD PTR [edi+edx-4], eax ;
- store previous four bytes
- movd eax, mm1
-
- add edx, 4
- cmp edx, cols
- jl acrossnextcol;
-
- mov DWORD PTR [edi+edx-4], eax
- pop eax
-
- // done with this rwo
- add esi, eax ;
- next line
- mov eax, dst_pixels_per_line ;
- destination pitch?
- add edi, eax ;
- next destination
- mov eax, src_pixels_per_line ;
- destination pitch?
-
- dec ecx ;
- decrement count
- jnz nextrow ;
- next row
- pop ebx
-
- }
-}
-
-
-
-void vp8_post_proc_down_and_across_xmm
-(
- unsigned char *src_ptr,
- unsigned char *dst_ptr,
- int src_pixels_per_line,
- int dst_pixels_per_line,
- int rows,
- int cols,
- int flimit
-)
-{
-#ifdef RELOCATEABLE
- R4D2
-#endif
-
- __asm
- {
- movd xmm2, flimit
- punpcklwd xmm2, xmm2
- punpckldq xmm2, xmm2
- punpcklqdq xmm2, xmm2
-
- mov esi, src_ptr
- mov edi, dst_ptr
-
- mov ecx, DWORD PTR rows
- mov eax, src_pixels_per_line ;
- destination pitch?
- pxor xmm0, xmm0 ;
- mm0 = 00000000
-
- nextrow:
-
- xor edx, edx ;
-
- clear out edx for use as loop counter
- nextcol:
- movq xmm3, QWORD PTR [esi] ;
-
- mm4 = r0 p0..p7
- punpcklbw xmm3, xmm0 ;
- mm3 = p0..p3
- movdqa xmm1, xmm3 ;
- mm1 = p0..p3
- psllw xmm3, 2 ;
-
- movq xmm5, QWORD PTR [esi + eax] ;
- mm4 = r1 p0..p7
- punpcklbw xmm5, xmm0 ;
- mm5 = r1 p0..p3
- paddusw xmm3, xmm5 ;
- mm3 += mm6
-
- ;
- thresholding
- movdqa xmm7, xmm1 ;
- mm7 = r0 p0..p3
- psubusw xmm7, xmm5 ;
- mm7 = r0 p0..p3 - r1 p0..p3
- psubusw xmm5, xmm1 ;
- mm5 = r1 p0..p3 - r0 p0..p3
- paddusw xmm7, xmm5 ;
- mm7 = abs(r0 p0..p3 - r1 p0..p3)
- pcmpgtw xmm7, xmm2
-
- movq xmm5, QWORD PTR [esi + 2*eax] ;
- mm4 = r2 p0..p7
- punpcklbw xmm5, xmm0 ;
- mm5 = r2 p0..p3
- paddusw xmm3, xmm5 ;
- mm3 += mm5
-
- ;
- thresholding
- movdqa xmm6, xmm1 ;
- mm6 = r0 p0..p3
- psubusw xmm6, xmm5 ;
- mm6 = r0 p0..p3 - r2 p0..p3
- psubusw xmm5, xmm1 ;
- mm5 = r2 p0..p3 - r2 p0..p3
- paddusw xmm6, xmm5 ;
- mm6 = abs(r0 p0..p3 - r2 p0..p3)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ;
- accumulate thresholds
-
-
- neg eax
- movq xmm5, QWORD PTR [esi+2*eax] ;
- mm4 = r-2 p0..p7
- punpcklbw xmm5, xmm0 ;
- mm5 = r-2 p0..p3
- paddusw xmm3, xmm5 ;
- mm3 += mm5
-
- ;
- thresholding
- movdqa xmm6, xmm1 ;
- mm6 = r0 p0..p3
- psubusw xmm6, xmm5 ;
- mm6 = p0..p3 - r-2 p0..p3
- psubusw xmm5, xmm1 ;
- mm5 = r-2 p0..p3 - p0..p3
- paddusw xmm6, xmm5 ;
- mm6 = abs(r0 p0..p3 - r-2 p0..p3)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ;
- accumulate thresholds
-
- movq xmm4, QWORD PTR [esi+eax] ;
- mm4 = r-1 p0..p7
- punpcklbw xmm4, xmm0 ;
- mm4 = r-1 p0..p3
- paddusw xmm3, xmm4 ;
- mm3 += mm5
-
- ;
- thresholding
- movdqa xmm6, xmm1 ;
- mm6 = r0 p0..p3
- psubusw xmm6, xmm4 ;
- mm6 = p0..p3 - r-2 p0..p3
- psubusw xmm4, xmm1 ;
- mm5 = r-1 p0..p3 - p0..p3
- paddusw xmm6, xmm4 ;
- mm6 = abs(r0 p0..p3 - r-1 p0..p3)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ;
- accumulate thresholds
-
-
- paddusw xmm3, rd42 ;
- mm3 += round value
- psraw xmm3, 3 ;
- mm3 /= 8
-
- pand xmm1, xmm7 ;
- mm1 select vals > thresh from source
- pandn xmm7, xmm3 ;
- mm7 select vals < thresh from blurred result
- paddusw xmm1, xmm7 ;
- combination
-
- packuswb xmm1, xmm0 ;
- pack to bytes
- movq QWORD PTR [edi], xmm1 ;
-
- neg eax ;
- pitch is positive
- add esi, 8
- add edi, 8
-
- add edx, 8
- cmp edx, cols
-
- jl nextcol
-
- // done with the all cols, start the across filtering in place
- sub esi, edx
- sub edi, edx
-
- xor edx, edx
- movq mm0, QWORD PTR [edi-8];
-
- acrossnextcol:
- movq xmm7, QWORD PTR [edi +edx -2]
- movd xmm4, DWORD PTR [edi +edx +6]
-
- pslldq xmm4, 8
- por xmm4, xmm7
-
- movdqa xmm3, xmm4
- psrldq xmm3, 2
- punpcklbw xmm3, xmm0 ;
- mm3 = p0..p3
- movdqa xmm1, xmm3 ;
- mm1 = p0..p3
- psllw xmm3, 2
-
-
- movdqa xmm5, xmm4
- psrldq xmm5, 3
- punpcklbw xmm5, xmm0 ;
- mm5 = p1..p4
- paddusw xmm3, xmm5 ;
- mm3 += mm6
-
- ;
- thresholding
- movdqa xmm7, xmm1 ;
- mm7 = p0..p3
- psubusw xmm7, xmm5 ;
- mm7 = p0..p3 - p1..p4
- psubusw xmm5, xmm1 ;
- mm5 = p1..p4 - p0..p3
- paddusw xmm7, xmm5 ;
- mm7 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm7, xmm2
-
- movdqa xmm5, xmm4
- psrldq xmm5, 4
- punpcklbw xmm5, xmm0 ;
- mm5 = p2..p5
- paddusw xmm3, xmm5 ;
- mm3 += mm5
-
- ;
- thresholding
- movdqa xmm6, xmm1 ;
- mm6 = p0..p3
- psubusw xmm6, xmm5 ;
- mm6 = p0..p3 - p1..p4
- psubusw xmm5, xmm1 ;
- mm5 = p1..p4 - p0..p3
- paddusw xmm6, xmm5 ;
- mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ;
- accumulate thresholds
-
-
- movdqa xmm5, xmm4 ;
- mm5 = p-2..p5
- punpcklbw xmm5, xmm0 ;
- mm5 = p-2..p1
- paddusw xmm3, xmm5 ;
- mm3 += mm5
-
- ;
- thresholding
- movdqa xmm6, xmm1 ;
- mm6 = p0..p3
- psubusw xmm6, xmm5 ;
- mm6 = p0..p3 - p1..p4
- psubusw xmm5, xmm1 ;
- mm5 = p1..p4 - p0..p3
- paddusw xmm6, xmm5 ;
- mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ;
- accumulate thresholds
-
- psrldq xmm4, 1 ;
- mm4 = p-1..p5
- punpcklbw xmm4, xmm0 ;
- mm4 = p-1..p2
- paddusw xmm3, xmm4 ;
- mm3 += mm5
-
- ;
- thresholding
- movdqa xmm6, xmm1 ;
- mm6 = p0..p3
- psubusw xmm6, xmm4 ;
- mm6 = p0..p3 - p1..p4
- psubusw xmm4, xmm1 ;
- mm5 = p1..p4 - p0..p3
- paddusw xmm6, xmm4 ;
- mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ;
- accumulate thresholds
-
- paddusw xmm3, rd42 ;
- mm3 += round value
- psraw xmm3, 3 ;
- mm3 /= 8
-
- pand xmm1, xmm7 ;
- mm1 select vals > thresh from source
- pandn xmm7, xmm3 ;
- mm7 select vals < thresh from blurred result
- paddusw xmm1, xmm7 ;
- combination
-
- packuswb xmm1, xmm0 ;
- pack to bytes
- movq QWORD PTR [edi+edx-8], mm0 ;
- store previous four bytes
- movdq2q mm0, xmm1
-
- add edx, 8
- cmp edx, cols
- jl acrossnextcol;
-
- // last 8 pixels
- movq QWORD PTR [edi+edx-8], mm0
-
- // done with this rwo
- add esi, eax ;
- next line
- mov eax, dst_pixels_per_line ;
- destination pitch?
- add edi, eax ;
- next destination
- mov eax, src_pixels_per_line ;
- destination pitch?
-
- dec ecx ;
- decrement count
- jnz nextrow ;
- next row
- }
-}
-
-
-void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols, int flimit)
-{
- int c, i;
- __declspec(align(16))
- int flimit2[2];
- __declspec(align(16))
- unsigned char d[16][8];
-
- flimit = vp8_q2mbl(flimit);
-
- for (i = 0; i < 2; i++)
- flimit2[i] = flimit;
-
- rows += 8;
-
- for (c = 0; c < cols; c += 4)
- {
- unsigned char *s = &dst[c];
-
- __asm
- {
- mov esi, s ;
- pxor mm0, mm0 ;
-
- mov eax, pitch ;
- neg eax // eax = -pitch
-
- lea esi, [esi + eax*8]; // edi = s[-pitch*8]
- neg eax
-
-
- pxor mm5, mm5
- pxor mm6, mm6 ;
-
- pxor mm7, mm7 ;
- mov edi, esi
-
- mov ecx, 15 ;
-
- loop_initvar:
- movd mm1, DWORD PTR [edi];
- punpcklbw mm1, mm0 ;
-
- paddw mm5, mm1 ;
- pmullw mm1, mm1 ;
-
- movq mm2, mm1 ;
- punpcklwd mm1, mm0 ;
-
- punpckhwd mm2, mm0 ;
- paddd mm6, mm1 ;
-
- paddd mm7, mm2 ;
- lea edi, [edi+eax] ;
-
- dec ecx
- jne loop_initvar
- //save the var and sum
- xor edx, edx
- loop_row:
- movd mm1, DWORD PTR [esi] // [s-pitch*8]
- movd mm2, DWORD PTR [edi] // [s+pitch*7]
-
- punpcklbw mm1, mm0
- punpcklbw mm2, mm0
-
- paddw mm5, mm2
- psubw mm5, mm1
-
- pmullw mm2, mm2
- movq mm4, mm2
-
- punpcklwd mm2, mm0
- punpckhwd mm4, mm0
-
- paddd mm6, mm2
- paddd mm7, mm4
-
- pmullw mm1, mm1
- movq mm2, mm1
-
- punpcklwd mm1, mm0
- psubd mm6, mm1
-
- punpckhwd mm2, mm0
- psubd mm7, mm2
-
-
- movq mm3, mm6
- pslld mm3, 4
-
- psubd mm3, mm6
- movq mm1, mm5
-
- movq mm4, mm5
- pmullw mm1, mm1
-
- pmulhw mm4, mm4
- movq mm2, mm1
-
- punpcklwd mm1, mm4
- punpckhwd mm2, mm4
-
- movq mm4, mm7
- pslld mm4, 4
-
- psubd mm4, mm7
-
- psubd mm3, mm1
- psubd mm4, mm2
-
- psubd mm3, flimit2
- psubd mm4, flimit2
-
- psrad mm3, 31
- psrad mm4, 31
-
- packssdw mm3, mm4
- packsswb mm3, mm0
-
- movd mm1, DWORD PTR [esi+eax*8]
-
- movq mm2, mm1
- punpcklbw mm1, mm0
-
- paddw mm1, mm5
- mov ecx, edx
-
- and ecx, 127
- movq mm4, vp8_rv[ecx*2]
-
- paddw mm1, mm4
- //paddw xmm1, eight8s
- psraw mm1, 4
-
- packuswb mm1, mm0
- pand mm1, mm3
-
- pandn mm3, mm2
- por mm1, mm3
-
- and ecx, 15
- movd DWORD PTR d[ecx*4], mm1
-
- mov ecx, edx
- sub ecx, 8
-
- and ecx, 15
- movd mm1, DWORD PTR d[ecx*4]
-
- movd [esi], mm1
- lea esi, [esi+eax]
-
- lea edi, [edi+eax]
- add edx, 1
-
- cmp edx, rows
- jl loop_row
-
- }
-
- }
-}
-
-void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols, int flimit)
-{
- int c, i;
- __declspec(align(16))
- int flimit4[4];
- __declspec(align(16))
- unsigned char d[16][8];
-
- flimit = vp8_q2mbl(flimit);
-
- for (i = 0; i < 4; i++)
- flimit4[i] = flimit;
-
- rows += 8;
-
- for (c = 0; c < cols; c += 8)
- {
- unsigned char *s = &dst[c];
-
- __asm
- {
- mov esi, s ;
- pxor xmm0, xmm0 ;
-
- mov eax, pitch ;
- neg eax // eax = -pitch
-
- lea esi, [esi + eax*8]; // edi = s[-pitch*8]
- neg eax
-
-
- pxor xmm5, xmm5
- pxor xmm6, xmm6 ;
-
- pxor xmm7, xmm7 ;
- mov edi, esi
-
- mov ecx, 15 ;
-
- loop_initvar:
- movq xmm1, QWORD PTR [edi];
- punpcklbw xmm1, xmm0 ;
-
- paddw xmm5, xmm1 ;
- pmullw xmm1, xmm1 ;
-
- movdqa xmm2, xmm1 ;
- punpcklwd xmm1, xmm0 ;
-
- punpckhwd xmm2, xmm0 ;
- paddd xmm6, xmm1 ;
-
- paddd xmm7, xmm2 ;
- lea edi, [edi+eax] ;
-
- dec ecx
- jne loop_initvar
- //save the var and sum
- xor edx, edx
- loop_row:
- movq xmm1, QWORD PTR [esi] // [s-pitch*8]
- movq xmm2, QWORD PTR [edi] // [s+pitch*7]
-
- punpcklbw xmm1, xmm0
- punpcklbw xmm2, xmm0
-
- paddw xmm5, xmm2
- psubw xmm5, xmm1
-
- pmullw xmm2, xmm2
- movdqa xmm4, xmm2
-
- punpcklwd xmm2, xmm0
- punpckhwd xmm4, xmm0
-
- paddd xmm6, xmm2
- paddd xmm7, xmm4
-
- pmullw xmm1, xmm1
- movdqa xmm2, xmm1
-
- punpcklwd xmm1, xmm0
- psubd xmm6, xmm1
-
- punpckhwd xmm2, xmm0
- psubd xmm7, xmm2
-
-
- movdqa xmm3, xmm6
- pslld xmm3, 4
-
- psubd xmm3, xmm6
- movdqa xmm1, xmm5
-
- movdqa xmm4, xmm5
- pmullw xmm1, xmm1
-
- pmulhw xmm4, xmm4
- movdqa xmm2, xmm1
-
- punpcklwd xmm1, xmm4
- punpckhwd xmm2, xmm4
-
- movdqa xmm4, xmm7
- pslld xmm4, 4
-
- psubd xmm4, xmm7
-
- psubd xmm3, xmm1
- psubd xmm4, xmm2
-
- psubd xmm3, flimit4
- psubd xmm4, flimit4
-
- psrad xmm3, 31
- psrad xmm4, 31
-
- packssdw xmm3, xmm4
- packsswb xmm3, xmm0
-
- movq xmm1, QWORD PTR [esi+eax*8]
-
- movq xmm2, xmm1
- punpcklbw xmm1, xmm0
-
- paddw xmm1, xmm5
- mov ecx, edx
-
- and ecx, 127
- movdqu xmm4, vp8_rv[ecx*2]
-
- paddw xmm1, xmm4
- //paddw xmm1, eight8s
- psraw xmm1, 4
-
- packuswb xmm1, xmm0
- pand xmm1, xmm3
-
- pandn xmm3, xmm2
- por xmm1, xmm3
-
- and ecx, 15
- movq QWORD PTR d[ecx*8], xmm1
-
- mov ecx, edx
- sub ecx, 8
-
- and ecx, 15
- movq mm0, d[ecx*8]
-
- movq [esi], mm0
- lea esi, [esi+eax]
-
- lea edi, [edi+eax]
- add edx, 1
-
- cmp edx, rows
- jl loop_row
-
- }
-
- }
-}
-#if 0
-/****************************************************************************
- *
- * ROUTINE : plane_add_noise_wmt
- *
- * INPUTS : unsigned char *Start starting address of buffer to add gaussian
- * noise to
- * unsigned int Width width of plane
- * unsigned int Height height of plane
- * int Pitch distance between subsequent lines of frame
- * int q quantizer used to determine amount of noise
- * to add
- *
- * OUTPUTS : None.
- *
- * RETURNS : void.
- *
- * FUNCTION : adds gaussian noise to a plane of pixels
- *
- * SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
-{
- unsigned int i;
-
- __declspec(align(16)) unsigned char blackclamp[16];
- __declspec(align(16)) unsigned char whiteclamp[16];
- __declspec(align(16)) unsigned char bothclamp[16];
- char char_dist[300];
- char Rand[2048];
- double sigma;
-// return;
- __asm emms
- sigma = a + .5 + .6 * (63 - q) / 63.0;
-
- // set up a lookup table of 256 entries that matches
- // a gaussian distribution with sigma determined by q.
- //
- {
- double i;
- int next, j;
-
- next = 0;
-
- for (i = -32; i < 32; i++)
- {
- double g = 256 * vp8_gaussian(sigma, 0, 1.0 * i);
- int a = (int)(g + .5);
-
- if (a)
- {
- for (j = 0; j < a; j++)
- {
- char_dist[next+j] = (char) i;
- }
-
- next = next + j;
- }
-
- }
-
- for (next = next; next < 256; next++)
- char_dist[next] = 0;
-
- }
-
- for (i = 0; i < 2048; i++)
- {
- Rand[i] = char_dist[rand() & 0xff];
- }
-
- for (i = 0; i < 16; i++)
- {
- blackclamp[i] = -char_dist[0];
- whiteclamp[i] = -char_dist[0];
- bothclamp[i] = -2 * char_dist[0];
- }
-
- for (i = 0; i < Height; i++)
- {
- unsigned char *Pos = Start + i * Pitch;
- char *Ref = Rand + (rand() & 0xff);
-
- __asm
- {
- mov ecx, [Width]
- mov esi, Pos
- mov edi, Ref
- xor eax, eax
-
- nextset:
- movdqu xmm1, [esi+eax] // get the source
-
- psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise
- paddusb xmm1, bothclamp
- psubusb xmm1, whiteclamp
-
- movdqu xmm2, [edi+eax] // get the noise for this line
- paddb xmm1, xmm2 // add it in
- movdqu [esi+eax], xmm1 // store the result
-
- add eax, 16 // move to the next line
-
- cmp eax, ecx
- jl nextset
-
-
- }
-
- }
-}
-#endif
-__declspec(align(16))
-static const int four8s[4] = { 8, 8, 8, 8};
-void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, int pitch, int rows, int cols, int flimit)
-{
- int r, i;
- __declspec(align(16))
- int flimit4[4];
- unsigned char *s = src;
- int sumsq;
- int sum;
-
-
- flimit = vp8_q2mbl(flimit);
- flimit4[0] =
- flimit4[1] =
- flimit4[2] =
- flimit4[3] = flimit;
-
- for (r = 0; r < rows; r++)
- {
-
-
- sumsq = 0;
- sum = 0;
-
- for (i = -8; i <= 6; i++)
- {
- sumsq += s[i] * s[i];
- sum += s[i];
- }
-
- __asm
- {
- mov eax, sumsq
- movd xmm7, eax
-
- mov eax, sum
- movd xmm6, eax
-
- mov esi, s
- xor ecx, ecx
-
- mov edx, cols
- add edx, 8
- pxor mm0, mm0
- pxor mm1, mm1
-
- pxor xmm0, xmm0
- nextcol4:
-
- movd xmm1, DWORD PTR [esi+ecx-8] // -8 -7 -6 -5
- movd xmm2, DWORD PTR [esi+ecx+7] // +7 +8 +9 +10
-
- punpcklbw xmm1, xmm0 // expanding
- punpcklbw xmm2, xmm0 // expanding
-
- punpcklwd xmm1, xmm0 // expanding to dwords
- punpcklwd xmm2, xmm0 // expanding to dwords
-
- psubd xmm2, xmm1 // 7--8 8--7 9--6 10--5
- paddd xmm1, xmm1 // -8*2 -7*2 -6*2 -5*2
-
- paddd xmm1, xmm2 // 7+-8 8+-7 9+-6 10+-5
- pmaddwd xmm1, xmm2 // squared of 7+-8 8+-7 9+-6 10+-5
-
- paddd xmm6, xmm2
- paddd xmm7, xmm1
-
- pshufd xmm6, xmm6, 0 // duplicate the last ones
- pshufd xmm7, xmm7, 0 // duplicate the last ones
-
- psrldq xmm1, 4 // 8--7 9--6 10--5 0000
- psrldq xmm2, 4 // 8--7 9--6 10--5 0000
-
- pshufd xmm3, xmm1, 3 // 0000 8--7 8--7 8--7 squared
- pshufd xmm4, xmm2, 3 // 0000 8--7 8--7 8--7 squared
-
- paddd xmm6, xmm4
- paddd xmm7, xmm3
-
- pshufd xmm3, xmm1, 01011111b // 0000 0000 9--6 9--6 squared
- pshufd xmm4, xmm2, 01011111b // 0000 0000 9--6 9--6 squared
-
- paddd xmm7, xmm3
- paddd xmm6, xmm4
-
- pshufd xmm3, xmm1, 10111111b // 0000 0000 8--7 8--7 squared
- pshufd xmm4, xmm2, 10111111b // 0000 0000 8--7 8--7 squared
-
- paddd xmm7, xmm3
- paddd xmm6, xmm4
-
- movdqa xmm3, xmm6
- pmaddwd xmm3, xmm3
-
- movdqa xmm5, xmm7
- pslld xmm5, 4
-
- psubd xmm5, xmm7
- psubd xmm5, xmm3
-
- psubd xmm5, flimit4
- psrad xmm5, 31
-
- packssdw xmm5, xmm0
- packsswb xmm5, xmm0
-
- movd xmm1, DWORD PTR [esi+ecx]
- movq xmm2, xmm1
-
- punpcklbw xmm1, xmm0
- punpcklwd xmm1, xmm0
-
- paddd xmm1, xmm6
- paddd xmm1, four8s
-
- psrad xmm1, 4
- packssdw xmm1, xmm0
-
- packuswb xmm1, xmm0
- pand xmm1, xmm5
-
- pandn xmm5, xmm2
- por xmm5, xmm1
-
- movd [esi+ecx-8], mm0
- movq mm0, mm1
-
- movdq2q mm1, xmm5
- psrldq xmm7, 12
-
- psrldq xmm6, 12
- add ecx, 4
-
- cmp ecx, edx
- jl nextcol4
-
- }
- s += pitch;
- }
-}
-
-#if 0
-
-/****************************************************************************
- *
- * ROUTINE : plane_add_noise_mmx
- *
- * INPUTS : unsigned char *Start starting address of buffer to add gaussian
- * noise to
- * unsigned int Width width of plane
- * unsigned int Height height of plane
- * int Pitch distance between subsequent lines of frame
- * int q quantizer used to determine amount of noise
- * to add
- *
- * OUTPUTS : None.
- *
- * RETURNS : void.
- *
- * FUNCTION : adds gaussian noise to a plane of pixels
- *
- * SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
-{
- unsigned int i;
- int Pitch4 = Pitch * 4;
- const int noise_amount = 2;
- const int noise_adder = 2 * noise_amount + 1;
-
- __declspec(align(16)) unsigned char blackclamp[16];
- __declspec(align(16)) unsigned char whiteclamp[16];
- __declspec(align(16)) unsigned char bothclamp[16];
-
- char char_dist[300];
- char Rand[2048];
-
- double sigma;
- __asm emms
- sigma = a + .5 + .6 * (63 - q) / 63.0;
-
- // set up a lookup table of 256 entries that matches
- // a gaussian distribution with sigma determined by q.
- //
- {
- double i, sum = 0;
- int next, j;
-
- next = 0;
-
- for (i = -32; i < 32; i++)
- {
- int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));
-
- if (a)
- {
- for (j = 0; j < a; j++)
- {
- char_dist[next+j] = (char) i;
- }
-
- next = next + j;
- }
-
- }
-
- for (next = next; next < 256; next++)
- char_dist[next] = 0;
-
- }
-
- for (i = 0; i < 2048; i++)
- {
- Rand[i] = char_dist[rand() & 0xff];
- }
-
- for (i = 0; i < 16; i++)
- {
- blackclamp[i] = -char_dist[0];
- whiteclamp[i] = -char_dist[0];
- bothclamp[i] = -2 * char_dist[0];
- }
-
- for (i = 0; i < Height; i++)
- {
- unsigned char *Pos = Start + i * Pitch;
- char *Ref = Rand + (rand() & 0xff);
-
- __asm
- {
- mov ecx, [Width]
- mov esi, Pos
- mov edi, Ref
- xor eax, eax
-
- nextset:
- movq mm1, [esi+eax] // get the source
-
- psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise
- paddusb mm1, bothclamp
- psubusb mm1, whiteclamp
-
- movq mm2, [edi+eax] // get the noise for this line
- paddb mm1, mm2 // add it in
- movq [esi+eax], mm1 // store the result
-
- add eax, 8 // move to the next line
-
- cmp eax, ecx
- jl nextset
-
-
- }
-
- }
-}
-#else
-extern char an[8][64][3072];
-extern int cd[8][64];
-
-void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
-{
- unsigned int i;
- __declspec(align(16)) unsigned char blackclamp[16];
- __declspec(align(16)) unsigned char whiteclamp[16];
- __declspec(align(16)) unsigned char bothclamp[16];
-
-
- __asm emms
-
- for (i = 0; i < 16; i++)
- {
- blackclamp[i] = -cd[a][q];
- whiteclamp[i] = -cd[a][q];
- bothclamp[i] = -2 * cd[a][q];
- }
-
- for (i = 0; i < Height; i++)
- {
- unsigned char *Pos = Start + i * Pitch;
- char *Ref = an[a][q] + (rand() & 0xff);
-
- __asm
- {
- mov ecx, [Width]
- mov esi, Pos
- mov edi, Ref
- xor eax, eax
-
- nextset:
- movq mm1, [esi+eax] // get the source
-
- psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise
- paddusb mm1, bothclamp
- psubusb mm1, whiteclamp
-
- movq mm2, [edi+eax] // get the noise for this line
- paddb mm1, mm2 // add it in
- movq [esi+eax], mm1 // store the result
-
- add eax, 8 // move to the next line
-
- cmp eax, ecx
- jl nextset
- }
- }
-}
-
-
-void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
-{
- unsigned int i;
-
- __declspec(align(16)) unsigned char blackclamp[16];
- __declspec(align(16)) unsigned char whiteclamp[16];
- __declspec(align(16)) unsigned char bothclamp[16];
-
- __asm emms
-
- for (i = 0; i < 16; i++)
- {
- blackclamp[i] = -cd[a][q];
- whiteclamp[i] = -cd[a][q];
- bothclamp[i] = -2 * cd[a][q];
- }
-
- for (i = 0; i < Height; i++)
- {
- unsigned char *Pos = Start + i * Pitch;
- char *Ref = an[a][q] + (rand() & 0xff);
-
- __asm
- {
- mov ecx, [Width]
- mov esi, Pos
- mov edi, Ref
- xor eax, eax
-
- nextset:
- movdqu xmm1, [esi+eax] // get the source
-
- psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise
- paddusb xmm1, bothclamp
- psubusb xmm1, whiteclamp
-
- movdqu xmm2, [edi+eax] // get the noise for this line
- paddb xmm1, xmm2 // add it in
- movdqu [esi+eax], xmm1 // store the result
-
- add eax, 16 // move to the next line
-
- cmp eax, ecx
- jl nextset
- }
- }
-}
-
-#endif