diff options
Diffstat (limited to 'vpx_scale')
46 files changed, 17885 insertions, 0 deletions
diff --git a/vpx_scale/arm/armv4/gen_scalers_armv4.asm b/vpx_scale/arm/armv4/gen_scalers_armv4.asm new file mode 100644 index 000000000..1c904edae --- /dev/null +++ b/vpx_scale/arm/armv4/gen_scalers_armv4.asm @@ -0,0 +1,773 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |horizontal_line_4_5_scale_armv4| + EXPORT |vertical_band_4_5_scale_armv4| + EXPORT |horizontal_line_2_3_scale_armv4| + EXPORT |vertical_band_2_3_scale_armv4| + EXPORT |horizontal_line_3_5_scale_armv4| + EXPORT |vertical_band_3_5_scale_armv4| + EXPORT |horizontal_line_3_4_scale_armv4| + EXPORT |vertical_band_3_4_scale_armv4| + EXPORT |horizontal_line_1_2_scale_armv4| + EXPORT |vertical_band_1_2_scale_armv4| + + AREA |.text|, CODE, READONLY ; name this block of code + +src RN r0 +srcw RN r1 +dest RN r2 +mask RN r12 +c51_205 RN r10 +c102_154 RN r11 +;/**************************************************************************** +; * +; * ROUTINE : horizontal_line_4_5_scale_armv4 +; * +; * INPUTS : const unsigned char *source : Pointer to source data. +; * unsigned int source_width : Stride of source. +; * unsigned char *dest : Pointer to destination data. +; * unsigned int dest_width : Stride of destination (NOT USED). +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Copies horizontal line of pixels from source to +; * destination scaling up by 4 to 5. +; * +; * SPECIAL NOTES : None. +; * +; ****************************************************************************/ +;void horizontal_line_4_5_scale_armv4 +;( +; r0 = UINT8 *source +; r1 = UINT32 source_width +; r2 = UINT8 *dest +; r3 = UINT32 dest_width +;) +|horizontal_line_4_5_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + + mov mask, #255 ; mask for selection + ldr c51_205, =0x3300cd + ldr c102_154, =0x66009a + + ldr r3, [src], #4 + +hl45_loop + + and r4, r3, mask ; a = src[0] + and r5, mask, r3, lsr #8 ; b = src[1] + strb r4, [dest], #1 + + orr r6, r4, r5, lsl #16 ; b | a + and r7, mask, r3, lsr #16 ; c = src[2] + mul r6, c51_205, r6 ; a * 51 + 205 * b + + orr r5, r5, r7, lsl #16 ; c | b + mul r5, c102_154, r5 ; b * 102 + 154 * c + add r6, r6, #0x8000 + and r8, mask, r3, lsr #24 ; d = src[3] + mov r6, r6, lsr #24 + strb r6, [dest], #1 + + orr r7, r8, r7, lsl #16 ; c | d + mul r7, c102_154, r7 ; c * 154 + 102 * d + add r5, r5, #0x8000 + ldr r3, [src], #4 + mov r5, r5, lsr #24 + strb r5, [dest], #1 + + add r7, r7, #0x8000 + and r9, mask, r3 ; e = src[4] + orr r9, r9, r8, lsl #16 ; d | e + mul r9, c51_205, r9 ; d * 205 + 51 * e + mov r7, r7, lsr #24 + strb r7, [dest], #1 + + add r9, r9, #0x8000 + subs srcw, srcw, #4 + mov r9, r9, lsr #24 + strb r9, [dest], #1 + + bne hl45_loop + + and r4, r3, mask + and r5, mask, r3, lsl #8 + strb r4, [dest], #1 + + orr r6, r4, r5, lsl #16 ; b | a + mul r6, c51_205, r6 + + and r7, mask, r3, lsl #16 + orr r5, r5, r7, lsl #16 ; c | b + mul r5, c102_154, r5 + add r6, r6, #0x8000 + and r8, mask, r3, lsl #24 + mov r6, r6, lsr #24 + strb r6, [dest], #1 + + orr r7, r8, r7, lsl #16 ; c | d + mul r7, c102_154, r7 + add r5, r5, #0x8000 + mov r5, r5, lsr #24 + strb r5, [dest], #1 + + add r7, r7, #0x8000 + mov r7, r7, lsr #24 + strb r7, [dest], #1 + + ldrb r3, [src] + strb r3, [dest], #1 + + ldmia sp!, {r4 - r11, pc} + ENDP ;|vp8cx_horizontal_line_4_5_scale_c| + +;/**************************************************************************** +; * +; * ROUTINE : vertical_band_4_5_scale_armv4 +; * +; * INPUTS : unsigned char *dest : Pointer to destination data. +; * unsigned int dest_pitch : Stride of destination data. +; * unsigned int dest_width : Width of destination data. +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Scales vertical band of pixels by scale 4 to 5. The +; * height of the band scaled is 4-pixels. +; * +; * SPECIAL NOTES : The routine uses the first line of the band below +; * the current band. +; * +; ****************************************************************************/ +;void vertical_band_4_5_scale_armv4 +;( +; r0 = UINT8 *dest +; r1 = UINT32 dest_pitch +; r2 = UINT32 dest_width +;) +|vertical_band_4_5_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + + ldr c51_205, =0x3300cd + ldr c102_154, =0x66009a + +vl45_loop + mov r3, src + ldrb r4, [r3], r1 ; a = des [0] + ldrb r5, [r3], r1 ; b = des [dest_pitch] + ldrb r7, [r3], r1 ; c = des[dest_pitch*2] + add lr, src, r1 + + orr r6, r4, r5, lsl #16 ; b | a + mul r6, c51_205, r6 ; a * 51 + 205 * b + + ldrb r8, [r3], r1 ; d = des[dest_pitch*3] + orr r5, r5, r7, lsl #16 ; c | b + mul r5, c102_154, r5 ; b * 102 + 154 * c + add r6, r6, #0x8000 + orr r7, r8, r7, lsl #16 ; c | d + mov r6, r6, lsr #24 + strb r6, [lr], r1 + + ldrb r9, [r3, r1] ; e = des [dest_pitch * 5] + mul r7, c102_154, r7 ; c * 154 + 102 * d + add r5, r5, #0x8000 + orr r9, r9, r8, lsl #16 ; d | e + mov r5, r5, lsr #24 + strb r5, [lr], r1 + + mul r9, c51_205, r9 ; d * 205 + 51 * e + add r7, r7, #0x8000 + add src, src, #1 + mov r7, r7, lsr #24 + strb r7, [lr], r1 + + add r9, r9, #0x8000 + subs r2, r2, #1 + mov r9, r9, lsr #24 + strb r9, [lr], r1 + + bne vl45_loop + + ldmia sp!, {r4 - r11, pc} + ENDP ;|vertical_band_4_5_scale_armv4| + +;/**************************************************************************** +; * +; * ROUTINE : horizontal_line_2_3_scale_armv4 +; * +; * INPUTS : const unsigned char *source : Pointer to source data. +; * unsigned int source_width : Stride of source. +; * unsigned char *dest : Pointer to destination data. +; * unsigned int dest_width : Stride of destination (NOT USED). +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Copies horizontal line of pixels from source to +; * destination scaling up by 2 to 3. +; * +; * SPECIAL NOTES : None. +; * +; * +; ****************************************************************************/ +;void horizontal_line_2_3_scale_armv4 +;( +; const unsigned char *source, +; unsigned int source_width, +; unsigned char *dest, +; unsigned int dest_width +;) +|horizontal_line_2_3_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + ldr lr, =85 + ldr r12, =171 + +hl23_loop + + ldrb r3, [src], #1 ; a + ldrb r4, [src], #1 ; b + ldrb r5, [src] ; c + + strb r3, [dest], #1 + mul r4, r12, r4 ; b * 171 + mla r6, lr, r3, r4 ; a * 85 + mla r7, lr, r5, r4 ; c * 85 + + add r6, r6, #128 + mov r6, r6, lsr #8 + strb r6, [dest], #1 + + add r7, r7, #128 + mov r7, r7, lsr #8 + strb r7, [dest], #1 + + subs srcw, srcw, #2 + bne hl23_loop + + ldrb r4, [src, #1] ; b + strb r5, [dest], #1 + strb r4, [dest, #1] + + mul r4, r12, r4 ; b * 171 + mla r6, lr, r5, r4 ; a * 85 + b *171 + + add r6, r6, #128 + mov r6, r6, lsr #8 + strb r6, [dest] + + ldmia sp!, {r4 - r11, pc} + ENDP ;|horizontal_line_2_3_scale_armv4| + +;/**************************************************************************** +; * +; * ROUTINE : vertical_band_2_3_scale_armv4 +; * +; * INPUTS : unsigned char *dest : Pointer to destination data. +; * unsigned int dest_pitch : Stride of destination data. +; * unsigned int dest_width : Width of destination data. +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Scales vertical band of pixels by scale 2 to 3. The +; * height of the band scaled is 2-pixels. +; * +; * SPECIAL NOTES : The routine uses the first line of the band below +; * the current band. +; * +; ****************************************************************************/ +;void vertical_band_2_3_scale_armv4 +;( +; r0 = UINT8 *dest +; r1 = UINT32 dest_pitch +; r2 = UINT32 dest_width +;) +|vertical_band_2_3_scale_armv4| PROC + stmdb sp!, {r4 - r8, lr} + ldr lr, =85 + ldr r12, =171 + add r3, r1, r1, lsl #1 ; 3 * dest_pitch + +vl23_loop + ldrb r4, [src] ; a = des [0] + ldrb r5, [src, r1] ; b = des [dest_pitch] + ldrb r7, [src, r3] ; c = des [dest_pitch*3] + subs r2, r2, #1 + + mul r5, r12, r5 ; b * 171 + mla r6, lr, r4, r5 ; a * 85 + mla r8, lr, r7, r5 ; c * 85 + + add r6, r6, #128 + mov r6, r6, lsr #8 + strb r6, [src, r1] + + add r8, r8, #128 + mov r8, r8, lsr #8 + strb r8, [src, r1, lsl #1] + + add src, src, #1 + + bne vl23_loop + + ldmia sp!, {r4 - r8, pc} + ENDP ;|vertical_band_2_3_scale_armv4| + +;/**************************************************************************** +; * +; * ROUTINE : vp8cx_horizontal_line_3_5_scale_c +; * +; * INPUTS : const unsigned char *source : Pointer to source data. +; * unsigned int source_width : Stride of source. +; * unsigned char *dest : Pointer to destination data. +; * unsigned int dest_width : Stride of destination (NOT USED). +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Copies horizontal line of pixels from source to +; * destination scaling up by 3 to 5. +; * +; * SPECIAL NOTES : None. +; * +; * +; ****************************************************************************/ +;void vp8cx_horizontal_line_3_5_scale_c +;( +; const unsigned char *source, +; unsigned int source_width, +; unsigned char *dest, +; unsigned int dest_width +;) +|horizontal_line_3_5_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + + ldr c51_205, =0x3300cd + ldr c102_154, =0x66009a + + ldrb r4, [src], #1 ; a = src[0] + +hl35_loop + + ldrb r8, [src], #1 ; b = src[1] + strb r4, [dest], #1 + + orr r6, r4, r8, lsl #16 ; b | a + ldrb r9, [src], #1 ; c = src[2] + mul r6, c102_154, r6 ; a * 102 + 154 * b + + orr r5, r9, r8, lsl #16 ; b | c + mul r5, c51_205, r5 ; b * 205 + 51 * c + add r6, r6, #0x8000 + ldrb r4, [src], #1 ; d = src[3] + mov r6, r6, lsr #24 + strb r6, [dest], #1 + + orr r7, r8, r9, lsl #16 ; c | b + mul r7, c51_205, r7 ; c * 205 + 154 * b + add r5, r5, #0x8000 + mov r5, r5, lsr #24 + strb r5, [dest], #1 + + orr r9, r4, r9, lsl #16 ; c | d + mul r9, c102_154, r9 ; c * 154 + 102 * d + add r7, r7, #0x8000 + mov r7, r7, lsr #24 + strb r7, [dest], #1 + + add r9, r9, #0x8000 + subs srcw, srcw, #3 + mov r9, r9, lsr #24 + strb r9, [dest], #1 + + bpl hl35_loop + + ldrb r5, [src], #1 ; b = src[1] + strb r4, [dest], #1 + + orr r6, r4, r8, lsl #16 ; b | a + ldrb r9, [src], #1 ; c = src[2] + mul r6, c102_154, r6 ; a * 102 + 154 * b + + orr r5, r9, r8, lsl #16 ; b | c + mul r5, c51_205, r5 ; b * 205 + 51 * c + add r6, r6, #0x8000 + mov r6, r6, lsr #24 + strb r6, [dest], #1 + + orr r7, r8, r9, lsl #16 ; c | b + mul r7, c51_205, r7 ; c * 205 + 154 * b + add r5, r5, #0x8000 + mov r5, r5, lsr #24 + strb r5, [dest], #1 + + add r7, r7, #0x8000 + mov r7, r7, lsr #24 + strb r7, [dest], #1 + strb r9, [dest], #1 + + ldmia sp!, {r4 - r11, pc} + ENDP ;|vp8cx_horizontal_line_3_5_scale_c| + + +;/**************************************************************************** +; * +; * ROUTINE : vp8cx_vertical_band_3_5_scale_c +; * +; * INPUTS : unsigned char *dest : Pointer to destination data. +; * unsigned int dest_pitch : Stride of destination data. +; * unsigned int dest_width : Width of destination data. +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Scales vertical band of pixels by scale 3 to 5. The +; * height of the band scaled is 3-pixels. +; * +; * SPECIAL NOTES : The routine uses the first line of the band below +; * the current band. +; * +; ****************************************************************************/ +;void vertical_band_4_5_scale_armv4 +;( +; r0 = UINT8 *dest +; r1 = UINT32 dest_pitch +; r2 = UINT32 dest_width +;) +|vertical_band_3_5_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + + ldr c51_205, =0x3300cd + ldr c102_154, =0x66009a + +vl35_loop + mov r3, src + ldrb r4, [r3], r1 ; a = des [0] + ldrb r5, [r3], r1 ; b = des [dest_pitch] + ldrb r7, [r3], r1 ; c = des[dest_pitch*2] + add lr, src, r1 + + orr r8, r4, r5, lsl #16 ; b | a + mul r6, c102_154, r8 ; a * 102 + 154 * b + + ldrb r8, [r3, r1, lsl #1] ; d = des[dest_pitch*5] + orr r3, r7, r5, lsl #16 ; b | c + mul r9, c51_205, r3 ; b * 205 + 51 * c + add r6, r6, #0x8000 + orr r3, r5, r7, lsl #16 ; c | b + mov r6, r6, lsr #24 + strb r6, [lr], r1 + + mul r5, c51_205, r3 ; c * 205 + 154 * b + add r9, r9, #0x8000 + orr r3, r8, r7, lsl #16 ; c | d + mov r9, r9, lsr #24 + strb r9, [lr], r1 + + mul r7, c102_154, r3 ; c * 154 + 102 * d + add r5, r5, #0x8000 + add src, src, #1 + mov r5, r5, lsr #24 + strb r5, [lr], r1 + + add r7, r7, #0x8000 + subs r2, r2, #1 + mov r7, r7, lsr #24 + strb r7, [lr], r1 + + + bne vl35_loop + + ldmia sp!, {r4 - r11, pc} + ENDP ;|vertical_band_3_5_scale_armv4| + +;/**************************************************************************** +; * +; * ROUTINE : horizontal_line_3_4_scale_armv4 +; * +; * INPUTS : const unsigned char *source : Pointer to source data. +; * unsigned int source_width : Stride of source. +; * unsigned char *dest : Pointer to destination data. +; * unsigned int dest_width : Stride of destination (NOT USED). +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Copies horizontal line of pixels from source to +; * destination scaling up by 3 to 4. +; * +; * SPECIAL NOTES : None. +; * +; * +; ****************************************************************************/ +;void horizontal_line_3_4_scale_armv4 +;( +; const unsigned char *source, +; unsigned int source_width, +; unsigned char *dest, +; unsigned int dest_width +;) +|horizontal_line_3_4_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r10, =64 + ldr r11, =192 + mov r9, #128 + + ldrb r4, [src], #1 ; a = src[0] + +hl34_loop + + ldrb r8, [src], #1 ; b = src[1] + ldrb r7, [src], #1 ; c = src[2] + strb r4, [dest], #1 + + mla r4, r10, r4, r9 ; a*64 + 128 + mla r4, r11, r8, r4 ; a*64 + b*192 + 1 + + add r8, r8, #1 ; b + 1 + add r8, r8, r7 ; b + c + 1 + mov r8, r8, asr #1 ; (b + c + 1) >> 1 + + mov r4, r4, asr #8 ; (a*64 + b*192 + 1) >> 8 + strb r4, [dest], #1 + + strb r8, [dest], #1 + + ldrb r4, [src], #1 ; [a+1] + + mla r7, r11, r7, r9 ; c*192 + 128 + mla r7, r4, r10, r7 ; a*64 + b*192 + 128 + + subs srcw, srcw, #3 + + mov r7, r7, asr #8 ; (a*64 + b*192 + 128) >> 8 + strb r7, [dest], #1 + + bpl hl34_loop + + ldrb r8, [src], #1 ; b = src[1] + ldrb r7, [src], #1 ; c = src[2] + strb r4, [dest], #1 + + mla r4, r10, r4, r9 ; a*64 + 128 + mla r4, r11, r8, r4 ; a*64 + b*192 + 1 + mov r4, r4, asr #8 ; (a*64 + b*192 + 1) >> 8 + strb r4, [dest], #1 + + add r8, r8, #1 ; b + 1 + add r8, r8, r7 ; b + c + 1 + mov r8, r8, asr #1 ; (b + c + 1) >> 1 + strb r8, [dest], #1 + strb r7, [dest], #1 + + ldmia sp!, {r4 - r11, pc} + ENDP ;|vp8cx_horizontal_line_3_4_scale_c| + + +;/**************************************************************************** +; * +; * ROUTINE : vertical_band_3_4_scale_armv4 +; * +; * INPUTS : unsigned char *dest : Pointer to destination data. +; * unsigned int dest_pitch : Stride of destination data. +; * unsigned int dest_width : Width of destination data. +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Scales vertical band of pixels by scale 3 to 4. The +; * height of the band scaled is 3-pixels. +; * +; * SPECIAL NOTES : The routine uses the first line of the band below +; * the current band. +; * +; ****************************************************************************/ +;void vertical_band_3_4_scale_armv4 +;( +; r0 = UINT8 *dest +; r1 = UINT32 dest_pitch +; r2 = UINT32 dest_width +;) +|vertical_band_3_4_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r10, =64 + ldr r11, =192 + mov r9, #128 + +; ldr r1,[r1] +vl34_loop + mov r3, src + ldrb r4, [r3], r1 ; a = des [0] + ldrb r5, [r3], r1 ; b = des [dest_pitch] + ldrb r7, [r3], r1 ; c = des [dest_pitch*2] + add lr, src, r1 + + mla r4, r10, r4, r9 ; a*64 + 128 + mla r4, r11, r5, r4 ; a*64 + b*192 + 1 + + add r5, r5, #1 ; b + 1 + add r5, r5, r7 ; b + c + 1 + mov r5, r5, asr #1 ; (b + c + 1) >> 1 + + mov r4, r4, asr #8 ; (a*64 + b*192 + 1) >> 8 + strb r4, [lr], r1 + + ldrb r4, [r3, r1] ; a = des [dest_pitch*4] + + strb r5, [lr], r1 + + mla r7, r11, r7, r9 ; c*192 + 128 + mla r7, r4, r10, r7 ; a*64 + b*192 + 128 + mov r7, r7, asr #8 ; (a*64 + b*192 + 128) >> 8 + + add src, src, #1 + subs r2, r2, #1 + + strb r7, [lr] + + bne vl34_loop + + ldmia sp!, {r4 - r11, pc} + ENDP ;|vertical_band_3_4_scale_armv4| + +;/**************************************************************************** +; * +; * ROUTINE : vp8cx_horizontal_line_1_2_scale_c +; * +; * INPUTS : const unsigned char *source : Pointer to source data. +; * unsigned int source_width : Stride of source. +; * unsigned char *dest : Pointer to destination data. +; * unsigned int dest_width : Stride of destination (NOT USED). +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Copies horizontal line of pixels from source to +; * destination scaling up by 1 to 2. +; * +; * SPECIAL NOTES : None. +; * +; ****************************************************************************/ +;void vp8cx_horizontal_line_1_2_scale_c +;( +; const unsigned char *source, +; unsigned int source_width, +; unsigned char *dest, +; unsigned int dest_width +;) +|horizontal_line_1_2_scale_armv4| PROC + stmdb sp!, {r4 - r5, lr} + + sub srcw, srcw, #1 + + ldrb r3, [src], #1 + ldrb r4, [src], #1 +hl12_loop + subs srcw, srcw, #1 + + add r5, r3, r4 + add r5, r5, #1 + mov r5, r5, lsr #1 + + orr r5, r3, r5, lsl #8 + strh r5, [dest], #2 + + mov r3, r4 + + ldrneb r4, [src], #1 + bne hl12_loop + + orr r5, r4, r4, lsl #8 + strh r5, [dest] + + ldmia sp!, {r4 - r5, pc} + ENDP ;|vertical_band_3_5_scale_armv4| + +;/**************************************************************************** +; * +; * ROUTINE : vp8cx_vertical_band_1_2_scale_c +; * +; * INPUTS : unsigned char *dest : Pointer to destination data. +; * unsigned int dest_pitch : Stride of destination data. +; * unsigned int dest_width : Width of destination data. +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Scales vertical band of pixels by scale 1 to 2. The +; * height of the band scaled is 1-pixel. +; * +; * SPECIAL NOTES : The routine uses the first line of the band below +; * the current band. +; * +; ****************************************************************************/ +;void vp8cx_vertical_band_1_2_scale_c +;( +; r0 = UINT8 *dest +; r1 = UINT32 dest_pitch +; r2 = UINT32 dest_width +;) +|vertical_band_1_2_scale_armv4| PROC + stmdb sp!, {r4 - r7, lr} + + ldr mask, =0xff00ff ; mask for selection + ldr lr, = 0x010001 + +vl12_loop + mov r3, src + ldr r4, [r3], r1 + ldr r5, [r3, r1] + + add src, src, #4 + subs r2, r2, #4 + + and r6, r4, mask + and r7, r5, mask + + add r6, r7, r6 + add r6, r6, lr + + and r4, mask, r4, lsr #8 + and r5, mask, r5, lsr #8 + + mov r6, r6, lsr #1 + and r6, r6, mask + + add r4, r5, r4 + add r4, r4, lr + + mov r4, r4, lsr #1 + and r4, r4, mask + + orr r5, r6, r4, lsl #8 + + str r5, [r3] + + bpl vl12_loop + + ldmia sp!, {r4 - r7, pc} + ENDP ;|vertical_band_3_5_scale_armv4| + + END diff --git a/vpx_scale/arm/nds/yv12extend.c b/vpx_scale/arm/nds/yv12extend.c new file mode 100644 index 000000000..56959cb18 --- /dev/null +++ b/vpx_scale/arm/nds/yv12extend.c @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** +* +* Module Title : yv12extend.c +* +* Description : +* +***************************************************************************/ + +/**************************************************************************** +* Header Files +****************************************************************************/ +#include "vpx_scale/yv12config.h" +#include "vpx_mem/vpx_mem.h" +#include <nitro.h> +#include <nitro/mi.h> +#include <nitro/itcm_begin.h> + +//---- DMA Number +#define DMA_NO 3 + +/**************************************************************************** +* Exports +****************************************************************************/ + +/**************************************************************************** +* +****************************************************************************/ +void +vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf) +{ + int i; + unsigned char *src_ptr1, *src_ptr2; + unsigned char *dest_ptr1, *dest_ptr2; + + unsigned int Border; + int plane_stride; + int plane_height; + int plane_width; + + /***********/ + /* Y Plane */ + /***********/ + Border = ybf->border; + plane_stride = ybf->y_stride; + plane_height = ybf->y_height; + plane_width = ybf->y_width; + + // copy the left and right most columns out + src_ptr1 = ybf->y_buffer; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; i++) + { + mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border); + mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->y_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)Border; i++) + { + mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride); + mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride); + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + plane_stride /= 2; + plane_height /= 2; + plane_width /= 2; + Border /= 2; + + /***********/ + /* U Plane */ + /***********/ + + // copy the left and right most columns out + src_ptr1 = ybf->u_buffer; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; i++) + { + mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border); + mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->u_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)(Border); i++) + { + mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride); + mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride); + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + /***********/ + /* V Plane */ + /***********/ + + // copy the left and right most columns out + src_ptr1 = ybf->v_buffer; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; i++) + { + mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border); + mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->v_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)(Border); i++) + { + mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride); + mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride); + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } +} + + + +/**************************************************************************** +* +* ROUTINE : vp8_yv12_copy_frame +* +* INPUTS : +* +* OUTPUTS : None. +* +* RETURNS : void +* +* FUNCTION : Copies the source image into the destination image and +* updates the destination's UMV borders. +* +* SPECIAL NOTES : The frames are assumed to be identical in size. +* +****************************************************************************/ +void +vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) +{ + int yplane_size = (src_ybc->y_height + 2 * src_ybc->border) * (src_ybc->y_stride); + int mem_size = (yplane_size * 3 / 2) + (src_ybc->y_stride * 2); + + mi_cpu_copy_fast(src_ybc->buffer_alloc, dst_ybc->buffer_alloc, mem_size); + + /* unsigned char *src_y, *dst_y; + unsigned char *src_u, *dst_u; + unsigned char *src_v, *dst_v; + + int yheight, uv_height; + int ystride, uv_stride; + int border; + int yoffset, uvoffset; + + border = src_ybc->border; + yheight = src_ybc->y_height; + uv_height = src_ybc->uv_height; + + ystride = src_ybc->y_stride; + uv_stride = src_ybc->uv_stride; + + yoffset = border * (ystride + 1); + uvoffset = border/2 * (uv_stride + 1); + + src_y = src_ybc->y_buffer - yoffset; + dst_y = dst_ybc->y_buffer - yoffset; + src_u = src_ybc->u_buffer - uvoffset; + dst_u = dst_ybc->u_buffer - uvoffset; + src_v = src_ybc->v_buffer - uvoffset; + dst_v = dst_ybc->v_buffer - uvoffset; + + mi_cpu_copy_fast (src_y, dst_y, ystride * (yheight + 2 * border)); + mi_cpu_copy_fast (src_u, dst_u, uv_stride * (uv_height + border)); + mi_cpu_copy_fast (src_v, dst_v, uv_stride * (uv_height + border)); + */ +} + +#include <nitro/itcm_end.h> diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm new file mode 100644 index 000000000..26384c42c --- /dev/null +++ b/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm @@ -0,0 +1,227 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_yv12_copy_frame_func_neon| + ARM + REQUIRE8 + PRESERVE8 + + INCLUDE vpx_asm_offsets.asm + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); + +|vp8_yv12_copy_frame_func_neon| PROC + push {r4 - r11, lr} + vpush {d8 - d15} + + sub sp, sp, #16 + + ;Copy Y plane + ldr r8, [r0, #yv12_buffer_config_u_buffer] ;srcptr1 + ldr r9, [r1, #yv12_buffer_config_u_buffer] ;srcptr1 + ldr r10, [r0, #yv12_buffer_config_v_buffer] ;srcptr1 + ldr r11, [r1, #yv12_buffer_config_v_buffer] ;srcptr1 + + ldr r4, [r0, #yv12_buffer_config_y_height] + ldr r5, [r0, #yv12_buffer_config_y_width] + ldr r6, [r0, #yv12_buffer_config_y_stride] + ldr r7, [r1, #yv12_buffer_config_y_stride] + ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 + ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1 + + str r8, [sp] + str r9, [sp, #4] + str r10, [sp, #8] + str r11, [sp, #12] + + ; copy two rows at one time + mov lr, r4, lsr #1 + +cp_src_to_dst_height_loop + mov r8, r2 + mov r9, r3 + add r10, r2, r6 + add r11, r3, r7 + mov r12, r5, lsr #7 + +cp_src_to_dst_width_loop + vld1.8 {q0, q1}, [r8]! + vld1.8 {q8, q9}, [r10]! + vld1.8 {q2, q3}, [r8]! + vld1.8 {q10, q11}, [r10]! + vld1.8 {q4, q5}, [r8]! + vld1.8 {q12, q13}, [r10]! + vld1.8 {q6, q7}, [r8]! + vld1.8 {q14, q15}, [r10]! + + subs r12, r12, #1 + + vst1.8 {q0, q1}, [r9]! + vst1.8 {q8, q9}, [r11]! + vst1.8 {q2, q3}, [r9]! + vst1.8 {q10, q11}, [r11]! + vst1.8 {q4, q5}, [r9]! + vst1.8 {q12, q13}, [r11]! + vst1.8 {q6, q7}, [r9]! + vst1.8 {q14, q15}, [r11]! + + bne cp_src_to_dst_width_loop + + subs lr, lr, #1 + add r2, r2, r6, lsl #1 + add r3, r3, r7, lsl #1 + + bne cp_src_to_dst_height_loop + + ands r10, r5, #0x7f ;check to see if extra copy is needed + sub r11, r5, r10 + ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 + ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1 + bne extra_cp_src_to_dst_width +end_of_cp_src_to_dst + +;Copy U & V planes + ldr r2, [sp] ;srcptr1 + ldr r3, [sp, #4] ;dstptr1 + mov r4, r4, lsr #1 ;src uv_height + mov r5, r5, lsr #1 ;src uv_width + mov r6, r6, lsr #1 ;src uv_stride + mov r7, r7, lsr #1 ;dst uv_stride + + mov r1, #2 + +cp_uv_loop + + ;copy two rows at one time + mov lr, r4, lsr #1 + +cp_src_to_dst_height_uv_loop + mov r8, r2 + mov r9, r3 + add r10, r2, r6 + add r11, r3, r7 + mov r12, r5, lsr #6 + +cp_src_to_dst_width_uv_loop + vld1.8 {q0, q1}, [r8]! + vld1.8 {q8, q9}, [r10]! + vld1.8 {q2, q3}, [r8]! + vld1.8 {q10, q11}, [r10]! + + subs r12, r12, #1 + + vst1.8 {q0, q1}, [r9]! + vst1.8 {q8, q9}, [r11]! + vst1.8 {q2, q3}, [r9]! + vst1.8 {q10, q11}, [r11]! + + bne cp_src_to_dst_width_uv_loop + + subs lr, lr, #1 + add r2, r2, r6, lsl #1 + add r3, r3, r7, lsl #1 + + bne cp_src_to_dst_height_uv_loop + + ands r10, r5, #0x3f ;check to see if extra copy is needed + sub r11, r5, r10 + ldr r2, [sp] ;srcptr1 + ldr r3, [sp, #4] ;dstptr1 + bne extra_cp_src_to_dst_uv_width +end_of_cp_src_to_dst_uv + + subs r1, r1, #1 + + addne sp, sp, #8 + + ldrne r2, [sp] ;srcptr1 + ldrne r3, [sp, #4] ;dstptr1 + + bne cp_uv_loop + + add sp, sp, #8 + + vpop {d8 - d15} + pop {r4 - r11, pc} + +;============================= +extra_cp_src_to_dst_width + add r2, r2, r11 + add r3, r3, r11 + add r0, r8, r6 + add r11, r9, r7 + + mov lr, r4, lsr #1 +extra_cp_src_to_dst_height_loop + mov r8, r2 + mov r9, r3 + add r0, r8, r6 + add r11, r9, r7 + + mov r12, r10 + +extra_cp_src_to_dst_width_loop + vld1.8 {q0}, [r8]! + vld1.8 {q1}, [r0]! + + subs r12, r12, #16 + + vst1.8 {q0}, [r9]! + vst1.8 {q1}, [r11]! + bne extra_cp_src_to_dst_width_loop + + subs lr, lr, #1 + + add r2, r2, r6, lsl #1 + add r3, r3, r7, lsl #1 + + bne extra_cp_src_to_dst_height_loop + + b end_of_cp_src_to_dst + +;================================= +extra_cp_src_to_dst_uv_width + add r2, r2, r11 + add r3, r3, r11 + add r0, r8, r6 + add r11, r9, r7 + + mov lr, r4, lsr #1 +extra_cp_src_to_dst_height_uv_loop + mov r8, r2 + mov r9, r3 + add r0, r8, r6 + add r11, r9, r7 + + mov r12, r10 + +extra_cp_src_to_dst_width_uv_loop + vld1.8 {d0}, [r8]! + vld1.8 {d1}, [r0]! + + subs r12, r12, #8 + + vst1.8 {d0}, [r9]! + vst1.8 {d1}, [r11]! + bne extra_cp_src_to_dst_width_uv_loop + + subs lr, lr, #1 + + add r2, r2, r6, lsl #1 + add r3, r3, r7, lsl #1 + + bne extra_cp_src_to_dst_height_uv_loop + + b end_of_cp_src_to_dst_uv + + ENDP + END diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm new file mode 100644 index 000000000..a50ae60d7 --- /dev/null +++ b/vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm @@ -0,0 +1,499 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_yv12_copy_frame_yonly_neon| + EXPORT |vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon| + + ARM + REQUIRE8 + PRESERVE8 + + INCLUDE vpx_asm_offsets.asm + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;void vpxyv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +; Note: this is VP8 function, which has border=32 and 16. Internal y_width and y_height +; are always multiples of 16. + +|vp8_yv12_copy_frame_yonly_neon| PROC + push {r4 - r11, lr} + vpush {d8 - d15} + + ldr r4, [r0, #yv12_buffer_config_y_height] + ldr r5, [r0, #yv12_buffer_config_y_width] + ldr r6, [r0, #yv12_buffer_config_y_stride] + ldr r7, [r1, #yv12_buffer_config_y_stride] + ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 + ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1 + + ; copy two rows at one time + mov lr, r4, lsr #1 + +cp_src_to_dst_height_loop + mov r8, r2 + mov r9, r3 + add r10, r2, r6 + add r11, r3, r7 + mov r12, r5, lsr #7 + +cp_src_to_dst_width_loop + vld1.8 {q0, q1}, [r8]! + vld1.8 {q8, q9}, [r10]! + vld1.8 {q2, q3}, [r8]! + vld1.8 {q10, q11}, [r10]! + vld1.8 {q4, q5}, [r8]! + vld1.8 {q12, q13}, [r10]! + vld1.8 {q6, q7}, [r8]! + vld1.8 {q14, q15}, [r10]! + + subs r12, r12, #1 + + vst1.8 {q0, q1}, [r9]! + vst1.8 {q8, q9}, [r11]! + vst1.8 {q2, q3}, [r9]! + vst1.8 {q10, q11}, [r11]! + vst1.8 {q4, q5}, [r9]! + vst1.8 {q12, q13}, [r11]! + vst1.8 {q6, q7}, [r9]! + vst1.8 {q14, q15}, [r11]! + + bne cp_src_to_dst_width_loop + + subs lr, lr, #1 + add r2, r2, r6, lsl #1 + add r3, r3, r7, lsl #1 + + bne cp_src_to_dst_height_loop + + ands r10, r5, #0x7f ;check to see if extra copy is needed + sub r11, r5, r10 + ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 + ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1 + bne extra_cp_src_to_dst_width +end_of_cp_src_to_dst + + + ;vpxyv12_extend_frame_borders_yonly + mov r0, r1 + ;Not need to load y_width, since: y_width = y_stride - 2*border + ldr r3, [r0, #yv12_buffer_config_border] + ldr r1, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 + ldr r4, [r0, #yv12_buffer_config_y_height] + ldr lr, [r0, #yv12_buffer_config_y_stride] + + cmp r3, #16 + beq b16_extend_frame_borders + +;======================= +b32_extend_frame_borders +;border = 32 +;======================= +;Border copy for Y plane +;copy the left and right most columns out + sub r5, r1, r3 ;destptr1 + add r6, r1, lr + sub r6, r6, r3, lsl #1 ;destptr2 + sub r2, r6, #1 ;srcptr2 + + ;Do four rows at one time + mov r12, r4, lsr #2 + +copy_left_right_y + vld1.8 {d0[], d1[]}, [r1], lr + vld1.8 {d4[], d5[]}, [r2], lr + vld1.8 {d8[], d9[]}, [r1], lr + vld1.8 {d12[], d13[]}, [r2], lr + vld1.8 {d16[], d17[]}, [r1], lr + vld1.8 {d20[], d21[]}, [r2], lr + vld1.8 {d24[], d25[]}, [r1], lr + vld1.8 {d28[], d29[]}, [r2], lr + + vmov q1, q0 + vmov q3, q2 + vmov q5, q4 + vmov q7, q6 + vmov q9, q8 + vmov q11, q10 + vmov q13, q12 + vmov q15, q14 + + subs r12, r12, #1 + + vst1.8 {q0, q1}, [r5], lr + vst1.8 {q2, q3}, [r6], lr + vst1.8 {q4, q5}, [r5], lr + vst1.8 {q6, q7}, [r6], lr + vst1.8 {q8, q9}, [r5], lr + vst1.8 {q10, q11}, [r6], lr + vst1.8 {q12, q13}, [r5], lr + vst1.8 {q14, q15}, [r6], lr + + bne copy_left_right_y + +;Now copy the top and bottom source lines into each line of the respective borders + ldr r7, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 + mul r8, r3, lr + + mov r12, lr, lsr #7 + + sub r6, r1, r3 ;destptr2 + sub r2, r6, lr ;srcptr2 + sub r1, r7, r3 ;srcptr1 + sub r5, r1, r8 ;destptr1 + +copy_top_bottom_y + vld1.8 {q0, q1}, [r1]! + vld1.8 {q8, q9}, [r2]! + vld1.8 {q2, q3}, [r1]! + vld1.8 {q10, q11}, [r2]! + vld1.8 {q4, q5}, [r1]! + vld1.8 {q12, q13}, [r2]! + vld1.8 {q6, q7}, [r1]! + vld1.8 {q14, q15}, [r2]! + + mov r7, r3 + +top_bottom_32 + subs r7, r7, #1 + + vst1.8 {q0, q1}, [r5]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q2, q3}, [r5]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q4, q5}, [r5]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q6, q7}, [r5]! + vst1.8 {q14, q15}, [r6]! + + add r5, r5, lr + sub r5, r5, #128 + add r6, r6, lr + sub r6, r6, #128 + + bne top_bottom_32 + + sub r5, r1, r8 + add r6, r2, lr + + subs r12, r12, #1 + bne copy_top_bottom_y + + mov r7, lr, lsr #4 ;check to see if extra copy is needed + ands r7, r7, #0x7 + bne extra_top_bottom_y +end_of_border_copy_y + + vpop {d8 - d15} + pop {r4 - r11, pc} + +;===================== +;extra copy part for Y +extra_top_bottom_y + vld1.8 {q0}, [r1]! + vld1.8 {q2}, [r2]! + + mov r9, r3, lsr #3 + +extra_top_bottom_32 + subs r9, r9, #1 + + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + bne extra_top_bottom_32 + + sub r5, r1, r8 + add r6, r2, lr + subs r7, r7, #1 + bne extra_top_bottom_y + + b end_of_border_copy_y + + +;======================= +b16_extend_frame_borders +;border = 16 +;======================= +;Border copy for Y plane +;copy the left and right most columns out + sub r5, r1, r3 ;destptr1 + add r6, r1, lr + sub r6, r6, r3, lsl #1 ;destptr2 + sub r2, r6, #1 ;srcptr2 + + ;Do four rows at one time + mov r12, r4, lsr #2 + +copy_left_right_y_b16 + vld1.8 {d0[], d1[]}, [r1], lr + vld1.8 {d4[], d5[]}, [r2], lr + vld1.8 {d8[], d9[]}, [r1], lr + vld1.8 {d12[], d13[]}, [r2], lr + vld1.8 {d16[], d17[]}, [r1], lr + vld1.8 {d20[], d21[]}, [r2], lr + vld1.8 {d24[], d25[]}, [r1], lr + vld1.8 {d28[], d29[]}, [r2], lr + + subs r12, r12, #1 + + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q4}, [r5], lr + vst1.8 {q6}, [r6], lr + vst1.8 {q8}, [r5], lr + vst1.8 {q10}, [r6], lr + vst1.8 {q12}, [r5], lr + vst1.8 {q14}, [r6], lr + + bne copy_left_right_y_b16 + +;Now copy the top and bottom source lines into each line of the respective borders + ldr r7, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 + mul r8, r3, lr + + mov r12, lr, lsr #7 + + sub r6, r1, r3 ;destptr2 + sub r2, r6, lr ;srcptr2 + sub r1, r7, r3 ;srcptr1 + sub r5, r1, r8 ;destptr1 + +copy_top_bottom_y_b16 + vld1.8 {q0, q1}, [r1]! + vld1.8 {q8, q9}, [r2]! + vld1.8 {q2, q3}, [r1]! + vld1.8 {q10, q11}, [r2]! + vld1.8 {q4, q5}, [r1]! + vld1.8 {q12, q13}, [r2]! + vld1.8 {q6, q7}, [r1]! + vld1.8 {q14, q15}, [r2]! + + mov r7, r3 + +top_bottom_16_b16 + subs r7, r7, #1 + + vst1.8 {q0, q1}, [r5]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q2, q3}, [r5]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q4, q5}, [r5]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q6, q7}, [r5]! + vst1.8 {q14, q15}, [r6]! + + add r5, r5, lr + sub r5, r5, #128 + add r6, r6, lr + sub r6, r6, #128 + + bne top_bottom_16_b16 + + sub r5, r1, r8 + add r6, r2, lr + + subs r12, r12, #1 + bne copy_top_bottom_y_b16 + + mov r7, lr, lsr #4 ;check to see if extra copy is needed + ands r7, r7, #0x7 + bne extra_top_bottom_y_b16 +end_of_border_copy_y_b16 + + vpop {d8 - d15} + pop {r4 - r11, pc} + +;===================== +;extra copy part for Y +extra_top_bottom_y_b16 + vld1.8 {q0}, [r1]! + vld1.8 {q2}, [r2]! + + mov r9, r3, lsr #3 + +extra_top_bottom_16_b16 + subs r9, r9, #1 + + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + bne extra_top_bottom_16_b16 + + sub r5, r1, r8 + add r6, r2, lr + subs r7, r7, #1 + bne extra_top_bottom_y_b16 + + b end_of_border_copy_y_b16 + +;============================= +extra_cp_src_to_dst_width + add r2, r2, r11 + add r3, r3, r11 + add r0, r8, r6 + add r11, r9, r7 + + mov lr, r4, lsr #1 +extra_cp_src_to_dst_height_loop + mov r8, r2 + mov r9, r3 + add r0, r8, r6 + add r11, r9, r7 + + mov r12, r10 + +extra_cp_src_to_dst_width_loop + vld1.8 {q0}, [r8]! + vld1.8 {q1}, [r0]! + + subs r12, r12, #16 + + vst1.8 {q0}, [r9]! + vst1.8 {q1}, [r11]! + bne extra_cp_src_to_dst_width_loop + + subs lr, lr, #1 + + add r2, r2, r6, lsl #1 + add r3, r3, r7, lsl #1 + + bne extra_cp_src_to_dst_height_loop + + b end_of_cp_src_to_dst + + ENDP + +;=========================================================== +;In vp8cx_pick_filter_level(), call vp8_yv12_copy_frame_yonly +;without extend_frame_borders. +|vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon| PROC + push {r4 - r11, lr} + vpush {d8-d15} + + ldr r4, [r0, #yv12_buffer_config_y_height] + ldr r5, [r0, #yv12_buffer_config_y_width] + ldr r6, [r0, #yv12_buffer_config_y_stride] + ldr r7, [r1, #yv12_buffer_config_y_stride] + ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 + ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1 + + ; copy two rows at one time + mov lr, r4, lsr #1 + +cp_src_to_dst_height_loop1 + mov r8, r2 + mov r9, r3 + add r10, r2, r6 + add r11, r3, r7 + mov r12, r5, lsr #7 + +cp_src_to_dst_width_loop1 + vld1.8 {q0, q1}, [r8]! + vld1.8 {q8, q9}, [r10]! + vld1.8 {q2, q3}, [r8]! + vld1.8 {q10, q11}, [r10]! + vld1.8 {q4, q5}, [r8]! + vld1.8 {q12, q13}, [r10]! + vld1.8 {q6, q7}, [r8]! + vld1.8 {q14, q15}, [r10]! + + subs r12, r12, #1 + + vst1.8 {q0, q1}, [r9]! + vst1.8 {q8, q9}, [r11]! + vst1.8 {q2, q3}, [r9]! + vst1.8 {q10, q11}, [r11]! + vst1.8 {q4, q5}, [r9]! + vst1.8 {q12, q13}, [r11]! + vst1.8 {q6, q7}, [r9]! + vst1.8 {q14, q15}, [r11]! + + bne cp_src_to_dst_width_loop1 + + subs lr, lr, #1 + add r2, r2, r6, lsl #1 + add r3, r3, r7, lsl #1 + + bne cp_src_to_dst_height_loop1 + + ands r10, r5, #0x7f ;check to see if extra copy is needed + sub r11, r5, r10 + ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 + ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1 + bne extra_cp_src_to_dst_width1 +end_of_cp_src_to_dst1 + + vpop {d8 - d15} + pop {r4-r11, pc} + +;============================= +extra_cp_src_to_dst_width1 + add r2, r2, r11 + add r3, r3, r11 + add r0, r8, r6 + add r11, r9, r7 + + mov lr, r4, lsr #1 +extra_cp_src_to_dst_height_loop1 + mov r8, r2 + mov r9, r3 + add r0, r8, r6 + add r11, r9, r7 + + mov r12, r10 + +extra_cp_src_to_dst_width_loop1 + vld1.8 {q0}, [r8]! + vld1.8 {q1}, [r0]! + + subs r12, r12, #16 + + vst1.8 {q0}, [r9]! + vst1.8 {q1}, [r11]! + bne extra_cp_src_to_dst_width_loop1 + + subs lr, lr, #1 + + add r2, r2, r6, lsl #1 + add r3, r3, r7, lsl #1 + + bne extra_cp_src_to_dst_height_loop1 + + b end_of_cp_src_to_dst1 + + ENDP + + END diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm new file mode 100644 index 000000000..c8923d5a5 --- /dev/null +++ b/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm @@ -0,0 +1,257 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_yv12_copy_src_frame_func_neon| + ARM + REQUIRE8 + PRESERVE8 + + INCLUDE vpx_asm_offsets.asm + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;Note: This function is used to copy source data in src_buffer[i] at beginning of +;the encoding. The buffer has a width and height of cpi->oxcf.Width and cpi->oxcf.Height, +;which can be ANY numbers(NOT always multiples of 16 or 4). + +;void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); + +|vp8_yv12_copy_src_frame_func_neon| PROC + push {r4 - r11, lr} + vpush {d8 - d15} + + ;Copy Y plane + ldr r4, [r0, #yv12_buffer_config_y_height] + ldr r5, [r0, #yv12_buffer_config_y_width] + ldr r6, [r0, #yv12_buffer_config_y_stride] + ldr r7, [r1, #yv12_buffer_config_y_stride] + ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 + ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1 + + add r10, r2, r6 ;second row src + add r11, r3, r7 ;second row dst + mov r6, r6, lsl #1 + mov r7, r7, lsl #1 + sub r6, r6, r5 ;adjust stride + sub r7, r7, r5 + + ; copy two rows at one time + mov lr, r4, lsr #1 + +cp_src_to_dst_height_loop + mov r12, r5 + +cp_width_128_loop + vld1.8 {q0, q1}, [r2]! + vld1.8 {q4, q5}, [r10]! + vld1.8 {q2, q3}, [r2]! + vld1.8 {q6, q7}, [r10]! + vld1.8 {q8, q9}, [r2]! + vld1.8 {q12, q13}, [r10]! + vld1.8 {q10, q11}, [r2]! + vld1.8 {q14, q15}, [r10]! + sub r12, r12, #128 + cmp r12, #128 + vst1.8 {q0, q1}, [r3]! + vst1.8 {q4, q5}, [r11]! + vst1.8 {q2, q3}, [r3]! + vst1.8 {q6, q7}, [r11]! + vst1.8 {q8, q9}, [r3]! + vst1.8 {q12, q13}, [r11]! + vst1.8 {q10, q11}, [r3]! + vst1.8 {q14, q15}, [r11]! + bhs cp_width_128_loop + + cmp r12, #0 + beq cp_width_done + +cp_width_8_loop + vld1.8 {d0}, [r2]! + vld1.8 {d1}, [r10]! + sub r12, r12, #8 + cmp r12, #8 + vst1.8 {d0}, [r3]! + vst1.8 {d1}, [r11]! + bhs cp_width_8_loop + + cmp r12, #0 + beq cp_width_done + +cp_width_1_loop + ldrb r8, [r2], #1 + subs r12, r12, #1 + strb r8, [r3], #1 + ldrb r8, [r10], #1 + strb r8, [r11], #1 + bne cp_width_1_loop + +cp_width_done + subs lr, lr, #1 + add r2, r2, r6 + add r3, r3, r7 + add r10, r10, r6 + add r11, r11, r7 + bne cp_src_to_dst_height_loop + +;copy last line for Y if y_height is odd + tst r4, #1 + beq cp_width_done_1 + mov r12, r5 + +cp_width_128_loop_1 + vld1.8 {q0, q1}, [r2]! + vld1.8 {q2, q3}, [r2]! + vld1.8 {q8, q9}, [r2]! + vld1.8 {q10, q11}, [r2]! + sub r12, r12, #128 + cmp r12, #128 + vst1.8 {q0, q1}, [r3]! + vst1.8 {q2, q3}, [r3]! + vst1.8 {q8, q9}, [r3]! + vst1.8 {q10, q11}, [r3]! + bhs cp_width_128_loop_1 + + cmp r12, #0 + beq cp_width_done_1 + +cp_width_8_loop_1 + vld1.8 {d0}, [r2]! + sub r12, r12, #8 + cmp r12, #8 + vst1.8 {d0}, [r3]! + bhs cp_width_8_loop_1 + + cmp r12, #0 + beq cp_width_done_1 + +cp_width_1_loop_1 + ldrb r8, [r2], #1 + subs r12, r12, #1 + strb r8, [r3], #1 + bne cp_width_1_loop_1 +cp_width_done_1 + +;Copy U & V planes + ldr r4, [r0, #yv12_buffer_config_uv_height] + ldr r5, [r0, #yv12_buffer_config_uv_width] + ldr r6, [r0, #yv12_buffer_config_uv_stride] + ldr r7, [r1, #yv12_buffer_config_uv_stride] + ldr r2, [r0, #yv12_buffer_config_u_buffer] ;srcptr1 + ldr r3, [r1, #yv12_buffer_config_u_buffer] ;dstptr1 + + add r10, r2, r6 ;second row src + add r11, r3, r7 ;second row dst + mov r6, r6, lsl #1 + mov r7, r7, lsl #1 + sub r6, r6, r5 ;adjust stride + sub r7, r7, r5 + + mov r9, #2 + +cp_uv_loop + ;copy two rows at one time + mov lr, r4, lsr #1 + +cp_src_to_dst_height_uv_loop + mov r12, r5 + +cp_width_uv_64_loop + vld1.8 {q0, q1}, [r2]! + vld1.8 {q4, q5}, [r10]! + vld1.8 {q2, q3}, [r2]! + vld1.8 {q6, q7}, [r10]! + sub r12, r12, #64 + cmp r12, #64 + vst1.8 {q0, q1}, [r3]! + vst1.8 {q4, q5}, [r11]! + vst1.8 {q2, q3}, [r3]! + vst1.8 {q6, q7}, [r11]! + bhs cp_width_uv_64_loop + + cmp r12, #0 + beq cp_width_uv_done + +cp_width_uv_8_loop + vld1.8 {d0}, [r2]! + vld1.8 {d1}, [r10]! + sub r12, r12, #8 + cmp r12, #8 + vst1.8 {d0}, [r3]! + vst1.8 {d1}, [r11]! + bhs cp_width_uv_8_loop + + cmp r12, #0 + beq cp_width_uv_done + +cp_width_uv_1_loop + ldrb r8, [r2], #1 + subs r12, r12, #1 + strb r8, [r3], #1 + ldrb r8, [r10], #1 + strb r8, [r11], #1 + bne cp_width_uv_1_loop + +cp_width_uv_done + subs lr, lr, #1 + add r2, r2, r6 + add r3, r3, r7 + add r10, r10, r6 + add r11, r11, r7 + bne cp_src_to_dst_height_uv_loop + +;copy last line for U & V if uv_height is odd + tst r4, #1 + beq cp_width_uv_done_1 + mov r12, r5 + +cp_width_uv_64_loop_1 + vld1.8 {q0, q1}, [r2]! + vld1.8 {q2, q3}, [r2]! + sub r12, r12, #64 + cmp r12, #64 + vst1.8 {q0, q1}, [r3]! + vst1.8 {q2, q3}, [r3]! + bhs cp_width_uv_64_loop_1 + + cmp r12, #0 + beq cp_width_uv_done_1 + +cp_width_uv_8_loop_1 + vld1.8 {d0}, [r2]! + sub r12, r12, #8 + cmp r12, #8 + vst1.8 {d0}, [r3]! + bhs cp_width_uv_8_loop_1 + + cmp r12, #0 + beq cp_width_uv_done_1 + +cp_width_uv_1_loop_1 + ldrb r8, [r2], #1 + subs r12, r12, #1 + strb r8, [r3], #1 + bne cp_width_uv_1_loop_1 +cp_width_uv_done_1 + + subs r9, r9, #1 + ldrne r2, [r0, #yv12_buffer_config_v_buffer] ;srcptr1 + ldrne r3, [r1, #yv12_buffer_config_v_buffer] ;dstptr1 + ldrne r10, [r0, #yv12_buffer_config_uv_stride] + ldrne r11, [r1, #yv12_buffer_config_uv_stride] + + addne r10, r2, r10 ;second row src + addne r11, r3, r11 ;second row dst + + bne cp_uv_loop + + vpop {d8 - d15} + pop {r4 - r11, pc} + + ENDP + END diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm new file mode 100644 index 000000000..8c9ce1962 --- /dev/null +++ b/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm @@ -0,0 +1,587 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_yv12_extend_frame_borders_neon| + ARM + REQUIRE8 + PRESERVE8 + + INCLUDE vpx_asm_offsets.asm + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;void vp8_yv12_extend_frame_borders_neon (YV12_BUFFER_CONFIG *ybf); +;Note: this is VP8 function, which has border=32 and 16. Internal y_width and y_height +; are always multiples of 16. + +|vp8_yv12_extend_frame_borders_neon| PROC + push {r4 - r10, lr} + vpush {d8 - d15} + + ;Not need to load y_width, since: y_width = y_stride - 2*border + ldr r3, [r0, #yv12_buffer_config_border] + ldr r1, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 + ldr r4, [r0, #yv12_buffer_config_y_height] + ldr lr, [r0, #yv12_buffer_config_y_stride] + + cmp r3, #16 + beq b16_extend_frame_borders + +;======================= +b32_extend_frame_borders +;border = 32 +;======================= +;Border copy for Y plane +;copy the left and right most columns out + sub r5, r1, r3 ;destptr1 + add r6, r1, lr + sub r6, r6, r3, lsl #1 ;destptr2 + sub r2, r6, #1 ;srcptr2 + + ;Do four rows at one time + mov r12, r4, lsr #2 + +copy_left_right_y + vld1.8 {d0[], d1[]}, [r1], lr + vld1.8 {d4[], d5[]}, [r2], lr + vld1.8 {d8[], d9[]}, [r1], lr + vld1.8 {d12[], d13[]}, [r2], lr + vld1.8 {d16[], d17[]}, [r1], lr + vld1.8 {d20[], d21[]}, [r2], lr + vld1.8 {d24[], d25[]}, [r1], lr + vld1.8 {d28[], d29[]}, [r2], lr + + vmov q1, q0 + vmov q3, q2 + vmov q5, q4 + vmov q7, q6 + vmov q9, q8 + vmov q11, q10 + vmov q13, q12 + vmov q15, q14 + + subs r12, r12, #1 + + vst1.8 {q0, q1}, [r5], lr + vst1.8 {q2, q3}, [r6], lr + vst1.8 {q4, q5}, [r5], lr + vst1.8 {q6, q7}, [r6], lr + vst1.8 {q8, q9}, [r5], lr + vst1.8 {q10, q11}, [r6], lr + vst1.8 {q12, q13}, [r5], lr + vst1.8 {q14, q15}, [r6], lr + + bne copy_left_right_y + +;Now copy the top and bottom source lines into each line of the respective borders + ldr r7, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 + mul r8, r3, lr + + mov r12, lr, lsr #7 + + sub r6, r1, r3 ;destptr2 + sub r2, r6, lr ;srcptr2 + sub r1, r7, r3 ;srcptr1 + sub r5, r1, r8 ;destptr1 + +copy_top_bottom_y + vld1.8 {q0, q1}, [r1]! + vld1.8 {q8, q9}, [r2]! + vld1.8 {q2, q3}, [r1]! + vld1.8 {q10, q11}, [r2]! + vld1.8 {q4, q5}, [r1]! + vld1.8 {q12, q13}, [r2]! + vld1.8 {q6, q7}, [r1]! + vld1.8 {q14, q15}, [r2]! + + mov r7, r3 + +top_bottom_32 + subs r7, r7, #1 + + vst1.8 {q0, q1}, [r5]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q2, q3}, [r5]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q4, q5}, [r5]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q6, q7}, [r5]! + vst1.8 {q14, q15}, [r6]! + + add r5, r5, lr + sub r5, r5, #128 + add r6, r6, lr + sub r6, r6, #128 + + bne top_bottom_32 + + sub r5, r1, r8 + add r6, r2, lr + + subs r12, r12, #1 + bne copy_top_bottom_y + + mov r7, lr, lsr #4 ;check to see if extra copy is needed + ands r7, r7, #0x7 + bne extra_top_bottom_y +end_of_border_copy_y + +;Border copy for U, V planes + ldr r1, [r0, #yv12_buffer_config_u_buffer] ;srcptr1 + mov lr, lr, lsr #1 ;uv_stride + mov r3, r3, lsr #1 ;border + mov r4, r4, lsr #1 ;uv_height + mov r8, r8, lsr #2 + + mov r10, #2 + +;copy the left and right most columns out +border_copy_uv + sub r5, r1, r3 ;destptr1 + add r6, r1, lr + sub r6, r6, r3, lsl #1 ;destptr2 + sub r2, r6, #1 ;srcptr2 + + mov r7, r1 + + ;Do eight rows at one time + mov r12, r4, lsr #3 + +copy_left_right_uv + vld1.8 {d0[], d1[]}, [r1], lr + vld1.8 {d2[], d3[]}, [r2], lr + vld1.8 {d4[], d5[]}, [r1], lr + vld1.8 {d6[], d7[]}, [r2], lr + vld1.8 {d8[], d9[]}, [r1], lr + vld1.8 {d10[], d11[]}, [r2], lr + vld1.8 {d12[], d13[]}, [r1], lr + vld1.8 {d14[], d15[]}, [r2], lr + vld1.8 {d16[], d17[]}, [r1], lr + vld1.8 {d18[], d19[]}, [r2], lr + vld1.8 {d20[], d21[]}, [r1], lr + vld1.8 {d22[], d23[]}, [r2], lr + vld1.8 {d24[], d25[]}, [r1], lr + vld1.8 {d26[], d27[]}, [r2], lr + vld1.8 {d28[], d29[]}, [r1], lr + vld1.8 {d30[], d31[]}, [r2], lr + + subs r12, r12, #1 + + vst1.8 {q0}, [r5], lr + vst1.8 {q1}, [r6], lr + vst1.8 {q2}, [r5], lr + vst1.8 {q3}, [r6], lr + vst1.8 {q4}, [r5], lr + vst1.8 {q5}, [r6], lr + vst1.8 {q6}, [r5], lr + vst1.8 {q7}, [r6], lr + vst1.8 {q8}, [r5], lr + vst1.8 {q9}, [r6], lr + vst1.8 {q10}, [r5], lr + vst1.8 {q11}, [r6], lr + vst1.8 {q12}, [r5], lr + vst1.8 {q13}, [r6], lr + vst1.8 {q14}, [r5], lr + vst1.8 {q15}, [r6], lr + + bne copy_left_right_uv + +;Now copy the top and bottom source lines into each line of the respective borders + mov r12, lr, lsr #6 + + sub r6, r1, r3 ;destptr2 + sub r2, r6, lr ;srcptr2 + sub r1, r7, r3 ;srcptr1 + sub r5, r1, r8 ;destptr1 + +copy_top_bottom_uv + vld1.8 {q0, q1}, [r1]! + vld1.8 {q8, q9}, [r2]! + vld1.8 {q2, q3}, [r1]! + vld1.8 {q10, q11}, [r2]! + + mov r7, r3 + +top_bottom_16 + subs r7, r7, #1 + + vst1.8 {q0, q1}, [r5]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q2, q3}, [r5]! + vst1.8 {q10, q11}, [r6]! + + add r5, r5, lr + sub r5, r5, #64 + add r6, r6, lr + sub r6, r6, #64 + + bne top_bottom_16 + + sub r5, r1, r8 + add r6, r2, lr + + subs r12, r12, #1 + bne copy_top_bottom_uv + + mov r7, lr, lsr #3 ;check to see if extra copy is needed + ands r7, r7, #0x7 + bne extra_top_bottom_uv + +end_of_border_copy_uv + subs r10, r10, #1 + ldrne r1, [r0, #yv12_buffer_config_v_buffer] ;srcptr1 + bne border_copy_uv + + vpop {d8 - d15} + pop {r4 - r10, pc} + +;;;;;;;;;;;;;;;;;;;;;; +;extra copy part for Y +extra_top_bottom_y + vld1.8 {q0}, [r1]! + vld1.8 {q2}, [r2]! + + mov r9, r3, lsr #3 + +extra_top_bottom_32 + subs r9, r9, #1 + + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + bne extra_top_bottom_32 + + sub r5, r1, r8 + add r6, r2, lr + subs r7, r7, #1 + bne extra_top_bottom_y + + b end_of_border_copy_y + +;extra copy part for UV +extra_top_bottom_uv + vld1.8 {d0}, [r1]! + vld1.8 {d8}, [r2]! + + mov r9, r3, lsr #3 + +extra_top_bottom_16 + subs r9, r9, #1 + + vst1.8 {d0}, [r5], lr + vst1.8 {d8}, [r6], lr + vst1.8 {d0}, [r5], lr + vst1.8 {d8}, [r6], lr + vst1.8 {d0}, [r5], lr + vst1.8 {d8}, [r6], lr + vst1.8 {d0}, [r5], lr + vst1.8 {d8}, [r6], lr + vst1.8 {d0}, [r5], lr + vst1.8 {d8}, [r6], lr + vst1.8 {d0}, [r5], lr + vst1.8 {d8}, [r6], lr + vst1.8 {d0}, [r5], lr + vst1.8 {d8}, [r6], lr + vst1.8 {d0}, [r5], lr + vst1.8 {d8}, [r6], lr + bne extra_top_bottom_16 + + sub r5, r1, r8 + add r6, r2, lr + subs r7, r7, #1 + bne extra_top_bottom_uv + + b end_of_border_copy_uv + + +;======================= +b16_extend_frame_borders +;border = 16 +;======================= +;Border copy for Y plane +;copy the left and right most columns out + sub r5, r1, r3 ;destptr1 + add r6, r1, lr + sub r6, r6, r3, lsl #1 ;destptr2 + sub r2, r6, #1 ;srcptr2 + + ;Do four rows at one time + mov r12, r4, lsr #2 + +copy_left_right_y_b16 + vld1.8 {d0[], d1[]}, [r1], lr + vld1.8 {d4[], d5[]}, [r2], lr + vld1.8 {d8[], d9[]}, [r1], lr + vld1.8 {d12[], d13[]}, [r2], lr + vld1.8 {d16[], d17[]}, [r1], lr + vld1.8 {d20[], d21[]}, [r2], lr + vld1.8 {d24[], d25[]}, [r1], lr + vld1.8 {d28[], d29[]}, [r2], lr + + subs r12, r12, #1 + + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q4}, [r5], lr + vst1.8 {q6}, [r6], lr + vst1.8 {q8}, [r5], lr + vst1.8 {q10}, [r6], lr + vst1.8 {q12}, [r5], lr + vst1.8 {q14}, [r6], lr + + bne copy_left_right_y_b16 + +;Now copy the top and bottom source lines into each line of the respective borders + ldr r7, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 + mul r8, r3, lr + + mov r12, lr, lsr #7 + + sub r6, r1, r3 ;destptr2 + sub r2, r6, lr ;srcptr2 + sub r1, r7, r3 ;srcptr1 + sub r5, r1, r8 ;destptr1 + +copy_top_bottom_y_b16 + vld1.8 {q0, q1}, [r1]! + vld1.8 {q8, q9}, [r2]! + vld1.8 {q2, q3}, [r1]! + vld1.8 {q10, q11}, [r2]! + vld1.8 {q4, q5}, [r1]! + vld1.8 {q12, q13}, [r2]! + vld1.8 {q6, q7}, [r1]! + vld1.8 {q14, q15}, [r2]! + + mov r7, r3 + +top_bottom_16_b16 + subs r7, r7, #1 + + vst1.8 {q0, q1}, [r5]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q2, q3}, [r5]! + vst1.8 {q10, q11}, [r6]! + vst1.8 {q4, q5}, [r5]! + vst1.8 {q12, q13}, [r6]! + vst1.8 {q6, q7}, [r5]! + vst1.8 {q14, q15}, [r6]! + + add r5, r5, lr + sub r5, r5, #128 + add r6, r6, lr + sub r6, r6, #128 + + bne top_bottom_16_b16 + + sub r5, r1, r8 + add r6, r2, lr + + subs r12, r12, #1 + bne copy_top_bottom_y_b16 + + mov r7, lr, lsr #4 ;check to see if extra copy is needed + ands r7, r7, #0x7 + bne extra_top_bottom_y_b16 +end_of_border_copy_y_b16 + +;Border copy for U, V planes + ldr r1, [r0, #yv12_buffer_config_u_buffer] ;srcptr1 + mov lr, lr, lsr #1 ;uv_stride + mov r3, r3, lsr #1 ;border + mov r4, r4, lsr #1 ;uv_height + mov r8, r8, lsr #2 + + mov r10, #2 + +;copy the left and right most columns out +border_copy_uv_b16 + sub r5, r1, r3 ;destptr1 + add r6, r1, lr + sub r6, r6, r3, lsl #1 ;destptr2 + sub r2, r6, #1 ;srcptr2 + + mov r7, r1 + + ;Do eight rows at one time + mov r12, r4, lsr #3 + +copy_left_right_uv_b16 + vld1.8 {d0[]}, [r1], lr + vld1.8 {d2[]}, [r2], lr + vld1.8 {d4[]}, [r1], lr + vld1.8 {d6[]}, [r2], lr + vld1.8 {d8[]}, [r1], lr + vld1.8 {d10[]}, [r2], lr + vld1.8 {d12[]}, [r1], lr + vld1.8 {d14[]}, [r2], lr + vld1.8 {d16[]}, [r1], lr + vld1.8 {d18[]}, [r2], lr + vld1.8 {d20[]}, [r1], lr + vld1.8 {d22[]}, [r2], lr + vld1.8 {d24[]}, [r1], lr + vld1.8 {d26[]}, [r2], lr + vld1.8 {d28[]}, [r1], lr + vld1.8 {d30[]}, [r2], lr + + subs r12, r12, #1 + + vst1.8 {d0}, [r5], lr + vst1.8 {d2}, [r6], lr + vst1.8 {d4}, [r5], lr + vst1.8 {d6}, [r6], lr + vst1.8 {d8}, [r5], lr + vst1.8 {d10}, [r6], lr + vst1.8 {d12}, [r5], lr + vst1.8 {d14}, [r6], lr + vst1.8 {d16}, [r5], lr + vst1.8 {d18}, [r6], lr + vst1.8 {d20}, [r5], lr + vst1.8 {d22}, [r6], lr + vst1.8 {d24}, [r5], lr + vst1.8 {d26}, [r6], lr + vst1.8 {d28}, [r5], lr + vst1.8 {d30}, [r6], lr + + bne copy_left_right_uv_b16 + +;Now copy the top and bottom source lines into each line of the respective borders + mov r12, lr, lsr #6 + + sub r6, r1, r3 ;destptr2 + sub r2, r6, lr ;srcptr2 + sub r1, r7, r3 ;srcptr1 + sub r5, r1, r8 ;destptr1 + +copy_top_bottom_uv_b16 + vld1.8 {q0, q1}, [r1]! + vld1.8 {q8, q9}, [r2]! + vld1.8 {q2, q3}, [r1]! + vld1.8 {q10, q11}, [r2]! + + mov r7, r3 + +top_bottom_8_b16 + subs r7, r7, #1 + + vst1.8 {q0, q1}, [r5]! + vst1.8 {q8, q9}, [r6]! + vst1.8 {q2, q3}, [r5]! + vst1.8 {q10, q11}, [r6]! + + add r5, r5, lr + sub r5, r5, #64 + add r6, r6, lr + sub r6, r6, #64 + + bne top_bottom_8_b16 + + sub r5, r1, r8 + add r6, r2, lr + + subs r12, r12, #1 + bne copy_top_bottom_uv_b16 + + mov r7, lr, lsr #3 ;check to see if extra copy is needed + ands r7, r7, #0x7 + bne extra_top_bottom_uv_b16 + +end_of_border_copy_uv_b16 + subs r10, r10, #1 + ldrne r1, [r0, #yv12_buffer_config_v_buffer] ;srcptr1 + bne border_copy_uv_b16 + + vpop {d8-d15} + pop {r4 - r10, pc} + +;;;;;;;;;;;;;;;;;;;;;; +;extra copy part for Y +extra_top_bottom_y_b16 + vld1.8 {q0}, [r1]! + vld1.8 {q2}, [r2]! + + mov r9, r3, lsr #3 + +extra_top_bottom_16_b16 + subs r9, r9, #1 + + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + vst1.8 {q0}, [r5], lr + vst1.8 {q2}, [r6], lr + bne extra_top_bottom_16_b16 + + sub r5, r1, r8 + add r6, r2, lr + subs r7, r7, #1 + bne extra_top_bottom_y_b16 + + b end_of_border_copy_y_b16 + +;extra copy part for UV +extra_top_bottom_uv_b16 + vld1.8 {d0}, [r1]! + vld1.8 {d8}, [r2]! + + mov r9, r3, lsr #3 + +extra_top_bottom_8_b16 + subs r9, r9, #1 + + vst1.8 {d0}, [r5], lr + vst1.8 {d8}, [r6], lr + vst1.8 {d0}, [r5], lr + vst1.8 {d8}, [r6], lr + vst1.8 {d0}, [r5], lr + vst1.8 {d8}, [r6], lr + vst1.8 {d0}, [r5], lr + vst1.8 {d8}, [r6], lr + vst1.8 {d0}, [r5], lr + vst1.8 {d8}, [r6], lr + vst1.8 {d0}, [r5], lr + vst1.8 {d8}, [r6], lr + vst1.8 {d0}, [r5], lr + vst1.8 {d8}, [r6], lr + vst1.8 {d0}, [r5], lr + vst1.8 {d8}, [r6], lr + bne extra_top_bottom_8_b16 + + sub r5, r1, r8 + add r6, r2, lr + subs r7, r7, #1 + bne extra_top_bottom_uv_b16 + + b end_of_border_copy_uv_b16 + + ENDP + END diff --git a/vpx_scale/arm/scalesystemdependant.c b/vpx_scale/arm/scalesystemdependant.c new file mode 100644 index 000000000..3c355becc --- /dev/null +++ b/vpx_scale/arm/scalesystemdependant.c @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_scale/vpxscale.h" + +#ifdef HAVE_CONFIG_H +#include "vpx_config.h" +#endif + +void (*vp8_yv12_extend_frame_borders_ptr)(YV12_BUFFER_CONFIG *ybf); +extern void vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf); +extern void vp8_yv12_extend_frame_borders_neon(YV12_BUFFER_CONFIG *ybf); + +void (*vp8_yv12_copy_frame_yonly_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +extern void vp8_yv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +extern void vp8_yv12_copy_frame_yonly_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); + +void (*vp8_yv12_copy_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +extern void vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +extern void vp8_yv12_copy_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); + +/**************************************************************************** +* Imports +*****************************************************************************/ + +/**************************************************************************** + * + * ROUTINE : vp8_scale_machine_specific_config + * + * INPUTS : UINT32 Version : Codec version number. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Checks for machine specifc features such as MMX support + * sets appropriate flags and function pointers. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void vp8_scale_machine_specific_config() +{ + /* + vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_armv4; + vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_armv4; + vp8_last_vertical_band_1_2_scale = vp8cx_last_vertical_band_1_2_scale_c; + vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_armv4; + vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_armv4; + vp8_last_vertical_band_3_5_scale = vp8cx_last_vertical_band_3_5_scale_c; + vp8_horizontal_line_3_4_scale = horizontal_line_3_4_scale_armv4; + vp8_vertical_band_3_4_scale = vertical_band_3_4_scale_armv4; + vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c; + vp8_horizontal_line_2_3_scale = horizontal_line_2_3_scale_armv4; + vp8_vertical_band_2_3_scale = vertical_band_2_3_scale_armv4; + vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c; + vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_armv4; + vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_armv4; + vp8_last_vertical_band_4_5_scale = vp8cx_last_vertical_band_4_5_scale_c; + + vp8_vertical_band_5_4_scale = vp8cx_vertical_band_5_4_scale_c; + vp8_vertical_band_5_3_scale = vp8cx_vertical_band_5_3_scale_c; + vp8_vertical_band_2_1_scale = vp8cx_vertical_band_2_1_scale_c; + vp8_vertical_band_2_1_scale_i = vp8cx_vertical_band_2_1_scale_i_c; + vp8_horizontal_line_2_1_scale = vp8cx_horizontal_line_2_1_scale_c; + vp8_horizontal_line_5_3_scale = vp8cx_horizontal_line_5_3_scale_c; + vp8_horizontal_line_5_4_scale = vp8cx_horizontal_line_5_4_scale_c; + */ + +#if HAVE_ARMV7 + vp8_yv12_extend_frame_borders_ptr = vp8_yv12_extend_frame_borders_neon; + vp8_yv12_copy_frame_yonly_ptr = vp8_yv12_copy_frame_yonly_neon; + vp8_yv12_copy_frame_ptr = vp8_yv12_copy_frame_neon; +#else + vp8_yv12_extend_frame_borders_ptr = vp8_yv12_extend_frame_borders; + vp8_yv12_copy_frame_yonly_ptr = vp8_yv12_copy_frame_yonly; + vp8_yv12_copy_frame_ptr = vp8_yv12_copy_frame; +#endif + +} diff --git a/vpx_scale/arm/yv12extend_arm.c b/vpx_scale/arm/yv12extend_arm.c new file mode 100644 index 000000000..7c3f7cd07 --- /dev/null +++ b/vpx_scale/arm/yv12extend_arm.c @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_scale/yv12config.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_scale/vpxscale.h" + +void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); + +void +vp8_yv12_copy_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) +{ + vp8_yv12_copy_frame_func_neon(src_ybc, dst_ybc); + //printf("Border:%d; plane_stride:%d; plane_height:%d; plane_width:%d\n",dst_ybc->border,dst_ybc->y_stride,dst_ybc->y_height,dst_ybc->y_width); + + vp8_yv12_extend_frame_borders_ptr(dst_ybc); +} diff --git a/vpx_scale/blackfin/yv12config.c b/vpx_scale/blackfin/yv12config.c new file mode 100644 index 000000000..7cb083fb9 --- /dev/null +++ b/vpx_scale/blackfin/yv12config.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** + * + * Module Title : yv12config.c + * + * Description : + * + ***************************************************************************/ + +/**************************************************************************** +* Header Files +****************************************************************************/ +#include "vpx_scale/yv12config.h" +#include "vpx_mem/vpx_mem.h" + +#include <cdef_bf533.h> + +/**************************************************************************** +* Imports +****************************************************************************/ +void +extend_memset(void *dst, unsigned char value, unsigned int size); + +/**************************************************************************** + * + ****************************************************************************/ +int +vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf) +{ + if (ybf) + { + if (ybf->buffer_alloc) + { + duck_free(ybf->buffer_alloc); + } + + ybf->buffer_alloc = 0; + } + else + { + return -1; + } + + return 0; +} + +/**************************************************************************** + * + ****************************************************************************/ +int +vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border) +{ +//NOTE: + + int yplane_size = (height + 2 * border) * (width + 2 * border); + int uvplane_size = (height / 2 + border) * (width / 2 + border); + + if (ybf) + { + vp8_yv12_de_alloc_frame_buffer(ybf); + + ybf->y_width = width; + ybf->y_height = height; + ybf->y_stride = width + 2 * border; + + ybf->uv_width = width / 2; + ybf->uv_height = height / 2; + ybf->uv_stride = ybf->uv_width + border; + + ybf->border = border; + + // Added 2 extra lines to framebuffer so that copy12x12 doesn't fail + // when we have a large motion vector in V on the last v block. + // Note : We never use these pixels anyway so this doesn't hurt. + ybf->buffer_alloc = (unsigned char *) duck_memalign(32, (yplane_size * 3 / 2) + ybf->y_stride , 0); + + if (ybf->buffer_alloc == NULL) + return -1; + + ybf->y_buffer = ybf->buffer_alloc + border * ybf->y_stride + border; + ybf->u_buffer = ybf->buffer_alloc + yplane_size + border / 2 * ybf->uv_stride + border / 2; + ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + border / 2 * ybf->uv_stride + border / 2; + } + else + { + return -2; + } + + return 0; +} +/**************************************************************************** + * + ****************************************************************************/ +int +vp8_yv12_black_frame_buffer(YV12_BUFFER_CONFIG *ybf) +{ + if (ybf) + { + if (ybf->buffer_alloc) + { + extend_memset(ybf->y_buffer, 0x0, ybf->y_stride *(ybf->y_height + 2 * ybf->border)); + extend_memset(ybf->u_buffer, 0x80, ybf->uv_stride *(ybf->uv_height + ybf->border)); + extend_memset(ybf->v_buffer, 0x80, ybf->uv_stride *(ybf->uv_height + ybf->border)); + } + + return 0; + } + + return -1; +} diff --git a/vpx_scale/blackfin/yv12extend.c b/vpx_scale/blackfin/yv12extend.c new file mode 100644 index 000000000..d5be4950d --- /dev/null +++ b/vpx_scale/blackfin/yv12extend.c @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** + * + * Module Title : yv12extend.c + * + * Description : + * + ***************************************************************************/ + +/**************************************************************************** +* Header Files +****************************************************************************/ +#include <cdef_bf533.h> + +#include "vpx_scale/yv12config.h" +#include "vpx_mem/vpx_mem.h" + +/**************************************************************************** +* +****************************************************************************/ + + +/**************************************************************************** +* +****************************************************************************/ +void +extend_memset(void *dst, unsigned char value, unsigned int size) +{ +#if 0 + unsigned int quad_value; + + quad_value = (unsigned int) value; + quad_value |= (unsigned int) value << 8; + quad_value |= (unsigned int) value << 16; + quad_value |= (unsigned int) value << 24; +#else + unsigned short quad_value; + + quad_value = (unsigned int) value; + quad_value |= (unsigned int) value << 8; +#endif + + + if (size / 2 >= 64 * 1024) + printf("_Extend_memset__________ dma memset is broken\n"); + + *p_mdma_s1_start_addr = &quad_value; + *p_mdma_s1_x_count = size / 2; + *p_mdma_s1_x_modify = 0x0; + *p_mdma_d1_start_addr = dst; + *p_mdma_d1_x_count = size / 2; + *p_mdma_d1_x_modify = 2; + + *p_mdma_s1_config = DMAEN | WDSIZE_16; + asm("ssync;"); + + *p_mdma_d1_config = DI_EN | DMAEN | WNR | WDSIZE_16; + asm("ssync;"); + + while ((*p_mdma_d1_irq_status & DMA_DONE) == 0); + + *p_mdma_d1_irq_status |= DMA_DONE; +} + +/**************************************************************************** +* +****************************************************************************/ +void +extend_memcpy(void *dst, void *src, unsigned int size) +{ + if (size / 2 >= 64 * 1024) + printf("_Extend_memcpy__________ dma memcpy is broken\n"); + + + if ((size & 0x3)) + printf("_)__________ size not a multiple of 4\n"); + +//32 bit dma here caused some data to be corrupted --- WHY ?????? + + *p_mdma_s1_start_addr = src; + *p_mdma_s1_x_count = size / 2; + *p_mdma_s1_x_modify = 2; + *p_mdma_d1_start_addr = dst; + *p_mdma_d1_x_count = size / 2; + *p_mdma_d1_x_modify = 2; + + *p_mdma_s1_config = DMAEN | WDSIZE_16; + asm("ssync;"); + + *p_mdma_d1_config = DI_EN | DMAEN | WNR | WDSIZE_16; + asm("ssync;"); + + while ((*p_mdma_d1_irq_status & DMA_DONE) == 0); + + *p_mdma_d1_irq_status |= DMA_DONE; +} + +/**************************************************************************** + * + ****************************************************************************/ +void +vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf) +{ +#if 1 + int i; + unsigned char *src_ptr1, *src_ptr2; + unsigned char *dest_ptr1, *dest_ptr2; + + unsigned int Border; + int plane_stride; + int plane_height; + int plane_width; + + unsigned int quad_sample; + unsigned int sample; + + /***********/ + /* Y Plane */ + /***********/ + Border = ybf->border; + plane_stride = ybf->y_stride; + plane_height = ybf->y_height; + plane_width = ybf->y_width; + + // copy the left and right most columns out + src_ptr1 = ybf->y_buffer; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; i++) + { + extend_memset(dest_ptr1, src_ptr1[0], Border); + extend_memset(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->y_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)Border; i++) + { + extend_memcpy(dest_ptr1, src_ptr1, plane_stride); + dest_ptr1 += plane_stride; + } + + for (i = 0; i < (int)Border; i++) + { + extend_memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr2 += plane_stride; + } + + plane_stride /= 2; + plane_height /= 2; + plane_width /= 2; + Border /= 2; + + /***********/ + /* U Plane */ + /***********/ + + // copy the left and right most columns out + src_ptr1 = ybf->u_buffer; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; i++) + { + extend_memset(dest_ptr1, src_ptr1[0], Border); + extend_memset(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->u_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)(Border); i++) + { + extend_memcpy(dest_ptr1, src_ptr1, plane_stride); + dest_ptr1 += plane_stride; + } + + for (i = 0; i < (int)(Border); i++) + { + extend_memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr2 += plane_stride; + } + + /***********/ + /* V Plane */ + /***********/ + + // copy the left and right most columns out + src_ptr1 = ybf->v_buffer; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; i++) + { + extend_memset(dest_ptr1, src_ptr1[0], Border); + extend_memset(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->v_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)(Border); i++) + { + extend_memcpy(dest_ptr1, src_ptr1, plane_stride); + dest_ptr1 += plane_stride; + } + + for (i = 0; i < (int)(Border); i++) + { + extend_memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr2 += plane_stride; + } + +#endif +} +/**************************************************************************** + * + * ROUTINE : vp8_yv12_copy_frame + * + * INPUTS : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies the source image into the destination image and + * updates the destination's UMV borders. + * + * SPECIAL NOTES : The frames are assumed to be identical in size. + * + ****************************************************************************/ +void +vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) +{ +#if 1 + int row; + unsigned char *source, *dest; + + source = src_ybc->y_buffer; + dest = dst_ybc->y_buffer; + + for (row = 0; row < src_ybc->y_height; row++) + { + extend_memcpy(dest, source, src_ybc->y_width); + source += src_ybc->y_stride; + dest += dst_ybc->y_stride; + } + + source = src_ybc->u_buffer; + dest = dst_ybc->u_buffer; + + for (row = 0; row < src_ybc->uv_height; row++) + { + extend_memcpy(dest, source, src_ybc->uv_width); + source += src_ybc->uv_stride; + dest += dst_ybc->uv_stride; + } + + source = src_ybc->v_buffer; + dest = dst_ybc->v_buffer; + + for (row = 0; row < src_ybc->uv_height; row++) + { + extend_memcpy(dest, source, src_ybc->uv_width); + source += src_ybc->uv_stride; + dest += dst_ybc->uv_stride; + } + + vp8_yv12_extend_frame_borders(dst_ybc); + +#else + int row; + char *source, *dest; + int height; + int width; + + height = src_ybc->y_height + (src_ybc->border * 2); + width = src_ybc->y_width + (src_ybc->border * 2); + source = src_ybc->y_buffer; + dest = dst_ybc->y_buffer; + + for (row = 0; row < height; row++) + { + extend_memcpy(dest, source, width); + source += src_ybc->y_stride; + dest += dst_ybc->y_stride; + } + + height = src_ybc->uv_height + (src_ybc->border); + width = src_ybc->uv_width + (src_ybc->border); + + source = src_ybc->u_buffer; + dest = dst_ybc->u_buffer; + + for (row = 0; row < height; row++) + { + extend_memcpy(dest, source, width); + source += src_ybc->uv_stride; + dest += dst_ybc->uv_stride; + } + + source = src_ybc->v_buffer; + dest = dst_ybc->v_buffer; + + for (row = 0; row < height; row++) + { + extend_memcpy(dest, source, width); + source += src_ybc->uv_stride; + dest += dst_ybc->uv_stride; + } + +#endif + +} diff --git a/vpx_scale/dm642/bicubic_scaler_c64.c b/vpx_scale/dm642/bicubic_scaler_c64.c new file mode 100644 index 000000000..9bd379725 --- /dev/null +++ b/vpx_scale/dm642/bicubic_scaler_c64.c @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <float.h> +#include <math.h> +#include <stdio.h> +#include "vpx_mem/vpx_mem.h" +#include "vpxscale_arbitrary.h" + +extern BICUBIC_SCALER_STRUCT g_b_scaler; + +int bicubic_scale_c64(int in_width, int in_height, int in_stride, + int out_width, int out_height, int out_stride, + unsigned char *input_image, unsigned char *output_image) +{ + short *restrict l_w, * restrict l_h; + short *restrict c_w, * restrict c_h; + unsigned char *restrict ip, * restrict op, *restrict op_w; + unsigned char *restrict hbuf; + int h, w, lw, lh; + int phase_offset_w, phase_offset_h; + double coeff; + int max_phase; + + c_w = g_b_scaler.c_w; + c_h = g_b_scaler.c_h; + + op = output_image; + + l_w = g_b_scaler.l_w; + l_h = g_b_scaler.l_h; + + phase_offset_h = 0; + + for (h = 0; h < out_height; h++) + { + // select the row to work on + lh = l_h[h]; + ip = input_image + (in_stride * lh); + + coeff = _memd8_const(&c_h[phase_offset_h*4]); + + // vp8_filter the row vertically into an temporary buffer. + // If the phase offset == 0 then all the multiplication + // is going to result in the output equalling the input. + // So instead point the temporary buffer to the input. + // Also handle the boundry condition of not being able to + // filter that last lines. + if (phase_offset_h && (lh < in_height - 2)) + { + hbuf = g_b_scaler.hbuf; + + for (w = 0; w < in_width; w += 4) + { + int ip1, ip2, ip3, ip4; + int y13_12, y11_10, y23_22, y21_20, y33_32, y31_30, y43_42, y41_40; + int y10_20, y11_21, y12_22, y13_23, y30_40, y31_41, y32_42, y33_43; + int s1, s2, s3, s4; + + ip1 = _mem4_const(&ip[w - in_stride]); + ip2 = _mem4_const(&ip[w]); + ip3 = _mem4_const(&ip[w + in_stride]); + ip4 = _mem4_const(&ip[w + 2*in_stride]); + + // realignment of data. Unpack the data so that it is in short + // format instead of bytes. + y13_12 = _unpkhu4(ip1); + y11_10 = _unpklu4(ip1); + y23_22 = _unpkhu4(ip2); + y21_20 = _unpklu4(ip2); + y33_32 = _unpkhu4(ip3); + y31_30 = _unpklu4(ip3); + y43_42 = _unpkhu4(ip4); + y41_40 = _unpklu4(ip4); + + // repack the data so that elements 1 and 2 are together. this + // lines up so that a dot product with the coefficients can be + // done. + y10_20 = _pack2(y11_10, y21_20); + y11_21 = _packh2(y11_10, y21_20); + y12_22 = _pack2(y13_12, y23_22); + y13_23 = _packh2(y13_12, y23_22); + + s1 = _dotp2(_hi(coeff), y10_20); + s2 = _dotp2(_hi(coeff), y11_21); + s3 = _dotp2(_hi(coeff), y12_22); + s4 = _dotp2(_hi(coeff), y13_23); + + y30_40 = _pack2(y31_30, y41_40); + y31_41 = _packh2(y31_30, y41_40); + y32_42 = _pack2(y33_32, y43_42); + y33_43 = _packh2(y33_32, y43_42); + + // now repack elements 3 and 4 together. + s1 += _dotp2(_lo(coeff), y30_40); + s2 += _dotp2(_lo(coeff), y31_41); + s3 += _dotp2(_lo(coeff), y32_42); + s4 += _dotp2(_lo(coeff), y33_43); + + s1 = s1 >> 12; + s2 = s2 >> 12; + s3 = s3 >> 12; + s4 = s4 >> 12; + + s1 = _pack2(s2, s1); + s2 = _pack2(s4, s3); + + _amem4(&hbuf[w]) = _spacku4(s2, s1); + } + } + else + hbuf = ip; + + // increase the phase offset for the next time around. + if (++phase_offset_h >= g_b_scaler.nh) + phase_offset_h = 0; + + op_w = op; + + // will never be able to interpolate first pixel, so just copy it + // over here. + phase_offset_w = 1; + *op_w++ = hbuf[0]; + + if (1 >= g_b_scaler.nw) phase_offset_w = 0; + + max_phase = g_b_scaler.nw; + + for (w = 1; w < out_width; w++) + { + double coefficients; + int hbuf_high, hbuf_low, hbuf_both; + int sum_high, sum_low, sum; + + // get the index to use to expand the image + lw = l_w[w]; + coefficients = _amemd8_const(&c_w[phase_offset_w*4]); + hbuf_both = _mem4_const(&hbuf[lw-1]); + + hbuf_high = _unpkhu4(hbuf_both); + hbuf_low = _unpklu4(hbuf_both); + + sum_high = _dotp2(_hi(coefficients), hbuf_high); + sum_low = _dotp2(_lo(coefficients), hbuf_low); + + sum = (sum_high + sum_low) >> 12; + + if (++phase_offset_w >= max_phase) + phase_offset_w = 0; + + if ((lw + 2) >= in_width) + sum = hbuf[lw]; + + *op_w++ = sum; + } + + op += out_stride; + } + + return 0; +} + +void bicubic_scale_frame_c64(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + int new_width, int new_height) +{ + + dst->y_width = new_width; + dst->y_height = new_height; + dst->uv_width = new_width / 2; + dst->uv_height = new_height / 2; + + dst->y_stride = dst->y_width; + dst->uv_stride = dst->uv_width; + + bicubic_scale_c64(src->y_width, src->y_height, src->y_stride, + new_width, new_height, dst->y_stride, + src->y_buffer, dst->y_buffer); + + bicubic_scale_c64(src->uv_width, src->uv_height, src->uv_stride, + new_width / 2, new_height / 2, dst->uv_stride, + src->u_buffer, dst->u_buffer); + + bicubic_scale_c64(src->uv_width, src->uv_height, src->uv_stride, + new_width / 2, new_height / 2, dst->uv_stride, + src->v_buffer, dst->v_buffer); +} diff --git a/vpx_scale/dm642/gen_scalers_c64.c b/vpx_scale/dm642/gen_scalers_c64.c new file mode 100644 index 000000000..2126a7534 --- /dev/null +++ b/vpx_scale/dm642/gen_scalers_c64.c @@ -0,0 +1,607 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** + * + * Module Title : gen_scalers.c + * + * Description : Generic image scaling functions. + * + ***************************************************************************/ + +/**************************************************************************** +* Header Files +****************************************************************************/ +#include "vpx_scale/vpxscale.h" + +/**************************************************************************** +* Imports +****************************************************************************/ + +/**************************************************************************** + * + * ROUTINE : horizontal_line_4_5_scale_c4 + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 4 to 5. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void horizontal_line_4_5_scale_c64 +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + unsigned i; + unsigned int ba, cb, dc, ed; + unsigned char *restrict des = dest; + unsigned int *restrict src = (unsigned int *)source; + unsigned int const_51_205, const_102_154, + const_205_51, const_154_102; + + unsigned int src_current, src_next; + + (void) dest_width; + + // Constants that are to be used for the filtering. For + // best speed we are going to want to right shift by 16. + // In the generic version they were shift by 8, so put + // an extra 8 in now so that 16 will come out later. + const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8); + const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8); + const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8); + const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8); + + // 5 points are needed to filter to give 5 output points. + // A load can pull up 4 at a time, and one needs to be + // "borrowed" from the next set of data. So instead of + // loading those 5 points each time, "steal" a point from + // the next set and only load up 4 each time through. + src_current = _mem4(src); + + for (i = 0; i < source_width - 4; i += 4) + { + src_next = _mem4(src++); + + // Reorder the data so that it is ready for the + // dot product. + ba = _unpklu4(src_current); + cb = _unpkhu4(_rotl(src_current, 8)); + dc = _unpkhu4(src_current); + ed = _unpkhu4(_shrmb(src_next, src_current)); + + // Use the dot product with round and shift. + des [0] = src_current & 0xff; + des [1] = _dotprsu2(ba, const_205_51); + des [2] = _dotprsu2(cb, const_154_102); + des [3] = _dotprsu2(dc, const_102_154); + des [4] = _dotprsu2(ed, const_51_205); + + des += 5; + + // reuse loaded vales next time around. + src_current = src_next; + } + + // vp8_filter the last set of points. Normally a point from the next set + // would be used, but there is no next set, so just fill. + ba = _unpklu4(src_current); + cb = _unpkhu4(_rotl(src_current, 8)); + dc = _unpkhu4(src_current); + + des [0] = src_current & 0xff; + des [1] = _dotprsu2(ba, const_205_51); + des [2] = _dotprsu2(cb, const_154_102); + des [3] = _dotprsu2(dc, const_102_154); + des [4] = src_current & 0xff; + +} +/**************************************************************************** + * + * ROUTINE : vertical_band_4_5_scale_c64 + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales vertical band of pixels by scale 4 to 5. The + * height of the band scaled is 4-pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. + * + ****************************************************************************/ +static +void vertical_band_4_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b, c, d, e; + unsigned int ba, cb, dc, ed; + unsigned char *restrict src = dest; + unsigned char *restrict des = dest; + unsigned int const_51_205, const_102_154, + const_205_51, const_154_102; + + const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8); + const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8); + const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8); + const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8); + + // Force a loop unroll here so that there is not such a + // dependancy. + a = src [0]; + b = src [dest_pitch]; + c = src [dest_pitch*2]; + d = src [dest_pitch*3]; + e = src [dest_pitch*5]; + src ++; + + for (i = 0; i < dest_width; i++) + { + ba = _pack2(b, a); + cb = _pack2(c, b); + dc = _pack2(d, c); + ed = _pack2(e, d); + + a = src [0]; + b = src [dest_pitch]; + c = src [dest_pitch*2]; + d = src [dest_pitch*3]; + e = src [dest_pitch*5]; + src ++; + + des [dest_pitch] = _dotprsu2(ba, const_205_51); + des [dest_pitch*2] = _dotprsu2(cb, const_154_102); + des [dest_pitch*3] = _dotprsu2(dc, const_102_154); + des [dest_pitch*4] = _dotprsu2(ed, const_51_205); + + des ++; + } +} + +/**************************************************************************** + * + * ROUTINE : last_vertical_band_4_5_scale_c64 + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales last vertical band of pixels by scale 4 to 5. The + * height of the band scaled is 4-pixels. + * + * SPECIAL NOTES : The routine does not have available the first line of + * the band below the current band, since this is the + * last band. + * + ****************************************************************************/ +static +void last_vertical_band_4_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b, c, d; + unsigned int ba, cb, dc; + unsigned char *restrict src = dest; + unsigned char *restrict des = dest; + unsigned int const_102_154, const_205_51, const_154_102; + + const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8); + const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8); + const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8); + + a = src [0]; + b = src [dest_pitch]; + c = src [dest_pitch*2]; + d = src [dest_pitch*3]; + src ++; + + for (i = 0; i < dest_width; ++i) + { + ba = _pack2(b, a); + cb = _pack2(c, b); + dc = _pack2(d, c); + + a = src [0]; + b = src [dest_pitch]; + c = src [dest_pitch*2]; + d = src [dest_pitch*3]; + src ++; + + des [dest_pitch] = _dotprsu2(ba, const_205_51); + des [dest_pitch*2] = _dotprsu2(cb, const_154_102); + des [dest_pitch*3] = _dotprsu2(dc, const_102_154); + des [dest_pitch*4] = (unsigned char) d; + + des++; + } +} + +/**************************************************************************** + * + * ROUTINE : horizontal_line_3_5_scale_c64 + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 3 to 5. + * + * SPECIAL NOTES : None. + * + * + ****************************************************************************/ +static +void horizontal_line_3_5_scale_c64 +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + unsigned int i; + unsigned int ba, cb, dc; + unsigned int src_current; + unsigned char *restrict des = dest; + unsigned char *restrict src = (unsigned char *)source; + unsigned int const_51_205, const_102_154, + const_205_51, const_154_102; + + (void) dest_width; + + const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8); + const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8); + const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8); + const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8); + + for (i = 0; i < source_width - 3; i += 3) + { + src_current = _mem4(src); + + // Reorder the data so that it is ready for the + // dot product. + ba = _unpklu4(src_current); + cb = _unpkhu4(_rotl(src_current, 8)); + dc = _unpkhu4(src_current); + + des [0] = src_current & 0xff; + des [1] = _dotprsu2(ba, const_154_102); + des [2] = _dotprsu2(cb, const_51_205); + des [3] = _dotprsu2(cb, const_205_51); + des [4] = _dotprsu2(dc, const_102_154); + + src += 3; + des += 5; + } + + src_current = _mem4(src); + + ba = _unpklu4(src_current); + cb = _unpkhu4(_rotl(src_current, 8)); + dc = _unpkhu4(src_current); + + + des [0] = src_current & 0xff; + des [1] = _dotprsu2(ba, const_154_102); + des [2] = _dotprsu2(cb, const_51_205); + des [3] = _dotprsu2(cb, const_205_51); + des [4] = dc & 0xff; + +} + +/**************************************************************************** + * + * ROUTINE : vertical_band_3_5_scale_c64 + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales vertical band of pixels by scale 3 to 5. The + * height of the band scaled is 3-pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. + * + ****************************************************************************/ +static +void vertical_band_3_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b, c, d; + unsigned int ba, cb, dc; + unsigned char *restrict src = dest; + unsigned char *restrict des = dest; + unsigned int const_51_205, const_102_154, + const_205_51, const_154_102; + + const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8); + const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8); + const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8); + const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8); + + a = src [0]; + b = src [dest_pitch]; + c = src [dest_pitch*2]; + d = src [dest_pitch*5]; + src ++; + + for (i = 0; i < dest_width; i++) + { + ba = _pack2(b, a); + cb = _pack2(c, b); + dc = _pack2(d, c); + + a = src [0]; + b = src [dest_pitch]; + c = src [dest_pitch*2]; + d = src [dest_pitch*5]; + src ++; + + des [dest_pitch] = _dotprsu2(ba, const_154_102); + des [dest_pitch*2] = _dotprsu2(cb, const_51_205); + des [dest_pitch*3] = _dotprsu2(cb, const_205_51); + des [dest_pitch*4] = _dotprsu2(dc, const_102_154); + + des++; + } +} + +/**************************************************************************** + * + * ROUTINE : last_vertical_band_3_5_scale_c64 + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales last vertical band of pixels by scale 3 to 5. The + * height of the band scaled is 3-pixels. + * + * SPECIAL NOTES : The routine does not have available the first line of + * the band below the current band, since this is the + * last band. + * + ****************************************************************************/ +static +void last_vertical_band_3_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b, c; + unsigned int ba, cb; + unsigned char *restrict src = dest; + unsigned char *restrict des = dest; + unsigned int const_51_205, const_205_51, const_154_102; + + const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8); + const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8); + const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8); + + a = src [0]; + b = src [dest_pitch]; + c = src [dest_pitch*2]; + src ++; + + for (i = 0; i < dest_width; ++i) + { + ba = _pack2(b, a); + cb = _pack2(c, b); + + a = src [0]; + b = src [dest_pitch]; + c = src [dest_pitch*2]; + src ++; + + des [dest_pitch] = _dotprsu2(ba, const_154_102); + des [dest_pitch*2] = _dotprsu2(cb, const_51_205); + des [dest_pitch*3] = _dotprsu2(cb, const_205_51); + des [dest_pitch*4] = (unsigned char)(c) ; + + des++; + } +} + +/**************************************************************************** + * + * ROUTINE : horizontal_line_1_2_scale_c64 + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 1 to 2. + * + * SPECIAL NOTES : source width must be a multiple of 4. + * + ****************************************************************************/ +void horizontal_line_1_2_scale_c64 +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + unsigned int i; + unsigned char *restrict des = dest; + unsigned char *restrict src = (unsigned char *)source; + unsigned int src7_4i, src4_1i, src3_0i; + unsigned int a4_0i, ahi, alo; + double src7_0d, src3_0d; + const unsigned int k01 = 0x01010101; + + for (i = 0; i < source_width / 4; i += 1) + { + // Load up the data from src. Here a wide load is + // used to get 8 bytes at once, only 5 will be used + // for the actual computation. + src7_0d = _memd8(src); + src3_0i = _lo(src7_0d); + src7_4i = _hi(src7_0d); + + // Need to average between points. Shift byte 5 into + // the lower word. This will result in bytes 5-1 + // averaged with 4-0. + src4_1i = _shrmb(src7_4i, src3_0i); + a4_0i = _avgu4(src4_1i, src3_0i); + + // Expand the data out. Could do an unpack, however + // all but the multiply units are getting pretty hard + // here the multiply unit can take some of the computations. + src3_0d = _mpyu4(src3_0i, k01); + + // The averages need to be unpacked so that they are in 16 + // bit form and will be able to be interleaved with the + // original data + ahi = _unpkhu4(a4_0i); + alo = _unpklu4(a4_0i); + + ahi = _swap4(ahi); + alo = _swap4(alo); + + // Mix the average result in with the orginal data. + ahi = _hi(src3_0d) | ahi; + alo = _lo(src3_0d) | alo; + + _memd8(des) = _itod(ahi, alo); + + des += 8; + src += 4; + } +} + + +/**************************************************************************** + * + * ROUTINE : vertical_band_1_2_scale_c64 + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales vertical band of pixels by scale 1 to 2. The + * height of the band scaled is 1-pixel. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. + * Destination width must be a multiple of 4. Because the + * intput must be, therefore the output must be. + * + ****************************************************************************/ +static +void vertical_band_1_2_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b; + unsigned int *restrict line_a = (unsigned int *)dest; + unsigned int *restrict line_b = (unsigned int *)(dest + (dest_pitch * 2)); + unsigned int *restrict des = (unsigned int *)(dest + dest_pitch); + + for (i = 0; i < dest_width / 4; i++) + { + a = _mem4(line_a++); + b = _mem4(line_b++); + + _mem4(des++) = _avgu4(a, b); + } +} + +/**************************************************************************** + * + * ROUTINE : last_vertical_band_1_2_scale_c64 + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales last vertical band of pixels by scale 1 to 2. The + * height of the band scaled is 1-pixel. + * + * SPECIAL NOTES : The routine does not have available the first line of + * the band below the current band, since this is the + * last band. Again, width must be a multiple of 4. + * + ****************************************************************************/ +static +void last_vertical_band_1_2_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int *restrict src = (unsigned int *)dest; + unsigned int *restrict des = (unsigned int *)(dest + dest_pitch); + + for (i = 0; i < dest_width / 4; ++i) + { + _mem4(des++) = _mem4(src++); + } +} + +void +register_generic_scalers(void) +{ + vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_c64; + vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_c64; + vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_c64; + vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_c64; + vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_c64; + vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_c64; + vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_c64; + vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_c64; + vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_c64; +} diff --git a/vpx_scale/dm642/yv12extend.c b/vpx_scale/dm642/yv12extend.c new file mode 100644 index 000000000..ca25a5fce --- /dev/null +++ b/vpx_scale/dm642/yv12extend.c @@ -0,0 +1,445 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** + * + * Module Title : yv12extend.c + * + * Description : + * + ***************************************************************************/ + +/**************************************************************************** +* Header Files +****************************************************************************/ +//#include <stdlib.h> +#include "csl_dat.h" +#include "vpx_scale/yv12config.h" +#include "vpx_mem/vpx_mem.h" + +/**************************************************************************** +* Exports +****************************************************************************/ +#define UINT8 unsigned char +#define UINT32 unsigned int + + +static inline +void copy_yleft_right_border( + UINT8 *restrict src_ptr1, + UINT8 *restrict src_ptr2, + UINT8 *restrict dest_ptr1, + UINT8 *restrict dest_ptr2, + UINT32 plane_height, + UINT32 plane_stride +) +{ + UINT32 left, right, left2, left4, right2, right4; + double dl, dr; + int i; + +#pragma MUST_ITERATE(16,16,16) + + for (i = 0; i < plane_height; i++) + { + left = src_ptr1[0]; + right = src_ptr2[0]; + + left2 = _pack2(left, left); + left4 = _packl4(left2, left2); + + right2 = _pack2(right, right); + right4 = _packl4(right2, right2); + + dl = _itod(left4, left4); + dr = _itod(right4, right4); + + _amemd8(&dest_ptr1[ 0]) = dl; + _amemd8(&dest_ptr2[ 0]) = dr; + + _amemd8(&dest_ptr1[ 8]) = dl; + _amemd8(&dest_ptr2[ 8]) = dr; + + _amemd8(&dest_ptr1[16]) = dl; + _amemd8(&dest_ptr2[16]) = dr; + + _amemd8(&dest_ptr1[24]) = dl; + _amemd8(&dest_ptr2[24]) = dr; + + _amemd8(&dest_ptr1[32]) = dl; + _amemd8(&dest_ptr2[32]) = dr; + + _amemd8(&dest_ptr1[40]) = dl; + _amemd8(&dest_ptr2[40]) = dr; + + + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } +} +/**************************************************************************** + * + * + ****************************************************************************/ +static +void copy_uvleft_right_border( + UINT8 *restrict src_ptr1, + UINT8 *restrict src_ptr2, + UINT8 *restrict dest_ptr1, + UINT8 *restrict dest_ptr2, + UINT32 plane_height, + UINT32 plane_stride +) +{ + UINT32 left, right, left2, left4, right2, right4; + double dl, dr; + int i; + +#pragma MUST_ITERATE(8,8 ,8) + + for (i = 0; i < plane_height; i++) + { + left = src_ptr1[0]; + right = src_ptr2[0]; + + left2 = _pack2(left, left); + left4 = _packl4(left2, left2); + + right2 = _pack2(right, right); + right4 = _packl4(right2, right2); + + dl = _itod(left4, left4); + dr = _itod(right4, right4); + + _amemd8(&dest_ptr1[ 0]) = dl; + _amemd8(&dest_ptr2[ 0]) = dr; + + _amemd8(&dest_ptr1[ 8]) = dl; + _amemd8(&dest_ptr2[ 8]) = dr; + + _amemd8(&dest_ptr1[16]) = dl; + _amemd8(&dest_ptr2[16]) = dr; + + + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } +} +/**************************************************************************** + * + ****************************************************************************/ +void +vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf) +{ + int i; + unsigned char *src_ptr1, *src_ptr2; + unsigned char *dest_ptr1, *dest_ptr2; + + unsigned int Border; + int plane_stride; + int plane_height; + int plane_width; + + /***********/ + /* Y Plane */ + /***********/ + Border = ybf->border; + plane_stride = ybf->y_stride; + plane_height = ybf->y_height; + plane_width = ybf->y_width; + +#if 1 + // copy the left and right most columns out + src_ptr1 = ybf->y_buffer; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + copy_yleft_right_border(src_ptr1, src_ptr2, dest_ptr1, dest_ptr2, plane_height, plane_stride); +#endif + + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->y_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)Border; i++) + { + vpx_memcpy(dest_ptr1, src_ptr1, plane_stride); + vpx_memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + plane_stride /= 2; + plane_height /= 2; + plane_width /= 2; + Border /= 2; + + /***********/ + /* U Plane */ + /***********/ +#if 1 + // copy the left and right most columns out + src_ptr1 = ybf->u_buffer; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + copy_uvleft_right_border(src_ptr1, src_ptr2, dest_ptr1, dest_ptr2, plane_height, plane_stride); + + +#endif + + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->u_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)(Border); i++) + { + vpx_memcpy(dest_ptr1, src_ptr1, plane_stride); + vpx_memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + /***********/ + /* V Plane */ + /***********/ +#if 1 + // copy the left and right most columns out + src_ptr1 = ybf->v_buffer; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + copy_uvleft_right_border(src_ptr1, src_ptr2, dest_ptr1, dest_ptr2, plane_height, plane_stride); + +#endif + + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->v_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)(Border); i++) + { + vpx_memcpy(dest_ptr1, src_ptr1, plane_stride); + vpx_memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } +} +/**************************************************************************** + * + ****************************************************************************/ +void +vpxyv12_extend_frame_tbborders(YV12_BUFFER_CONFIG *ybf) +{ + int i; + unsigned char *src_ptr1, *src_ptr2; + unsigned char *dest_ptr1, *dest_ptr2; + int tid1, tid2; + + unsigned int Border; + int plane_stride; + int plane_height; + int plane_width; + + /***********/ + /* Y Plane */ + /***********/ + Border = ybf->border; + plane_stride = ybf->y_stride; + plane_height = ybf->y_height; + plane_width = ybf->y_width; + + + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->y_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + + for (i = 0; i < (int)Border; i++) + { + dat_copy(src_ptr1, dest_ptr1, plane_stride); + dat_copy(src_ptr2, dest_ptr2, plane_stride); + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + plane_stride /= 2; + plane_height /= 2; + plane_width /= 2; + Border /= 2; + + /***********/ + /* U Plane */ + /***********/ + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->u_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)(Border); i++) + { + dat_copy(src_ptr1, dest_ptr1, plane_stride); + dat_copy(src_ptr2, dest_ptr2, plane_stride); + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + /***********/ + /* V Plane */ + /***********/ + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->v_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)(Border); i++) + { + tid1 = dat_copy(src_ptr1, dest_ptr1, plane_stride); + tid2 = dat_copy(src_ptr2, dest_ptr2, plane_stride); + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + dat_wait(tid1); + dat_wait(tid2); +} + +/**************************************************************************** + * + * ROUTINE : vp8_yv12_copy_frame + * + * INPUTS : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies the source image into the destination image and + * updates the destination's UMV borders. Because the + * borders have been update prior to this so the whole frame + * is copied, borders and all. This is also to circumvent + * using copy_left_right Border functions when copying data + * between L2 and main memory. When that occurs a cache + * clean needs to be done, which would require invalidating + * an entire frame. + * + * SPECIAL NOTES : The frames are assumed to be identical in size. + * + ****************************************************************************/ +void +vpxyv12_copy_frame_dma(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) +{ + int yheight, uv_height; + int ystride, uv_stride; + int border; + int yoffset, uvoffset; + + border = src_ybc->border; + yheight = src_ybc->y_height; + uv_height = src_ybc->uv_height; + + ystride = src_ybc->y_stride; + uv_stride = src_ybc->uv_stride; + + yoffset = border * (ystride + 1); + uvoffset = border / 2 * (uv_stride + 1); + + dat_copy2d(DAT_2D2D, + src_ybc->y_buffer - yoffset, + dst_ybc->y_buffer - yoffset, + ystride, + yheight + 2 * border, + ystride); + dat_copy2d(DAT_2D2D, + src_ybc->u_buffer - uvoffset, + dst_ybc->u_buffer - uvoffset, + uv_stride, + uv_height + border, + uv_stride); + dat_copy2d(DAT_2D2D, + src_ybc->v_buffer - uvoffset, + dst_ybc->v_buffer - uvoffset, + uv_stride, + uv_height + border, + uv_stride); + +} + + +/**************************************************************************** + * + * ROUTINE : vp8_yv12_copy_frame + * + * INPUTS : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies the source image into the destination image and + * updates the destination's UMV borders. + * + * SPECIAL NOTES : The frames are assumed to be identical in size. + * + ****************************************************************************/ +void +vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) +{ + int row; + unsigned char *source, *dest; + + source = src_ybc->y_buffer; + dest = dst_ybc->y_buffer; + + for (row = 0; row < src_ybc->y_height; row++) + { + vpx_memcpy(dest, source, src_ybc->y_width); + source += src_ybc->y_stride; + dest += dst_ybc->y_stride; + } + + source = src_ybc->u_buffer; + dest = dst_ybc->u_buffer; + + for (row = 0; row < src_ybc->uv_height; row++) + { + vpx_memcpy(dest, source, src_ybc->uv_width); + source += src_ybc->uv_stride; + dest += dst_ybc->uv_stride; + } + + source = src_ybc->v_buffer; + dest = dst_ybc->v_buffer; + + for (row = 0; row < src_ybc->uv_height; row++) + { + vpx_memcpy(dest, source, src_ybc->uv_width); + source += src_ybc->uv_stride; + dest += dst_ybc->uv_stride; + } + + vp8_yv12_extend_frame_borders(dst_ybc); +} diff --git a/vpx_scale/generic/bicubic_scaler.c b/vpx_scale/generic/bicubic_scaler.c new file mode 100644 index 000000000..e3c2b4a80 --- /dev/null +++ b/vpx_scale/generic/bicubic_scaler.c @@ -0,0 +1,601 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include <float.h> +#include <math.h> +#include <stdio.h> +#include "vpx_mem/vpx_mem.h" +#include "vpxscale_arbitrary.h" + +#define FIXED_POINT + +#define MAX_IN_WIDTH 800 +#define MAX_IN_HEIGHT 600 +#define MAX_OUT_WIDTH 800 +#define MAX_OUT_HEIGHT 600 +#define MAX_OUT_DIMENSION ((MAX_OUT_WIDTH > MAX_OUT_HEIGHT) ? \ + MAX_OUT_WIDTH : MAX_OUT_HEIGHT) + +BICUBIC_SCALER_STRUCT g_b_scaler; +static int g_first_time = 1; + +#pragma DATA_SECTION(g_hbuf, "VP6_HEAP") +#pragma DATA_ALIGN (g_hbuf, 32); +unsigned char g_hbuf[MAX_OUT_DIMENSION]; + +#pragma DATA_SECTION(g_hbuf_uv, "VP6_HEAP") +#pragma DATA_ALIGN (g_hbuf_uv, 32); +unsigned char g_hbuf_uv[MAX_OUT_DIMENSION]; + + +#ifdef FIXED_POINT +static int a_i = 0.6 * 65536; +#else +static float a = -0.6; +#endif + +#ifdef FIXED_POINT +// 3 2 +// C0 = a*t - a*t +// +static INLINE short c0_fixed(unsigned int t) +{ + // put t in Q16 notation + unsigned short v1, v2; + + // Q16 + v1 = (a_i * t) >> 16; + v1 = (v1 * t) >> 16; + + // Q16 + v2 = (a_i * t) >> 16; + v2 = (v2 * t) >> 16; + v2 = (v2 * t) >> 16; + + // Q12 + return -((v1 - v2) >> 4); +} + +// 2 3 +// C1 = a*t + (3-2*a)*t - (2-a)*t +// +static INLINE short c1_fixed(unsigned int t) +{ + unsigned short v1, v2, v3; + unsigned short two, three; + + // Q16 + v1 = (a_i * t) >> 16; + + // Q13 + two = 2 << 13; + v2 = two - (a_i >> 3); + v2 = (v2 * t) >> 16; + v2 = (v2 * t) >> 16; + v2 = (v2 * t) >> 16; + + // Q13 + three = 3 << 13; + v3 = three - (2 * (a_i >> 3)); + v3 = (v3 * t) >> 16; + v3 = (v3 * t) >> 16; + + // Q12 + return (((v1 >> 3) - v2 + v3) >> 1); + +} + +// 2 3 +// C2 = 1 - (3-a)*t + (2-a)*t +// +static INLINE short c2_fixed(unsigned int t) +{ + unsigned short v1, v2, v3; + unsigned short two, three; + + // Q13 + v1 = 1 << 13; + + // Q13 + three = 3 << 13; + v2 = three - (a_i >> 3); + v2 = (v2 * t) >> 16; + v2 = (v2 * t) >> 16; + + // Q13 + two = 2 << 13; + v3 = two - (a_i >> 3); + v3 = (v3 * t) >> 16; + v3 = (v3 * t) >> 16; + v3 = (v3 * t) >> 16; + + // Q12 + return (v1 - v2 + v3) >> 1; +} + +// 2 3 +// C3 = a*t - 2*a*t + a*t +// +static INLINE short c3_fixed(unsigned int t) +{ + int v1, v2, v3; + + // Q16 + v1 = (a_i * t) >> 16; + + // Q15 + v2 = 2 * (a_i >> 1); + v2 = (v2 * t) >> 16; + v2 = (v2 * t) >> 16; + + // Q16 + v3 = (a_i * t) >> 16; + v3 = (v3 * t) >> 16; + v3 = (v3 * t) >> 16; + + // Q12 + return ((v2 - (v1 >> 1) - (v3 >> 1)) >> 3); +} +#else +// 3 2 +// C0 = -a*t + a*t +// +float C0(float t) +{ + return -a * t * t * t + a * t * t; +} + +// 2 3 +// C1 = -a*t + (2*a+3)*t - (a+2)*t +// +float C1(float t) +{ + return -(a + 2.0f) * t * t * t + (2.0f * a + 3.0f) * t * t - a * t; +} + +// 2 3 +// C2 = 1 - (a+3)*t + (a+2)*t +// +float C2(float t) +{ + return (a + 2.0f) * t * t * t - (a + 3.0f) * t * t + 1.0f; +} + +// 2 3 +// C3 = a*t - 2*a*t + a*t +// +float C3(float t) +{ + return a * t * t * t - 2.0f * a * t * t + a * t; +} +#endif + +#if 0 +int compare_real_fixed() +{ + int i, errors = 0; + float mult = 1.0 / 10000.0; + unsigned int fixed_mult = mult * 4294967296;//65536; + unsigned int phase_offset_int; + float phase_offset_real; + + for (i = 0; i < 10000; i++) + { + int fixed0, fixed1, fixed2, fixed3, fixed_total; + int real0, real1, real2, real3, real_total; + + phase_offset_real = (float)i * mult; + phase_offset_int = (fixed_mult * i) >> 16; +// phase_offset_int = phase_offset_real * 65536; + + fixed0 = c0_fixed(phase_offset_int); + real0 = C0(phase_offset_real) * 4096.0; + + if ((abs(fixed0) > (abs(real0) + 1)) || (abs(fixed0) < (abs(real0) - 1))) + errors++; + + fixed1 = c1_fixed(phase_offset_int); + real1 = C1(phase_offset_real) * 4096.0; + + if ((abs(fixed1) > (abs(real1) + 1)) || (abs(fixed1) < (abs(real1) - 1))) + errors++; + + fixed2 = c2_fixed(phase_offset_int); + real2 = C2(phase_offset_real) * 4096.0; + + if ((abs(fixed2) > (abs(real2) + 1)) || (abs(fixed2) < (abs(real2) - 1))) + errors++; + + fixed3 = c3_fixed(phase_offset_int); + real3 = C3(phase_offset_real) * 4096.0; + + if ((abs(fixed3) > (abs(real3) + 1)) || (abs(fixed3) < (abs(real3) - 1))) + errors++; + + fixed_total = fixed0 + fixed1 + fixed2 + fixed3; + real_total = real0 + real1 + real2 + real3; + + if ((fixed_total > 4097) || (fixed_total < 4094)) + errors ++; + + if ((real_total > 4097) || (real_total < 4095)) + errors ++; + } + + return errors; +} +#endif + +// Find greatest common denominator between two integers. Method used here is +// slow compared to Euclid's algorithm, but does not require any division. +int gcd(int a, int b) +{ + // Problem with this algorithm is that if a or b = 0 this function + // will never exit. Don't want to return 0 because any computation + // that was based on a common denoninator and tried to reduce by + // dividing by 0 would fail. Best solution that could be thought of + // would to be fail by returing a 1; + if (a <= 0 || b <= 0) + return 1; + + while (a != b) + { + if (b > a) + b = b - a; + else + { + int tmp = a;//swap large and + a = b; //small + b = tmp; + } + } + + return b; +} + +void bicubic_coefficient_init() +{ + vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT)); + g_first_time = 0; +} + +void bicubic_coefficient_destroy() +{ + if (!g_first_time) + { + if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w); + + if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h); + + if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv); + + if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w); + + if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h); + + if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv); + + vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT)); + } +} + +// Create the coeffients that will be used for the cubic interpolation. +// Because scaling does not have to be equal in the vertical and horizontal +// regimes the phase offsets will be different. There are 4 coefficents +// for each point, two on each side. The layout is that there are the +// 4 coefficents for each phase in the array and then the next phase. +int bicubic_coefficient_setup(int in_width, int in_height, int out_width, int out_height) +{ + int i; +#ifdef FIXED_POINT + int phase_offset_int; + unsigned int fixed_mult; + int product_val = 0; +#else + float phase_offset; +#endif + int gcd_w, gcd_h, gcd_h_uv, d_w, d_h, d_h_uv; + + if (g_first_time) + bicubic_coefficient_init(); + + + // check to see if the coefficents have already been set up correctly + if ((in_width == g_b_scaler.in_width) && (in_height == g_b_scaler.in_height) + && (out_width == g_b_scaler.out_width) && (out_height == g_b_scaler.out_height)) + return 0; + + g_b_scaler.in_width = in_width; + g_b_scaler.in_height = in_height; + g_b_scaler.out_width = out_width; + g_b_scaler.out_height = out_height; + + // Don't want to allow crazy scaling, just try and prevent a catastrophic + // failure here. Want to fail after setting the member functions so if + // if the scaler is called the member functions will not scale. + if (out_width <= 0 || out_height <= 0) + return -1; + + // reduce in/out width and height ratios using the gcd + gcd_w = gcd(out_width, in_width); + gcd_h = gcd(out_height, in_height); + gcd_h_uv = gcd(out_height, in_height / 2); + + // the numerator width and height are to be saved in + // globals so they can be used during the scaling process + // without having to be recalculated. + g_b_scaler.nw = out_width / gcd_w; + d_w = in_width / gcd_w; + + g_b_scaler.nh = out_height / gcd_h; + d_h = in_height / gcd_h; + + g_b_scaler.nh_uv = out_height / gcd_h_uv; + d_h_uv = (in_height / 2) / gcd_h_uv; + + // allocate memory for the coefficents + if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w); + + if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h); + + if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv); + + g_b_scaler.l_w = (short *)vpx_memalign(32, out_width * 2); + g_b_scaler.l_h = (short *)vpx_memalign(32, out_height * 2); + g_b_scaler.l_h_uv = (short *)vpx_memalign(32, out_height * 2); + + if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w); + + if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h); + + if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv); + + g_b_scaler.c_w = (short *)vpx_memalign(32, g_b_scaler.nw * 4 * 2); + g_b_scaler.c_h = (short *)vpx_memalign(32, g_b_scaler.nh * 4 * 2); + g_b_scaler.c_h_uv = (short *)vpx_memalign(32, g_b_scaler.nh_uv * 4 * 2); + + g_b_scaler.hbuf = g_hbuf; + g_b_scaler.hbuf_uv = g_hbuf_uv; + + // Set up polyphase filter taps. This needs to be done before + // the scaling because of the floating point math required. The + // coefficients are multiplied by 2^12 so that fixed point math + // can be used in the main scaling loop. +#ifdef FIXED_POINT + fixed_mult = (1.0 / (float)g_b_scaler.nw) * 4294967296; + + product_val = 0; + + for (i = 0; i < g_b_scaler.nw; i++) + { + if (product_val > g_b_scaler.nw) + product_val -= g_b_scaler.nw; + + phase_offset_int = (fixed_mult * product_val) >> 16; + + g_b_scaler.c_w[i*4] = c3_fixed(phase_offset_int); + g_b_scaler.c_w[i*4+1] = c2_fixed(phase_offset_int); + g_b_scaler.c_w[i*4+2] = c1_fixed(phase_offset_int); + g_b_scaler.c_w[i*4+3] = c0_fixed(phase_offset_int); + + product_val += d_w; + } + + + fixed_mult = (1.0 / (float)g_b_scaler.nh) * 4294967296; + + product_val = 0; + + for (i = 0; i < g_b_scaler.nh; i++) + { + if (product_val > g_b_scaler.nh) + product_val -= g_b_scaler.nh; + + phase_offset_int = (fixed_mult * product_val) >> 16; + + g_b_scaler.c_h[i*4] = c0_fixed(phase_offset_int); + g_b_scaler.c_h[i*4+1] = c1_fixed(phase_offset_int); + g_b_scaler.c_h[i*4+2] = c2_fixed(phase_offset_int); + g_b_scaler.c_h[i*4+3] = c3_fixed(phase_offset_int); + + product_val += d_h; + } + + fixed_mult = (1.0 / (float)g_b_scaler.nh_uv) * 4294967296; + + product_val = 0; + + for (i = 0; i < g_b_scaler.nh_uv; i++) + { + if (product_val > g_b_scaler.nh_uv) + product_val -= g_b_scaler.nh_uv; + + phase_offset_int = (fixed_mult * product_val) >> 16; + + g_b_scaler.c_h_uv[i*4] = c0_fixed(phase_offset_int); + g_b_scaler.c_h_uv[i*4+1] = c1_fixed(phase_offset_int); + g_b_scaler.c_h_uv[i*4+2] = c2_fixed(phase_offset_int); + g_b_scaler.c_h_uv[i*4+3] = c3_fixed(phase_offset_int); + + product_val += d_h_uv; + } + +#else + + for (i = 0; i < g_nw; i++) + { + phase_offset = (float)((i * d_w) % g_nw) / (float)g_nw; + g_c_w[i*4] = (C3(phase_offset) * 4096.0); + g_c_w[i*4+1] = (C2(phase_offset) * 4096.0); + g_c_w[i*4+2] = (C1(phase_offset) * 4096.0); + g_c_w[i*4+3] = (C0(phase_offset) * 4096.0); + } + + for (i = 0; i < g_nh; i++) + { + phase_offset = (float)((i * d_h) % g_nh) / (float)g_nh; + g_c_h[i*4] = (C0(phase_offset) * 4096.0); + g_c_h[i*4+1] = (C1(phase_offset) * 4096.0); + g_c_h[i*4+2] = (C2(phase_offset) * 4096.0); + g_c_h[i*4+3] = (C3(phase_offset) * 4096.0); + } + + for (i = 0; i < g_nh_uv; i++) + { + phase_offset = (float)((i * d_h_uv) % g_nh_uv) / (float)g_nh_uv; + g_c_h_uv[i*4] = (C0(phase_offset) * 4096.0); + g_c_h_uv[i*4+1] = (C1(phase_offset) * 4096.0); + g_c_h_uv[i*4+2] = (C2(phase_offset) * 4096.0); + g_c_h_uv[i*4+3] = (C3(phase_offset) * 4096.0); + } + +#endif + + // Create an array that corresponds input lines to output lines. + // This doesn't require floating point math, but it does require + // a division and because hardware division is not present that + // is a call. + for (i = 0; i < out_width; i++) + { + g_b_scaler.l_w[i] = (i * d_w) / g_b_scaler.nw; + + if ((g_b_scaler.l_w[i] + 2) <= in_width) + g_b_scaler.max_usable_out_width = i; + + } + + for (i = 0; i < out_height + 1; i++) + { + g_b_scaler.l_h[i] = (i * d_h) / g_b_scaler.nh; + g_b_scaler.l_h_uv[i] = (i * d_h_uv) / g_b_scaler.nh_uv; + } + + return 0; +} + +int bicubic_scale(int in_width, int in_height, int in_stride, + int out_width, int out_height, int out_stride, + unsigned char *input_image, unsigned char *output_image) +{ + short *RESTRICT l_w, * RESTRICT l_h; + short *RESTRICT c_w, * RESTRICT c_h; + unsigned char *RESTRICT ip, * RESTRICT op; + unsigned char *RESTRICT hbuf; + int h, w, lw, lh; + int temp_sum; + int phase_offset_w, phase_offset_h; + + c_w = g_b_scaler.c_w; + c_h = g_b_scaler.c_h; + + op = output_image; + + l_w = g_b_scaler.l_w; + l_h = g_b_scaler.l_h; + + phase_offset_h = 0; + + for (h = 0; h < out_height; h++) + { + // select the row to work on + lh = l_h[h]; + ip = input_image + (in_stride * lh); + + // vp8_filter the row vertically into an temporary buffer. + // If the phase offset == 0 then all the multiplication + // is going to result in the output equalling the input. + // So instead point the temporary buffer to the input. + // Also handle the boundry condition of not being able to + // filter that last lines. + if (phase_offset_h && (lh < in_height - 2)) + { + hbuf = g_b_scaler.hbuf; + + for (w = 0; w < in_width; w++) + { + temp_sum = c_h[phase_offset_h*4+3] * ip[w - in_stride]; + temp_sum += c_h[phase_offset_h*4+2] * ip[w]; + temp_sum += c_h[phase_offset_h*4+1] * ip[w + in_stride]; + temp_sum += c_h[phase_offset_h*4] * ip[w + 2*in_stride]; + + hbuf[w] = temp_sum >> 12; + } + } + else + hbuf = ip; + + // increase the phase offset for the next time around. + if (++phase_offset_h >= g_b_scaler.nh) + phase_offset_h = 0; + + // now filter and expand it horizontally into the final + // output buffer + phase_offset_w = 0; + + for (w = 0; w < out_width; w++) + { + // get the index to use to expand the image + lw = l_w[w]; + + temp_sum = c_w[phase_offset_w*4] * hbuf[lw - 1]; + temp_sum += c_w[phase_offset_w*4+1] * hbuf[lw]; + temp_sum += c_w[phase_offset_w*4+2] * hbuf[lw + 1]; + temp_sum += c_w[phase_offset_w*4+3] * hbuf[lw + 2]; + temp_sum = temp_sum >> 12; + + if (++phase_offset_w >= g_b_scaler.nw) + phase_offset_w = 0; + + // boundry conditions + if ((lw + 2) >= in_width) + temp_sum = hbuf[lw]; + + if (lw == 0) + temp_sum = hbuf[0]; + + op[w] = temp_sum; + } + + op += out_stride; + } + + return 0; +} + +void bicubic_scale_frame_reset() +{ + g_b_scaler.out_width = 0; + g_b_scaler.out_height = 0; +} + +void bicubic_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + int new_width, int new_height) +{ + + dst->y_width = new_width; + dst->y_height = new_height; + dst->uv_width = new_width / 2; + dst->uv_height = new_height / 2; + + dst->y_stride = dst->y_width; + dst->uv_stride = dst->uv_width; + + bicubic_scale(src->y_width, src->y_height, src->y_stride, + new_width, new_height, dst->y_stride, + src->y_buffer, dst->y_buffer); + + bicubic_scale(src->uv_width, src->uv_height, src->uv_stride, + new_width / 2, new_height / 2, dst->uv_stride, + src->u_buffer, dst->u_buffer); + + bicubic_scale(src->uv_width, src->uv_height, src->uv_stride, + new_width / 2, new_height / 2, dst->uv_stride, + src->v_buffer, dst->v_buffer); +} diff --git a/vpx_scale/generic/gen_scalers.c b/vpx_scale/generic/gen_scalers.c new file mode 100644 index 000000000..a5e545f70 --- /dev/null +++ b/vpx_scale/generic/gen_scalers.c @@ -0,0 +1,954 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_scale/vpxscale.h" +#include "vpx_mem/vpx_mem.h" +/**************************************************************************** +* Imports +****************************************************************************/ + +/**************************************************************************** + * + * ROUTINE : vp8cx_horizontal_line_4_5_scale_c + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 4 to 5. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void vp8cx_horizontal_line_4_5_scale_c +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + unsigned i; + unsigned int a, b, c; + unsigned char *des = dest; + const unsigned char *src = source; + + (void) dest_width; + + for (i = 0; i < source_width - 4; i += 4) + { + a = src[0]; + b = src[1]; + des [0] = (unsigned char) a; + des [1] = (unsigned char)((a * 51 + 205 * b + 128) >> 8); + c = src[2] * 154; + a = src[3]; + des [2] = (unsigned char)((b * 102 + c + 128) >> 8); + des [3] = (unsigned char)((c + 102 * a + 128) >> 8); + b = src[4]; + des [4] = (unsigned char)((a * 205 + 51 * b + 128) >> 8); + + src += 4; + des += 5; + } + + a = src[0]; + b = src[1]; + des [0] = (unsigned char)(a); + des [1] = (unsigned char)((a * 51 + 205 * b + 128) >> 8); + c = src[2] * 154; + a = src[3]; + des [2] = (unsigned char)((b * 102 + c + 128) >> 8); + des [3] = (unsigned char)((c + 102 * a + 128) >> 8); + des [4] = (unsigned char)(a); + +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_vertical_band_4_5_scale_c + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales vertical band of pixels by scale 4 to 5. The + * height of the band scaled is 4-pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. + * + ****************************************************************************/ +void vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b, c, d; + unsigned char *des = dest; + + for (i = 0; i < dest_width; i++) + { + a = des [0]; + b = des [dest_pitch]; + + des[dest_pitch] = (unsigned char)((a * 51 + 205 * b + 128) >> 8); + + c = des[dest_pitch*2] * 154; + d = des[dest_pitch*3]; + + des [dest_pitch*2] = (unsigned char)((b * 102 + c + 128) >> 8); + des [dest_pitch*3] = (unsigned char)((c + 102 * d + 128) >> 8); + + // First line in next band + a = des [dest_pitch * 5]; + des [dest_pitch * 4] = (unsigned char)((d * 205 + 51 * a + 128) >> 8); + + des ++; + } +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_last_vertical_band_4_5_scale_c + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales last vertical band of pixels by scale 4 to 5. The + * height of the band scaled is 4-pixels. + * + * SPECIAL NOTES : The routine does not have available the first line of + * the band below the current band, since this is the + * last band. + * + ****************************************************************************/ +void vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b, c, d; + unsigned char *des = dest; + + for (i = 0; i < dest_width; ++i) + { + a = des[0]; + b = des[dest_pitch]; + + des[dest_pitch] = (unsigned char)((a * 51 + 205 * b + 128) >> 8); + + c = des[dest_pitch*2] * 154; + d = des[dest_pitch*3]; + + des [dest_pitch*2] = (unsigned char)((b * 102 + c + 128) >> 8); + des [dest_pitch*3] = (unsigned char)((c + 102 * d + 128) >> 8); + + // No other line for interplation of this line, so .. + des[dest_pitch*4] = (unsigned char) d; + + des++; + } +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_horizontal_line_2_3_scale_c + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 2 to 3. + * + * SPECIAL NOTES : None. + * + * + ****************************************************************************/ +void vp8cx_horizontal_line_2_3_scale_c +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + unsigned int i; + unsigned int a, b, c; + unsigned char *des = dest; + const unsigned char *src = source; + + (void) dest_width; + + for (i = 0; i < source_width - 2; i += 2) + { + a = src[0]; + b = src[1]; + c = src[2]; + + des [0] = (unsigned char)(a); + des [1] = (unsigned char)((a * 85 + 171 * b + 128) >> 8); + des [2] = (unsigned char)((b * 171 + 85 * c + 128) >> 8); + + src += 2; + des += 3; + } + + a = src[0]; + b = src[1]; + des [0] = (unsigned char)(a); + des [1] = (unsigned char)((a * 85 + 171 * b + 128) >> 8); + des [2] = (unsigned char)(b); +} + + +/**************************************************************************** + * + * ROUTINE : vp8cx_vertical_band_2_3_scale_c + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales vertical band of pixels by scale 2 to 3. The + * height of the band scaled is 2-pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. + * + ****************************************************************************/ +void vp8cx_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b, c; + unsigned char *des = dest; + + for (i = 0; i < dest_width; i++) + { + a = des [0]; + b = des [dest_pitch]; + c = des[dest_pitch*3]; + des [dest_pitch ] = (unsigned char)((a * 85 + 171 * b + 128) >> 8); + des [dest_pitch*2] = (unsigned char)((b * 171 + 85 * c + 128) >> 8); + + des++; + } +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_last_vertical_band_2_3_scale_c + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales last vertical band of pixels by scale 2 to 3. The + * height of the band scaled is 2-pixels. + * + * SPECIAL NOTES : The routine does not have available the first line of + * the band below the current band, since this is the + * last band. + * + ****************************************************************************/ +void vp8cx_last_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b; + unsigned char *des = dest; + + for (i = 0; i < dest_width; ++i) + { + a = des [0]; + b = des [dest_pitch]; + + des [dest_pitch ] = (unsigned char)((a * 85 + 171 * b + 128) >> 8); + des [dest_pitch*2] = (unsigned char)(b); + des++; + } +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_horizontal_line_3_5_scale_c + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 3 to 5. + * + * SPECIAL NOTES : None. + * + * + ****************************************************************************/ +void vp8cx_horizontal_line_3_5_scale_c +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + unsigned int i; + unsigned int a, b, c; + unsigned char *des = dest; + const unsigned char *src = source; + + (void) dest_width; + + for (i = 0; i < source_width - 3; i += 3) + { + a = src[0]; + b = src[1]; + des [0] = (unsigned char)(a); + des [1] = (unsigned char)((a * 102 + 154 * b + 128) >> 8); + + c = src[2] ; + des [2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8); + des [3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8); + + a = src[3]; + des [4] = (unsigned char)((c * 154 + a * 102 + 128) >> 8); + + src += 3; + des += 5; + } + + a = src[0]; + b = src[1]; + des [0] = (unsigned char)(a); + + des [1] = (unsigned char)((a * 102 + 154 * b + 128) >> 8); + c = src[2] ; + des [2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8); + des [3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8); + + des [4] = (unsigned char)(c); +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_vertical_band_3_5_scale_c + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales vertical band of pixels by scale 3 to 5. The + * height of the band scaled is 3-pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. + * + ****************************************************************************/ +void vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b, c; + unsigned char *des = dest; + + for (i = 0; i < dest_width; i++) + { + a = des [0]; + b = des [dest_pitch]; + des [dest_pitch] = (unsigned char)((a * 102 + 154 * b + 128) >> 8); + + c = des[dest_pitch*2]; + des [dest_pitch*2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8); + des [dest_pitch*3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8); + + // First line in next band... + a = des [dest_pitch * 5]; + des [dest_pitch * 4] = (unsigned char)((c * 154 + a * 102 + 128) >> 8); + + des++; + } +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_last_vertical_band_3_5_scale_c + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales last vertical band of pixels by scale 3 to 5. The + * height of the band scaled is 3-pixels. + * + * SPECIAL NOTES : The routine does not have available the first line of + * the band below the current band, since this is the + * last band. + * + ****************************************************************************/ +void vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b, c; + unsigned char *des = dest; + + for (i = 0; i < dest_width; ++i) + { + a = des [0]; + b = des [dest_pitch]; + + des [ dest_pitch ] = (unsigned char)((a * 102 + 154 * b + 128) >> 8); + + c = des[dest_pitch*2]; + des [dest_pitch*2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8); + des [dest_pitch*3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8); + + // No other line for interplation of this line, so .. + des [ dest_pitch * 4 ] = (unsigned char)(c) ; + + des++; + } +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_horizontal_line_3_4_scale_c + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 3 to 4. + * + * SPECIAL NOTES : None. + * + * + ****************************************************************************/ +void vp8cx_horizontal_line_3_4_scale_c +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + unsigned int i; + unsigned int a, b, c; + unsigned char *des = dest; + const unsigned char *src = source; + + (void) dest_width; + + for (i = 0; i < source_width - 3; i += 3) + { + a = src[0]; + b = src[1]; + des [0] = (unsigned char)(a); + des [1] = (unsigned char)((a * 64 + b * 192 + 128) >> 8); + + c = src[2]; + des [2] = (unsigned char)((b + c + 1) >> 1); + + a = src[3]; + des [3] = (unsigned char)((c * 192 + a * 64 + 128) >> 8); + + src += 3; + des += 4; + } + + a = src[0]; + b = src[1]; + des [0] = (unsigned char)(a); + des [1] = (unsigned char)((a * 64 + b * 192 + 128) >> 8); + + c = src[2] ; + des [2] = (unsigned char)((b + c + 1) >> 1); + des [3] = (unsigned char)(c); +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_vertical_band_3_4_scale_c + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales vertical band of pixels by scale 3 to 4. The + * height of the band scaled is 3-pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. + * + ****************************************************************************/ +void vp8cx_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b, c; + unsigned char *des = dest; + + for (i = 0; i < dest_width; i++) + { + a = des [0]; + b = des [dest_pitch]; + des [dest_pitch] = (unsigned char)((a * 64 + b * 192 + 128) >> 8); + + c = des[dest_pitch*2]; + des [dest_pitch*2] = (unsigned char)((b + c + 1) >> 1); + + // First line in next band... + a = des [dest_pitch*4]; + des [dest_pitch*3] = (unsigned char)((c * 192 + a * 64 + 128) >> 8); + + des++; + } +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_last_vertical_band_3_4_scale_c + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales last vertical band of pixels by scale 3 to 4. The + * height of the band scaled is 3-pixels. + * + * SPECIAL NOTES : The routine does not have available the first line of + * the band below the current band, since this is the + * last band. + * + ****************************************************************************/ +void vp8cx_last_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b, c; + unsigned char *des = dest; + + for (i = 0; i < dest_width; ++i) + { + a = des [0]; + b = des [dest_pitch]; + + des [dest_pitch] = (unsigned char)((a * 64 + b * 192 + 128) >> 8); + + c = des[dest_pitch*2]; + des [dest_pitch*2] = (unsigned char)((b + c + 1) >> 1); + + // No other line for interplation of this line, so .. + des [dest_pitch*3] = (unsigned char)(c); + + des++; + } +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_horizontal_line_1_2_scale_c + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 1 to 2. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void vp8cx_horizontal_line_1_2_scale_c +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + unsigned int i; + unsigned int a, b; + unsigned char *des = dest; + const unsigned char *src = source; + + (void) dest_width; + + for (i = 0; i < source_width - 1; i += 1) + { + a = src[0]; + b = src[1]; + des [0] = (unsigned char)(a); + des [1] = (unsigned char)((a + b + 1) >> 1); + src += 1; + des += 2; + } + + a = src[0]; + des [0] = (unsigned char)(a); + des [1] = (unsigned char)(a); +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_vertical_band_1_2_scale_c + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales vertical band of pixels by scale 1 to 2. The + * height of the band scaled is 1-pixel. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. + * + ****************************************************************************/ +void vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b; + unsigned char *des = dest; + + for (i = 0; i < dest_width; i++) + { + a = des [0]; + b = des [dest_pitch * 2]; + + des[dest_pitch] = (unsigned char)((a + b + 1) >> 1); + + des++; + } +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_last_vertical_band_1_2_scale_c + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales last vertical band of pixels by scale 1 to 2. The + * height of the band scaled is 1-pixel. + * + * SPECIAL NOTES : The routine does not have available the first line of + * the band below the current band, since this is the + * last band. + * + ****************************************************************************/ +void vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned char *des = dest; + + for (i = 0; i < dest_width; ++i) + { + des[dest_pitch] = des[0]; + des++; + } +} + + + + + +/**************************************************************************** + * + * ROUTINE : vp8cx_horizontal_line_4_5_scale_c + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 4 to 5. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void vp8cx_horizontal_line_5_4_scale_c +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + unsigned i; + unsigned int a, b, c, d, e; + unsigned char *des = dest; + const unsigned char *src = source; + + (void) dest_width; + + for (i = 0; i < source_width; i += 5) + { + a = src[0]; + b = src[1]; + c = src[2]; + d = src[3]; + e = src[4]; + + des[0] = (unsigned char) a; + des[1] = (unsigned char)((b * 192 + c * 64 + 128) >> 8); + des[2] = (unsigned char)((c * 128 + d * 128 + 128) >> 8); + des[3] = (unsigned char)((d * 64 + e * 192 + 128) >> 8); + + src += 5; + des += 4; + } +} + + + + +void vp8cx_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b, c, d, e; + unsigned char *des = dest; + unsigned char *src = source; + + for (i = 0; i < dest_width; i++) + { + + a = src[0 * src_pitch]; + b = src[1 * src_pitch]; + c = src[2 * src_pitch]; + d = src[3 * src_pitch]; + e = src[4 * src_pitch]; + + des[0 * dest_pitch] = (unsigned char) a; + des[1 * dest_pitch] = (unsigned char)((b * 192 + c * 64 + 128) >> 8); + des[2 * dest_pitch] = (unsigned char)((c * 128 + d * 128 + 128) >> 8); + des[3 * dest_pitch] = (unsigned char)((d * 64 + e * 192 + 128) >> 8); + + src ++; + des ++; + + } +} + + +/*7*************************************************************************** + * + * ROUTINE : vp8cx_horizontal_line_3_5_scale_c + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 3 to 5. + * + * SPECIAL NOTES : None. + * + * + ****************************************************************************/ +void vp8cx_horizontal_line_5_3_scale_c +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + unsigned int i; + unsigned int a, b, c, d , e; + unsigned char *des = dest; + const unsigned char *src = source; + + (void) dest_width; + + for (i = 0; i < source_width; i += 5) + { + a = src[0]; + b = src[1]; + c = src[2]; + d = src[3]; + e = src[4]; + + des[0] = (unsigned char) a; + des[1] = (unsigned char)((b * 85 + c * 171 + 128) >> 8); + des[2] = (unsigned char)((d * 171 + e * 85 + 128) >> 8); + + src += 5; + des += 3; + } + +} + +void vp8cx_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b, c, d, e; + unsigned char *des = dest; + unsigned char *src = source; + + for (i = 0; i < dest_width; i++) + { + + a = src[0 * src_pitch]; + b = src[1 * src_pitch]; + c = src[2 * src_pitch]; + d = src[3 * src_pitch]; + e = src[4 * src_pitch]; + + des[0 * dest_pitch] = (unsigned char) a; + des[1 * dest_pitch] = (unsigned char)((b * 85 + c * 171 + 128) >> 8); + des[2 * dest_pitch] = (unsigned char)((d * 171 + e * 85 + 128) >> 8); + + src ++; + des ++; + + } +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_horizontal_line_1_2_scale_c + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 1 to 2. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void vp8cx_horizontal_line_2_1_scale_c +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + unsigned int i; + unsigned int a; + unsigned char *des = dest; + const unsigned char *src = source; + + (void) dest_width; + + for (i = 0; i < source_width; i += 2) + { + a = src[0]; + des [0] = (unsigned char)(a); + src += 2; + des += 1; + } + + + +} +void vp8cx_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + (void) dest_pitch; + (void) src_pitch; + vpx_memcpy(dest, source, dest_width); +} + +void vp8cx_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + int temp; + + (void) dest_pitch; + + for (i = 0; i < dest_width; i++) + { + temp = 8; + temp += source[i-(int)src_pitch] * 3; + temp += source[i] * 10; + temp += source[i+src_pitch] * 3; + temp >>= 4 ; + dest[i] = (unsigned char)(temp); + } + +} diff --git a/vpx_scale/generic/scalesystemdependant.c b/vpx_scale/generic/scalesystemdependant.c new file mode 100644 index 000000000..28f5c7252 --- /dev/null +++ b/vpx_scale/generic/scalesystemdependant.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_scale/vpxscale.h" + +#ifdef HAVE_CONFIG_H +#include "vpx_config.h" +#endif + +void (*vp8_yv12_extend_frame_borders_ptr)(YV12_BUFFER_CONFIG *ybf); +extern void vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf); + +void (*vp8_yv12_copy_frame_yonly_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +extern void vp8_yv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); + +void (*vp8_yv12_copy_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +extern void vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); + +/**************************************************************************** +* Imports +*****************************************************************************/ + +/**************************************************************************** + * + * ROUTINE : vp8_scale_machine_specific_config + * + * INPUTS : UINT32 Version : Codec version number. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Checks for machine specifc features such as MMX support + * sets appropriate flags and function pointers. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void vp8_scale_machine_specific_config() +{ +#if CONFIG_SPATIAL_RESAMPLING + vp8_horizontal_line_1_2_scale = vp8cx_horizontal_line_1_2_scale_c; + vp8_vertical_band_1_2_scale = vp8cx_vertical_band_1_2_scale_c; + vp8_last_vertical_band_1_2_scale = vp8cx_last_vertical_band_1_2_scale_c; + vp8_horizontal_line_3_5_scale = vp8cx_horizontal_line_3_5_scale_c; + vp8_vertical_band_3_5_scale = vp8cx_vertical_band_3_5_scale_c; + vp8_last_vertical_band_3_5_scale = vp8cx_last_vertical_band_3_5_scale_c; + vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c; + vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c; + vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c; + vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c; + vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c; + vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c; + vp8_horizontal_line_4_5_scale = vp8cx_horizontal_line_4_5_scale_c; + vp8_vertical_band_4_5_scale = vp8cx_vertical_band_4_5_scale_c; + vp8_last_vertical_band_4_5_scale = vp8cx_last_vertical_band_4_5_scale_c; + + + vp8_vertical_band_5_4_scale = vp8cx_vertical_band_5_4_scale_c; + vp8_vertical_band_5_3_scale = vp8cx_vertical_band_5_3_scale_c; + vp8_vertical_band_2_1_scale = vp8cx_vertical_band_2_1_scale_c; + vp8_vertical_band_2_1_scale_i = vp8cx_vertical_band_2_1_scale_i_c; + vp8_horizontal_line_2_1_scale = vp8cx_horizontal_line_2_1_scale_c; + vp8_horizontal_line_5_3_scale = vp8cx_horizontal_line_5_3_scale_c; + vp8_horizontal_line_5_4_scale = vp8cx_horizontal_line_5_4_scale_c; +#endif + + vp8_yv12_extend_frame_borders_ptr = vp8_yv12_extend_frame_borders; + vp8_yv12_copy_frame_yonly_ptr = vp8_yv12_copy_frame_yonly; + vp8_yv12_copy_frame_ptr = vp8_yv12_copy_frame; + +} diff --git a/vpx_scale/generic/vpxscale.c b/vpx_scale/generic/vpxscale.c new file mode 100644 index 000000000..206cd5512 --- /dev/null +++ b/vpx_scale/generic/vpxscale.c @@ -0,0 +1,1088 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** + * + * Module Title : scale.c + * + * Description : Image scaling functions. + * + ***************************************************************************/ + +/**************************************************************************** +* Header Files +****************************************************************************/ +#include "vpx_mem/vpx_mem.h" +#include "vpx_scale/yv12config.h" +#include "vpx_scale/scale_mode.h" + +/**************************************************************************** +* Exports +****************************************************************************/ +#ifndef VPX_NO_GLOBALS +void (*vp8_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0; +void (*vp8_last_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0; +void (*vp8_vertical_band_2_3_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0; +void (*vp8_last_vertical_band_2_3_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0; +void (*vp8_vertical_band_3_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0; +void (*vp8_last_vertical_band_3_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0; +void (*vp8_vertical_band_3_4_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0; +void (*vp8_last_vertical_band_3_4_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0; +void (*vp8_horizontal_line_1_2_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0; +void (*vp8_horizontal_line_3_5_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0; +void (*vp8_horizontal_line_3_4_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0; +void (*vp8_horizontal_line_2_3_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0; +void (*vp8_horizontal_line_4_5_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0; +void (*vp8_vertical_band_1_2_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0; +void (*vp8_last_vertical_band_1_2_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0; + +void (*vp8_vertical_band_5_4_scale)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0; +void (*vp8_vertical_band_5_3_scale)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0; +void (*vp8_vertical_band_2_1_scale)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0; +void (*vp8_vertical_band_2_1_scale_i)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) = 0; +void (*vp8_horizontal_line_2_1_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0; +void (*vp8_horizontal_line_5_3_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0; +void (*vp8_horizontal_line_5_4_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width) = 0; +#else +# include "vpxscale_nofp.h" +#endif + +typedef struct +{ + int expanded_frame_width; + int expanded_frame_height; + + int HScale; + int HRatio; + int VScale; + int VRatio; + + YV12_BUFFER_CONFIG *src_yuv_config; + YV12_BUFFER_CONFIG *dst_yuv_config; + +} SCALE_VARS; + +/**************************************************************************** + * + * ROUTINE : horizontal_line_copy + * + * INPUTS : None + * + * + * OUTPUTS : None. + * + * RETURNS : None + * + * FUNCTION : 1 to 1 scaling up for a horizontal line of pixles + * + * SPECIAL NOTES : None. + * + * ERRORS : None. + * + ****************************************************************************/ +static +void horizontal_line_copy( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + (void) dest_width; + + duck_memcpy(dest, source, source_width); +} +/**************************************************************************** + * + * ROUTINE : null_scale + * + * INPUTS : None + * + * + * OUTPUTS : None. + * + * RETURNS : None + * + * FUNCTION : 1 to 1 scaling up for a vertical band + * + * SPECIAL NOTES : None. + * + * ERRORS : None. + * + ****************************************************************************/ +static +void null_scale( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + (void) dest; + (void) dest_pitch; + (void) dest_width; + + return; +} + +/**************************************************************************** + * + * ROUTINE : scale1d_2t1_i + * + * INPUTS : const unsigned char *source : Pointer to data to be scaled. + * int source_step : Number of pixels to step on in source. + * unsigned int source_scale : Scale for source (UNUSED). + * unsigned int source_length : Length of source (UNUSED). + * unsigned char *dest : Pointer to output data array. + * int dest_step : Number of pixels to step on in destination. + * unsigned int dest_scale : Scale for destination (UNUSED). + * unsigned int dest_length : Length of destination. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs 2-to-1 interpolated scaling. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void scale1d_2t1_i +( + const unsigned char *source, + int source_step, + unsigned int source_scale, + unsigned int source_length, + unsigned char *dest, + int dest_step, + unsigned int dest_scale, + unsigned int dest_length +) +{ + unsigned int i, j; + unsigned int temp; + int source_pitch = source_step; + (void) source_length; + (void) source_scale; + (void) dest_scale; + + source_step *= 2; + dest[0] = source[0]; + + for (i = dest_step, j = source_step; i < dest_length * dest_step; i += dest_step, j += source_step) + { + temp = 8; + temp += 3 * source[j-source_pitch]; + temp += 10 * source[j]; + temp += 3 * source[j+source_pitch]; + temp >>= 4; + dest[i] = (char)(temp); + } +} + +/**************************************************************************** + * + * ROUTINE : scale1d_2t1_ps + * + * INPUTS : const unsigned char *source : Pointer to data to be scaled. + * int source_step : Number of pixels to step on in source. + * unsigned int source_scale : Scale for source (UNUSED). + * unsigned int source_length : Length of source (UNUSED). + * unsigned char *dest : Pointer to output data array. + * int dest_step : Number of pixels to step on in destination. + * unsigned int dest_scale : Scale for destination (UNUSED). + * unsigned int dest_length : Length of destination. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs 2-to-1 point subsampled scaling. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void scale1d_2t1_ps +( + const unsigned char *source, + int source_step, + unsigned int source_scale, + unsigned int source_length, + unsigned char *dest, + int dest_step, + unsigned int dest_scale, + unsigned int dest_length +) +{ + unsigned int i, j; + + (void) source_length; + (void) source_scale; + (void) dest_scale; + + source_step *= 2; + j = 0; + + for (i = 0; i < dest_length * dest_step; i += dest_step, j += source_step) + dest[i] = source[j]; +} +/**************************************************************************** + * + * ROUTINE : scale1d_c + * + * INPUTS : const unsigned char *source : Pointer to data to be scaled. + * int source_step : Number of pixels to step on in source. + * unsigned int source_scale : Scale for source. + * unsigned int source_length : Length of source (UNUSED). + * unsigned char *dest : Pointer to output data array. + * int dest_step : Number of pixels to step on in destination. + * unsigned int dest_scale : Scale for destination. + * unsigned int dest_length : Length of destination. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs linear interpolation in one dimension. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void scale1d_c +( + const unsigned char *source, + int source_step, + unsigned int source_scale, + unsigned int source_length, + unsigned char *dest, + int dest_step, + unsigned int dest_scale, + unsigned int dest_length +) +{ + unsigned int i; + unsigned int round_value = dest_scale / 2; + unsigned int left_modifier = dest_scale; + unsigned int right_modifier = 0; + unsigned char left_pixel = *source; + unsigned char right_pixel = *(source + source_step); + + (void) source_length; + + // These asserts are needed if there are boundary issues... + //assert ( dest_scale > source_scale ); + //assert ( (source_length-1) * dest_scale >= (dest_length-1) * source_scale ); + + for (i = 0; i < dest_length * dest_step; i += dest_step) + { + dest[i] = (char)((left_modifier * left_pixel + right_modifier * right_pixel + round_value) / dest_scale); + + right_modifier += source_scale; + + while (right_modifier > dest_scale) + { + right_modifier -= dest_scale; + source += source_step; + left_pixel = *source; + right_pixel = *(source + source_step); + } + + left_modifier = dest_scale - right_modifier; + } +} + +/**************************************************************************** + * + * ROUTINE : Scale2D + * + * INPUTS : const unsigned char *source : Pointer to data to be scaled. + * int source_pitch : Stride of source image. + * unsigned int source_width : Width of input image. + * unsigned int source_height : Height of input image. + * unsigned char *dest : Pointer to output data array. + * int dest_pitch : Stride of destination image. + * unsigned int dest_width : Width of destination image. + * unsigned int dest_height : Height of destination image. + * unsigned char *temp_area : Pointer to temp work area. + * unsigned char temp_area_height : Height of temp work area. + * unsigned int hscale : Horizontal scale factor numerator. + * unsigned int hratio : Horizontal scale factor denominator. + * unsigned int vscale : Vertical scale factor numerator. + * unsigned int vratio : Vertical scale factor denominator. + * unsigned int interlaced : Interlace flag. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs 2-tap linear interpolation in two dimensions. + * + * SPECIAL NOTES : Expansion is performed one band at a time to help with + * caching. + * + ****************************************************************************/ +static +void Scale2D +( + //const + unsigned char *source, + int source_pitch, + unsigned int source_width, + unsigned int source_height, + unsigned char *dest, + int dest_pitch, + unsigned int dest_width, + unsigned int dest_height, + unsigned char *temp_area, + unsigned char temp_area_height, + unsigned int hscale, + unsigned int hratio, + unsigned int vscale, + unsigned int vratio, + unsigned int interlaced +) +{ + //unsigned + int i, j, k; + int bands; + int dest_band_height; + int source_band_height; + + typedef void (*Scale1D)(const unsigned char * source, int source_step, unsigned int source_scale, unsigned int source_length, + unsigned char * dest, int dest_step, unsigned int dest_scale, unsigned int dest_length); + + Scale1D Scale1Dv = scale1d_c; + Scale1D Scale1Dh = scale1d_c; + + void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *, unsigned int) = NULL; + void (*vert_band_scale)(unsigned char *, unsigned int, unsigned char *, unsigned int, unsigned int) = NULL; + + int ratio_scalable = 1; + int interpolation = 0; + + unsigned char *source_base; // = (unsigned char *) ((source_pitch >= 0) ? source : (source + ((source_height-1) * source_pitch))); + unsigned char *line_src; + + + source_base = (unsigned char *)source; + + if (source_pitch < 0) + { + int offset; + + offset = (source_height - 1); + offset *= source_pitch; + + source_base += offset; + } + + // find out the ratio for each direction + switch (hratio * 10 / hscale) + { + case 8: + // 4-5 Scale in Width direction + horiz_line_scale = vp8_horizontal_line_5_4_scale; + break; + case 6: + // 3-5 Scale in Width direction + horiz_line_scale = vp8_horizontal_line_5_3_scale; + break; + case 5: + // 1-2 Scale in Width direction + horiz_line_scale = vp8_horizontal_line_2_1_scale; + break; + default: + // The ratio is not acceptable now + // throw("The ratio is not acceptable for now!"); + ratio_scalable = 0; + break; + } + + switch (vratio * 10 / vscale) + { + case 8: + // 4-5 Scale in vertical direction + vert_band_scale = vp8_vertical_band_5_4_scale; + source_band_height = 5; + dest_band_height = 4; + break; + case 6: + // 3-5 Scale in vertical direction + vert_band_scale = vp8_vertical_band_5_3_scale; + source_band_height = 5; + dest_band_height = 3; + break; + case 5: + // 1-2 Scale in vertical direction + + if (interlaced) + { + //if the content is interlaced, point sampling is used + vert_band_scale = vp8_vertical_band_2_1_scale; + } + else + { + + interpolation = 1; + //if the content is progressive, interplo + vert_band_scale = vp8_vertical_band_2_1_scale_i; + + } + + source_band_height = 2; + dest_band_height = 1; + break; + default: + // The ratio is not acceptable now + // throw("The ratio is not acceptable for now!"); + ratio_scalable = 0; + break; + } + + if (ratio_scalable) + { + if (source_height == dest_height) + { + // for each band of the image + for (k = 0; k < (int)dest_height; k++) + { + horiz_line_scale(source, source_width, dest, dest_width); + source += source_pitch; + dest += dest_pitch; + } + + return; + } + + if (interpolation) + { + if (source < source_base) + source = source_base; + + horiz_line_scale(source, source_width, temp_area, dest_width); + } + + for (k = 0; k < (int)(dest_height + dest_band_height - 1) / dest_band_height; k++) + { + // scale one band horizontally + for (i = 0; i < source_band_height; i++) + { + // Trap case where we could read off the base of the source buffer + + line_src = (unsigned char *)source + i * source_pitch; + + if (line_src < source_base) + line_src = source_base; + + horiz_line_scale(line_src, source_width, + temp_area + (i + 1)*dest_pitch, dest_width); + } + + // Vertical scaling is in place + vert_band_scale(temp_area + dest_pitch, dest_pitch, dest, dest_pitch, dest_width); + + if (interpolation) + vpx_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_width); + + // Next band... + source += (unsigned long) source_band_height * source_pitch; + dest += (unsigned long) dest_band_height * dest_pitch; + } + + return; + } + + if (hscale == 2 && hratio == 1) + Scale1Dh = scale1d_2t1_ps; + + if (vscale == 2 && vratio == 1) + { + if (interlaced) + Scale1Dv = scale1d_2t1_ps; + else + Scale1Dv = scale1d_2t1_i; + } + + if (source_height == dest_height) + { + // for each band of the image + for (k = 0; k < (int)dest_height; k++) + { + Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio, dest_width); + source += source_pitch; + dest += dest_pitch; + } + + return; + } + + if (dest_height > source_height) + { + dest_band_height = temp_area_height - 1; + source_band_height = dest_band_height * source_height / dest_height; + } + else + { + source_band_height = temp_area_height - 1; + dest_band_height = source_band_height * vratio / vscale; + } + + // first row needs to be done so that we can stay one row ahead for vertical zoom + Scale1Dh(source, 1, hscale, source_width + 1, temp_area, 1, hratio, dest_width); + + // for each band of the image + bands = (dest_height + dest_band_height - 1) / dest_band_height; + + for (k = 0; k < bands; k++) + { + // scale one band horizontally + for (i = 1; i < source_band_height + 1; i++) + { + if (k * source_band_height + i < (int) source_height) + { + Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1, + temp_area + i * dest_pitch, 1, hratio, dest_width); + } + else // Duplicate the last row + { + // copy temp_area row 0 over from last row in the past + duck_memcpy(temp_area + i * dest_pitch, temp_area + (i - 1)*dest_pitch, dest_pitch); + } + } + + // scale one band vertically + for (j = 0; j < (int)dest_width; j++) + { + Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1, + &dest[j], dest_pitch, vratio, dest_band_height); + } + + // copy temp_area row 0 over from last row in the past + duck_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch); + + // move to the next band + source += source_band_height * source_pitch; + dest += dest_band_height * dest_pitch; + } +} + +/**************************************************************************** + * + * ROUTINE : + * + * INPUTS : YV12_BUFFER_CONFIG *src : Pointer to frame to be scaled. + * YV12_BUFFER_CONFIG *dst : Pointer to buffer to hold scaled frame. + * unsigned char *temp_area : Pointer to temp work area. + * unsigned char temp_area_height : Height of temp work area. + * unsigned int hscale : Horizontal scale factor numerator. + * unsigned int hratio : Horizontal scale factor denominator. + * unsigned int vscale : Vertical scale factor numerator. + * unsigned int vratio : Vertical scale factor denominator. + * unsigned int interlaced : Interlace flag. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs 2-tap linear interpolation in two dimensions. + * + * SPECIAL NOTES : Expansion is performed one band at a time to help with + * caching. + * + ****************************************************************************/ +void vp8_scale_frame +( + YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + unsigned char *temp_area, + unsigned char temp_height, + unsigned int hscale, + unsigned int hratio, + unsigned int vscale, + unsigned int vratio, + unsigned int interlaced +) +{ + int i; + int dw = (hscale - 1 + src->y_width * hratio) / hscale; + int dh = (vscale - 1 + src->y_height * vratio) / vscale; + + // call our internal scaling routines!! + Scale2D((unsigned char *) src->y_buffer, src->y_stride, src->y_width, src->y_height, + (unsigned char *) dst->y_buffer, dst->y_stride, dw, dh, + temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced); + + if (dw < (int)dst->y_width) + for (i = 0; i < dh; i++) + duck_memset(dst->y_buffer + i * dst->y_stride + dw - 1, dst->y_buffer[i*dst->y_stride+dw-2], dst->y_width - dw + 1); + + if (dh < (int)dst->y_height) + for (i = dh - 1; i < (int)dst->y_height; i++) + duck_memcpy(dst->y_buffer + i * dst->y_stride, dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1); + + Scale2D((unsigned char *) src->u_buffer, src->uv_stride, src->uv_width, src->uv_height, + (unsigned char *) dst->u_buffer, dst->uv_stride, dw / 2, dh / 2, + temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced); + + if (dw / 2 < (int)dst->uv_width) + for (i = 0; i < dst->uv_height; i++) + duck_memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1, dst->u_buffer[i*dst->uv_stride+dw/2-2], dst->uv_width - dw / 2 + 1); + + if (dh / 2 < (int)dst->uv_height) + for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++) + duck_memcpy(dst->u_buffer + i * dst->uv_stride, dst->u_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width); + + Scale2D((unsigned char *) src->v_buffer, src->uv_stride, src->uv_width, src->uv_height, + (unsigned char *) dst->v_buffer, dst->uv_stride, dw / 2, dh / 2, + temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced); + + if (dw / 2 < (int)dst->uv_width) + for (i = 0; i < dst->uv_height; i++) + duck_memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1, dst->v_buffer[i*dst->uv_stride+dw/2-2], dst->uv_width - dw / 2 + 1); + + if (dh / 2 < (int) dst->uv_height) + for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++) + duck_memcpy(dst->v_buffer + i * dst->uv_stride, dst->v_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width); +} +/**************************************************************************** + * + * ROUTINE : any_ratio_2d_scale + * + * INPUTS : SCALE_INSTANCE *si : Pointer to post-processor instance (NOT USED). + * const unsigned char *source : Pointer to source image. + * unsigned int source_pitch : Stride of source image. + * unsigned int source_width : Width of source image. + * unsigned int source_height : Height of source image (NOT USED). + * unsigned char *dest : Pointer to destination image. + * unsigned int dest_pitch : Stride of destination image. + * unsigned int dest_width : Width of destination image. + * unsigned int dest_height : Height of destination image. + * + * OUTPUTS : None. + * + * RETURNS : int: 1 if image scaled, 0 if image could not be scaled. + * + * FUNCTION : Scale the image with changing apect ratio. + * + * SPECIAL NOTES : This scaling is a bi-linear scaling. Need to re-work the + * whole function for new scaling algorithm. + * + ****************************************************************************/ +static +int any_ratio_2d_scale +( + SCALE_VARS *si, + const unsigned char *source, + int source_pitch, + unsigned int source_width, + unsigned int source_height, + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width, + unsigned int dest_height +) +{ + unsigned int i, k; + unsigned int src_band_height = 0; + unsigned int dest_band_height = 0; + + // suggested scale factors + int hs = si->HScale; + int hr = si->HRatio; + int vs = si->VScale; + int vr = si->VRatio; + + // assume the ratios are scalable instead of should be centered + int ratio_scalable = 1; + + const unsigned char *source_base = ((source_pitch >= 0) ? source : (source + ((source_height - 1) * source_pitch))); + const unsigned char *line_src; + + void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *, unsigned int) = NULL; + void (*vert_band_scale)(unsigned char *, unsigned int, unsigned int) = NULL; + void (*last_vert_band_scale)(unsigned char *, unsigned int, unsigned int) = NULL; + + (void) si; + + // find out the ratio for each direction + switch (hr * 30 / hs) + { + case 24: + // 4-5 Scale in Width direction + horiz_line_scale = vp8_horizontal_line_4_5_scale; + break; + case 22: + // 3-4 Scale in Width direction + horiz_line_scale = vp8_horizontal_line_3_4_scale; + break; + + case 20: + // 4-5 Scale in Width direction + horiz_line_scale = vp8_horizontal_line_2_3_scale; + break; + case 18: + // 3-5 Scale in Width direction + horiz_line_scale = vp8_horizontal_line_3_5_scale; + break; + case 15: + // 1-2 Scale in Width direction + horiz_line_scale = vp8_horizontal_line_1_2_scale; + break; + case 30: + // no scale in Width direction + horiz_line_scale = horizontal_line_copy; + break; + default: + // The ratio is not acceptable now + // throw("The ratio is not acceptable for now!"); + ratio_scalable = 0; + break; + } + + switch (vr * 30 / vs) + { + case 24: + // 4-5 Scale in vertical direction + vert_band_scale = vp8_vertical_band_4_5_scale; + last_vert_band_scale = vp8_last_vertical_band_4_5_scale; + src_band_height = 4; + dest_band_height = 5; + break; + case 22: + // 3-4 Scale in vertical direction + vert_band_scale = vp8_vertical_band_3_4_scale; + last_vert_band_scale = vp8_last_vertical_band_3_4_scale; + src_band_height = 3; + dest_band_height = 4; + break; + case 20: + // 2-3 Scale in vertical direction + vert_band_scale = vp8_vertical_band_2_3_scale; + last_vert_band_scale = vp8_last_vertical_band_2_3_scale; + src_band_height = 2; + dest_band_height = 3; + break; + case 18: + // 3-5 Scale in vertical direction + vert_band_scale = vp8_vertical_band_3_5_scale; + last_vert_band_scale = vp8_last_vertical_band_3_5_scale; + src_band_height = 3; + dest_band_height = 5; + break; + case 15: + // 1-2 Scale in vertical direction + vert_band_scale = vp8_vertical_band_1_2_scale; + last_vert_band_scale = vp8_last_vertical_band_1_2_scale; + src_band_height = 1; + dest_band_height = 2; + break; + case 30: + // no scale in Width direction + vert_band_scale = null_scale; + last_vert_band_scale = null_scale; + src_band_height = 4; + dest_band_height = 4; + break; + default: + // The ratio is not acceptable now + // throw("The ratio is not acceptable for now!"); + ratio_scalable = 0; + break; + } + + if (ratio_scalable == 0) + return ratio_scalable; + + horiz_line_scale(source, source_width, dest, dest_width); + + // except last band + for (k = 0; k < (dest_height + dest_band_height - 1) / dest_band_height - 1; k++) + { + // scale one band horizontally + for (i = 1; i < src_band_height; i++) + { + // Trap case where we could read off the base of the source buffer + line_src = source + i * source_pitch; + + if (line_src < source_base) + line_src = source_base; + + horiz_line_scale(line_src, source_width, + dest + i * dest_pitch, dest_width); + } + + // first line of next band + // Trap case where we could read off the base of the source buffer + line_src = source + src_band_height * source_pitch; + + if (line_src < source_base) + line_src = source_base; + + horiz_line_scale(line_src, source_width, + dest + dest_band_height * dest_pitch, + dest_width); + + // Vertical scaling is in place + vert_band_scale(dest, dest_pitch, dest_width); + + // Next band... + source += src_band_height * source_pitch; + dest += dest_band_height * dest_pitch; + } + + // scale one band horizontally + for (i = 1; i < src_band_height; i++) + { + // Trap case where we could read off the base of the source buffer + line_src = source + i * source_pitch; + + if (line_src < source_base) + line_src = source_base; + + horiz_line_scale(line_src, source_width, + dest + i * dest_pitch, + dest_width); + } + + // Vertical scaling is in place + last_vert_band_scale(dest, dest_pitch, dest_width); + + return ratio_scalable; +} + +/**************************************************************************** + * + * ROUTINE : any_ratio_frame_scale + * + * INPUTS : SCALE_INSTANCE *si : Pointer to post-processor instance (NOT USED). + * unsigned char *frame_buffer : Pointer to source image. + * int YOffset : Offset from start of buffer to Y samples. + * int UVOffset : Offset from start of buffer to UV samples. + * + * OUTPUTS : None. + * + * RETURNS : int: 1 if image scaled, 0 if image could not be scaled. + * + * FUNCTION : Scale the image with changing apect ratio. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +int any_ratio_frame_scale(SCALE_VARS *scale_vars, int YOffset, int UVOffset) +{ + int i; + int ew; + int eh; + + // suggested scale factors + int hs = scale_vars->HScale; + int hr = scale_vars->HRatio; + int vs = scale_vars->VScale; + int vr = scale_vars->VRatio; + + int ratio_scalable = 1; + + int sw = (scale_vars->expanded_frame_width * hr + hs - 1) / hs; + int sh = (scale_vars->expanded_frame_height * vr + vs - 1) / vs; + int dw = scale_vars->expanded_frame_width; + int dh = scale_vars->expanded_frame_height; + YV12_BUFFER_CONFIG *src_yuv_config = scale_vars->src_yuv_config; + YV12_BUFFER_CONFIG *dst_yuv_config = scale_vars->dst_yuv_config; + + if (hr == 3) + ew = (sw + 2) / 3 * 3 * hs / hr; + else + ew = (sw + 7) / 8 * 8 * hs / hr; + + if (vr == 3) + eh = (sh + 2) / 3 * 3 * vs / vr; + else + eh = (sh + 7) / 8 * 8 * vs / vr; + + ratio_scalable = any_ratio_2d_scale(scale_vars, + (const unsigned char *)src_yuv_config->y_buffer, + src_yuv_config->y_stride, sw, sh, + (unsigned char *) dst_yuv_config->y_buffer + YOffset, + dst_yuv_config->y_stride, dw, dh); + + for (i = 0; i < eh; i++) + duck_memset(dst_yuv_config->y_buffer + YOffset + i * dst_yuv_config->y_stride + dw, 0, ew - dw); + + for (i = dh; i < eh; i++) + duck_memset(dst_yuv_config->y_buffer + YOffset + i * dst_yuv_config->y_stride, 0, ew); + + if (ratio_scalable == 0) + return ratio_scalable; + + sw = (sw + 1) >> 1; + sh = (sh + 1) >> 1; + dw = (dw + 1) >> 1; + dh = (dh + 1) >> 1; + + any_ratio_2d_scale(scale_vars, + (const unsigned char *)src_yuv_config->u_buffer, + src_yuv_config->y_stride / 2, sw, sh, + (unsigned char *)dst_yuv_config->u_buffer + UVOffset, + dst_yuv_config->uv_stride, dw, dh); + + any_ratio_2d_scale(scale_vars, + (const unsigned char *)src_yuv_config->v_buffer, + src_yuv_config->y_stride / 2, sw, sh, + (unsigned char *)dst_yuv_config->v_buffer + UVOffset, + dst_yuv_config->uv_stride, dw, dh); + + return ratio_scalable; +} + +/**************************************************************************** + * + * ROUTINE : center_image + * + * INPUTS : SCALE_INSTANCE *si : Pointer to post-processor instance. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Centers the image without scaling in the output buffer. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static void +center_image(YV12_BUFFER_CONFIG *src_yuv_config, YV12_BUFFER_CONFIG *dst_yuv_config) +{ + int i; + int row_offset, col_offset; + unsigned char *src_data_pointer; + unsigned char *dst_data_pointer; + + // center values + row_offset = (dst_yuv_config->y_height - src_yuv_config->y_height) / 2; + col_offset = (dst_yuv_config->y_width - src_yuv_config->y_width) / 2; + + // Y's + src_data_pointer = src_yuv_config->y_buffer; + dst_data_pointer = (unsigned char *)dst_yuv_config->y_buffer + (row_offset * dst_yuv_config->y_stride) + col_offset; + + for (i = 0; i < src_yuv_config->y_height; i++) + { + duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->y_width); + dst_data_pointer += dst_yuv_config->y_stride; + src_data_pointer += src_yuv_config->y_stride; + } + + row_offset /= 2; + col_offset /= 2; + + // U's + src_data_pointer = src_yuv_config->u_buffer; + dst_data_pointer = (unsigned char *)dst_yuv_config->u_buffer + (row_offset * dst_yuv_config->uv_stride) + col_offset; + + for (i = 0; i < src_yuv_config->uv_height; i++) + { + duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->uv_width); + dst_data_pointer += dst_yuv_config->uv_stride; + src_data_pointer += src_yuv_config->uv_stride; + } + + // V's + src_data_pointer = src_yuv_config->v_buffer; + dst_data_pointer = (unsigned char *)dst_yuv_config->v_buffer + (row_offset * dst_yuv_config->uv_stride) + col_offset; + + for (i = 0; i < src_yuv_config->uv_height; i++) + { + duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->uv_width); + dst_data_pointer += dst_yuv_config->uv_stride; + src_data_pointer += src_yuv_config->uv_stride; + } +} + +/**************************************************************************** + * + * ROUTINE : scale_or_center + * + * INPUTS : SCALE_INSTANCE *si : Pointer to post-processor instance. + * + * + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Decides to scale or center image in scale buffer for blit + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void +vp8_yv12_scale_or_center +( + YV12_BUFFER_CONFIG *src_yuv_config, + YV12_BUFFER_CONFIG *dst_yuv_config, + int expanded_frame_width, + int expanded_frame_height, + int scaling_mode, + int HScale, + int HRatio, + int VScale, + int VRatio +) +{ +// if ( ppi->post_processing_level ) + // update_umvborder ( ppi, frame_buffer ); + + + switch (scaling_mode) + { + case SCALE_TO_FIT: + case MAINTAIN_ASPECT_RATIO: + { + SCALE_VARS scale_vars; + // center values +#if 1 + int row = (dst_yuv_config->y_height - expanded_frame_height) / 2; + int col = (dst_yuv_config->y_width - expanded_frame_width) / 2; +// int YOffset = row * dst_yuv_config->y_width + col; +// int UVOffset = (row>>1) * dst_yuv_config->uv_width + (col>>1); + int YOffset = row * dst_yuv_config->y_stride + col; + int UVOffset = (row >> 1) * dst_yuv_config->uv_stride + (col >> 1); +#else + int row = (src_yuv_config->y_height - expanded_frame_height) / 2; + int col = (src_yuv_config->y_width - expanded_frame_width) / 2; + int YOffset = row * src_yuv_config->y_width + col; + int UVOffset = (row >> 1) * src_yuv_config->uv_width + (col >> 1); +#endif + + scale_vars.dst_yuv_config = dst_yuv_config; + scale_vars.src_yuv_config = src_yuv_config; + scale_vars.HScale = HScale; + scale_vars.HRatio = HRatio; + scale_vars.VScale = VScale; + scale_vars.VRatio = VRatio; + scale_vars.expanded_frame_width = expanded_frame_width; + scale_vars.expanded_frame_height = expanded_frame_height; + + // perform center and scale + any_ratio_frame_scale(&scale_vars, YOffset, UVOffset); + + break; + } + case CENTER: + center_image(src_yuv_config, dst_yuv_config); + break; + + default: + break; + } +} diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c new file mode 100644 index 000000000..04617be51 --- /dev/null +++ b/vpx_scale/generic/yv12config.c @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_scale/yv12config.h" +#include "vpx_mem/vpx_mem.h" + +/**************************************************************************** +* Exports +****************************************************************************/ + +/**************************************************************************** + * + ****************************************************************************/ +int +vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf) +{ + if (ybf) + { + if (ybf->buffer_alloc) + { + duck_free(ybf->buffer_alloc); + } + + ybf->buffer_alloc = 0; + } + else + { + return -1; + } + + return 0; +} + +/**************************************************************************** + * + ****************************************************************************/ +int +vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border) +{ +//NOTE: + + int yplane_size = (height + 2 * border) * (width + 2 * border); + int uvplane_size = ((1 + height) / 2 + border) * ((1 + width) / 2 + border); + + if (ybf) + { + vp8_yv12_de_alloc_frame_buffer(ybf); + + ybf->y_width = width; + ybf->y_height = height; + ybf->y_stride = width + 2 * border; + + ybf->uv_width = (1 + width) / 2; + ybf->uv_height = (1 + height) / 2; + ybf->uv_stride = ybf->uv_width + border; + + ybf->border = border; + ybf->frame_size = yplane_size + 2 * uvplane_size; + + // Added 2 extra lines to framebuffer so that copy12x12 doesn't fail + // when we have a large motion vector in V on the last v block. + // Note : We never use these pixels anyway so this doesn't hurt. + ybf->buffer_alloc = (unsigned char *) duck_memalign(32, ybf->frame_size + (ybf->y_stride * 2) + 32, 0); + + if (ybf->buffer_alloc == NULL) + return -1; + + ybf->y_buffer = ybf->buffer_alloc + (border * ybf->y_stride) + border; + + if (yplane_size & 0xf) + yplane_size += 16 - (yplane_size & 0xf); + + ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2 * ybf->uv_stride) + border / 2; + ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2 * ybf->uv_stride) + border / 2; + } + else + { + return -2; + } + + return 0; +} + +/**************************************************************************** + * + ****************************************************************************/ +int +vp8_yv12_black_frame_buffer(YV12_BUFFER_CONFIG *ybf) +{ + if (ybf) + { + if (ybf->buffer_alloc) + { + duck_memset(ybf->y_buffer, 0x0, ybf->y_stride * ybf->y_height); + duck_memset(ybf->u_buffer, 0x80, ybf->uv_stride * ybf->uv_height); + duck_memset(ybf->v_buffer, 0x80, ybf->uv_stride * ybf->uv_height); + } + + return 0; + } + + return -1; +} diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c new file mode 100644 index 000000000..4906625c8 --- /dev/null +++ b/vpx_scale/generic/yv12extend.c @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_scale/yv12config.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_scale/vpxscale.h" + +/**************************************************************************** +* Exports +****************************************************************************/ + +/**************************************************************************** + * + ****************************************************************************/ +void +vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf) +{ + int i; + unsigned char *src_ptr1, *src_ptr2; + unsigned char *dest_ptr1, *dest_ptr2; + + unsigned int Border; + int plane_stride; + int plane_height; + int plane_width; + + /***********/ + /* Y Plane */ + /***********/ + Border = ybf->border; + plane_stride = ybf->y_stride; + plane_height = ybf->y_height; + plane_width = ybf->y_width; + + // copy the left and right most columns out + src_ptr1 = ybf->y_buffer; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; i++) + { + vpx_memset(dest_ptr1, src_ptr1[0], Border); + vpx_memset(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->y_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)Border; i++) + { + vpx_memcpy(dest_ptr1, src_ptr1, plane_stride); + vpx_memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + + /***********/ + /* U Plane */ + /***********/ + plane_stride = ybf->uv_stride; + plane_height = ybf->uv_height; + plane_width = ybf->uv_width; + Border /= 2; + + // copy the left and right most columns out + src_ptr1 = ybf->u_buffer; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; i++) + { + vpx_memset(dest_ptr1, src_ptr1[0], Border); + vpx_memset(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->u_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)(Border); i++) + { + vpx_memcpy(dest_ptr1, src_ptr1, plane_stride); + vpx_memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + /***********/ + /* V Plane */ + /***********/ + + // copy the left and right most columns out + src_ptr1 = ybf->v_buffer; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; i++) + { + vpx_memset(dest_ptr1, src_ptr1[0], Border); + vpx_memset(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->v_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)(Border); i++) + { + vpx_memcpy(dest_ptr1, src_ptr1, plane_stride); + vpx_memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } +} + + +void +vp8_yv12_extend_frame_borders_yonly(YV12_BUFFER_CONFIG *ybf) +{ + int i; + unsigned char *src_ptr1, *src_ptr2; + unsigned char *dest_ptr1, *dest_ptr2; + + unsigned int Border; + int plane_stride; + int plane_height; + int plane_width; + + /***********/ + /* Y Plane */ + /***********/ + Border = ybf->border; + plane_stride = ybf->y_stride; + plane_height = ybf->y_height; + plane_width = ybf->y_width; + + // copy the left and right most columns out + src_ptr1 = ybf->y_buffer; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; i++) + { + vpx_memset(dest_ptr1, src_ptr1[0], Border); + vpx_memset(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->y_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)Border; i++) + { + vpx_memcpy(dest_ptr1, src_ptr1, plane_stride); + vpx_memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + plane_stride /= 2; + plane_height /= 2; + plane_width /= 2; + Border /= 2; + +} + + + +/**************************************************************************** + * + * ROUTINE : vp8_yv12_copy_frame + * + * INPUTS : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies the source image into the destination image and + * updates the destination's UMV borders. + * + * SPECIAL NOTES : The frames are assumed to be identical in size. + * + ****************************************************************************/ +void +vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) +{ + int row; + unsigned char *source, *dest; + + source = src_ybc->y_buffer; + dest = dst_ybc->y_buffer; + + for (row = 0; row < src_ybc->y_height; row++) + { + vpx_memcpy(dest, source, src_ybc->y_width); + source += src_ybc->y_stride; + dest += dst_ybc->y_stride; + } + + source = src_ybc->u_buffer; + dest = dst_ybc->u_buffer; + + for (row = 0; row < src_ybc->uv_height; row++) + { + vpx_memcpy(dest, source, src_ybc->uv_width); + source += src_ybc->uv_stride; + dest += dst_ybc->uv_stride; + } + + source = src_ybc->v_buffer; + dest = dst_ybc->v_buffer; + + for (row = 0; row < src_ybc->uv_height; row++) + { + vpx_memcpy(dest, source, src_ybc->uv_width); + source += src_ybc->uv_stride; + dest += dst_ybc->uv_stride; + } + + vp8_yv12_extend_frame_borders_ptr(dst_ybc); +} + +void +vp8_yv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) +{ + int row; + unsigned char *source, *dest; + + + source = src_ybc->y_buffer; + dest = dst_ybc->y_buffer; + + for (row = 0; row < src_ybc->y_height; row++) + { + vpx_memcpy(dest, source, src_ybc->y_width); + source += src_ybc->y_stride; + dest += dst_ybc->y_stride; + } + + vp8_yv12_extend_frame_borders_yonly(dst_ybc); +} diff --git a/vpx_scale/include/arm/vpxscale_nofp.h b/vpx_scale/include/arm/vpxscale_nofp.h new file mode 100644 index 000000000..d6181d207 --- /dev/null +++ b/vpx_scale/include/arm/vpxscale_nofp.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +void vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_horizontal_line_1_2_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_horizontal_line_3_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_horizontal_line_3_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_horizontal_line_2_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_horizontal_line_4_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); + +void vp8cx_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); + +void horizontal_line_4_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void horizontal_line_2_3_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void horizontal_line_3_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void horizontal_line_3_4_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void horizontal_line_1_2_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vertical_band_4_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vertical_band_2_3_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vertical_band_3_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vertical_band_3_4_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vertical_band_1_2_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); + +#define vp8_vertical_band_4_5_scale vertical_band_4_5_scale_armv4 +#define vp8_last_vertical_band_4_5_scale vp8cx_last_vertical_band_4_5_scale_c +#define vp8_vertical_band_2_3_scale vertical_band_2_3_scale_armv4 +#define vp8_last_vertical_band_2_3_scale vp8cx_last_vertical_band_2_3_scale_c +#define vp8_vertical_band_3_5_scale vertical_band_3_5_scale_armv4 +#define vp8_last_vertical_band_3_5_scale vp8cx_last_vertical_band_3_5_scale_c +#define vp8_vertical_band_3_4_scale vertical_band_3_4_scale_armv4 +#define vp8_last_vertical_band_3_4_scale vp8cx_last_vertical_band_3_4_scale_c +#define vp8_horizontal_line_1_2_scale horizontal_line_1_2_scale_armv4 +#define vp8_horizontal_line_3_5_scale horizontal_line_3_5_scale_armv4 +#define vp8_horizontal_line_3_4_scale horizontal_line_3_4_scale_armv4 +#define vp8_horizontal_line_4_5_scale horizontal_line_4_5_scale_armv4 +#define vp8_horizontal_line_2_3_scale horizontal_line_2_3_scale_armv4 +#define vp8_vertical_band_1_2_scale vertical_band_1_2_scale_armv4 +#define vp8_last_vertical_band_1_2_scale vp8cx_last_vertical_band_1_2_scale_c +#define vp8_vertical_band_5_4_scale vp8cx_vertical_band_5_4_scale_c +#define vp8_vertical_band_5_3_scale vp8cx_vertical_band_5_3_scale_c +#define vp8_vertical_band_2_1_scale vp8cx_vertical_band_2_1_scale_c +#define vp8_vertical_band_2_1_scale_i vp8cx_vertical_band_2_1_scale_i_c +#define vp8_horizontal_line_2_1_scale vp8cx_horizontal_line_2_1_scale_c +#define vp8_horizontal_line_5_3_scale vp8cx_horizontal_line_5_3_scale_c +#define vp8_horizontal_line_5_4_scale vp8cx_horizontal_line_5_4_scale_c diff --git a/vpx_scale/include/generic/vpxscale_arbitrary.h b/vpx_scale/include/generic/vpxscale_arbitrary.h new file mode 100644 index 000000000..2b50f24cf --- /dev/null +++ b/vpx_scale/include/generic/vpxscale_arbitrary.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef __VPX_SCALE_ARBITRARY_H__ +#define __VPX_SCALE_ARBITRARY_H__ + +#include "vpx_scale/yv12config.h" + +typedef struct +{ + int in_width; + int in_height; + + int out_width; + int out_height; + int max_usable_out_width; + + // numerator for the width and height + int nw; + int nh; + int nh_uv; + + // output to input correspondance array + short *l_w; + short *l_h; + short *l_h_uv; + + // polyphase coefficients + short *c_w; + short *c_h; + short *c_h_uv; + + // buffer for horizontal filtering. + unsigned char *hbuf; + unsigned char *hbuf_uv; +} BICUBIC_SCALER_STRUCT; + +int bicubic_coefficient_setup(int in_width, int in_height, int out_width, int out_height); +int bicubic_scale(int in_width, int in_height, int in_stride, + int out_width, int out_height, int out_stride, + unsigned char *input_image, unsigned char *output_image); +void bicubic_scale_frame_reset(); +void bicubic_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + int new_width, int new_height); +void bicubic_coefficient_init(); +void bicubic_coefficient_destroy(); + +#endif /* __VPX_SCALE_ARBITRARY_H__ */ diff --git a/vpx_scale/include/generic/vpxscale_depricated.h b/vpx_scale/include/generic/vpxscale_depricated.h new file mode 100644 index 000000000..015eed0fc --- /dev/null +++ b/vpx_scale/include/generic/vpxscale_depricated.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** +* +* Module Title : postp.h +* +* Description : Post processor interface +* +****************************************************************************/ +#ifndef VPXSCALE_H +#define VPXSCALE_H + +extern void (*vp8_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_last_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_vertical_band_3_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_last_vertical_band_3_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_horizontal_line_1_2_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +extern void (*vp8_horizontal_line_3_5_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +extern void (*vp8_horizontal_line_4_5_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +extern void (*vp8_vertical_band_1_2_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_last_vertical_band_1_2_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); + +extern void dmachine_specific_config(int mmx_enabled, int xmm_enabled, int wmt_enabled); + +#endif diff --git a/vpx_scale/include/generic/vpxscale_nofp.h b/vpx_scale/include/generic/vpxscale_nofp.h new file mode 100644 index 000000000..c4d5f4c6f --- /dev/null +++ b/vpx_scale/include/generic/vpxscale_nofp.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +void vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_horizontal_line_1_2_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_horizontal_line_3_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_horizontal_line_2_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_horizontal_line_4_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); + +void vp8cx_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); + +#define vp8_vertical_band_4_5_scale vp8cx_vertical_band_4_5_scale_c +#define vp8_last_vertical_band_4_5_scale vp8cx_last_vertical_band_4_5_scale_c +#define vp8_vertical_band_2_3_scale vp8cx_vertical_band_2_3_scale_c +#define vp8_last_vertical_band_2_3_scale vp8cx_last_vertical_band_2_3_scale_c +#define vp8_vertical_band_3_5_scale vp8cx_vertical_band_3_5_scale_c +#define vp8_last_vertical_band_3_5_scale vp8cx_last_vertical_band_3_5_scale_c +#define vp8_horizontal_line_1_2_scale vp8cx_horizontal_line_1_2_scale_c +#define vp8_horizontal_line_3_5_scale vp8cx_horizontal_line_3_5_scale_c +#define vp8_horizontal_line_4_5_scale vp8cx_horizontal_line_4_5_scale_c +#define vp8_horizontal_line_2_3_scale vp8cx_horizontal_line_2_3_scale_c +#define vp8_vertical_band_1_2_scale vp8cx_vertical_band_1_2_scale_c +#define vp8_last_vertical_band_1_2_scale vp8cx_last_vertical_band_1_2_scale_c +#define vp8_vertical_band_5_4_scale vp8cx_vertical_band_5_4_scale_c +#define vp8_vertical_band_5_3_scale vp8cx_vertical_band_5_3_scale_c +#define vp8_vertical_band_2_1_scale vp8cx_vertical_band_2_1_scale_c +#define vp8_vertical_band_2_1_scale_i vp8cx_vertical_band_2_1_scale_i_c +#define vp8_horizontal_line_2_1_scale vp8cx_horizontal_line_2_1_scale_c +#define vp8_horizontal_line_5_3_scale vp8cx_horizontal_line_5_3_scale_c +#define vp8_horizontal_line_5_4_scale vp8cx_horizontal_line_5_4_scale_c diff --git a/vpx_scale/include/leapster/vpxscale.h b/vpx_scale/include/leapster/vpxscale.h new file mode 100644 index 000000000..f70029cae --- /dev/null +++ b/vpx_scale/include/leapster/vpxscale.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** +* +* Module Title : postp.h +* +* Description : Post processor interface +* +****************************************************************************/ +#ifndef VPXSCALE_H +#define VPXSCALE_H + + +// fwg 2004-10-14 +typedef void (*vpxvertical_band_4_5_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +typedef void (*vpxlast_vertical_band_4_5_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +typedef void (*vpxvertical_band_3_5_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +typedef void (*vpxlast_vertical_band_3_5_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +typedef void (*vpxhorizontal_line_1_2_scale_lf)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +typedef void (*vpxhorizontal_line_3_5_scale_lf)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +typedef void (*vpxhorizontal_line_4_5_scale_lf)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +typedef void (*vpxvertical_band_1_2_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +typedef void (*vpxlast_vertical_band_1_2_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); + + +typedef struct vpxglobal_scalling_ptrs_t +{ + vpxvertical_band_4_5_scale_lf vpxvertical_band_4_5_scale_t; + vpxlast_vertical_band_4_5_scale_lf vpxlast_vertical_band_4_5_scale_t; + vpxvertical_band_3_5_scale_lf vpxvertical_band_3_5_scale_t; + vpxlast_vertical_band_3_5_scale_lf vpxlast_vertical_band_3_5_scale_t; + vpxhorizontal_line_1_2_scale_lf vpxhorizontal_line_1_2_scale_t; + vpxhorizontal_line_3_5_scale_lf vpxhorizontal_line_3_5_scale_t; + vpxhorizontal_line_4_5_scale_lf vpxhorizontal_line_4_5_scale_t; + vpxvertical_band_1_2_scale_lf vpxvertical_band_1_2_scale_t; + vpxlast_vertical_band_1_2_scale_lf vpxlast_vertical_band_1_2_scale_t; +} vpxglobal_scalling_ptrs; + +extern struct vpxglobal_scalling_ptrs_t *g_scaling_ptrs; + +/* +extern void (*vp8_vertical_band_4_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width); +extern void (*vp8_last_vertical_band_4_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width); +extern void (*vp8_vertical_band_3_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width); +extern void (*vp8_last_vertical_band_3_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width); +extern void (*vp8_horizontal_line_1_2_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width); +extern void (*vp8_horizontal_line_3_5_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width); +extern void (*vp8_horizontal_line_4_5_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width); +extern void (*vp8_vertical_band_1_2_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width); +extern void (*vp8_last_vertical_band_1_2_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width); +*/ + +#endif diff --git a/vpx_scale/include/symbian/vpxscale_nofp.h b/vpx_scale/include/symbian/vpxscale_nofp.h new file mode 100644 index 000000000..d6181d207 --- /dev/null +++ b/vpx_scale/include/symbian/vpxscale_nofp.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +void vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_horizontal_line_1_2_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_horizontal_line_3_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_horizontal_line_3_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_horizontal_line_2_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_horizontal_line_4_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); + +void vp8cx_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); + +void horizontal_line_4_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void horizontal_line_2_3_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void horizontal_line_3_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void horizontal_line_3_4_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void horizontal_line_1_2_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vertical_band_4_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vertical_band_2_3_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vertical_band_3_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vertical_band_3_4_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vertical_band_1_2_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); + +#define vp8_vertical_band_4_5_scale vertical_band_4_5_scale_armv4 +#define vp8_last_vertical_band_4_5_scale vp8cx_last_vertical_band_4_5_scale_c +#define vp8_vertical_band_2_3_scale vertical_band_2_3_scale_armv4 +#define vp8_last_vertical_band_2_3_scale vp8cx_last_vertical_band_2_3_scale_c +#define vp8_vertical_band_3_5_scale vertical_band_3_5_scale_armv4 +#define vp8_last_vertical_band_3_5_scale vp8cx_last_vertical_band_3_5_scale_c +#define vp8_vertical_band_3_4_scale vertical_band_3_4_scale_armv4 +#define vp8_last_vertical_band_3_4_scale vp8cx_last_vertical_band_3_4_scale_c +#define vp8_horizontal_line_1_2_scale horizontal_line_1_2_scale_armv4 +#define vp8_horizontal_line_3_5_scale horizontal_line_3_5_scale_armv4 +#define vp8_horizontal_line_3_4_scale horizontal_line_3_4_scale_armv4 +#define vp8_horizontal_line_4_5_scale horizontal_line_4_5_scale_armv4 +#define vp8_horizontal_line_2_3_scale horizontal_line_2_3_scale_armv4 +#define vp8_vertical_band_1_2_scale vertical_band_1_2_scale_armv4 +#define vp8_last_vertical_band_1_2_scale vp8cx_last_vertical_band_1_2_scale_c +#define vp8_vertical_band_5_4_scale vp8cx_vertical_band_5_4_scale_c +#define vp8_vertical_band_5_3_scale vp8cx_vertical_band_5_3_scale_c +#define vp8_vertical_band_2_1_scale vp8cx_vertical_band_2_1_scale_c +#define vp8_vertical_band_2_1_scale_i vp8cx_vertical_band_2_1_scale_i_c +#define vp8_horizontal_line_2_1_scale vp8cx_horizontal_line_2_1_scale_c +#define vp8_horizontal_line_5_3_scale vp8cx_horizontal_line_5_3_scale_c +#define vp8_horizontal_line_5_4_scale vp8cx_horizontal_line_5_4_scale_c diff --git a/vpx_scale/include/vpxscale_nofp.h b/vpx_scale/include/vpxscale_nofp.h new file mode 100644 index 000000000..f6482f944 --- /dev/null +++ b/vpx_scale/include/vpxscale_nofp.h @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#if defined(__S60_V20__) || defined(__SYMBIAN32__) && !defined(__WINS__) +#include "symbian\vpxscale_nofp.h" +#else +#include "generic\vpxscale_nofp.h" +#endif diff --git a/vpx_scale/intel_linux/scaleopt.c b/vpx_scale/intel_linux/scaleopt.c new file mode 100644 index 000000000..6555600e9 --- /dev/null +++ b/vpx_scale/intel_linux/scaleopt.c @@ -0,0 +1,1852 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** +* +* Module Title : scaleopt.cpp +* +* Description : Optimized scaling functions +* +****************************************************************************/ +#include "pragmas.h" + +/**************************************************************************** +* Module Statics +****************************************************************************/ +#if 0 +__declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 51 }; +__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 }; +__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 }; +__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 }; +__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 }; +__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1}; +__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 }; +__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 }; +__declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0}; +__declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 }; +__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 }; +#endif + +#include "vpx_scale/vpxscale.h" +#include "vpx_mem/vpx_mem.h" + +/**************************************************************************** + * + * ROUTINE : horizontal_line_3_5_scale_mmx + * + * INPUTS : const unsigned char *source : + * unsigned int source_width : + * unsigned char *dest : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void horizontal_line_3_5_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + __declspec(align(16)) unsigned short const35_2[] = { 154, 51, 205, 102 }; + __declspec(align(16)) unsigned short const35_1[] = { 102, 205, 51, 154 }; + __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; + + (void) dest_width; + + __asm + { + + push ebx + + mov esi, source + mov edi, dest + + mov ecx, source_width + lea edx, [esi+ecx-3]; + + movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx + movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx + + movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx + pxor mm7, mm7 // clear mm7 + + horiz_line_3_5_loop: + + mov eax, DWORD PTR [esi] // eax = 00 01 02 03 + mov ebx, eax + + and ebx, 0xffff00 // ebx = xx 01 02 xx + mov ecx, eax // ecx = 00 01 02 03 + + and eax, 0xffff0000 // eax = xx xx 02 03 + xor ecx, eax // ecx = 00 01 xx xx + + shr ebx, 8 // ebx = 01 02 xx xx + or eax, ebx // eax = 01 02 02 03 + + shl ebx, 16 // ebx = xx xx 01 02 + movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx + + or ebx, ecx // ebx = 00 01 01 02 + punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx + + movd mm0, ebx // mm0 = 00 01 01 02 + pmullw mm1, mm6 // + + punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx + pmullw mm0, mm5 // + + mov [edi], ebx // writeoutput 00 xx xx xx + add esi, 3 + + add edi, 5 + paddw mm0, mm1 + + paddw mm0, mm4 + psrlw mm0, 8 + + cmp esi, edx + packuswb mm0, mm7 + + movd DWORD Ptr [edi-4], mm0 + jl horiz_line_3_5_loop + +//Exit: + mov eax, DWORD PTR [esi] // eax = 00 01 02 03 + mov ebx, eax + + and ebx, 0xffff00 // ebx = xx 01 02 xx + mov ecx, eax // ecx = 00 01 02 03 + + and eax, 0xffff0000 // eax = xx xx 02 03 + xor ecx, eax // ecx = 00 01 xx xx + + shr ebx, 8 // ebx = 01 02 xx xx + or eax, ebx // eax = 01 02 02 03 + + shl eax, 8 // eax = xx 01 02 02 + and eax, 0xffff0000 // eax = xx xx 02 02 + + or eax, ebx // eax = 01 02 02 02 + + shl ebx, 16 // ebx = xx xx 01 02 + movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx + + or ebx, ecx // ebx = 00 01 01 02 + punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx + + movd mm0, ebx // mm0 = 00 01 01 02 + pmullw mm1, mm6 // + + punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx + pmullw mm0, mm5 // + + mov [edi], ebx // writeoutput 00 xx xx xx + paddw mm0, mm1 + + paddw mm0, mm4 + psrlw mm0, 8 + + packuswb mm0, mm7 + movd DWORD Ptr [edi+1], mm0 + + pop ebx + + } + + /* + const unsigned char *src = source; + unsigned char *des = dest; + unsigned int a, b, c ; + unsigned int i; + (void) dest_width; + + for ( i=0; i<source_width-3; i+=3 ) + { + a = src[0]; + b = src[1]; + des [0] = (UINT8) (a); + // 2 * left + 3 * right /5 + des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8); + c = src[2] ; + // 4 * left + 1 * right /5 + des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8); + // 1 * left + 4 * right /5 + des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8); + + a = src[3]; + // 3 * left + 2 * right /5 + des [4] = (UINT8) (( c * 154 + a * 102 + 128 ) >> 8); + + src += 3; + des += 5; + } + + a = src[0]; + b = src[1]; + des [0] = (UINT8) (a); + // 2 * left + 3 * right /5 + des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8); + c = src[2] ; + // 4 * left + 1 * right /5 + des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8); + // 1 * left + 4 * right /5 + des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8); + + des [4] = (UINT8) (c); + */ +} + + +/**************************************************************************** + * + * ROUTINE : horizontal_line_4_5_scale_mmx + * + * INPUTS : const unsigned char *source : + * unsigned int source_width : + * unsigned char *dest : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void horizontal_line_4_5_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; + __declspec(align(16)) unsigned short const45_2[] = {205, 154, 102, 51 }; + __declspec(align(16)) unsigned short const45_1[] = { 51, 102, 154, 205 }; + __declspec(align(16)) unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0}; + + (void)dest_width; + + __asm + { + + mov esi, source + mov edi, dest + + mov ecx, source_width + lea edx, [esi+ecx-8]; + + movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx + movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx + + movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx + pxor mm7, mm7 // clear mm7 + + horiz_line_4_5_loop: + + movq mm0, QWORD PTR [esi] // mm0 = 00 01 02 03 04 05 06 07 + movq mm1, QWORD PTR [esi+1]; // mm1 = 01 02 03 04 05 06 07 08 + + movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07 + movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08 + + movd DWORD PTR [edi], mm0 // write output 00 xx xx xx + punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx + + punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx + pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205 + + pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51 + punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx + + movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx + pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205 + + punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx + pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51 + + paddw mm0, mm1 // added round values + paddw mm0, mm4 + + psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx + packuswb mm0, mm7 + + movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04 + add edi, 10 + + add esi, 8 + paddw mm2, mm3 // + + paddw mm2, mm4 // added round values + cmp esi, edx + + psrlw mm2, 8 + packuswb mm2, mm7 + + movd DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09 + jl horiz_line_4_5_loop + +//Exit: + movq mm0, [esi] // mm0 = 00 01 02 03 04 05 06 07 + movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07 + + movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07 + psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00 + + movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00 + pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00 + + psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07 + por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07 + + movq mm3, mm1 + + movd DWORD PTR [edi], mm0 // write output 00 xx xx xx + punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx + + punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx + pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205 + + pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51 + punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx + + movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx + pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205 + + punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx + pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51 + + paddw mm0, mm1 // added round values + paddw mm0, mm4 + + psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx + packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx + + movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04 + paddw mm2, mm3 // + + paddw mm2, mm4 // added round values + psrlw mm2, 8 + + packuswb mm2, mm7 + movd DWORD PTR [edi+6], mm2 // writeoutput 06 07 08 09 + + + } + /* + const unsigned char *src = source; + unsigned char *des = dest; + unsigned int a, b, c ; + unsigned i; + (void) dest_width; + + for ( i=0; i<source_width-4; i+=4 ) + { + a = src[0]; + b = src[1]; + des [0] = (UINT8) a; + des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8); + c = src[2] * 154; + a = src[3]; + des [2] = (UINT8) (( b * 102 + c + 128) >> 8); + des [3] = (UINT8) (( c + 102 * a + 128) >> 8); + b = src[4]; + des [4] = (UINT8) (( a * 205 + 51 * b + 128) >> 8); + + src += 4; + des += 5; + } + + a = src[0]; + b = src[1]; + des [0] = (UINT8) (a); + des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8); + c = src[2] * 154; + a = src[3]; + des [2] = (UINT8) (( b * 102 + c + 128) >> 8); + des [3] = (UINT8) (( c + 102 * a + 128) >> 8); + des [4] = (UINT8) (a); + */ +} + +/**************************************************************************** + * + * ROUTINE : vertical_band_4_5_scale_mmx + * + * INPUTS : unsigned char *dest : + * unsigned int dest_pitch : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. The function also has a "C" only + * version. + * + ****************************************************************************/ +static +void vertical_band_4_5_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + + __declspec(align(16)) unsigned short one_fifth[] = { 51, 51, 51, 51 }; + __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 }; + __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 }; + __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 }; + __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; + + __asm + { + + mov esi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + lea edi, [esi+ecx*2] // tow lines below + add edi, ecx // three lines below + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + vs_4_5_loop: + + movq mm0, QWORD ptr [esi] // src[0]; + movq mm1, QWORD ptr [esi+ecx] // src[1]; + + movq mm2, mm0 // Make a copy + punpcklbw mm0, mm7 // unpack low to word + + movq mm5, one_fifth + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm0, mm5 // a * 1/5 + + movq mm3, mm1 // make a copy + punpcklbw mm1, mm7 // unpack low to word + + pmullw mm2, mm5 // a * 1/5 + movq mm6, four_fifths // constan + + movq mm4, mm1 // copy of low b + pmullw mm4, mm6 // b * 4/5 + + punpckhbw mm3, mm7 // unpack high to word + movq mm5, mm3 // copy of high b + + pmullw mm5, mm6 // b * 4/5 + paddw mm0, mm4 // a * 1/5 + b * 4/5 + + paddw mm2, mm5 // a * 1/5 + b * 4/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des [1] + + movq QWORD ptr [esi+ecx], mm0 // write des[1] + movq mm0, [esi+ecx*2] // mm0 = src[2] + + // mm1, mm3 --- Src[1] + // mm0 --- Src[2] + // mm7 for unpacking + + movq mm5, two_fifths + movq mm2, mm0 // make a copy + + pmullw mm1, mm5 // b * 2/5 + movq mm6, three_fifths + + + punpcklbw mm0, mm7 // unpack low to word + pmullw mm3, mm5 // b * 2/5 + + movq mm4, mm0 // make copy of c + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm4, mm6 // c * 3/5 + movq mm5, mm2 + + pmullw mm5, mm6 // c * 3/5 + paddw mm1, mm4 // b * 2/5 + c * 3/5 + + paddw mm3, mm5 // b * 2/5 + c * 3/5 + paddw mm1, round_values // + 128 + + paddw mm3, round_values // + 128 + psrlw mm1, 8 + + psrlw mm3, 8 + packuswb mm1, mm3 // des[2] + + movq QWORD ptr [esi+ecx*2], mm1 // write des[2] + movq mm1, [edi] // mm1=Src[3]; + + // mm0, mm2 --- Src[2] + // mm1 --- Src[3] + // mm6 --- 3/5 + // mm7 for unpacking + + pmullw mm0, mm6 // c * 3/5 + movq mm5, two_fifths // mm5 = 2/5 + + movq mm3, mm1 // make a copy + pmullw mm2, mm6 // c * 3/5 + + punpcklbw mm1, mm7 // unpack low + movq mm4, mm1 // make a copy + + punpckhbw mm3, mm7 // unpack high + pmullw mm4, mm5 // d * 2/5 + + movq mm6, mm3 // make a copy + pmullw mm6, mm5 // d * 2/5 + + paddw mm0, mm4 // c * 3/5 + d * 2/5 + paddw mm2, mm6 // c * 3/5 + d * 2/5 + + paddw mm0, round_values // + 128 + paddw mm2, round_values // + 128 + + psrlw mm0, 8 + psrlw mm2, 8 + + packuswb mm0, mm2 // des[3] + movq QWORD ptr [edi], mm0 // write des[3] + + // mm1, mm3 --- Src[3] + // mm7 -- cleared for unpacking + + movq mm0, [edi+ecx*2] // mm0, Src[0] of the next group + + movq mm5, four_fifths // mm5 = 4/5 + pmullw mm1, mm5 // d * 4/5 + + movq mm6, one_fifth // mm6 = 1/5 + movq mm2, mm0 // make a copy + + pmullw mm3, mm5 // d * 4/5 + punpcklbw mm0, mm7 // unpack low + + pmullw mm0, mm6 // an * 1/5 + punpckhbw mm2, mm7 // unpack high + + paddw mm1, mm0 // d * 4/5 + an * 1/5 + pmullw mm2, mm6 // an * 1/5 + + paddw mm3, mm2 // d * 4/5 + an * 1/5 + paddw mm1, round_values // + 128 + + paddw mm3, round_values // + 128 + psrlw mm1, 8 + + psrlw mm3, 8 + packuswb mm1, mm3 // des[4] + + movq QWORD ptr [edi+ecx], mm1 // write des[4] + + add edi, 8 + add esi, 8 + + sub edx, 8 + jg vs_4_5_loop + } +} + +/**************************************************************************** + * + * ROUTINE : last_vertical_band_4_5_scale_mmx + * + * INPUTS : unsigned char *dest : + * unsigned int dest_pitch : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : None + * + * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. The function also has an "C" only + * version. + * + ****************************************************************************/ +static +void last_vertical_band_4_5_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __declspec(align(16)) unsigned short one_fifth[] = { 51, 51, 51, 51 }; + __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 }; + __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 }; + __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 }; + __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; + + __asm + { + mov esi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + lea edi, [esi+ecx*2] // tow lines below + add edi, ecx // three lines below + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + last_vs_4_5_loop: + + movq mm0, QWORD ptr [esi] // src[0]; + movq mm1, QWORD ptr [esi+ecx] // src[1]; + + movq mm2, mm0 // Make a copy + punpcklbw mm0, mm7 // unpack low to word + + movq mm5, one_fifth + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm0, mm5 // a * 1/5 + + movq mm3, mm1 // make a copy + punpcklbw mm1, mm7 // unpack low to word + + pmullw mm2, mm5 // a * 1/5 + movq mm6, four_fifths // constan + + movq mm4, mm1 // copy of low b + pmullw mm4, mm6 // b * 4/5 + + punpckhbw mm3, mm7 // unpack high to word + movq mm5, mm3 // copy of high b + + pmullw mm5, mm6 // b * 4/5 + paddw mm0, mm4 // a * 1/5 + b * 4/5 + + paddw mm2, mm5 // a * 1/5 + b * 4/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des [1] + + movq QWORD ptr [esi+ecx], mm0 // write des[1] + movq mm0, [esi+ecx*2] // mm0 = src[2] + + // mm1, mm3 --- Src[1] + // mm0 --- Src[2] + // mm7 for unpacking + + movq mm5, two_fifths + movq mm2, mm0 // make a copy + + pmullw mm1, mm5 // b * 2/5 + movq mm6, three_fifths + + + punpcklbw mm0, mm7 // unpack low to word + pmullw mm3, mm5 // b * 2/5 + + movq mm4, mm0 // make copy of c + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm4, mm6 // c * 3/5 + movq mm5, mm2 + + pmullw mm5, mm6 // c * 3/5 + paddw mm1, mm4 // b * 2/5 + c * 3/5 + + paddw mm3, mm5 // b * 2/5 + c * 3/5 + paddw mm1, round_values // + 128 + + paddw mm3, round_values // + 128 + psrlw mm1, 8 + + psrlw mm3, 8 + packuswb mm1, mm3 // des[2] + + movq QWORD ptr [esi+ecx*2], mm1 // write des[2] + movq mm1, [edi] // mm1=Src[3]; + + movq QWORD ptr [edi+ecx], mm1 // write des[4]; + + // mm0, mm2 --- Src[2] + // mm1 --- Src[3] + // mm6 --- 3/5 + // mm7 for unpacking + + pmullw mm0, mm6 // c * 3/5 + movq mm5, two_fifths // mm5 = 2/5 + + movq mm3, mm1 // make a copy + pmullw mm2, mm6 // c * 3/5 + + punpcklbw mm1, mm7 // unpack low + movq mm4, mm1 // make a copy + + punpckhbw mm3, mm7 // unpack high + pmullw mm4, mm5 // d * 2/5 + + movq mm6, mm3 // make a copy + pmullw mm6, mm5 // d * 2/5 + + paddw mm0, mm4 // c * 3/5 + d * 2/5 + paddw mm2, mm6 // c * 3/5 + d * 2/5 + + paddw mm0, round_values // + 128 + paddw mm2, round_values // + 128 + + psrlw mm0, 8 + psrlw mm2, 8 + + packuswb mm0, mm2 // des[3] + movq QWORD ptr [edi], mm0 // write des[3] + + // mm1, mm3 --- Src[3] + // mm7 -- cleared for unpacking + add edi, 8 + add esi, 8 + + sub edx, 8 + jg last_vs_4_5_loop + } +} + +/**************************************************************************** + * + * ROUTINE : vertical_band_3_5_scale_mmx + * + * INPUTS : unsigned char *dest : + * unsigned int dest_pitch : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. The function also has an "C" only + * version. + * + ****************************************************************************/ +static +void vertical_band_3_5_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __declspec(align(16)) unsigned short one_fifth[] = { 51, 51, 51, 51 }; + __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 }; + __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 }; + __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 }; + __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; + + __asm + { + mov esi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + lea edi, [esi+ecx*2] // tow lines below + add edi, ecx // three lines below + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + vs_3_5_loop: + + movq mm0, QWORD ptr [esi] // src[0]; + movq mm1, QWORD ptr [esi+ecx] // src[1]; + + movq mm2, mm0 // Make a copy + punpcklbw mm0, mm7 // unpack low to word + + movq mm5, two_fifths // mm5 = 2/5 + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm0, mm5 // a * 2/5 + + movq mm3, mm1 // make a copy + punpcklbw mm1, mm7 // unpack low to word + + pmullw mm2, mm5 // a * 2/5 + movq mm6, three_fifths // mm6 = 3/5 + + movq mm4, mm1 // copy of low b + pmullw mm4, mm6 // b * 3/5 + + punpckhbw mm3, mm7 // unpack high to word + movq mm5, mm3 // copy of high b + + pmullw mm5, mm6 // b * 3/5 + paddw mm0, mm4 // a * 2/5 + b * 3/5 + + paddw mm2, mm5 // a * 2/5 + b * 3/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des [1] + + movq QWORD ptr [esi+ecx], mm0 // write des[1] + movq mm0, [esi+ecx*2] // mm0 = src[2] + + // mm1, mm3 --- Src[1] + // mm0 --- Src[2] + // mm7 for unpacking + + movq mm4, mm1 // b low + pmullw mm1, four_fifths // b * 4/5 low + + movq mm5, mm3 // b high + pmullw mm3, four_fifths // b * 4/5 high + + movq mm2, mm0 // c + pmullw mm4, one_fifth // b * 1/5 + + punpcklbw mm0, mm7 // c low + pmullw mm5, one_fifth // b * 1/5 + + movq mm6, mm0 // make copy of c low + punpckhbw mm2, mm7 // c high + + pmullw mm6, one_fifth // c * 1/5 low + movq mm7, mm2 // make copy of c high + + pmullw mm7, one_fifth // c * 1/5 high + paddw mm1, mm6 // b * 4/5 + c * 1/5 low + + paddw mm3, mm7 // b * 4/5 + c * 1/5 high + movq mm6, mm0 // make copy of c low + + pmullw mm6, four_fifths // c * 4/5 low + movq mm7, mm2 // make copy of c high + + pmullw mm7, four_fifths // c * 4/5 high + + paddw mm4, mm6 // b * 1/5 + c * 4/5 low + paddw mm5, mm7 // b * 1/5 + c * 4/5 high + + paddw mm1, round_values // + 128 + paddw mm3, round_values // + 128 + + psrlw mm1, 8 + psrlw mm3, 8 + + packuswb mm1, mm3 // des[2] + movq QWORD ptr [esi+ecx*2], mm1 // write des[2] + + paddw mm4, round_values // + 128 + paddw mm5, round_values // + 128 + + psrlw mm4, 8 + psrlw mm5, 8 + + packuswb mm4, mm5 // des[3] + movq QWORD ptr [edi], mm4 // write des[3] + + // mm0, mm2 --- Src[3] + + pxor mm7, mm7 // clear mm7 for unpacking + movq mm1, [edi+ecx*2] // mm1 = Src[0] of the next group + + movq mm5, three_fifths // mm5 = 3/5 + pmullw mm0, mm5 // d * 3/5 + + movq mm6, two_fifths // mm6 = 2/5 + movq mm3, mm1 // make a copy + + pmullw mm2, mm5 // d * 3/5 + punpcklbw mm1, mm7 // unpack low + + pmullw mm1, mm6 // an * 2/5 + punpckhbw mm3, mm7 // unpack high + + paddw mm0, mm1 // d * 3/5 + an * 2/5 + pmullw mm3, mm6 // an * 2/5 + + paddw mm2, mm3 // d * 3/5 + an * 2/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des[4] + + movq QWORD ptr [edi+ecx], mm0 // write des[4] + + add edi, 8 + add esi, 8 + + sub edx, 8 + jg vs_3_5_loop + } +} + +/**************************************************************************** + * + * ROUTINE : last_vertical_band_3_5_scale_mmx + * + * INPUTS : unsigned char *dest : + * unsigned int dest_pitch : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. The function also has an "C" only + * version. + * + ****************************************************************************/ +static +void last_vertical_band_3_5_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __declspec(align(16)) unsigned short one_fifth[] = { 51, 51, 51, 51 }; + __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 }; + __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 }; + __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 }; + __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; + __asm + { + mov esi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + lea edi, [esi+ecx*2] // tow lines below + add edi, ecx // three lines below + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + + last_vs_3_5_loop: + + movq mm0, QWORD ptr [esi] // src[0]; + movq mm1, QWORD ptr [esi+ecx] // src[1]; + + movq mm2, mm0 // Make a copy + punpcklbw mm0, mm7 // unpack low to word + + movq mm5, two_fifths // mm5 = 2/5 + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm0, mm5 // a * 2/5 + + movq mm3, mm1 // make a copy + punpcklbw mm1, mm7 // unpack low to word + + pmullw mm2, mm5 // a * 2/5 + movq mm6, three_fifths // mm6 = 3/5 + + movq mm4, mm1 // copy of low b + pmullw mm4, mm6 // b * 3/5 + + punpckhbw mm3, mm7 // unpack high to word + movq mm5, mm3 // copy of high b + + pmullw mm5, mm6 // b * 3/5 + paddw mm0, mm4 // a * 2/5 + b * 3/5 + + paddw mm2, mm5 // a * 2/5 + b * 3/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des [1] + + movq QWORD ptr [esi+ecx], mm0 // write des[1] + movq mm0, [esi+ecx*2] // mm0 = src[2] + + + + // mm1, mm3 --- Src[1] + // mm0 --- Src[2] + // mm7 for unpacking + + movq mm4, mm1 // b low + pmullw mm1, four_fifths // b * 4/5 low + + movq QWORD ptr [edi+ecx], mm0 // write des[4] + + movq mm5, mm3 // b high + pmullw mm3, four_fifths // b * 4/5 high + + movq mm2, mm0 // c + pmullw mm4, one_fifth // b * 1/5 + + punpcklbw mm0, mm7 // c low + pmullw mm5, one_fifth // b * 1/5 + + movq mm6, mm0 // make copy of c low + punpckhbw mm2, mm7 // c high + + pmullw mm6, one_fifth // c * 1/5 low + movq mm7, mm2 // make copy of c high + + pmullw mm7, one_fifth // c * 1/5 high + paddw mm1, mm6 // b * 4/5 + c * 1/5 low + + paddw mm3, mm7 // b * 4/5 + c * 1/5 high + movq mm6, mm0 // make copy of c low + + pmullw mm6, four_fifths // c * 4/5 low + movq mm7, mm2 // make copy of c high + + pmullw mm7, four_fifths // c * 4/5 high + + paddw mm4, mm6 // b * 1/5 + c * 4/5 low + paddw mm5, mm7 // b * 1/5 + c * 4/5 high + + paddw mm1, round_values // + 128 + paddw mm3, round_values // + 128 + + psrlw mm1, 8 + psrlw mm3, 8 + + packuswb mm1, mm3 // des[2] + movq QWORD ptr [esi+ecx*2], mm1 // write des[2] + + paddw mm4, round_values // + 128 + paddw mm5, round_values // + 128 + + psrlw mm4, 8 + psrlw mm5, 8 + + packuswb mm4, mm5 // des[3] + movq QWORD ptr [edi], mm4 // write des[3] + + // mm0, mm2 --- Src[3] + + add edi, 8 + add esi, 8 + + sub edx, 8 + jg last_vs_3_5_loop + } +} + +/**************************************************************************** + * + * ROUTINE : vertical_band_1_2_scale_mmx + * + * INPUTS : unsigned char *dest : + * unsigned int dest_pitch : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 1 to 2 up-scaling of a band of pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. The function also has an "C" only + * version. + * + ****************************************************************************/ +static +void vertical_band_1_2_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __declspec(align(16))unsigned short four_ones[] = { 1, 1, 1, 1}; + + __asm + { + + mov esi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + vs_1_2_loop: + + movq mm0, [esi] // get Src[0] + movq mm1, [esi + ecx * 2] // get Src[1] + + movq mm2, mm0 // make copy before unpack + movq mm3, mm1 // make copy before unpack + + punpcklbw mm0, mm7 // low Src[0] + movq mm6, four_ones // mm6= 1, 1, 1, 1 + + punpcklbw mm1, mm7 // low Src[1] + paddw mm0, mm1 // low (a + b) + + punpckhbw mm2, mm7 // high Src[0] + paddw mm0, mm6 // low (a + b + 1) + + punpckhbw mm3, mm7 + paddw mm2, mm3 // high (a + b ) + + psraw mm0, 1 // low (a + b +1 )/2 + paddw mm2, mm6 // high (a + b + 1) + + psraw mm2, 1 // high (a + b + 1)/2 + packuswb mm0, mm2 // pack results + + movq [esi+ecx], mm0 // write out eight bytes + add esi, 8 + + sub edx, 8 + jg vs_1_2_loop + } + +} + +/**************************************************************************** + * + * ROUTINE : last_vertical_band_1_2_scale_mmx + * + * INPUTS : unsigned char *dest : + * unsigned int dest_pitch : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 1 to 2 up-scaling of band of pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. The function also has an "C" only + * version. + * + ****************************************************************************/ +static +void last_vertical_band_1_2_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + mov esi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + mov edx, dest_width // Loop counter + + last_vs_1_2_loop: + + movq mm0, [esi] // get Src[0] + movq [esi+ecx], mm0 // write out eight bytes + + add esi, 8 + sub edx, 8 + + jg last_vs_1_2_loop + } +} + +/**************************************************************************** + * + * ROUTINE : horizontal_line_1_2_scale + * + * INPUTS : const unsigned char *source : + * unsigned int source_width : + * unsigned char *dest : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void horizontal_line_1_2_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + __declspec(align(16))unsigned short four_ones[] = { 1, 1, 1, 1}; + + (void) dest_width; + + __asm + { + mov esi, source + mov edi, dest + + pxor mm7, mm7 + movq mm6, four_ones + + mov ecx, source_width + + hs_1_2_loop: + + movq mm0, [esi] + movq mm1, [esi+1] + + movq mm2, mm0 + movq mm3, mm1 + + movq mm4, mm0 + punpcklbw mm0, mm7 + + punpcklbw mm1, mm7 + paddw mm0, mm1 + + paddw mm0, mm6 + punpckhbw mm2, mm7 + + punpckhbw mm3, mm7 + paddw mm2, mm3 + + paddw mm2, mm6 + psraw mm0, 1 + + psraw mm2, 1 + packuswb mm0, mm2 + + movq mm2, mm4 + punpcklbw mm2, mm0 + + movq [edi], mm2 + punpckhbw mm4, mm0 + + movq [edi+8], mm4 + add esi, 8 + + add edi, 16 + sub ecx, 8 + + cmp ecx, 8 + jg hs_1_2_loop + +// last eight pixel + + movq mm0, [esi] + movq mm1, mm0 + + movq mm2, mm0 + movq mm3, mm1 + + psrlq mm1, 8 + psrlq mm3, 56 + + psllq mm3, 56 + por mm1, mm3 + + movq mm3, mm1 + movq mm4, mm0 + + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + + paddw mm0, mm1 + paddw mm0, mm6 + + punpckhbw mm2, mm7 + punpckhbw mm3, mm7 + + paddw mm2, mm3 + paddw mm2, mm6 + + psraw mm0, 1 + psraw mm2, 1 + + packuswb mm0, mm2 + movq mm2, mm4 + + punpcklbw mm2, mm0 + movq [edi], mm2 + + punpckhbw mm4, mm0 + movq [edi+8], mm4 + } +} + + + + + + +/**************************************************************************** + * + * ROUTINE : horizontal_line_5_4_scale_mmx + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 4 to 5. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void horizontal_line_5_4_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + + __declspec(align(16)) const unsigned short const54_2[] = { 0, 64, 128, 192 }; + __declspec(align(16)) const unsigned short const54_1[] = {256, 192, 128, 64 }; + __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; + /* + unsigned i; + unsigned int a, b, c, d, e; + unsigned char *des = dest; + const unsigned char *src = source; + + (void) dest_width; + + for ( i=0; i<source_width; i+=5 ) + { + a = src[0]; + b = src[1]; + c = src[2]; + d = src[3]; + e = src[4]; + + des[0] = a; + des[1] = ((b*192 + c* 64 + 128)>>8); + des[2] = ((c*128 + d*128 + 128)>>8); + des[3] = ((d* 64 + e*192 + 128)>>8); + + src += 5; + des += 4; + } + */ + __asm + { + + mov esi, source ; + mov edi, dest ; + + mov ecx, source_width ; + movq mm5, const54_1 ; + + pxor mm7, mm7 ; + movq mm6, const54_2 ; + + movq mm4, round_values ; + lea edx, [esi+ecx] ; + horizontal_line_5_4_loop: + + movq mm0, QWORD PTR [esi] ; + 00 01 02 03 04 05 06 07 + movq mm1, mm0 ; + 00 01 02 03 04 05 06 07 + + psrlq mm0, 8 ; + 01 02 03 04 05 06 07 xx + punpcklbw mm1, mm7 ; + xx 00 xx 01 xx 02 xx 03 + + punpcklbw mm0, mm7 ; + xx 01 xx 02 xx 03 xx 04 + pmullw mm1, mm5 + + pmullw mm0, mm6 + add esi, 5 + + add edi, 4 + paddw mm1, mm0 + + paddw mm1, mm4 + psrlw mm1, 8 + + cmp esi, edx + packuswb mm1, mm7 + + movd DWORD PTR [edi-4], mm1 + + jl horizontal_line_5_4_loop + + } + +} + +static +void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + + __declspec(align(16)) const unsigned short one_fourths[] = { 64, 64, 64, 64 }; + __declspec(align(16)) const unsigned short two_fourths[] = { 128, 128, 128, 128 }; + __declspec(align(16)) const unsigned short three_fourths[] = { 192, 192, 192, 192 }; + + __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; + __asm + { + push ebx + + mov esi, source // Get the source and destination pointer + mov ecx, src_pitch // Get the pitch size + + mov edi, dest // tow lines below + pxor mm7, mm7 // clear out mm7 + + mov edx, dest_pitch // Loop counter + mov ebx, dest_width + + vs_5_4_loop: + + movd mm0, DWORD ptr [esi] // src[0]; + movd mm1, DWORD ptr [esi+ecx] // src[1]; + + movd mm2, DWORD ptr [esi+ecx*2] + lea eax, [esi+ecx*2] // + + punpcklbw mm1, mm7 + punpcklbw mm2, mm7 + + movq mm3, mm2 + pmullw mm1, three_fourths + + pmullw mm2, one_fourths + movd mm4, [eax+ecx] + + pmullw mm3, two_fourths + punpcklbw mm4, mm7 + + movq mm5, mm4 + pmullw mm4, two_fourths + + paddw mm1, mm2 + movd mm6, [eax+ecx*2] + + pmullw mm5, one_fourths + paddw mm1, round_values; + + paddw mm3, mm4 + psrlw mm1, 8 + + punpcklbw mm6, mm7 + paddw mm3, round_values + + pmullw mm6, three_fourths + psrlw mm3, 8 + + packuswb mm1, mm7 + packuswb mm3, mm7 + + movd DWORD PTR [edi], mm0 + movd DWORD PTR [edi+edx], mm1 + + + paddw mm5, mm6 + movd DWORD PTR [edi+edx*2], mm3 + + lea eax, [edi+edx*2] + paddw mm5, round_values + + psrlw mm5, 8 + add edi, 4 + + packuswb mm5, mm7 + movd DWORD PTR [eax+edx], mm5 + + add esi, 4 + sub ebx, 4 + + jg vs_5_4_loop + + pop ebx + } +} + + + +static +void horizontal_line_5_3_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + __declspec(align(16)) const unsigned short const53_1[] = { 0, 85, 171, 0 }; + __declspec(align(16)) const unsigned short const53_2[] = {256, 171, 85, 0 }; + __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; + __asm + { + + mov esi, source ; + mov edi, dest ; + + mov ecx, source_width ; + movq mm5, const53_1 ; + + pxor mm7, mm7 ; + movq mm6, const53_2 ; + + movq mm4, round_values ; + lea edx, [esi+ecx-5] ; + horizontal_line_5_3_loop: + + movq mm0, QWORD PTR [esi] ; + 00 01 02 03 04 05 06 07 + movq mm1, mm0 ; + 00 01 02 03 04 05 06 07 + + psllw mm0, 8 ; + xx 00 xx 02 xx 04 xx 06 + psrlw mm1, 8 ; + 01 xx 03 xx 05 xx 07 xx + + psrlw mm0, 8 ; + 00 xx 02 xx 04 xx 06 xx + psllq mm1, 16 ; + xx xx 01 xx 03 xx 05 xx + + pmullw mm0, mm6 + + pmullw mm1, mm5 + add esi, 5 + + add edi, 3 + paddw mm1, mm0 + + paddw mm1, mm4 + psrlw mm1, 8 + + cmp esi, edx + packuswb mm1, mm7 + + movd DWORD PTR [edi-3], mm1 + jl horizontal_line_5_3_loop + +//exit condition + movq mm0, QWORD PTR [esi] ; + 00 01 02 03 04 05 06 07 + movq mm1, mm0 ; + 00 01 02 03 04 05 06 07 + + psllw mm0, 8 ; + xx 00 xx 02 xx 04 xx 06 + psrlw mm1, 8 ; + 01 xx 03 xx 05 xx 07 xx + + psrlw mm0, 8 ; + 00 xx 02 xx 04 xx 06 xx + psllq mm1, 16 ; + xx xx 01 xx 03 xx 05 xx + + pmullw mm0, mm6 + + pmullw mm1, mm5 + paddw mm1, mm0 + + paddw mm1, mm4 + psrlw mm1, 8 + + packuswb mm1, mm7 + movd eax, mm1 + + mov edx, eax + shr edx, 16 + + mov WORD PTR[edi], ax + mov BYTE PTR[edi+2], dl + + } + +} + + +static +void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; + __declspec(align(16)) const unsigned short one_thirds[] = { 85, 85, 85, 85 }; + __declspec(align(16)) const unsigned short two_thirds[] = { 171, 171, 171, 171 }; + + __asm + { + push ebx + + mov esi, source // Get the source and destination pointer + mov ecx, src_pitch // Get the pitch size + + mov edi, dest // tow lines below + pxor mm7, mm7 // clear out mm7 + + mov edx, dest_pitch // Loop counter + movq mm5, one_thirds + + movq mm6, two_thirds + mov ebx, dest_width; + + vs_5_3_loop: + + movd mm0, DWORD ptr [esi] // src[0]; + movd mm1, DWORD ptr [esi+ecx] // src[1]; + + movd mm2, DWORD ptr [esi+ecx*2] + lea eax, [esi+ecx*2] // + + punpcklbw mm1, mm7 + punpcklbw mm2, mm7 + + pmullw mm1, mm5 + pmullw mm2, mm6 + + movd mm3, DWORD ptr [eax+ecx] + movd mm4, DWORD ptr [eax+ecx*2] + + punpcklbw mm3, mm7 + punpcklbw mm4, mm7 + + pmullw mm3, mm6 + pmullw mm4, mm5 + + + movd DWORD PTR [edi], mm0 + paddw mm1, mm2 + + paddw mm1, round_values + psrlw mm1, 8 + + packuswb mm1, mm7 + paddw mm3, mm4 + + paddw mm3, round_values + movd DWORD PTR [edi+edx], mm1 + + psrlw mm3, 8 + packuswb mm3, mm7 + + movd DWORD PTR [edi+edx*2], mm3 + + + add edi, 4 + add esi, 4 + + sub ebx, 4 + jg vs_5_3_loop + + pop ebx + } +} + + + + +/**************************************************************************** + * + * ROUTINE : horizontal_line_2_1_scale + * + * INPUTS : const unsigned char *source : + * unsigned int source_width : + * unsigned char *dest : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void horizontal_line_2_1_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + (void) dest_width; + + __asm + { + mov esi, source + mov edi, dest + + pxor mm7, mm7 + mov ecx, dest_width + + xor edx, edx + hs_2_1_loop: + + movq mm0, [esi+edx*2] + psllw mm0, 8 + + psrlw mm0, 8 + packuswb mm0, mm7 + + movd DWORD Ptr [edi+edx], mm0; + add edx, 4 + + cmp edx, ecx + jl hs_2_1_loop + + } +} + + + +static +void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + vpx_memcpy(dest, source, dest_width); +} + + + +static +void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + + __declspec(align(16)) const unsigned short three_sixteenths[] = { 48, 48, 48, 48 }; + __declspec(align(16)) const unsigned short ten_sixteenths[] = { 160, 160, 160, 160 }; + __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 }; + __asm + { + mov esi, source + mov edi, dest + + mov eax, src_pitch + mov edx, dest_width + + pxor mm7, mm7 + sub esi, eax //back one line + + + lea ecx, [esi+edx]; + movq mm6, round_values; + + movq mm5, three_sixteenths; + movq mm4, ten_sixteenths; + + vs_2_1_i_loop: + movd mm0, [esi] // + movd mm1, [esi+eax] // + + movd mm2, [esi+eax*2] // + punpcklbw mm0, mm7 + + pmullw mm0, mm5 + punpcklbw mm1, mm7 + + pmullw mm1, mm4 + punpcklbw mm2, mm7 + + pmullw mm2, mm5 + paddw mm0, round_values + + paddw mm1, mm2 + paddw mm0, mm1 + + psrlw mm0, 8 + packuswb mm0, mm7 + + movd DWORD PTR [edi], mm0 + add esi, 4 + + add edi, 4; + cmp esi, ecx + jl vs_2_1_i_loop + + } +} + +void +register_mmxscalers(void) +{ + vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx; + vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx; + vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx; + vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx; + vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx; + vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx; + vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx; + vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx; + vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx; + + vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c; + vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c; + vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c; + vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c; + vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c; + vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c; + + + + vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx; + vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx; + vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx; + vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx; + vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx; + vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx; + vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx; + +} diff --git a/vpx_scale/intel_linux/scalesystemdependant.c b/vpx_scale/intel_linux/scalesystemdependant.c new file mode 100644 index 000000000..9ed48bfc6 --- /dev/null +++ b/vpx_scale/intel_linux/scalesystemdependant.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** +* +* Module Title : system_dependant.c +* +* Description : Miscellaneous system dependant functions +* +****************************************************************************/ + +/**************************************************************************** +* Header Files +****************************************************************************/ +#include "vpx_scale/vpxscale.h" +#include "cpuidlib.h" + +/**************************************************************************** +* Imports +*****************************************************************************/ +extern void register_generic_scalers(void); +extern void register_mmxscalers(void); + +/**************************************************************************** + * + * ROUTINE : post_proc_machine_specific_config + * + * INPUTS : UINT32 Version : Codec version number. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Checks for machine specifc features such as MMX support + * sets appropriate flags and function pointers. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void +vp8_scale_machine_specific_config(void) +{ + // If MMX supported then set to use MMX versions of functions else + // use original 'C' versions. + int mmx_enabled; + int xmm_enabled; + int wmt_enabled; + + vpx_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled); + + if (mmx_enabled || xmm_enabled || wmt_enabled) + { + register_mmxscalers(); + } + else + { + vp8_horizontal_line_1_2_scale = vp8cx_horizontal_line_1_2_scale_c; + vp8_vertical_band_1_2_scale = vp8cx_vertical_band_1_2_scale_c; + vp8_last_vertical_band_1_2_scale = vp8cx_last_vertical_band_1_2_scale_c; + vp8_horizontal_line_3_5_scale = vp8cx_horizontal_line_3_5_scale_c; + vp8_vertical_band_3_5_scale = vp8cx_vertical_band_3_5_scale_c; + vp8_last_vertical_band_3_5_scale = vp8cx_last_vertical_band_3_5_scale_c; + vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c; + vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c; + vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c; + vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c; + vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c; + vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c; + vp8_horizontal_line_4_5_scale = vp8cx_horizontal_line_4_5_scale_c; + vp8_vertical_band_4_5_scale = vp8cx_vertical_band_4_5_scale_c; + vp8_last_vertical_band_4_5_scale = vp8cx_last_vertical_band_4_5_scale_c; + + + vp8_vertical_band_5_4_scale = vp8cx_vertical_band_5_4_scale_c; + vp8_vertical_band_5_3_scale = vp8cx_vertical_band_5_3_scale_c; + vp8_vertical_band_2_1_scale = vp8cx_vertical_band_2_1_scale_c; + vp8_vertical_band_2_1_scale_i = vp8cx_vertical_band_2_1_scale_i_c; + vp8_horizontal_line_2_1_scale = vp8cx_horizontal_line_2_1_scale_c; + vp8_horizontal_line_5_3_scale = vp8cx_horizontal_line_5_3_scale_c; + vp8_horizontal_line_5_4_scale = vp8cx_horizontal_line_5_4_scale_c; + + } +} diff --git a/vpx_scale/leapster/doptsystemdependant_lf.c b/vpx_scale/leapster/doptsystemdependant_lf.c new file mode 100644 index 000000000..ca1316730 --- /dev/null +++ b/vpx_scale/leapster/doptsystemdependant_lf.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** +* +* Module Title : system_dependant.c +* +* Description : Miscellaneous system dependant functions +* +****************************************************************************/ + +/**************************************************************************** +* Header Files +****************************************************************************/ +#include "vpx_scale/vpxscale.h" + +/**************************************************************************** +* Imports +*****************************************************************************/ +extern int register_generic_scalers(void); +extern int de_register_generic_scalers(void); + +/**************************************************************************** + * + * ROUTINE : vp8_scale_machine_specific_config + * + * INPUTS : UINT32 Version : Codec version number. + * + * OUTPUTS : None. + * + * RETURNS : int + * + * FUNCTION : Checks for machine specifc features such as MMX support + * sets appropriate flags and function pointers. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +int +vp8_scale_machine_specific_config() +{ + return register_generic_scalers(); +} + +/**************************************************************************** + * + * ROUTINE : vp8_scale_machine_specific_config + * + * INPUTS : UINT32 Version : Codec version number. + * + * OUTPUTS : None. + * + * RETURNS : int + * + * FUNCTION : Resets the funtion pointers and deallocates memory. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +int +scale_machine_specific_de_config() +{ + return de_register_generic_scalers(); +} diff --git a/vpx_scale/leapster/gen_scalers_lf.c b/vpx_scale/leapster/gen_scalers_lf.c new file mode 100644 index 000000000..1b9c7c745 --- /dev/null +++ b/vpx_scale/leapster/gen_scalers_lf.c @@ -0,0 +1,521 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** + * + * Module Title : gen_scalers.c + * + * Description : Generic image scaling functions. + * + ***************************************************************************/ + +/**************************************************************************** +* Header Files +****************************************************************************/ +#include "vpx_scale/vpxscale.h" + +/**************************************************************************** +* Imports +****************************************************************************/ + +/**************************************************************************** + * + * ROUTINE : vp8cx_horizontal_line_4_5_scale_c + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 4 to 5. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void vp8cx_horizontal_line_4_5_scale_c +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + unsigned i; + unsigned int a, b, c; + unsigned char *des = dest; + const unsigned char *src = source; + + (void) dest_width; + + for (i = 0; i < source_width - 4; i += 4) + { + a = src[0]; + b = src[1]; + des [0] = (unsigned char) a; + des [1] = (unsigned char)((a * 51 + 205 * b + 128) >> 8); + c = src[2] * 154; + a = src[3]; + des [2] = (unsigned char)((b * 102 + c + 128) >> 8); + des [3] = (unsigned char)((c + 102 * a + 128) >> 8); + b = src[4]; + des [4] = (unsigned char)((a * 205 + 51 * b + 128) >> 8); + + src += 4; + des += 5; + } + + a = src[0]; + b = src[1]; + des [0] = (unsigned char)(a); + des [1] = (unsigned char)((a * 51 + 205 * b + 128) >> 8); + c = src[2] * 154; + a = src[3]; + des [2] = (unsigned char)((b * 102 + c + 128) >> 8); + des [3] = (unsigned char)((c + 102 * a + 128) >> 8); + des [4] = (unsigned char)(a); + +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_vertical_band_4_5_scale_c + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales vertical band of pixels by scale 4 to 5. The + * height of the band scaled is 4-pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. + * + ****************************************************************************/ +static +void vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b, c, d; + unsigned char *des = dest; + + for (i = 0; i < dest_width; i++) + { + a = des [0]; + b = des [dest_pitch]; + + des[dest_pitch] = (unsigned char)((a * 51 + 205 * b + 128) >> 8); + + c = des[dest_pitch*2] * 154; + d = des[dest_pitch*3]; + + des [dest_pitch*2] = (unsigned char)((b * 102 + c + 128) >> 8); + des [dest_pitch*3] = (unsigned char)((c + 102 * d + 128) >> 8); + + // First line in next band + a = des [dest_pitch * 5]; + des [dest_pitch * 4] = (unsigned char)((d * 205 + 51 * a + 128) >> 8); + + des ++; + } +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_last_vertical_band_4_5_scale_c + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales last vertical band of pixels by scale 4 to 5. The + * height of the band scaled is 4-pixels. + * + * SPECIAL NOTES : The routine does not have available the first line of + * the band below the current band, since this is the + * last band. + * + ****************************************************************************/ +static +void vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b, c, d; + unsigned char *des = dest; + + for (i = 0; i < dest_width; ++i) + { + a = des[0]; + b = des[dest_pitch]; + + des[dest_pitch] = (unsigned char)((a * 51 + 205 * b + 128) >> 8); + + c = des[dest_pitch*2] * 154; + d = des[dest_pitch*3]; + + des [dest_pitch*2] = (unsigned char)((b * 102 + c + 128) >> 8); + des [dest_pitch*3] = (unsigned char)((c + 102 * d + 128) >> 8); + + // No other line for interplation of this line, so .. + des[dest_pitch*4] = (unsigned char) d; + + des++; + } +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_horizontal_line_3_5_scale_c + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 3 to 5. + * + * SPECIAL NOTES : None. + * + * + ****************************************************************************/ +static +void vp8cx_horizontal_line_3_5_scale_c +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + unsigned int i; + unsigned int a, b, c; + unsigned char *des = dest; + const unsigned char *src = source; + + (void) dest_width; + + for (i = 0; i < source_width - 3; i += 3) + { + a = src[0]; + b = src[1]; + des [0] = (unsigned char)(a); + des [1] = (unsigned char)((a * 102 + 154 * b + 128) >> 8); + + c = src[2] ; + des [2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8); + des [3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8); + + a = src[3]; + des [4] = (unsigned char)((c * 154 + a * 102 + 128) >> 8); + + src += 3; + des += 5; + } + + a = src[0]; + b = src[1]; + des [0] = (unsigned char)(a); + + des [1] = (unsigned char)((a * 102 + 154 * b + 128) >> 8); + c = src[2] ; + des [2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8); + des [3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8); + + des [4] = (unsigned char)(c); +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_vertical_band_3_5_scale_c + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales vertical band of pixels by scale 3 to 5. The + * height of the band scaled is 3-pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. + * + ****************************************************************************/ +static +void vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b, c; + unsigned char *des = dest; + + for (i = 0; i < dest_width; i++) + { + a = des [0]; + b = des [dest_pitch]; + des [dest_pitch] = (unsigned char)((a * 102 + 154 * b + 128) >> 8); + + c = des[dest_pitch*2]; + des [dest_pitch*2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8); + des [dest_pitch*3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8); + + // First line in next band... + a = des [dest_pitch * 5]; + des [dest_pitch * 4] = (unsigned char)((c * 154 + a * 102 + 128) >> 8); + + des++; + } +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_last_vertical_band_3_5_scale_c + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales last vertical band of pixels by scale 3 to 5. The + * height of the band scaled is 3-pixels. + * + * SPECIAL NOTES : The routine does not have available the first line of + * the band below the current band, since this is the + * last band. + * + ****************************************************************************/ +static +void vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b, c; + unsigned char *des = dest; + + for (i = 0; i < dest_width; ++i) + { + a = des [0]; + b = des [dest_pitch]; + + des [ dest_pitch ] = (unsigned char)((a * 102 + 154 * b + 128) >> 8); + + c = des[dest_pitch*2]; + des [dest_pitch*2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8); + des [dest_pitch*3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8); + + // No other line for interplation of this line, so .. + des [ dest_pitch * 4 ] = (unsigned char)(c) ; + + des++; + } +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_horizontal_line_1_2_scale_c + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 1 to 2. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void vp8cx_horizontal_line_1_2_scale_c +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + unsigned int i; + unsigned int a, b; + unsigned char *des = dest; + const unsigned char *src = source; + + (void) dest_width; + + for (i = 0; i < source_width - 1; i += 1) + { + a = src[0]; + b = src[1]; + des [0] = (unsigned char)(a); + des [1] = (unsigned char)((a + b + 1) >> 1); + src += 1; + des += 2; + } + + a = src[0]; + des [0] = (unsigned char)(a); + des [1] = (unsigned char)(a); +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_vertical_band_1_2_scale_c + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales vertical band of pixels by scale 1 to 2. The + * height of the band scaled is 1-pixel. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. + * + ****************************************************************************/ +static +void vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned int a, b; + unsigned char *des = dest; + + for (i = 0; i < dest_width; i++) + { + a = des [0]; + b = des [dest_pitch * 2]; + + des[dest_pitch] = (unsigned char)((a + b + 1) >> 1); + + des++; + } +} + +/**************************************************************************** + * + * ROUTINE : vp8cx_last_vertical_band_1_2_scale_c + * + * INPUTS : unsigned char *dest : Pointer to destination data. + * unsigned int dest_pitch : Stride of destination data. + * unsigned int dest_width : Width of destination data. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Scales last vertical band of pixels by scale 1 to 2. The + * height of the band scaled is 1-pixel. + * + * SPECIAL NOTES : The routine does not have available the first line of + * the band below the current band, since this is the + * last band. + * + ****************************************************************************/ +static +void vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + unsigned int i; + unsigned char *des = dest; + + for (i = 0; i < dest_width; ++i) + { + des[dest_pitch] = des[0]; + des++; + } +} + +#include "vpx_scale/vpxscale.h" +#include "vpx_mem/vpx_mem.h" + +struct vpxglobal_scalling_ptrs_t *g_scaling_ptrs = 0; + +int +register_generic_scalers(void) +{ + int rv = 0; + + g_scaling_ptrs = (struct vpxglobal_scalling_ptrs_t *)vpx_malloc(sizeof(struct vpxglobal_scalling_ptrs_t)); + + if (g_scaling_ptrs) + { + g_scaling_ptrs->vpxhorizontal_line_1_2_scale_t = vp8cx_horizontal_line_1_2_scale_c; + g_scaling_ptrs->vpxvertical_band_1_2_scale_t = vp8cx_vertical_band_1_2_scale_c; + g_scaling_ptrs->vpxlast_vertical_band_1_2_scale_t = vp8cx_last_vertical_band_1_2_scale_c; + g_scaling_ptrs->vpxhorizontal_line_3_5_scale_t = vp8cx_horizontal_line_3_5_scale_c; + g_scaling_ptrs->vpxvertical_band_3_5_scale_t = vp8cx_vertical_band_3_5_scale_c; + g_scaling_ptrs->vpxlast_vertical_band_3_5_scale_t = vp8cx_last_vertical_band_3_5_scale_c; + g_scaling_ptrs->vpxhorizontal_line_4_5_scale_t = vp8cx_horizontal_line_4_5_scale_c; + g_scaling_ptrs->vpxvertical_band_4_5_scale_t = vp8cx_vertical_band_4_5_scale_c; + g_scaling_ptrs->vpxlast_vertical_band_4_5_scale_t = vp8cx_last_vertical_band_4_5_scale_c; + } + else + { + rv = -1; + } + + /* + vp8_horizontal_line_1_2_scale = vp8cx_horizontal_line_1_2_scale_c; + vp8_vertical_band_1_2_scale = vp8cx_vertical_band_1_2_scale_c; + vp8_last_vertical_band_1_2_scale = vp8cx_last_vertical_band_1_2_scale_c; + vp8_horizontal_line_3_5_scale = vp8cx_horizontal_line_3_5_scale_c; + vp8_vertical_band_3_5_scale = vp8cx_vertical_band_3_5_scale_c; + vp8_last_vertical_band_3_5_scale = vp8cx_last_vertical_band_3_5_scale_c; + vp8_horizontal_line_4_5_scale = vp8cx_horizontal_line_4_5_scale_c; + vp8_vertical_band_4_5_scale = vp8cx_vertical_band_4_5_scale_c; + vp8_last_vertical_band_4_5_scale = vp8cx_last_vertical_band_4_5_scale_c; + */ + + return rv; +} + +int +de_register_generic_scalers(void) +{ + int rv = 0; + + if (g_scaling_ptrs) + { + vpx_free(g_scaling_ptrs); + g_scaling_ptrs = 0; + } + else + { + rv = -1; + } + + return rv; +} diff --git a/vpx_scale/leapster/vpxscale_lf.c b/vpx_scale/leapster/vpxscale_lf.c new file mode 100644 index 000000000..5f05e5de0 --- /dev/null +++ b/vpx_scale/leapster/vpxscale_lf.c @@ -0,0 +1,890 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** + * + * Module Title : scale.c + * + * Description : Image scaling functions. + * + ***************************************************************************/ + +/**************************************************************************** +* Header Files +****************************************************************************/ +#include "stdlib.h" +#include "vpx_scale/vpxscale.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_scale/yv12config.h" +#include "codec_common_interface.h" + +/**************************************************************************** +* Exports +****************************************************************************/ +/* +void (*vp8_vertical_band_4_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width); +void (*vp8_last_vertical_band_4_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width); +void (*vp8_vertical_band_3_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width); +void (*vp8_last_vertical_band_3_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width); +void (*vp8_horizontal_line_1_2_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width); +void (*vp8_horizontal_line_3_5_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width); +void (*vp8_horizontal_line_4_5_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width); +void (*vp8_vertical_band_1_2_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width); +void (*vp8_last_vertical_band_1_2_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width); +*/ + + +typedef struct +{ + int expanded_frame_width; + int expanded_frame_height; + + int HScale; + int HRatio; + int VScale; + int VRatio; + + YV12_BUFFER_CONFIG *src_yuv_config; + YV12_BUFFER_CONFIG *dst_yuv_config; + +} SCALE_VARS; + + +/**************************************************************************** + * + * ROUTINE : horizontal_line_copy + * + * INPUTS : None + * + * + * OUTPUTS : None. + * + * RETURNS : None + * + * FUNCTION : 1 to 1 scaling up for a horizontal line of pixles + * + * SPECIAL NOTES : None. + * + * ERRORS : None. + * + ****************************************************************************/ +static +void horizontal_line_copy( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + duck_memcpy(dest, source, source_width); +} +/**************************************************************************** + * + * ROUTINE : null_scale + * + * INPUTS : None + * + * + * OUTPUTS : None. + * + * RETURNS : None + * + * FUNCTION : 1 to 1 scaling up for a vertical band + * + * SPECIAL NOTES : None. + * + * ERRORS : None. + * + ****************************************************************************/ +static +void null_scale( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + return; +} + +/**************************************************************************** + * + * ROUTINE : scale1d_2t1_i + * + * INPUTS : const unsigned char *source : Pointer to data to be scaled. + * int source_step : Number of pixels to step on in source. + * unsigned int source_scale : Scale for source (UNUSED). + * unsigned int source_length : Length of source (UNUSED). + * unsigned char *dest : Pointer to output data array. + * int dest_step : Number of pixels to step on in destination. + * unsigned int dest_scale : Scale for destination (UNUSED). + * unsigned int dest_length : Length of destination. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs 2-to-1 interpolated scaling. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void scale1d_2t1_i +( + const unsigned char *source, + int source_step, + unsigned int source_scale, + unsigned int source_length, + unsigned char *dest, + int dest_step, + unsigned int dest_scale, + unsigned int dest_length +) +{ + unsigned int i, j; + unsigned int temp; + + (void) source_length; + (void) source_scale; + (void) dest_scale; + + source_step *= 2; + dest[0] = source[0]; + + for (i = dest_step, j = source_step; i < dest_length * dest_step; i += dest_step, j += source_step) + { + temp = 8; + temp += 3 * source[j-source_step]; + temp += 10 * source[j]; + temp += 3 * source[j+source_step]; + temp >>= 4; + dest[i] = (char)(temp); + } +} + +/**************************************************************************** + * + * ROUTINE : scale1d_2t1_ps + * + * INPUTS : const unsigned char *source : Pointer to data to be scaled. + * int source_step : Number of pixels to step on in source. + * unsigned int source_scale : Scale for source (UNUSED). + * unsigned int source_length : Length of source (UNUSED). + * unsigned char *dest : Pointer to output data array. + * int dest_step : Number of pixels to step on in destination. + * unsigned int dest_scale : Scale for destination (UNUSED). + * unsigned int dest_length : Length of destination. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs 2-to-1 point subsampled scaling. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void scale1d_2t1_ps +( + const unsigned char *source, + int source_step, + unsigned int source_scale, + unsigned int source_length, + unsigned char *dest, + int dest_step, + unsigned int dest_scale, + unsigned int dest_length +) +{ + unsigned int i, j; + + (void) source_length; + (void) source_scale; + (void) dest_scale; + + source_step *= 2; + j = 0; + + for (i = 0; i < dest_length * dest_step; i += dest_step, j += source_step) + dest[i] = source[j]; +} +/**************************************************************************** + * + * ROUTINE : scale1d_c + * + * INPUTS : const unsigned char *source : Pointer to data to be scaled. + * int source_step : Number of pixels to step on in source. + * unsigned int source_scale : Scale for source. + * unsigned int source_length : Length of source (UNUSED). + * unsigned char *dest : Pointer to output data array. + * int dest_step : Number of pixels to step on in destination. + * unsigned int dest_scale : Scale for destination. + * unsigned int dest_length : Length of destination. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs linear interpolation in one dimension. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void scale1d_c +( + const unsigned char *source, + int source_step, + unsigned int source_scale, + unsigned int source_length, + unsigned char *dest, + int dest_step, + unsigned int dest_scale, + unsigned int dest_length +) +{ + unsigned int i; + unsigned int round_value = dest_scale / 2; + unsigned int left_modifier = dest_scale; + unsigned int right_modifier = 0; + unsigned char left_pixel = *source; + unsigned char right_pixel = *(source + source_step); + + (void) source_length; + + // These asserts are needed if there are boundary issues... + //assert ( dest_scale > source_scale ); + //assert ( (source_length-1) * dest_scale >= (dest_length-1) * source_scale ); + + for (i = 0; i < dest_length * dest_step; i += dest_step) + { + dest[i] = (char)((left_modifier * left_pixel + right_modifier * right_pixel + round_value) / dest_scale); + + right_modifier += source_scale; + + while (right_modifier > dest_scale) + { + right_modifier -= dest_scale; + source += source_step; + left_pixel = *source; + right_pixel = *(source + source_step); + } + + left_modifier = dest_scale - right_modifier; + } +} + +/**************************************************************************** + * + * ROUTINE : Scale2D + * + * INPUTS : const unsigned char *source : Pointer to data to be scaled. + * int source_pitch : Stride of source image. + * unsigned int source_width : Width of input image. + * unsigned int source_height : Height of input image. + * unsigned char *dest : Pointer to output data array. + * int dest_pitch : Stride of destination image. + * unsigned int dest_width : Width of destination image. + * unsigned int dest_height : Height of destination image. + * unsigned char *temp_area : Pointer to temp work area. + * unsigned char temp_area_height : Height of temp work area. + * unsigned int hscale : Horizontal scale factor numerator. + * unsigned int hratio : Horizontal scale factor denominator. + * unsigned int vscale : Vertical scale factor numerator. + * unsigned int vratio : Vertical scale factor denominator. + * unsigned int interlaced : Interlace flag. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs 2-tap linear interpolation in two dimensions. + * + * SPECIAL NOTES : Expansion is performed one band at a time to help with + * caching. + * + ****************************************************************************/ +static +void Scale2D +( + const unsigned char *source, + int source_pitch, + unsigned int source_width, + unsigned int source_height, + unsigned char *dest, + int dest_pitch, + unsigned int dest_width, + unsigned int dest_height, + unsigned char *temp_area, + unsigned char temp_area_height, + unsigned int hscale, + unsigned int hratio, + unsigned int vscale, + unsigned int vratio, + unsigned int interlaced +) +{ + unsigned int i, j, k; + unsigned int bands; + unsigned int dest_band_height; + unsigned int source_band_height; + + typedef void (*Scale1D)(const unsigned char * source, int source_step, unsigned int source_scale, unsigned int source_length, + unsigned char * dest, int dest_step, unsigned int dest_scale, unsigned int dest_length); + + Scale1D Scale1Dv = scale1d_c; + Scale1D Scale1Dh = scale1d_c; + + if (hscale == 2 && hratio == 1) + Scale1Dh = scale1d_2t1_ps; + + if (vscale == 2 && vratio == 1) + { + if (interlaced) + Scale1Dv = scale1d_2t1_ps; + else + Scale1Dv = scale1d_2t1_i; + } + + if (source_height == dest_height) + { + // for each band of the image + for (k = 0; k < dest_height; k++) + { + Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio, dest_width); + source += source_pitch; + dest += dest_pitch; + } + + return; + } + + if (dest_height > source_height) + { + dest_band_height = temp_area_height - 1; + source_band_height = dest_band_height * source_height / dest_height; + } + else + { + source_band_height = temp_area_height - 1; + dest_band_height = source_band_height * vratio / vscale; + } + + // first row needs to be done so that we can stay one row ahead for vertical zoom + Scale1Dh(source, 1, hscale, source_width + 1, temp_area, 1, hratio, dest_width); + + // for each band of the image + bands = (dest_height + dest_band_height - 1) / dest_band_height; + + for (k = 0; k < bands; k++) + { + // scale one band horizontally + for (i = 1; i < source_band_height + 1; i++) + { + if (k * source_band_height + i < source_height) + { + Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1, + temp_area + i * dest_pitch, 1, hratio, dest_width); + } + else // Duplicate the last row + { + // copy temp_area row 0 over from last row in the past + duck_memcpy(temp_area + i * dest_pitch, temp_area + (i - 1)*dest_pitch, dest_pitch); + } + } + + // scale one band vertically + for (j = 0; j < dest_width; j++) + { + Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1, + &dest[j], dest_pitch, vratio, dest_band_height); + } + + // copy temp_area row 0 over from last row in the past + duck_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch); + + // move to the next band + source += source_band_height * source_pitch; + dest += dest_band_height * dest_pitch; + } +} + +/**************************************************************************** + * + * ROUTINE : vp8_scale_frame + * + * INPUTS : YV12_BUFFER_CONFIG *src : Pointer to frame to be scaled. + * YV12_BUFFER_CONFIG *dst : Pointer to buffer to hold scaled frame. + * unsigned char *temp_area : Pointer to temp work area. + * unsigned char temp_area_height : Height of temp work area. + * unsigned int hscale : Horizontal scale factor numerator. + * unsigned int hratio : Horizontal scale factor denominator. + * unsigned int vscale : Vertical scale factor numerator. + * unsigned int vratio : Vertical scale factor denominator. + * unsigned int interlaced : Interlace flag. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs 2-tap linear interpolation in two dimensions. + * + * SPECIAL NOTES : Expansion is performed one band at a time to help with + * caching. + * + ****************************************************************************/ +void vp8_scale_frame +( + YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + unsigned char *temp_area, + unsigned char temp_height, + unsigned int hscale, + unsigned int hratio, + unsigned int vscale, + unsigned int vratio, + unsigned int interlaced +) +{ + int i; + int dw = (hscale - 1 + src->y_width * hratio) / hscale; + int dh = (vscale - 1 + src->y_height * vratio) / vscale; + + // call our internal scaling routines!! + Scale2D((unsigned char *) src->y_buffer, src->y_stride, src->y_width, src->y_height, + (unsigned char *) dst->y_buffer, dst->y_stride, dw, dh, + temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced); + + if (dw < (int)dst->y_width) + for (i = 0; i < dh; i++) + duck_memset(dst->y_buffer + i * dst->y_stride + dw - 1, dst->y_buffer[i*dst->y_stride+dw-2], dst->y_width - dw + 1); + + if (dh < (int)dst->y_height) + for (i = dh - 1; i < (int)dst->y_height; i++) + duck_memcpy(dst->y_buffer + i * dst->y_stride, dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1); + + Scale2D((unsigned char *) src->u_buffer, src->uv_stride, src->uv_width, src->uv_height, + (unsigned char *) dst->u_buffer, dst->uv_stride, dw / 2, dh / 2, + temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced); + + if (dw / 2 < (int)dst->uv_width) + for (i = 0; i < dst->uv_height; i++) + duck_memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1, dst->u_buffer[i*dst->uv_stride+dw/2-2], dst->uv_width - dw / 2 + 1); + + if (dh / 2 < (int)dst->uv_height) + for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++) + duck_memcpy(dst->u_buffer + i * dst->uv_stride, dst->u_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width); + + Scale2D((unsigned char *) src->v_buffer, src->uv_stride, src->uv_width, src->uv_height, + (unsigned char *) dst->v_buffer, dst->uv_stride, dw / 2, dh / 2, + temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced); + + if (dw / 2 < (int)dst->uv_width) + for (i = 0; i < dst->uv_height; i++) + duck_memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1, dst->v_buffer[i*dst->uv_stride+dw/2-2], dst->uv_width - dw / 2 + 1); + + if (dh / 2 < (int) dst->uv_height) + for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++) + duck_memcpy(dst->v_buffer + i * dst->uv_stride, dst->v_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width); +} +/**************************************************************************** + * + * ROUTINE : any_ratio_2d_scale + * + * INPUTS : SCALE_INSTANCE *si : Pointer to post-processor instance (NOT USED). + * const unsigned char *source : Pointer to source image. + * unsigned int source_pitch : Stride of source image. + * unsigned int source_width : Width of source image. + * unsigned int source_height : Height of source image (NOT USED). + * unsigned char *dest : Pointer to destination image. + * unsigned int dest_pitch : Stride of destination image. + * unsigned int dest_width : Width of destination image. + * unsigned int dest_height : Height of destination image. + * + * OUTPUTS : None. + * + * RETURNS : int: 1 if image scaled, 0 if image could not be scaled. + * + * FUNCTION : Scale the image with changing apect ratio. + * + * SPECIAL NOTES : This scaling is a bi-linear scaling. Need to re-work the + * whole function for new scaling algorithm. + * + ****************************************************************************/ +static +int any_ratio_2d_scale +( + SCALE_VARS *si, + const unsigned char *source, + unsigned int source_pitch, + unsigned int source_width, + unsigned int source_height, + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width, + unsigned int dest_height +) +{ + unsigned int i, k; + unsigned int src_band_height = 0; + unsigned int dest_band_height = 0; + + // suggested scale factors + int hs = si->HScale; + int hr = si->HRatio; + int vs = si->VScale; + int vr = si->VRatio; + + // assume the ratios are scalable instead of should be centered + int ratio_scalable = 1; + + void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *, unsigned int) = NULL; + void (*vert_band_scale)(unsigned char *, unsigned int, unsigned int) = NULL; + void (*last_vert_band_scale)(unsigned char *, unsigned int, unsigned int) = NULL; + + (void) si; + + // find out the ratio for each direction + switch (hr * 10 / hs) + { + case 8: + // 4-5 Scale in Width direction + horiz_line_scale = g_scaling_ptrs->vpxhorizontal_line_4_5_scale_t; + break; + case 6: + // 3-5 Scale in Width direction + horiz_line_scale = g_scaling_ptrs->vpxhorizontal_line_3_5_scale_t; + break; + case 5: + // 1-2 Scale in Width direction + horiz_line_scale = g_scaling_ptrs->vpxhorizontal_line_1_2_scale_t; + break; + case 10: + // no scale in Width direction + horiz_line_scale = horizontal_line_copy; + break; + default: + // The ratio is not acceptable now + // throw("The ratio is not acceptable for now!"); + ratio_scalable = 0; + break; + } + + switch (vr * 10 / vs) + { + case 8: + // 4-5 Scale in vertical direction + vert_band_scale = g_scaling_ptrs->vpxvertical_band_4_5_scale_t; + last_vert_band_scale = g_scaling_ptrs->vpxlast_vertical_band_4_5_scale_t; + src_band_height = 4; + dest_band_height = 5; + break; + case 6: + // 3-5 Scale in vertical direction + vert_band_scale = g_scaling_ptrs->vpxvertical_band_3_5_scale_t; + last_vert_band_scale = g_scaling_ptrs->vpxlast_vertical_band_3_5_scale_t; + src_band_height = 3; + dest_band_height = 5; + break; + case 5: + // 1-2 Scale in vertical direction + vert_band_scale = g_scaling_ptrs->vpxvertical_band_1_2_scale_t; + last_vert_band_scale = g_scaling_ptrs->vpxlast_vertical_band_1_2_scale_t; + src_band_height = 1; + dest_band_height = 2; + break; + case 10: + // no scale in Width direction + vert_band_scale = null_scale; + last_vert_band_scale = null_scale; + src_band_height = 4; + dest_band_height = 4; + break; + default: + // The ratio is not acceptable now + // throw("The ratio is not acceptable for now!"); + ratio_scalable = 0; + break; + } + + if (ratio_scalable == 0) + return ratio_scalable; + + horiz_line_scale(source, source_width, dest, dest_width); + + // except last band + for (k = 0; k < (dest_height + dest_band_height - 1) / dest_band_height - 1; k++) + { + // scale one band horizontally + for (i = 1; i < src_band_height; i++) + { + horiz_line_scale(source + i * source_pitch, + source_width, + dest + i * dest_pitch, + dest_width); + } + + // first line of next band + horiz_line_scale(source + src_band_height * source_pitch, + source_width, + dest + dest_band_height * dest_pitch, + dest_width); + + // Vertical scaling is in place + vert_band_scale(dest, dest_pitch, dest_width); + + // Next band... + source += src_band_height * source_pitch; + dest += dest_band_height * dest_pitch; + } + + // scale one band horizontally + for (i = 1; i < src_band_height; i++) + { + horiz_line_scale(source + i * source_pitch, + source_width, + dest + i * dest_pitch, + dest_width); + } + + // Vertical scaling is in place + last_vert_band_scale(dest, dest_pitch, dest_width); + + return ratio_scalable; +} + +/**************************************************************************** + * + * ROUTINE : any_ratio_frame_scale + * + * INPUTS : SCALE_INSTANCE *si : Pointer to post-processor instance (NOT USED). + * unsigned char *frame_buffer : Pointer to source image. + * int YOffset : Offset from start of buffer to Y samples. + * int UVOffset : Offset from start of buffer to UV samples. + * + * OUTPUTS : None. + * + * RETURNS : int: 1 if image scaled, 0 if image could not be scaled. + * + * FUNCTION : Scale the image with changing apect ratio. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +int any_ratio_frame_scale(SCALE_VARS *scale_vars, int YOffset, int UVOffset) +{ + int i; + int ew; + int eh; + + // suggested scale factors + int hs = scale_vars->HScale; + int hr = scale_vars->HRatio; + int vs = scale_vars->VScale; + int vr = scale_vars->VRatio; + + int ratio_scalable = 1; + + int sw = (scale_vars->expanded_frame_width * hr + hs - 1) / hs; + int sh = (scale_vars->expanded_frame_height * vr + vs - 1) / vs; + int dw = scale_vars->expanded_frame_width; + int dh = scale_vars->expanded_frame_height; + YV12_BUFFER_CONFIG *src_yuv_config = scale_vars->src_yuv_config; + YV12_BUFFER_CONFIG *dst_yuv_config = scale_vars->dst_yuv_config; + + if (hr == 3) + ew = (sw + 2) / 3 * 3 * hs / hr; + else + ew = (sw + 7) / 8 * 8 * hs / hr; + + if (vr == 3) + eh = (sh + 2) / 3 * 3 * vs / vr; + else + eh = (sh + 7) / 8 * 8 * vs / vr; + + ratio_scalable = any_ratio_2d_scale(scale_vars, + (const unsigned char *)src_yuv_config->y_buffer, + src_yuv_config->y_stride, sw, sh, + (unsigned char *) dst_yuv_config->y_buffer + YOffset, + dst_yuv_config->y_stride, dw, dh); + + for (i = 0; i < eh; i++) + duck_memset(dst_yuv_config->y_buffer + YOffset + i * dst_yuv_config->y_stride + dw, 0, ew - dw); + + for (i = dh; i < eh; i++) + duck_memset(dst_yuv_config->y_buffer + YOffset + i * dst_yuv_config->y_stride, 0, ew); + + if (ratio_scalable == 0) + return ratio_scalable; + + sw = (sw + 1) >> 1; + sh = (sh + 1) >> 1; + dw = (dw + 1) >> 1; + dh = (dh + 1) >> 1; + + any_ratio_2d_scale(scale_vars, + (const unsigned char *)src_yuv_config->u_buffer, + src_yuv_config->y_stride / 2, sw, sh, + (unsigned char *)dst_yuv_config->u_buffer + UVOffset, + dst_yuv_config->uv_stride, dw, dh); + + any_ratio_2d_scale(scale_vars, + (const unsigned char *)src_yuv_config->v_buffer, + src_yuv_config->y_stride / 2, sw, sh, + (unsigned char *)dst_yuv_config->v_buffer + UVOffset, + dst_yuv_config->uv_stride, dw, dh); + + return ratio_scalable; +} + +/**************************************************************************** + * + * ROUTINE : center_image + * + * INPUTS : SCALE_INSTANCE *si : Pointer to post-processor instance. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Centers the image without scaling in the output buffer. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static void +center_image(YV12_BUFFER_CONFIG *src_yuv_config, YV12_BUFFER_CONFIG *dst_yuv_config) +{ + int i; + int row_offset, col_offset; + char *src_data_pointer; + char *dst_data_pointer; + + // center values + row_offset = (dst_yuv_config->y_height - src_yuv_config->y_height) / 2; + col_offset = (dst_yuv_config->y_width - src_yuv_config->y_width) / 2; + + // Y's + src_data_pointer = src_yuv_config->y_buffer; + dst_data_pointer = (char *)dst_yuv_config->y_buffer + (row_offset * dst_yuv_config->y_stride) + col_offset; + + for (i = 0; i < src_yuv_config->y_height; i++) + { + duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->y_width); + dst_data_pointer += dst_yuv_config->y_stride; + src_data_pointer += src_yuv_config->y_stride; + } + + row_offset /= 2; + col_offset /= 2; + + // U's + src_data_pointer = src_yuv_config->u_buffer; + dst_data_pointer = (char *)dst_yuv_config->u_buffer + (row_offset * dst_yuv_config->uv_stride) + col_offset; + + for (i = 0; i < src_yuv_config->uv_height; i++) + { + duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->uv_width); + dst_data_pointer += dst_yuv_config->uv_stride; + src_data_pointer += src_yuv_config->uv_stride; + } + + // V's + src_data_pointer = src_yuv_config->v_buffer; + dst_data_pointer = (char *)dst_yuv_config->v_buffer + (row_offset * dst_yuv_config->uv_stride) + col_offset; + + for (i = 0; i < src_yuv_config->uv_height; i++) + { + duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->uv_width); + dst_data_pointer += dst_yuv_config->uv_stride; + src_data_pointer += src_yuv_config->uv_stride; + } +} + +/**************************************************************************** + * + * ROUTINE : scale_or_center + * + * INPUTS : SCALE_INSTANCE *si : Pointer to post-processor instance. + * + * + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Decides to scale or center image in scale buffer for blit + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void +vp8_yv12_scale_or_center +( + YV12_BUFFER_CONFIG *src_yuv_config, + YV12_BUFFER_CONFIG *dst_yuv_config, + int expanded_frame_width, + int expanded_frame_height, + int scaling_mode, + int HScale, + int HRatio, + int VScale, + int VRatio +) +{ +// if ( ppi->post_processing_level ) + // update_umvborder ( ppi, frame_buffer ); + + + switch (scaling_mode) + { + case SCALE_TO_FIT: + case MAINTAIN_ASPECT_RATIO: + { + SCALE_VARS scale_vars; + // center values +#if 1 + int row = (dst_yuv_config->y_height - expanded_frame_height) / 2; + int col = (dst_yuv_config->y_width - expanded_frame_width) / 2; +// int YOffset = row * dst_yuv_config->y_width + col; +// int UVOffset = (row>>1) * dst_yuv_config->uv_width + (col>>1); + int YOffset = row * dst_yuv_config->y_stride + col; + int UVOffset = (row >> 1) * dst_yuv_config->uv_stride + (col >> 1); +#else + int row = (src_yuv_config->y_height - expanded_frame_height) / 2; + int col = (src_yuv_config->y_width - expanded_frame_width) / 2; + int YOffset = row * src_yuv_config->y_width + col; + int UVOffset = (row >> 1) * src_yuv_config->uv_width + (col >> 1); +#endif + + scale_vars.dst_yuv_config = dst_yuv_config; + scale_vars.src_yuv_config = src_yuv_config; + scale_vars.HScale = HScale; + scale_vars.HRatio = HRatio; + scale_vars.VScale = VScale; + scale_vars.VRatio = VRatio; + scale_vars.expanded_frame_width = expanded_frame_width; + scale_vars.expanded_frame_height = expanded_frame_height; + + // perform center and scale + any_ratio_frame_scale(&scale_vars, YOffset, UVOffset); + + break; + } + case CENTER: + center_image(src_yuv_config, dst_yuv_config); + break; + + default: + break; + } +} diff --git a/vpx_scale/leapster/yv12extend.c b/vpx_scale/leapster/yv12extend.c new file mode 100644 index 000000000..480d971b4 --- /dev/null +++ b/vpx_scale/leapster/yv12extend.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** + * + * Module Title : yv12extend.c + * + * Description : + * + ***************************************************************************/ + +/**************************************************************************** +* Header Files +****************************************************************************/ +//#include <stdlib.h> +#include "vpx_scale/yv12config.h" +#include "vpx_mem/vpx_mem.h" + +/**************************************************************************** +* Exports +****************************************************************************/ + +/**************************************************************************** + * + ****************************************************************************/ +void +vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf) +{ + int i; + char *src_ptr1, *src_ptr2; + char *dest_ptr1, *dest_ptr2; + + unsigned int Border; + int plane_stride; + int plane_height; + int plane_width; + + /***********/ + /* Y Plane */ + /***********/ + Border = ybf->border; + plane_stride = ybf->y_stride; + plane_height = ybf->y_height; + plane_width = ybf->y_width; + + // copy the left and right most columns out + src_ptr1 = ybf->y_buffer; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; i++) + { + memset(dest_ptr1, src_ptr1[0], Border); + memset(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->y_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)Border; i++) + { + memcpy(dest_ptr1, src_ptr1, plane_stride); + memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + plane_stride /= 2; + plane_height /= 2; + plane_width /= 2; + Border /= 2; + + /***********/ + /* U Plane */ + /***********/ + + // copy the left and right most columns out + src_ptr1 = ybf->u_buffer; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; i++) + { + memset(dest_ptr1, src_ptr1[0], Border); + memset(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->u_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)(Border); i++) + { + memcpy(dest_ptr1, src_ptr1, plane_stride); + memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + /***********/ + /* V Plane */ + /***********/ + + // copy the left and right most columns out + src_ptr1 = ybf->v_buffer; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; i++) + { + memset(dest_ptr1, src_ptr1[0], Border); + memset(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + // Now copy the top and bottom source lines into each line of the respective borders + src_ptr1 = ybf->v_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)(Border); i++) + { + memcpy(dest_ptr1, src_ptr1, plane_stride); + memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } +} +/**************************************************************************** + * + * ROUTINE : vp8_yv12_copy_frame + * + * INPUTS : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies the source image into the destination image and + * updates the destination's UMV borders. + * + * SPECIAL NOTES : The frames are assumed to be identical in size. + * + ****************************************************************************/ +void +vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) +{ + int row; + int i; + unsigned int *source; + _Uncached unsigned int *dest; + int height; + int width; + + height = src_ybc->y_height + (src_ybc->border * 2); + width = src_ybc->y_width + (src_ybc->border * 2); + width /= 4; + source = (unsigned int *)(src_ybc->y_buffer - (src_ybc->border * src_ybc->y_stride) - src_ybc->border); + dest = (_Uncached unsigned int *)(dst_ybc->y_buffer - (dst_ybc->border * dst_ybc->y_stride) - dst_ybc->border); + + for (row = 0; row < height; row++) + { + for (i = 0; i < width; i++) + { + dest[i] = source[i]; + } + + source += width; + dest += width; + } + + height = src_ybc->uv_height + (src_ybc->border); + width = src_ybc->uv_width + (src_ybc->border); + width /= 4; + + source = (unsigned int *)(src_ybc->u_buffer - (src_ybc->border / 2 * src_ybc->uv_stride) - src_ybc->border / 2); + dest = (_Uncached unsigned int *)(dst_ybc->u_buffer - (dst_ybc->border / 2 * dst_ybc->uv_stride) - dst_ybc->border / 2); + + for (row = 0; row < height; row++) + { + for (i = 0; i < width; i++) + { + dest[i] = source[i]; + } + + source += width; + dest += width; + } + + source = (unsigned int *)(src_ybc->v_buffer - (src_ybc->border / 2 * src_ybc->uv_stride) - src_ybc->border / 2); + dest = (_Uncached unsigned int *)(dst_ybc->v_buffer - (dst_ybc->border / 2 * dst_ybc->uv_stride) - dst_ybc->border / 2); + + for (row = 0; row < height; row++) + { + for (i = 0; i < width; i++) + { + dest[i] = source[i]; + } + + source += width; + dest += width; + } + +} diff --git a/vpx_scale/scale_mode.h b/vpx_scale/scale_mode.h new file mode 100644 index 000000000..2a9ab7612 --- /dev/null +++ b/vpx_scale/scale_mode.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** +* +***************************************************************************** +*/ + +#ifndef SCALE_MODE_H +#define SCALE_MODE_H + +typedef enum +{ + MAINTAIN_ASPECT_RATIO = 0x0, + SCALE_TO_FIT = 0x1, + CENTER = 0x2, + OTHER = 0x3 +} SCALE_MODE; + + +#endif diff --git a/vpx_scale/symbian/gen_scalers_armv4.asm b/vpx_scale/symbian/gen_scalers_armv4.asm new file mode 100644 index 000000000..1c904edae --- /dev/null +++ b/vpx_scale/symbian/gen_scalers_armv4.asm @@ -0,0 +1,773 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |horizontal_line_4_5_scale_armv4| + EXPORT |vertical_band_4_5_scale_armv4| + EXPORT |horizontal_line_2_3_scale_armv4| + EXPORT |vertical_band_2_3_scale_armv4| + EXPORT |horizontal_line_3_5_scale_armv4| + EXPORT |vertical_band_3_5_scale_armv4| + EXPORT |horizontal_line_3_4_scale_armv4| + EXPORT |vertical_band_3_4_scale_armv4| + EXPORT |horizontal_line_1_2_scale_armv4| + EXPORT |vertical_band_1_2_scale_armv4| + + AREA |.text|, CODE, READONLY ; name this block of code + +src RN r0 +srcw RN r1 +dest RN r2 +mask RN r12 +c51_205 RN r10 +c102_154 RN r11 +;/**************************************************************************** +; * +; * ROUTINE : horizontal_line_4_5_scale_armv4 +; * +; * INPUTS : const unsigned char *source : Pointer to source data. +; * unsigned int source_width : Stride of source. +; * unsigned char *dest : Pointer to destination data. +; * unsigned int dest_width : Stride of destination (NOT USED). +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Copies horizontal line of pixels from source to +; * destination scaling up by 4 to 5. +; * +; * SPECIAL NOTES : None. +; * +; ****************************************************************************/ +;void horizontal_line_4_5_scale_armv4 +;( +; r0 = UINT8 *source +; r1 = UINT32 source_width +; r2 = UINT8 *dest +; r3 = UINT32 dest_width +;) +|horizontal_line_4_5_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + + mov mask, #255 ; mask for selection + ldr c51_205, =0x3300cd + ldr c102_154, =0x66009a + + ldr r3, [src], #4 + +hl45_loop + + and r4, r3, mask ; a = src[0] + and r5, mask, r3, lsr #8 ; b = src[1] + strb r4, [dest], #1 + + orr r6, r4, r5, lsl #16 ; b | a + and r7, mask, r3, lsr #16 ; c = src[2] + mul r6, c51_205, r6 ; a * 51 + 205 * b + + orr r5, r5, r7, lsl #16 ; c | b + mul r5, c102_154, r5 ; b * 102 + 154 * c + add r6, r6, #0x8000 + and r8, mask, r3, lsr #24 ; d = src[3] + mov r6, r6, lsr #24 + strb r6, [dest], #1 + + orr r7, r8, r7, lsl #16 ; c | d + mul r7, c102_154, r7 ; c * 154 + 102 * d + add r5, r5, #0x8000 + ldr r3, [src], #4 + mov r5, r5, lsr #24 + strb r5, [dest], #1 + + add r7, r7, #0x8000 + and r9, mask, r3 ; e = src[4] + orr r9, r9, r8, lsl #16 ; d | e + mul r9, c51_205, r9 ; d * 205 + 51 * e + mov r7, r7, lsr #24 + strb r7, [dest], #1 + + add r9, r9, #0x8000 + subs srcw, srcw, #4 + mov r9, r9, lsr #24 + strb r9, [dest], #1 + + bne hl45_loop + + and r4, r3, mask + and r5, mask, r3, lsl #8 + strb r4, [dest], #1 + + orr r6, r4, r5, lsl #16 ; b | a + mul r6, c51_205, r6 + + and r7, mask, r3, lsl #16 + orr r5, r5, r7, lsl #16 ; c | b + mul r5, c102_154, r5 + add r6, r6, #0x8000 + and r8, mask, r3, lsl #24 + mov r6, r6, lsr #24 + strb r6, [dest], #1 + + orr r7, r8, r7, lsl #16 ; c | d + mul r7, c102_154, r7 + add r5, r5, #0x8000 + mov r5, r5, lsr #24 + strb r5, [dest], #1 + + add r7, r7, #0x8000 + mov r7, r7, lsr #24 + strb r7, [dest], #1 + + ldrb r3, [src] + strb r3, [dest], #1 + + ldmia sp!, {r4 - r11, pc} + ENDP ;|vp8cx_horizontal_line_4_5_scale_c| + +;/**************************************************************************** +; * +; * ROUTINE : vertical_band_4_5_scale_armv4 +; * +; * INPUTS : unsigned char *dest : Pointer to destination data. +; * unsigned int dest_pitch : Stride of destination data. +; * unsigned int dest_width : Width of destination data. +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Scales vertical band of pixels by scale 4 to 5. The +; * height of the band scaled is 4-pixels. +; * +; * SPECIAL NOTES : The routine uses the first line of the band below +; * the current band. +; * +; ****************************************************************************/ +;void vertical_band_4_5_scale_armv4 +;( +; r0 = UINT8 *dest +; r1 = UINT32 dest_pitch +; r2 = UINT32 dest_width +;) +|vertical_band_4_5_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + + ldr c51_205, =0x3300cd + ldr c102_154, =0x66009a + +vl45_loop + mov r3, src + ldrb r4, [r3], r1 ; a = des [0] + ldrb r5, [r3], r1 ; b = des [dest_pitch] + ldrb r7, [r3], r1 ; c = des[dest_pitch*2] + add lr, src, r1 + + orr r6, r4, r5, lsl #16 ; b | a + mul r6, c51_205, r6 ; a * 51 + 205 * b + + ldrb r8, [r3], r1 ; d = des[dest_pitch*3] + orr r5, r5, r7, lsl #16 ; c | b + mul r5, c102_154, r5 ; b * 102 + 154 * c + add r6, r6, #0x8000 + orr r7, r8, r7, lsl #16 ; c | d + mov r6, r6, lsr #24 + strb r6, [lr], r1 + + ldrb r9, [r3, r1] ; e = des [dest_pitch * 5] + mul r7, c102_154, r7 ; c * 154 + 102 * d + add r5, r5, #0x8000 + orr r9, r9, r8, lsl #16 ; d | e + mov r5, r5, lsr #24 + strb r5, [lr], r1 + + mul r9, c51_205, r9 ; d * 205 + 51 * e + add r7, r7, #0x8000 + add src, src, #1 + mov r7, r7, lsr #24 + strb r7, [lr], r1 + + add r9, r9, #0x8000 + subs r2, r2, #1 + mov r9, r9, lsr #24 + strb r9, [lr], r1 + + bne vl45_loop + + ldmia sp!, {r4 - r11, pc} + ENDP ;|vertical_band_4_5_scale_armv4| + +;/**************************************************************************** +; * +; * ROUTINE : horizontal_line_2_3_scale_armv4 +; * +; * INPUTS : const unsigned char *source : Pointer to source data. +; * unsigned int source_width : Stride of source. +; * unsigned char *dest : Pointer to destination data. +; * unsigned int dest_width : Stride of destination (NOT USED). +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Copies horizontal line of pixels from source to +; * destination scaling up by 2 to 3. +; * +; * SPECIAL NOTES : None. +; * +; * +; ****************************************************************************/ +;void horizontal_line_2_3_scale_armv4 +;( +; const unsigned char *source, +; unsigned int source_width, +; unsigned char *dest, +; unsigned int dest_width +;) +|horizontal_line_2_3_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + ldr lr, =85 + ldr r12, =171 + +hl23_loop + + ldrb r3, [src], #1 ; a + ldrb r4, [src], #1 ; b + ldrb r5, [src] ; c + + strb r3, [dest], #1 + mul r4, r12, r4 ; b * 171 + mla r6, lr, r3, r4 ; a * 85 + mla r7, lr, r5, r4 ; c * 85 + + add r6, r6, #128 + mov r6, r6, lsr #8 + strb r6, [dest], #1 + + add r7, r7, #128 + mov r7, r7, lsr #8 + strb r7, [dest], #1 + + subs srcw, srcw, #2 + bne hl23_loop + + ldrb r4, [src, #1] ; b + strb r5, [dest], #1 + strb r4, [dest, #1] + + mul r4, r12, r4 ; b * 171 + mla r6, lr, r5, r4 ; a * 85 + b *171 + + add r6, r6, #128 + mov r6, r6, lsr #8 + strb r6, [dest] + + ldmia sp!, {r4 - r11, pc} + ENDP ;|horizontal_line_2_3_scale_armv4| + +;/**************************************************************************** +; * +; * ROUTINE : vertical_band_2_3_scale_armv4 +; * +; * INPUTS : unsigned char *dest : Pointer to destination data. +; * unsigned int dest_pitch : Stride of destination data. +; * unsigned int dest_width : Width of destination data. +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Scales vertical band of pixels by scale 2 to 3. The +; * height of the band scaled is 2-pixels. +; * +; * SPECIAL NOTES : The routine uses the first line of the band below +; * the current band. +; * +; ****************************************************************************/ +;void vertical_band_2_3_scale_armv4 +;( +; r0 = UINT8 *dest +; r1 = UINT32 dest_pitch +; r2 = UINT32 dest_width +;) +|vertical_band_2_3_scale_armv4| PROC + stmdb sp!, {r4 - r8, lr} + ldr lr, =85 + ldr r12, =171 + add r3, r1, r1, lsl #1 ; 3 * dest_pitch + +vl23_loop + ldrb r4, [src] ; a = des [0] + ldrb r5, [src, r1] ; b = des [dest_pitch] + ldrb r7, [src, r3] ; c = des [dest_pitch*3] + subs r2, r2, #1 + + mul r5, r12, r5 ; b * 171 + mla r6, lr, r4, r5 ; a * 85 + mla r8, lr, r7, r5 ; c * 85 + + add r6, r6, #128 + mov r6, r6, lsr #8 + strb r6, [src, r1] + + add r8, r8, #128 + mov r8, r8, lsr #8 + strb r8, [src, r1, lsl #1] + + add src, src, #1 + + bne vl23_loop + + ldmia sp!, {r4 - r8, pc} + ENDP ;|vertical_band_2_3_scale_armv4| + +;/**************************************************************************** +; * +; * ROUTINE : vp8cx_horizontal_line_3_5_scale_c +; * +; * INPUTS : const unsigned char *source : Pointer to source data. +; * unsigned int source_width : Stride of source. +; * unsigned char *dest : Pointer to destination data. +; * unsigned int dest_width : Stride of destination (NOT USED). +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Copies horizontal line of pixels from source to +; * destination scaling up by 3 to 5. +; * +; * SPECIAL NOTES : None. +; * +; * +; ****************************************************************************/ +;void vp8cx_horizontal_line_3_5_scale_c +;( +; const unsigned char *source, +; unsigned int source_width, +; unsigned char *dest, +; unsigned int dest_width +;) +|horizontal_line_3_5_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + + ldr c51_205, =0x3300cd + ldr c102_154, =0x66009a + + ldrb r4, [src], #1 ; a = src[0] + +hl35_loop + + ldrb r8, [src], #1 ; b = src[1] + strb r4, [dest], #1 + + orr r6, r4, r8, lsl #16 ; b | a + ldrb r9, [src], #1 ; c = src[2] + mul r6, c102_154, r6 ; a * 102 + 154 * b + + orr r5, r9, r8, lsl #16 ; b | c + mul r5, c51_205, r5 ; b * 205 + 51 * c + add r6, r6, #0x8000 + ldrb r4, [src], #1 ; d = src[3] + mov r6, r6, lsr #24 + strb r6, [dest], #1 + + orr r7, r8, r9, lsl #16 ; c | b + mul r7, c51_205, r7 ; c * 205 + 154 * b + add r5, r5, #0x8000 + mov r5, r5, lsr #24 + strb r5, [dest], #1 + + orr r9, r4, r9, lsl #16 ; c | d + mul r9, c102_154, r9 ; c * 154 + 102 * d + add r7, r7, #0x8000 + mov r7, r7, lsr #24 + strb r7, [dest], #1 + + add r9, r9, #0x8000 + subs srcw, srcw, #3 + mov r9, r9, lsr #24 + strb r9, [dest], #1 + + bpl hl35_loop + + ldrb r5, [src], #1 ; b = src[1] + strb r4, [dest], #1 + + orr r6, r4, r8, lsl #16 ; b | a + ldrb r9, [src], #1 ; c = src[2] + mul r6, c102_154, r6 ; a * 102 + 154 * b + + orr r5, r9, r8, lsl #16 ; b | c + mul r5, c51_205, r5 ; b * 205 + 51 * c + add r6, r6, #0x8000 + mov r6, r6, lsr #24 + strb r6, [dest], #1 + + orr r7, r8, r9, lsl #16 ; c | b + mul r7, c51_205, r7 ; c * 205 + 154 * b + add r5, r5, #0x8000 + mov r5, r5, lsr #24 + strb r5, [dest], #1 + + add r7, r7, #0x8000 + mov r7, r7, lsr #24 + strb r7, [dest], #1 + strb r9, [dest], #1 + + ldmia sp!, {r4 - r11, pc} + ENDP ;|vp8cx_horizontal_line_3_5_scale_c| + + +;/**************************************************************************** +; * +; * ROUTINE : vp8cx_vertical_band_3_5_scale_c +; * +; * INPUTS : unsigned char *dest : Pointer to destination data. +; * unsigned int dest_pitch : Stride of destination data. +; * unsigned int dest_width : Width of destination data. +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Scales vertical band of pixels by scale 3 to 5. The +; * height of the band scaled is 3-pixels. +; * +; * SPECIAL NOTES : The routine uses the first line of the band below +; * the current band. +; * +; ****************************************************************************/ +;void vertical_band_4_5_scale_armv4 +;( +; r0 = UINT8 *dest +; r1 = UINT32 dest_pitch +; r2 = UINT32 dest_width +;) +|vertical_band_3_5_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + + ldr c51_205, =0x3300cd + ldr c102_154, =0x66009a + +vl35_loop + mov r3, src + ldrb r4, [r3], r1 ; a = des [0] + ldrb r5, [r3], r1 ; b = des [dest_pitch] + ldrb r7, [r3], r1 ; c = des[dest_pitch*2] + add lr, src, r1 + + orr r8, r4, r5, lsl #16 ; b | a + mul r6, c102_154, r8 ; a * 102 + 154 * b + + ldrb r8, [r3, r1, lsl #1] ; d = des[dest_pitch*5] + orr r3, r7, r5, lsl #16 ; b | c + mul r9, c51_205, r3 ; b * 205 + 51 * c + add r6, r6, #0x8000 + orr r3, r5, r7, lsl #16 ; c | b + mov r6, r6, lsr #24 + strb r6, [lr], r1 + + mul r5, c51_205, r3 ; c * 205 + 154 * b + add r9, r9, #0x8000 + orr r3, r8, r7, lsl #16 ; c | d + mov r9, r9, lsr #24 + strb r9, [lr], r1 + + mul r7, c102_154, r3 ; c * 154 + 102 * d + add r5, r5, #0x8000 + add src, src, #1 + mov r5, r5, lsr #24 + strb r5, [lr], r1 + + add r7, r7, #0x8000 + subs r2, r2, #1 + mov r7, r7, lsr #24 + strb r7, [lr], r1 + + + bne vl35_loop + + ldmia sp!, {r4 - r11, pc} + ENDP ;|vertical_band_3_5_scale_armv4| + +;/**************************************************************************** +; * +; * ROUTINE : horizontal_line_3_4_scale_armv4 +; * +; * INPUTS : const unsigned char *source : Pointer to source data. +; * unsigned int source_width : Stride of source. +; * unsigned char *dest : Pointer to destination data. +; * unsigned int dest_width : Stride of destination (NOT USED). +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Copies horizontal line of pixels from source to +; * destination scaling up by 3 to 4. +; * +; * SPECIAL NOTES : None. +; * +; * +; ****************************************************************************/ +;void horizontal_line_3_4_scale_armv4 +;( +; const unsigned char *source, +; unsigned int source_width, +; unsigned char *dest, +; unsigned int dest_width +;) +|horizontal_line_3_4_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r10, =64 + ldr r11, =192 + mov r9, #128 + + ldrb r4, [src], #1 ; a = src[0] + +hl34_loop + + ldrb r8, [src], #1 ; b = src[1] + ldrb r7, [src], #1 ; c = src[2] + strb r4, [dest], #1 + + mla r4, r10, r4, r9 ; a*64 + 128 + mla r4, r11, r8, r4 ; a*64 + b*192 + 1 + + add r8, r8, #1 ; b + 1 + add r8, r8, r7 ; b + c + 1 + mov r8, r8, asr #1 ; (b + c + 1) >> 1 + + mov r4, r4, asr #8 ; (a*64 + b*192 + 1) >> 8 + strb r4, [dest], #1 + + strb r8, [dest], #1 + + ldrb r4, [src], #1 ; [a+1] + + mla r7, r11, r7, r9 ; c*192 + 128 + mla r7, r4, r10, r7 ; a*64 + b*192 + 128 + + subs srcw, srcw, #3 + + mov r7, r7, asr #8 ; (a*64 + b*192 + 128) >> 8 + strb r7, [dest], #1 + + bpl hl34_loop + + ldrb r8, [src], #1 ; b = src[1] + ldrb r7, [src], #1 ; c = src[2] + strb r4, [dest], #1 + + mla r4, r10, r4, r9 ; a*64 + 128 + mla r4, r11, r8, r4 ; a*64 + b*192 + 1 + mov r4, r4, asr #8 ; (a*64 + b*192 + 1) >> 8 + strb r4, [dest], #1 + + add r8, r8, #1 ; b + 1 + add r8, r8, r7 ; b + c + 1 + mov r8, r8, asr #1 ; (b + c + 1) >> 1 + strb r8, [dest], #1 + strb r7, [dest], #1 + + ldmia sp!, {r4 - r11, pc} + ENDP ;|vp8cx_horizontal_line_3_4_scale_c| + + +;/**************************************************************************** +; * +; * ROUTINE : vertical_band_3_4_scale_armv4 +; * +; * INPUTS : unsigned char *dest : Pointer to destination data. +; * unsigned int dest_pitch : Stride of destination data. +; * unsigned int dest_width : Width of destination data. +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Scales vertical band of pixels by scale 3 to 4. The +; * height of the band scaled is 3-pixels. +; * +; * SPECIAL NOTES : The routine uses the first line of the band below +; * the current band. +; * +; ****************************************************************************/ +;void vertical_band_3_4_scale_armv4 +;( +; r0 = UINT8 *dest +; r1 = UINT32 dest_pitch +; r2 = UINT32 dest_width +;) +|vertical_band_3_4_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r10, =64 + ldr r11, =192 + mov r9, #128 + +; ldr r1,[r1] +vl34_loop + mov r3, src + ldrb r4, [r3], r1 ; a = des [0] + ldrb r5, [r3], r1 ; b = des [dest_pitch] + ldrb r7, [r3], r1 ; c = des [dest_pitch*2] + add lr, src, r1 + + mla r4, r10, r4, r9 ; a*64 + 128 + mla r4, r11, r5, r4 ; a*64 + b*192 + 1 + + add r5, r5, #1 ; b + 1 + add r5, r5, r7 ; b + c + 1 + mov r5, r5, asr #1 ; (b + c + 1) >> 1 + + mov r4, r4, asr #8 ; (a*64 + b*192 + 1) >> 8 + strb r4, [lr], r1 + + ldrb r4, [r3, r1] ; a = des [dest_pitch*4] + + strb r5, [lr], r1 + + mla r7, r11, r7, r9 ; c*192 + 128 + mla r7, r4, r10, r7 ; a*64 + b*192 + 128 + mov r7, r7, asr #8 ; (a*64 + b*192 + 128) >> 8 + + add src, src, #1 + subs r2, r2, #1 + + strb r7, [lr] + + bne vl34_loop + + ldmia sp!, {r4 - r11, pc} + ENDP ;|vertical_band_3_4_scale_armv4| + +;/**************************************************************************** +; * +; * ROUTINE : vp8cx_horizontal_line_1_2_scale_c +; * +; * INPUTS : const unsigned char *source : Pointer to source data. +; * unsigned int source_width : Stride of source. +; * unsigned char *dest : Pointer to destination data. +; * unsigned int dest_width : Stride of destination (NOT USED). +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Copies horizontal line of pixels from source to +; * destination scaling up by 1 to 2. +; * +; * SPECIAL NOTES : None. +; * +; ****************************************************************************/ +;void vp8cx_horizontal_line_1_2_scale_c +;( +; const unsigned char *source, +; unsigned int source_width, +; unsigned char *dest, +; unsigned int dest_width +;) +|horizontal_line_1_2_scale_armv4| PROC + stmdb sp!, {r4 - r5, lr} + + sub srcw, srcw, #1 + + ldrb r3, [src], #1 + ldrb r4, [src], #1 +hl12_loop + subs srcw, srcw, #1 + + add r5, r3, r4 + add r5, r5, #1 + mov r5, r5, lsr #1 + + orr r5, r3, r5, lsl #8 + strh r5, [dest], #2 + + mov r3, r4 + + ldrneb r4, [src], #1 + bne hl12_loop + + orr r5, r4, r4, lsl #8 + strh r5, [dest] + + ldmia sp!, {r4 - r5, pc} + ENDP ;|vertical_band_3_5_scale_armv4| + +;/**************************************************************************** +; * +; * ROUTINE : vp8cx_vertical_band_1_2_scale_c +; * +; * INPUTS : unsigned char *dest : Pointer to destination data. +; * unsigned int dest_pitch : Stride of destination data. +; * unsigned int dest_width : Width of destination data. +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Scales vertical band of pixels by scale 1 to 2. The +; * height of the band scaled is 1-pixel. +; * +; * SPECIAL NOTES : The routine uses the first line of the band below +; * the current band. +; * +; ****************************************************************************/ +;void vp8cx_vertical_band_1_2_scale_c +;( +; r0 = UINT8 *dest +; r1 = UINT32 dest_pitch +; r2 = UINT32 dest_width +;) +|vertical_band_1_2_scale_armv4| PROC + stmdb sp!, {r4 - r7, lr} + + ldr mask, =0xff00ff ; mask for selection + ldr lr, = 0x010001 + +vl12_loop + mov r3, src + ldr r4, [r3], r1 + ldr r5, [r3, r1] + + add src, src, #4 + subs r2, r2, #4 + + and r6, r4, mask + and r7, r5, mask + + add r6, r7, r6 + add r6, r6, lr + + and r4, mask, r4, lsr #8 + and r5, mask, r5, lsr #8 + + mov r6, r6, lsr #1 + and r6, r6, mask + + add r4, r5, r4 + add r4, r4, lr + + mov r4, r4, lsr #1 + and r4, r4, mask + + orr r5, r6, r4, lsl #8 + + str r5, [r3] + + bpl vl12_loop + + ldmia sp!, {r4 - r7, pc} + ENDP ;|vertical_band_3_5_scale_armv4| + + END diff --git a/vpx_scale/symbian/gen_scalers_armv4.s b/vpx_scale/symbian/gen_scalers_armv4.s new file mode 100644 index 000000000..3dfd0b9b9 --- /dev/null +++ b/vpx_scale/symbian/gen_scalers_armv4.s @@ -0,0 +1,808 @@ +@ This file was created from a .asm file +@ using the ads2gas.pl script. + + .equ WIDE_REFERENCE, 0 + .ifndef ARCHITECTURE + .equ ARCHITECTURE, 5 + .endif + .global horizontal_line_4_5_scale_armv4 + .ifndef NO_TYPE_PSEUDO_OP + .type horizontal_line_4_5_scale_armv4, function + .endif + .global vertical_band_4_5_scale_armv4 + .ifndef NO_TYPE_PSEUDO_OP + .type vertical_band_4_5_scale_armv4, function + .endif + .global horizontal_line_2_3_scale_armv4 + .ifndef NO_TYPE_PSEUDO_OP + .type horizontal_line_2_3_scale_armv4, function + .endif + .global vertical_band_2_3_scale_armv4 + .ifndef NO_TYPE_PSEUDO_OP + .type vertical_band_2_3_scale_armv4, function + .endif + .global horizontal_line_3_5_scale_armv4 + .ifndef NO_TYPE_PSEUDO_OP + .type horizontal_line_3_5_scale_armv4, function + .endif + .global vertical_band_3_5_scale_armv4 + .ifndef NO_TYPE_PSEUDO_OP + .type vertical_band_3_5_scale_armv4, function + .endif + .global horizontal_line_3_4_scale_armv4 + .ifndef NO_TYPE_PSEUDO_OP + .type horizontal_line_3_4_scale_armv4, function + .endif + .global vertical_band_3_4_scale_armv4 + .ifndef NO_TYPE_PSEUDO_OP + .type vertical_band_3_4_scale_armv4, function + .endif + .global horizontal_line_1_2_scale_armv4 + .ifndef NO_TYPE_PSEUDO_OP + .type horizontal_line_1_2_scale_armv4, function + .endif + .global vertical_band_1_2_scale_armv4 + .ifndef NO_TYPE_PSEUDO_OP + .type vertical_band_1_2_scale_armv4, function + .endif + +.text + +src .req r0 +srcw .req r1 +dest .req r2 +mask .req r12 +c51_205 .req r10 +c102_154 .req r11 +@/**************************************************************************** +@ * +@ * ROUTINE : horizontal_line_4_5_scale_armv4 +@ * +@ * INPUTS : const unsigned char *source : Pointer to source data. +@ * unsigned int source_width : Stride of source. +@ * unsigned char *dest : Pointer to destination data. +@ * unsigned int dest_width : Stride of destination (NOT USED). +@ * +@ * OUTPUTS : None. +@ * +@ * RETU.req_s : void +@ * +@ * FUNCTION : Copies horizontal line of pixels from source to +@ * destination scaling up by 4 to 5. +@ * +@ * SPECIAL NOTES : None. +@ * +@ ****************************************************************************/ +@void horizontal_line_4_5_scale_armv4 +@( +@ r0 = UINT8 *source +@ r1 = UINT32 source_width +@ r2 = UINT8 *dest +@ r3 = UINT32 dest_width +@) +_HorizontalLine_4_5_Scale_ARMv4: + horizontal_line_4_5_scale_armv4: @ + stmdb sp!, {r4 - r11, lr} + + mov mask, #255 @ mask for selection + ldr c51_205, =0x3300cd + ldr c102_154, =0x66009a + + ldr r3, [src], #4 + +hl45_loop: + + and r4, r3, mask @ a = src[0] + and r5, mask, r3, lsr #8 @ b = src[1] + strb r4, [dest], #1 + + orr r6, r4, r5, lsl #16 @ b | a + and r7, mask, r3, lsr #16 @ c = src[2] + mul r6, c51_205, r6 @ a * 51 + 205 * b + + orr r5, r5, r7, lsl #16 @ c | b + mul r5, c102_154, r5 @ b * 102 + 154 * c + add r6, r6, #0x8000 + and r8, mask, r3, lsr #24 @ d = src[3] + mov r6, r6, lsr #24 + strb r6, [dest], #1 + + orr r7, r8, r7, lsl #16 @ c | d + mul r7, c102_154, r7 @ c * 154 + 102 * d + add r5, r5, #0x8000 + ldr r3, [src], #4 + mov r5, r5, lsr #24 + strb r5, [dest], #1 + + add r7, r7, #0x8000 + and r9, mask, r3 @ e = src[4] + orr r9, r9, r8, lsl #16 @ d | e + mul r9, c51_205, r9 @ d * 205 + 51 * e + mov r7, r7, lsr #24 + strb r7, [dest], #1 + + add r9, r9, #0x8000 + subs srcw, srcw, #4 + mov r9, r9, lsr #24 + strb r9, [dest], #1 + + bne hl45_loop + + and r4, r3, mask + and r5, mask, r3, lsl #8 + strb r4, [dest], #1 + + orr r6, r4, r5, lsl #16 @ b | a + mul r6, c51_205, r6 + + and r7, mask, r3, lsl #16 + orr r5, r5, r7, lsl #16 @ c | b + mul r5, c102_154, r5 + add r6, r6, #0x8000 + and r8, mask, r3, lsl #24 + mov r6, r6, lsr #24 + strb r6, [dest], #1 + + orr r7, r8, r7, lsl #16 @ c | d + mul r7, c102_154, r7 + add r5, r5, #0x8000 + mov r5, r5, lsr #24 + strb r5, [dest], #1 + + add r7, r7, #0x8000 + mov r7, r7, lsr #24 + strb r7, [dest], #1 + + ldrb r3, [src] + strb r3, [dest], #1 + + ldmia sp!, {r4 - r11, pc} + @ @|vp8cx_horizontal_line_4_5_scale_c| + +@/**************************************************************************** +@ * +@ * ROUTINE : vertical_band_4_5_scale_armv4 +@ * +@ * INPUTS : unsigned char *dest : Pointer to destination data. +@ * unsigned int dest_pitch : Stride of destination data. +@ * unsigned int dest_width : Width of destination data. +@ * +@ * OUTPUTS : None. +@ * +@ * RETU.req_s : void +@ * +@ * FUNCTION : Scales vertical band of pixels by scale 4 to 5. The +@ * height of the band scaled is 4-pixels. +@ * +@ * SPECIAL NOTES : The routine uses the first line of the band below +@ * the current band. +@ * +@ ****************************************************************************/ +@void vertical_band_4_5_scale_armv4 +@( +@ r0 = UINT8 *dest +@ r1 = UINT32 dest_pitch +@ r2 = UINT32 dest_width +@) +_VerticalBand_4_5_Scale_ARMv4: + vertical_band_4_5_scale_armv4: @ + stmdb sp!, {r4 - r11, lr} + + ldr c51_205, =0x3300cd + ldr c102_154, =0x66009a + +vl45_loop: + mov r3, src + ldrb r4, [r3], r1 @ a = des [0] + ldrb r5, [r3], r1 @ b = des [dest_pitch] + ldrb r7, [r3], r1 @ c = des[dest_pitch*2] + add lr, src, r1 + + orr r6, r4, r5, lsl #16 @ b | a + mul r6, c51_205, r6 @ a * 51 + 205 * b + + ldrb r8, [r3], r1 @ d = des[dest_pitch*3] + orr r5, r5, r7, lsl #16 @ c | b + mul r5, c102_154, r5 @ b * 102 + 154 * c + add r6, r6, #0x8000 + orr r7, r8, r7, lsl #16 @ c | d + mov r6, r6, lsr #24 + strb r6, [lr], r1 + + ldrb r9, [r3, r1] @ e = des [dest_pitch * 5] + mul r7, c102_154, r7 @ c * 154 + 102 * d + add r5, r5, #0x8000 + orr r9, r9, r8, lsl #16 @ d | e + mov r5, r5, lsr #24 + strb r5, [lr], r1 + + mul r9, c51_205, r9 @ d * 205 + 51 * e + add r7, r7, #0x8000 + add src, src, #1 + mov r7, r7, lsr #24 + strb r7, [lr], r1 + + add r9, r9, #0x8000 + subs r2, r2, #1 + mov r9, r9, lsr #24 + strb r9, [lr], r1 + + bne vl45_loop + + ldmia sp!, {r4 - r11, pc} + @ @|vertical_band_4_5_scale_armv4| + +@/**************************************************************************** +@ * +@ * ROUTINE : horizontal_line_2_3_scale_armv4 +@ * +@ * INPUTS : const unsigned char *source : Pointer to source data. +@ * unsigned int source_width : Stride of source. +@ * unsigned char *dest : Pointer to destination data. +@ * unsigned int dest_width : Stride of destination (NOT USED). +@ * +@ * OUTPUTS : None. +@ * +@ * RETU.req_s : void +@ * +@ * FUNCTION : Copies horizontal line of pixels from source to +@ * destination scaling up by 2 to 3. +@ * +@ * SPECIAL NOTES : None. +@ * +@ * +@ ****************************************************************************/ +@void horizontal_line_2_3_scale_armv4 +@( +@ const unsigned char *source, +@ unsigned int source_width, +@ unsigned char *dest, +@ unsigned int dest_width +@) +_HorizontalLine_2_3_Scale_ARMv4: + horizontal_line_2_3_scale_armv4: @ + stmdb sp!, {r4 - r11, lr} + ldr lr, =85 + ldr r12, =171 + +hl23_loop: + + ldrb r3, [src], #1 @ a + ldrb r4, [src], #1 @ b + ldrb r5, [src] @ c + + strb r3, [dest], #1 + mul r4, r12, r4 @ b * 171 + mla r6, lr, r3, r4 @ a * 85 + mla r7, lr, r5, r4 @ c * 85 + + add r6, r6, #128 + mov r6, r6, lsr #8 + strb r6, [dest], #1 + + add r7, r7, #128 + mov r7, r7, lsr #8 + strb r7, [dest], #1 + + subs srcw, srcw, #2 + bne hl23_loop + + ldrb r4, [src, #1] @ b + strb r5, [dest], #1 + strb r4, [dest, #1] + + mul r4, r12, r4 @ b * 171 + mla r6, lr, r5, r4 @ a * 85 + b *171 + + add r6, r6, #128 + mov r6, r6, lsr #8 + strb r6, [dest] + + ldmia sp!, {r4 - r11, pc} + @ @|horizontal_line_2_3_scale_armv4| + +@/**************************************************************************** +@ * +@ * ROUTINE : vertical_band_2_3_scale_armv4 +@ * +@ * INPUTS : unsigned char *dest : Pointer to destination data. +@ * unsigned int dest_pitch : Stride of destination data. +@ * unsigned int dest_width : Width of destination data. +@ * +@ * OUTPUTS : None. +@ * +@ * RETU.req_s : void +@ * +@ * FUNCTION : Scales vertical band of pixels by scale 2 to 3. The +@ * height of the band scaled is 2-pixels. +@ * +@ * SPECIAL NOTES : The routine uses the first line of the band below +@ * the current band. +@ * +@ ****************************************************************************/ +@void vertical_band_2_3_scale_armv4 +@( +@ r0 = UINT8 *dest +@ r1 = UINT32 dest_pitch +@ r2 = UINT32 dest_width +@) +_VerticalBand_2_3_Scale_ARMv4: + vertical_band_2_3_scale_armv4: @ + stmdb sp!, {r4 - r8, lr} + ldr lr, =85 + ldr r12, =171 + add r3, r1, r1, lsl #1 @ 3 * dest_pitch + +vl23_loop: + ldrb r4, [src] @ a = des [0] + ldrb r5, [src, r1] @ b = des [dest_pitch] + ldrb r7, [src, r3] @ c = des [dest_pitch*3] + subs r2, r2, #1 + + mul r5, r12, r5 @ b * 171 + mla r6, lr, r4, r5 @ a * 85 + mla r8, lr, r7, r5 @ c * 85 + + add r6, r6, #128 + mov r6, r6, lsr #8 + strb r6, [src, r1] + + add r8, r8, #128 + mov r8, r8, lsr #8 + strb r8, [src, r1, lsl #1] + + add src, src, #1 + + bne vl23_loop + + ldmia sp!, {r4 - r8, pc} + @ @|vertical_band_2_3_scale_armv4| + +@/**************************************************************************** +@ * +@ * ROUTINE : vp8cx_horizontal_line_3_5_scale_c +@ * +@ * INPUTS : const unsigned char *source : Pointer to source data. +@ * unsigned int source_width : Stride of source. +@ * unsigned char *dest : Pointer to destination data. +@ * unsigned int dest_width : Stride of destination (NOT USED). +@ * +@ * OUTPUTS : None. +@ * +@ * RETU.req_s : void +@ * +@ * FUNCTION : Copies horizontal line of pixels from source to +@ * destination scaling up by 3 to 5. +@ * +@ * SPECIAL NOTES : None. +@ * +@ * +@ ****************************************************************************/ +@void vp8cx_horizontal_line_3_5_scale_c +@( +@ const unsigned char *source, +@ unsigned int source_width, +@ unsigned char *dest, +@ unsigned int dest_width +@) +_HorizontalLine_3_5_Scale_ARMv4: + horizontal_line_3_5_scale_armv4: @ + stmdb sp!, {r4 - r11, lr} + + ldr c51_205, =0x3300cd + ldr c102_154, =0x66009a + + ldrb r4, [src], #1 @ a = src[0] + +hl35_loop: + + ldrb r8, [src], #1 @ b = src[1] + strb r4, [dest], #1 + + orr r6, r4, r8, lsl #16 @ b | a + ldrb r9, [src], #1 @ c = src[2] + mul r6, c102_154, r6 @ a * 102 + 154 * b + + orr r5, r9, r8, lsl #16 @ b | c + mul r5, c51_205, r5 @ b * 205 + 51 * c + add r6, r6, #0x8000 + ldrb r4, [src], #1 @ d = src[3] + mov r6, r6, lsr #24 + strb r6, [dest], #1 + + orr r7, r8, r9, lsl #16 @ c | b + mul r7, c51_205, r7 @ c * 205 + 154 * b + add r5, r5, #0x8000 + mov r5, r5, lsr #24 + strb r5, [dest], #1 + + orr r9, r4, r9, lsl #16 @ c | d + mul r9, c102_154, r9 @ c * 154 + 102 * d + add r7, r7, #0x8000 + mov r7, r7, lsr #24 + strb r7, [dest], #1 + + add r9, r9, #0x8000 + subs srcw, srcw, #3 + mov r9, r9, lsr #24 + strb r9, [dest], #1 + + bpl hl35_loop + + ldrb r5, [src], #1 @ b = src[1] + strb r4, [dest], #1 + + orr r6, r4, r8, lsl #16 @ b | a + ldrb r9, [src], #1 @ c = src[2] + mul r6, c102_154, r6 @ a * 102 + 154 * b + + orr r5, r9, r8, lsl #16 @ b | c + mul r5, c51_205, r5 @ b * 205 + 51 * c + add r6, r6, #0x8000 + mov r6, r6, lsr #24 + strb r6, [dest], #1 + + orr r7, r8, r9, lsl #16 @ c | b + mul r7, c51_205, r7 @ c * 205 + 154 * b + add r5, r5, #0x8000 + mov r5, r5, lsr #24 + strb r5, [dest], #1 + + add r7, r7, #0x8000 + mov r7, r7, lsr #24 + strb r7, [dest], #1 + strb r9, [dest], #1 + + ldmia sp!, {r4 - r11, pc} + @ @|vp8cx_horizontal_line_3_5_scale_c| + + +@/**************************************************************************** +@ * +@ * ROUTINE : vp8cx_vertical_band_3_5_scale_c +@ * +@ * INPUTS : unsigned char *dest : Pointer to destination data. +@ * unsigned int dest_pitch : Stride of destination data. +@ * unsigned int dest_width : Width of destination data. +@ * +@ * OUTPUTS : None. +@ * +@ * RETU.req_s : void +@ * +@ * FUNCTION : Scales vertical band of pixels by scale 3 to 5. The +@ * height of the band scaled is 3-pixels. +@ * +@ * SPECIAL NOTES : The routine uses the first line of the band below +@ * the current band. +@ * +@ ****************************************************************************/ +@void vertical_band_4_5_scale_armv4 +@( +@ r0 = UINT8 *dest +@ r1 = UINT32 dest_pitch +@ r2 = UINT32 dest_width +@) +_VerticalBand_3_5_Scale_ARMv4: + vertical_band_3_5_scale_armv4: @ + stmdb sp!, {r4 - r11, lr} + + ldr c51_205, =0x3300cd + ldr c102_154, =0x66009a + +vl35_loop: + mov r3, src + ldrb r4, [r3], r1 @ a = des [0] + ldrb r5, [r3], r1 @ b = des [dest_pitch] + ldrb r7, [r3], r1 @ c = des[dest_pitch*2] + add lr, src, r1 + + orr r8, r4, r5, lsl #16 @ b | a + mul r6, c102_154, r8 @ a * 102 + 154 * b + + ldrb r8, [r3, r1, lsl #1] @ d = des[dest_pitch*5] + orr r3, r7, r5, lsl #16 @ b | c + mul r9, c51_205, r3 @ b * 205 + 51 * c + add r6, r6, #0x8000 + orr r3, r5, r7, lsl #16 @ c | b + mov r6, r6, lsr #24 + strb r6, [lr], r1 + + mul r5, c51_205, r3 @ c * 205 + 154 * b + add r9, r9, #0x8000 + orr r3, r8, r7, lsl #16 @ c | d + mov r9, r9, lsr #24 + strb r9, [lr], r1 + + mul r7, c102_154, r3 @ c * 154 + 102 * d + add r5, r5, #0x8000 + add src, src, #1 + mov r5, r5, lsr #24 + strb r5, [lr], r1 + + add r7, r7, #0x8000 + subs r2, r2, #1 + mov r7, r7, lsr #24 + strb r7, [lr], r1 + + + bne vl35_loop + + ldmia sp!, {r4 - r11, pc} + @ @|vertical_band_3_5_scale_armv4| + +@/**************************************************************************** +@ * +@ * ROUTINE : horizontal_line_3_4_scale_armv4 +@ * +@ * INPUTS : const unsigned char *source : Pointer to source data. +@ * unsigned int source_width : Stride of source. +@ * unsigned char *dest : Pointer to destination data. +@ * unsigned int dest_width : Stride of destination (NOT USED). +@ * +@ * OUTPUTS : None. +@ * +@ * RETU.req_s : void +@ * +@ * FUNCTION : Copies horizontal line of pixels from source to +@ * destination scaling up by 3 to 4. +@ * +@ * SPECIAL NOTES : None. +@ * +@ * +@ ****************************************************************************/ +@void horizontal_line_3_4_scale_armv4 +@( +@ const unsigned char *source, +@ unsigned int source_width, +@ unsigned char *dest, +@ unsigned int dest_width +@) +_HorizontalLine_3_4_Scale_ARMv4: + horizontal_line_3_4_scale_armv4: @ + stmdb sp!, {r4 - r11, lr} + + ldr r10, =64 + ldr r11, =192 + mov r9, #128 + + ldrb r4, [src], #1 @ a = src[0] + +hl34_loop: + + ldrb r8, [src], #1 @ b = src[1] + ldrb r7, [src], #1 @ c = src[2] + strb r4, [dest], #1 + + mla r4, r10, r4, r9 @ a*64 + 128 + mla r4, r11, r8, r4 @ a*64 + b*192 + 1 + + add r8, r8, #1 @ b + 1 + add r8, r8, r7 @ b + c + 1 + mov r8, r8, asr #1 @ (b + c + 1) >> 1 + + mov r4, r4, asr #8 @ (a*64 + b*192 + 1) >> 8 + strb r4, [dest], #1 + + strb r8, [dest], #1 + + ldrb r4, [src], #1 @ [a+1] + + mla r7, r11, r7, r9 @ c*192 + 128 + mla r7, r4, r10, r7 @ a*64 + b*192 + 128 + + subs srcw, srcw, #3 + + mov r7, r7, asr #8 @ (a*64 + b*192 + 128) >> 8 + strb r7, [dest], #1 + + bpl hl34_loop + + ldrb r8, [src], #1 @ b = src[1] + ldrb r7, [src], #1 @ c = src[2] + strb r4, [dest], #1 + + mla r4, r10, r4, r9 @ a*64 + 128 + mla r4, r11, r8, r4 @ a*64 + b*192 + 1 + mov r4, r4, asr #8 @ (a*64 + b*192 + 1) >> 8 + strb r4, [dest], #1 + + add r8, r8, #1 @ b + 1 + add r8, r8, r7 @ b + c + 1 + mov r8, r8, asr #1 @ (b + c + 1) >> 1 + strb r8, [dest], #1 + strb r7, [dest], #1 + + ldmia sp!, {r4 - r11, pc} + @ @|vp8cx_horizontal_line_3_4_scale_c| + + +@/**************************************************************************** +@ * +@ * ROUTINE : vertical_band_3_4_scale_armv4 +@ * +@ * INPUTS : unsigned char *dest : Pointer to destination data. +@ * unsigned int dest_pitch : Stride of destination data. +@ * unsigned int dest_width : Width of destination data. +@ * +@ * OUTPUTS : None. +@ * +@ * RETU.req_s : void +@ * +@ * FUNCTION : Scales vertical band of pixels by scale 3 to 4. The +@ * height of the band scaled is 3-pixels. +@ * +@ * SPECIAL NOTES : The routine uses the first line of the band below +@ * the current band. +@ * +@ ****************************************************************************/ +@void vertical_band_3_4_scale_armv4 +@( +@ r0 = UINT8 *dest +@ r1 = UINT32 dest_pitch +@ r2 = UINT32 dest_width +@) +_VerticalBand_3_4_Scale_ARMv4: + vertical_band_3_4_scale_armv4: @ + stmdb sp!, {r4 - r11, lr} + + ldr r10, =64 + ldr r11, =192 + mov r9, #128 + +@ ldr r1,[r1] +vl34_loop: + mov r3, src + ldrb r4, [r3], r1 @ a = des [0] + ldrb r5, [r3], r1 @ b = des [dest_pitch] + ldrb r7, [r3], r1 @ c = des [dest_pitch*2] + add lr, src, r1 + + mla r4, r10, r4, r9 @ a*64 + 128 + mla r4, r11, r5, r4 @ a*64 + b*192 + 1 + + add r5, r5, #1 @ b + 1 + add r5, r5, r7 @ b + c + 1 + mov r5, r5, asr #1 @ (b + c + 1) >> 1 + + mov r4, r4, asr #8 @ (a*64 + b*192 + 1) >> 8 + strb r4, [lr], r1 + + ldrb r4, [r3, r1] @ a = des [dest_pitch*4] + + strb r5, [lr], r1 + + mla r7, r11, r7, r9 @ c*192 + 128 + mla r7, r4, r10, r7 @ a*64 + b*192 + 128 + mov r7, r7, asr #8 @ (a*64 + b*192 + 128) >> 8 + + add src, src, #1 + subs r2, r2, #1 + + strb r7, [lr] + + bne vl34_loop + + ldmia sp!, {r4 - r11, pc} + @ @|vertical_band_3_4_scale_armv4| + +@/**************************************************************************** +@ * +@ * ROUTINE : vp8cx_horizontal_line_1_2_scale_c +@ * +@ * INPUTS : const unsigned char *source : Pointer to source data. +@ * unsigned int source_width : Stride of source. +@ * unsigned char *dest : Pointer to destination data. +@ * unsigned int dest_width : Stride of destination (NOT USED). +@ * +@ * OUTPUTS : None. +@ * +@ * RETU.req_s : void +@ * +@ * FUNCTION : Copies horizontal line of pixels from source to +@ * destination scaling up by 1 to 2. +@ * +@ * SPECIAL NOTES : None. +@ * +@ ****************************************************************************/ +@void vp8cx_horizontal_line_1_2_scale_c +@( +@ const unsigned char *source, +@ unsigned int source_width, +@ unsigned char *dest, +@ unsigned int dest_width +@) +_HorizontalLine_1_2_Scale_ARMv4: + horizontal_line_1_2_scale_armv4: @ + stmdb sp!, {r4 - r5, lr} + + sub srcw, srcw, #1 + + ldrb r3, [src], #1 + ldrb r4, [src], #1 +hl12_loop: + subs srcw, srcw, #1 + + add r5, r3, r4 + add r5, r5, #1 + mov r5, r5, lsr #1 + + orr r5, r3, r5, lsl #8 + strh r5, [dest], #2 + + mov r3, r4 + + ldrneb r4, [src], #1 + bne hl12_loop + + orr r5, r4, r4, lsl #8 + strh r5, [dest] + + ldmia sp!, {r4 - r5, pc} + @ @|vertical_band_3_5_scale_armv4| + +@/**************************************************************************** +@ * +@ * ROUTINE : vp8cx_vertical_band_1_2_scale_c +@ * +@ * INPUTS : unsigned char *dest : Pointer to destination data. +@ * unsigned int dest_pitch : Stride of destination data. +@ * unsigned int dest_width : Width of destination data. +@ * +@ * OUTPUTS : None. +@ * +@ * RETU.req_s : void +@ * +@ * FUNCTION : Scales vertical band of pixels by scale 1 to 2. The +@ * height of the band scaled is 1-pixel. +@ * +@ * SPECIAL NOTES : The routine uses the first line of the band below +@ * the current band. +@ * +@ ****************************************************************************/ +@void vp8cx_vertical_band_1_2_scale_c +@( +@ r0 = UINT8 *dest +@ r1 = UINT32 dest_pitch +@ r2 = UINT32 dest_width +@) +_VerticalBand_1_2_Scale_ARMv4: + vertical_band_1_2_scale_armv4: @ + stmdb sp!, {r4 - r7, lr} + + ldr mask, =0xff00ff @ mask for selection + ldr lr, = 0x010001 + +vl12_loop: + mov r3, src + ldr r4, [r3], r1 + ldr r5, [r3, r1] + + add src, src, #4 + subs r2, r2, #4 + + and r6, r4, mask + and r7, r5, mask + + add r6, r7, r6 + add r6, r6, lr + + and r4, mask, r4, lsr #8 + and r5, mask, r5, lsr #8 + + mov r6, r6, lsr #1 + and r6, r6, mask + + add r4, r5, r4 + add r4, r4, lr + + mov r4, r4, lsr #1 + and r4, r4, mask + + orr r5, r6, r4, lsl #8 + + str r5, [r3] + + bpl vl12_loop + + ldmia sp!, {r4 - r7, pc} + @ @|vertical_band_3_5_scale_armv4| diff --git a/vpx_scale/symbian/scalesystemdependant.c b/vpx_scale/symbian/scalesystemdependant.c new file mode 100644 index 000000000..a2acc3e9d --- /dev/null +++ b/vpx_scale/symbian/scalesystemdependant.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_scale/vpxscale.h" + +/**************************************************************************** + * + * ROUTINE : vp8_scale_machine_specific_config + * + * INPUTS : UINT32 Version : Codec version number. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Checks for machine specifc features such as MMX support + * sets appropriate flags and function pointers. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void vp8_scale_machine_specific_config() +{ +#ifndef VPX_NO_GLOBALS + vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_armv4; + vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_armv4; + vp8_last_vertical_band_1_2_scale = vp8cx_last_vertical_band_1_2_scale_c; + vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_armv4; + vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_armv4; + vp8_last_vertical_band_3_5_scale = vp8cx_last_vertical_band_3_5_scale_c; + vp8_horizontal_line_3_4_scale = horizontal_line_3_4_scale_armv4; + vp8_vertical_band_3_4_scale = vertical_band_3_4_scale_armv4; + vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c; + vp8_horizontal_line_2_3_scale = horizontal_line_2_3_scale_armv4; + vp8_vertical_band_2_3_scale = vertical_band_2_3_scale_armv4; + vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c; + vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_armv4; + vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_armv4; + vp8_last_vertical_band_4_5_scale = vp8cx_last_vertical_band_4_5_scale_c; + + + vp8_vertical_band_5_4_scale = vp8cx_vertical_band_5_4_scale_c; + vp8_vertical_band_5_3_scale = vp8cx_vertical_band_5_3_scale_c; + vp8_vertical_band_2_1_scale = vp8cx_vertical_band_2_1_scale_c; + vp8_vertical_band_2_1_scale_i = vp8cx_vertical_band_2_1_scale_i_c; + vp8_horizontal_line_2_1_scale = vp8cx_horizontal_line_2_1_scale_c; + vp8_horizontal_line_5_3_scale = vp8cx_horizontal_line_5_3_scale_c; + vp8_horizontal_line_5_4_scale = vp8cx_horizontal_line_5_4_scale_c; +#endif +} diff --git a/vpx_scale/vpx_scale.mk b/vpx_scale/vpx_scale.mk new file mode 100644 index 000000000..f4ab258ed --- /dev/null +++ b/vpx_scale/vpx_scale.mk @@ -0,0 +1,23 @@ +SCALE_SRCS-yes += vpx_scale.mk +SCALE_SRCS-yes += scale_mode.h +SCALE_SRCS-yes += yv12extend.h +SCALE_SRCS-yes += yv12config.h +SCALE_SRCS-yes += vpxscale.h +SCALE_SRCS-yes += generic/vpxscale.c +SCALE_SRCS-yes += generic/yv12config.c +SCALE_SRCS-yes += generic/yv12extend.c +SCALE_SRCS-yes += generic/scalesystemdependant.c +SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/gen_scalers.c + +#arm +SCALE_SRCS-$(HAVE_ARMV7) += arm/scalesystemdependant.c +SCALE_SRCS-$(HAVE_ARMV7) += arm/yv12extend_arm.c +SCALE_SRCS_REMOVE-$(HAVE_ARMV7) += generic/scalesystemdependant.c + +#neon +SCALE_SRCS-$(HAVE_ARMV7) += arm/neon/vp8_vpxyv12_copyframe_func_neon$(ASM) +SCALE_SRCS-$(HAVE_ARMV7) += arm/neon/vp8_vpxyv12_copyframeyonly_neon$(ASM) +SCALE_SRCS-$(HAVE_ARMV7) += arm/neon/vp8_vpxyv12_copysrcframe_func_neon$(ASM) +SCALE_SRCS-$(HAVE_ARMV7) += arm/neon/vp8_vpxyv12_extendframeborders_neon$(ASM) + +SCALE_SRCS-no += $(SCALE_SRCS_REMOVE-yes) diff --git a/vpx_scale/vpxscale.h b/vpx_scale/vpxscale.h new file mode 100644 index 000000000..9a86b75de --- /dev/null +++ b/vpx_scale/vpxscale.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef VPXSCALE_H +#define VPXSCALE_H + +#include "vpx_scale/yv12config.h" +void vp8cx_horizontal_line_4_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_horizontal_line_2_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_2_3_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_horizontal_line_3_5_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_horizontal_line_3_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_3_4_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_horizontal_line_1_2_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vp8cx_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vp8cx_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); + + +extern void (*vp8_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_last_vertical_band_4_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_vertical_band_2_3_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_last_vertical_band_2_3_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_vertical_band_3_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_last_vertical_band_3_5_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_vertical_band_3_4_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_last_vertical_band_3_4_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_horizontal_line_1_2_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +extern void (*vp8_horizontal_line_3_4_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +extern void (*vp8_horizontal_line_3_5_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +extern void (*vp8_horizontal_line_2_3_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +extern void (*vp8_horizontal_line_4_5_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +extern void (*vp8_vertical_band_1_2_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_last_vertical_band_1_2_scale)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_vertical_band_5_4_scale)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_vertical_band_5_3_scale)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_vertical_band_2_1_scale)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_vertical_band_2_1_scale_i)(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +extern void (*vp8_horizontal_line_2_1_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +extern void (*vp8_horizontal_line_5_3_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +extern void (*vp8_horizontal_line_5_4_scale)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); + +void horizontal_line_4_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void horizontal_line_2_3_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void horizontal_line_3_5_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void horizontal_line_3_4_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void horizontal_line_1_2_scale_armv4(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +void vertical_band_4_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vertical_band_2_3_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vertical_band_3_5_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vertical_band_3_4_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +void vertical_band_1_2_scale_armv4(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); + + +extern void dmachine_specific_config(int mmx_enabled, int xmm_enabled, int wmt_enabled); +extern void vp8_yv12_scale_or_center +( + YV12_BUFFER_CONFIG *src_yuv_config, + YV12_BUFFER_CONFIG *dst_yuv_config, + int expanded_frame_width, + int expanded_frame_height, + int scaling_mode, + int HScale, + int HRatio, + int VScale, + int VRatio +); +extern void vp8_scale_frame +( + YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + unsigned char *temp_area, + unsigned char temp_height, + unsigned int hscale, + unsigned int hratio, + unsigned int vscale, + unsigned int vratio, + unsigned int interlaced +); +extern void vp8_scale_machine_specific_config(void); + +extern void (*vp8_yv12_extend_frame_borders_ptr)(YV12_BUFFER_CONFIG *ybf); +extern void vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf); +extern void vp8_yv12_extend_frame_borders_neon(YV12_BUFFER_CONFIG *ybf); + +extern void (*vp8_yv12_copy_frame_yonly_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +extern void vp8_yv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +extern void vp8_yv12_copy_frame_yonly_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); + +extern void (*vp8_yv12_copy_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +extern void vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +extern void vp8_yv12_copy_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); + +#endif diff --git a/vpx_scale/wce/gen_scalers_armv4.asm b/vpx_scale/wce/gen_scalers_armv4.asm new file mode 100644 index 000000000..1c904edae --- /dev/null +++ b/vpx_scale/wce/gen_scalers_armv4.asm @@ -0,0 +1,773 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |horizontal_line_4_5_scale_armv4| + EXPORT |vertical_band_4_5_scale_armv4| + EXPORT |horizontal_line_2_3_scale_armv4| + EXPORT |vertical_band_2_3_scale_armv4| + EXPORT |horizontal_line_3_5_scale_armv4| + EXPORT |vertical_band_3_5_scale_armv4| + EXPORT |horizontal_line_3_4_scale_armv4| + EXPORT |vertical_band_3_4_scale_armv4| + EXPORT |horizontal_line_1_2_scale_armv4| + EXPORT |vertical_band_1_2_scale_armv4| + + AREA |.text|, CODE, READONLY ; name this block of code + +src RN r0 +srcw RN r1 +dest RN r2 +mask RN r12 +c51_205 RN r10 +c102_154 RN r11 +;/**************************************************************************** +; * +; * ROUTINE : horizontal_line_4_5_scale_armv4 +; * +; * INPUTS : const unsigned char *source : Pointer to source data. +; * unsigned int source_width : Stride of source. +; * unsigned char *dest : Pointer to destination data. +; * unsigned int dest_width : Stride of destination (NOT USED). +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Copies horizontal line of pixels from source to +; * destination scaling up by 4 to 5. +; * +; * SPECIAL NOTES : None. +; * +; ****************************************************************************/ +;void horizontal_line_4_5_scale_armv4 +;( +; r0 = UINT8 *source +; r1 = UINT32 source_width +; r2 = UINT8 *dest +; r3 = UINT32 dest_width +;) +|horizontal_line_4_5_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + + mov mask, #255 ; mask for selection + ldr c51_205, =0x3300cd + ldr c102_154, =0x66009a + + ldr r3, [src], #4 + +hl45_loop + + and r4, r3, mask ; a = src[0] + and r5, mask, r3, lsr #8 ; b = src[1] + strb r4, [dest], #1 + + orr r6, r4, r5, lsl #16 ; b | a + and r7, mask, r3, lsr #16 ; c = src[2] + mul r6, c51_205, r6 ; a * 51 + 205 * b + + orr r5, r5, r7, lsl #16 ; c | b + mul r5, c102_154, r5 ; b * 102 + 154 * c + add r6, r6, #0x8000 + and r8, mask, r3, lsr #24 ; d = src[3] + mov r6, r6, lsr #24 + strb r6, [dest], #1 + + orr r7, r8, r7, lsl #16 ; c | d + mul r7, c102_154, r7 ; c * 154 + 102 * d + add r5, r5, #0x8000 + ldr r3, [src], #4 + mov r5, r5, lsr #24 + strb r5, [dest], #1 + + add r7, r7, #0x8000 + and r9, mask, r3 ; e = src[4] + orr r9, r9, r8, lsl #16 ; d | e + mul r9, c51_205, r9 ; d * 205 + 51 * e + mov r7, r7, lsr #24 + strb r7, [dest], #1 + + add r9, r9, #0x8000 + subs srcw, srcw, #4 + mov r9, r9, lsr #24 + strb r9, [dest], #1 + + bne hl45_loop + + and r4, r3, mask + and r5, mask, r3, lsl #8 + strb r4, [dest], #1 + + orr r6, r4, r5, lsl #16 ; b | a + mul r6, c51_205, r6 + + and r7, mask, r3, lsl #16 + orr r5, r5, r7, lsl #16 ; c | b + mul r5, c102_154, r5 + add r6, r6, #0x8000 + and r8, mask, r3, lsl #24 + mov r6, r6, lsr #24 + strb r6, [dest], #1 + + orr r7, r8, r7, lsl #16 ; c | d + mul r7, c102_154, r7 + add r5, r5, #0x8000 + mov r5, r5, lsr #24 + strb r5, [dest], #1 + + add r7, r7, #0x8000 + mov r7, r7, lsr #24 + strb r7, [dest], #1 + + ldrb r3, [src] + strb r3, [dest], #1 + + ldmia sp!, {r4 - r11, pc} + ENDP ;|vp8cx_horizontal_line_4_5_scale_c| + +;/**************************************************************************** +; * +; * ROUTINE : vertical_band_4_5_scale_armv4 +; * +; * INPUTS : unsigned char *dest : Pointer to destination data. +; * unsigned int dest_pitch : Stride of destination data. +; * unsigned int dest_width : Width of destination data. +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Scales vertical band of pixels by scale 4 to 5. The +; * height of the band scaled is 4-pixels. +; * +; * SPECIAL NOTES : The routine uses the first line of the band below +; * the current band. +; * +; ****************************************************************************/ +;void vertical_band_4_5_scale_armv4 +;( +; r0 = UINT8 *dest +; r1 = UINT32 dest_pitch +; r2 = UINT32 dest_width +;) +|vertical_band_4_5_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + + ldr c51_205, =0x3300cd + ldr c102_154, =0x66009a + +vl45_loop + mov r3, src + ldrb r4, [r3], r1 ; a = des [0] + ldrb r5, [r3], r1 ; b = des [dest_pitch] + ldrb r7, [r3], r1 ; c = des[dest_pitch*2] + add lr, src, r1 + + orr r6, r4, r5, lsl #16 ; b | a + mul r6, c51_205, r6 ; a * 51 + 205 * b + + ldrb r8, [r3], r1 ; d = des[dest_pitch*3] + orr r5, r5, r7, lsl #16 ; c | b + mul r5, c102_154, r5 ; b * 102 + 154 * c + add r6, r6, #0x8000 + orr r7, r8, r7, lsl #16 ; c | d + mov r6, r6, lsr #24 + strb r6, [lr], r1 + + ldrb r9, [r3, r1] ; e = des [dest_pitch * 5] + mul r7, c102_154, r7 ; c * 154 + 102 * d + add r5, r5, #0x8000 + orr r9, r9, r8, lsl #16 ; d | e + mov r5, r5, lsr #24 + strb r5, [lr], r1 + + mul r9, c51_205, r9 ; d * 205 + 51 * e + add r7, r7, #0x8000 + add src, src, #1 + mov r7, r7, lsr #24 + strb r7, [lr], r1 + + add r9, r9, #0x8000 + subs r2, r2, #1 + mov r9, r9, lsr #24 + strb r9, [lr], r1 + + bne vl45_loop + + ldmia sp!, {r4 - r11, pc} + ENDP ;|vertical_band_4_5_scale_armv4| + +;/**************************************************************************** +; * +; * ROUTINE : horizontal_line_2_3_scale_armv4 +; * +; * INPUTS : const unsigned char *source : Pointer to source data. +; * unsigned int source_width : Stride of source. +; * unsigned char *dest : Pointer to destination data. +; * unsigned int dest_width : Stride of destination (NOT USED). +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Copies horizontal line of pixels from source to +; * destination scaling up by 2 to 3. +; * +; * SPECIAL NOTES : None. +; * +; * +; ****************************************************************************/ +;void horizontal_line_2_3_scale_armv4 +;( +; const unsigned char *source, +; unsigned int source_width, +; unsigned char *dest, +; unsigned int dest_width +;) +|horizontal_line_2_3_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + ldr lr, =85 + ldr r12, =171 + +hl23_loop + + ldrb r3, [src], #1 ; a + ldrb r4, [src], #1 ; b + ldrb r5, [src] ; c + + strb r3, [dest], #1 + mul r4, r12, r4 ; b * 171 + mla r6, lr, r3, r4 ; a * 85 + mla r7, lr, r5, r4 ; c * 85 + + add r6, r6, #128 + mov r6, r6, lsr #8 + strb r6, [dest], #1 + + add r7, r7, #128 + mov r7, r7, lsr #8 + strb r7, [dest], #1 + + subs srcw, srcw, #2 + bne hl23_loop + + ldrb r4, [src, #1] ; b + strb r5, [dest], #1 + strb r4, [dest, #1] + + mul r4, r12, r4 ; b * 171 + mla r6, lr, r5, r4 ; a * 85 + b *171 + + add r6, r6, #128 + mov r6, r6, lsr #8 + strb r6, [dest] + + ldmia sp!, {r4 - r11, pc} + ENDP ;|horizontal_line_2_3_scale_armv4| + +;/**************************************************************************** +; * +; * ROUTINE : vertical_band_2_3_scale_armv4 +; * +; * INPUTS : unsigned char *dest : Pointer to destination data. +; * unsigned int dest_pitch : Stride of destination data. +; * unsigned int dest_width : Width of destination data. +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Scales vertical band of pixels by scale 2 to 3. The +; * height of the band scaled is 2-pixels. +; * +; * SPECIAL NOTES : The routine uses the first line of the band below +; * the current band. +; * +; ****************************************************************************/ +;void vertical_band_2_3_scale_armv4 +;( +; r0 = UINT8 *dest +; r1 = UINT32 dest_pitch +; r2 = UINT32 dest_width +;) +|vertical_band_2_3_scale_armv4| PROC + stmdb sp!, {r4 - r8, lr} + ldr lr, =85 + ldr r12, =171 + add r3, r1, r1, lsl #1 ; 3 * dest_pitch + +vl23_loop + ldrb r4, [src] ; a = des [0] + ldrb r5, [src, r1] ; b = des [dest_pitch] + ldrb r7, [src, r3] ; c = des [dest_pitch*3] + subs r2, r2, #1 + + mul r5, r12, r5 ; b * 171 + mla r6, lr, r4, r5 ; a * 85 + mla r8, lr, r7, r5 ; c * 85 + + add r6, r6, #128 + mov r6, r6, lsr #8 + strb r6, [src, r1] + + add r8, r8, #128 + mov r8, r8, lsr #8 + strb r8, [src, r1, lsl #1] + + add src, src, #1 + + bne vl23_loop + + ldmia sp!, {r4 - r8, pc} + ENDP ;|vertical_band_2_3_scale_armv4| + +;/**************************************************************************** +; * +; * ROUTINE : vp8cx_horizontal_line_3_5_scale_c +; * +; * INPUTS : const unsigned char *source : Pointer to source data. +; * unsigned int source_width : Stride of source. +; * unsigned char *dest : Pointer to destination data. +; * unsigned int dest_width : Stride of destination (NOT USED). +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Copies horizontal line of pixels from source to +; * destination scaling up by 3 to 5. +; * +; * SPECIAL NOTES : None. +; * +; * +; ****************************************************************************/ +;void vp8cx_horizontal_line_3_5_scale_c +;( +; const unsigned char *source, +; unsigned int source_width, +; unsigned char *dest, +; unsigned int dest_width +;) +|horizontal_line_3_5_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + + ldr c51_205, =0x3300cd + ldr c102_154, =0x66009a + + ldrb r4, [src], #1 ; a = src[0] + +hl35_loop + + ldrb r8, [src], #1 ; b = src[1] + strb r4, [dest], #1 + + orr r6, r4, r8, lsl #16 ; b | a + ldrb r9, [src], #1 ; c = src[2] + mul r6, c102_154, r6 ; a * 102 + 154 * b + + orr r5, r9, r8, lsl #16 ; b | c + mul r5, c51_205, r5 ; b * 205 + 51 * c + add r6, r6, #0x8000 + ldrb r4, [src], #1 ; d = src[3] + mov r6, r6, lsr #24 + strb r6, [dest], #1 + + orr r7, r8, r9, lsl #16 ; c | b + mul r7, c51_205, r7 ; c * 205 + 154 * b + add r5, r5, #0x8000 + mov r5, r5, lsr #24 + strb r5, [dest], #1 + + orr r9, r4, r9, lsl #16 ; c | d + mul r9, c102_154, r9 ; c * 154 + 102 * d + add r7, r7, #0x8000 + mov r7, r7, lsr #24 + strb r7, [dest], #1 + + add r9, r9, #0x8000 + subs srcw, srcw, #3 + mov r9, r9, lsr #24 + strb r9, [dest], #1 + + bpl hl35_loop + + ldrb r5, [src], #1 ; b = src[1] + strb r4, [dest], #1 + + orr r6, r4, r8, lsl #16 ; b | a + ldrb r9, [src], #1 ; c = src[2] + mul r6, c102_154, r6 ; a * 102 + 154 * b + + orr r5, r9, r8, lsl #16 ; b | c + mul r5, c51_205, r5 ; b * 205 + 51 * c + add r6, r6, #0x8000 + mov r6, r6, lsr #24 + strb r6, [dest], #1 + + orr r7, r8, r9, lsl #16 ; c | b + mul r7, c51_205, r7 ; c * 205 + 154 * b + add r5, r5, #0x8000 + mov r5, r5, lsr #24 + strb r5, [dest], #1 + + add r7, r7, #0x8000 + mov r7, r7, lsr #24 + strb r7, [dest], #1 + strb r9, [dest], #1 + + ldmia sp!, {r4 - r11, pc} + ENDP ;|vp8cx_horizontal_line_3_5_scale_c| + + +;/**************************************************************************** +; * +; * ROUTINE : vp8cx_vertical_band_3_5_scale_c +; * +; * INPUTS : unsigned char *dest : Pointer to destination data. +; * unsigned int dest_pitch : Stride of destination data. +; * unsigned int dest_width : Width of destination data. +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Scales vertical band of pixels by scale 3 to 5. The +; * height of the band scaled is 3-pixels. +; * +; * SPECIAL NOTES : The routine uses the first line of the band below +; * the current band. +; * +; ****************************************************************************/ +;void vertical_band_4_5_scale_armv4 +;( +; r0 = UINT8 *dest +; r1 = UINT32 dest_pitch +; r2 = UINT32 dest_width +;) +|vertical_band_3_5_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + + ldr c51_205, =0x3300cd + ldr c102_154, =0x66009a + +vl35_loop + mov r3, src + ldrb r4, [r3], r1 ; a = des [0] + ldrb r5, [r3], r1 ; b = des [dest_pitch] + ldrb r7, [r3], r1 ; c = des[dest_pitch*2] + add lr, src, r1 + + orr r8, r4, r5, lsl #16 ; b | a + mul r6, c102_154, r8 ; a * 102 + 154 * b + + ldrb r8, [r3, r1, lsl #1] ; d = des[dest_pitch*5] + orr r3, r7, r5, lsl #16 ; b | c + mul r9, c51_205, r3 ; b * 205 + 51 * c + add r6, r6, #0x8000 + orr r3, r5, r7, lsl #16 ; c | b + mov r6, r6, lsr #24 + strb r6, [lr], r1 + + mul r5, c51_205, r3 ; c * 205 + 154 * b + add r9, r9, #0x8000 + orr r3, r8, r7, lsl #16 ; c | d + mov r9, r9, lsr #24 + strb r9, [lr], r1 + + mul r7, c102_154, r3 ; c * 154 + 102 * d + add r5, r5, #0x8000 + add src, src, #1 + mov r5, r5, lsr #24 + strb r5, [lr], r1 + + add r7, r7, #0x8000 + subs r2, r2, #1 + mov r7, r7, lsr #24 + strb r7, [lr], r1 + + + bne vl35_loop + + ldmia sp!, {r4 - r11, pc} + ENDP ;|vertical_band_3_5_scale_armv4| + +;/**************************************************************************** +; * +; * ROUTINE : horizontal_line_3_4_scale_armv4 +; * +; * INPUTS : const unsigned char *source : Pointer to source data. +; * unsigned int source_width : Stride of source. +; * unsigned char *dest : Pointer to destination data. +; * unsigned int dest_width : Stride of destination (NOT USED). +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Copies horizontal line of pixels from source to +; * destination scaling up by 3 to 4. +; * +; * SPECIAL NOTES : None. +; * +; * +; ****************************************************************************/ +;void horizontal_line_3_4_scale_armv4 +;( +; const unsigned char *source, +; unsigned int source_width, +; unsigned char *dest, +; unsigned int dest_width +;) +|horizontal_line_3_4_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r10, =64 + ldr r11, =192 + mov r9, #128 + + ldrb r4, [src], #1 ; a = src[0] + +hl34_loop + + ldrb r8, [src], #1 ; b = src[1] + ldrb r7, [src], #1 ; c = src[2] + strb r4, [dest], #1 + + mla r4, r10, r4, r9 ; a*64 + 128 + mla r4, r11, r8, r4 ; a*64 + b*192 + 1 + + add r8, r8, #1 ; b + 1 + add r8, r8, r7 ; b + c + 1 + mov r8, r8, asr #1 ; (b + c + 1) >> 1 + + mov r4, r4, asr #8 ; (a*64 + b*192 + 1) >> 8 + strb r4, [dest], #1 + + strb r8, [dest], #1 + + ldrb r4, [src], #1 ; [a+1] + + mla r7, r11, r7, r9 ; c*192 + 128 + mla r7, r4, r10, r7 ; a*64 + b*192 + 128 + + subs srcw, srcw, #3 + + mov r7, r7, asr #8 ; (a*64 + b*192 + 128) >> 8 + strb r7, [dest], #1 + + bpl hl34_loop + + ldrb r8, [src], #1 ; b = src[1] + ldrb r7, [src], #1 ; c = src[2] + strb r4, [dest], #1 + + mla r4, r10, r4, r9 ; a*64 + 128 + mla r4, r11, r8, r4 ; a*64 + b*192 + 1 + mov r4, r4, asr #8 ; (a*64 + b*192 + 1) >> 8 + strb r4, [dest], #1 + + add r8, r8, #1 ; b + 1 + add r8, r8, r7 ; b + c + 1 + mov r8, r8, asr #1 ; (b + c + 1) >> 1 + strb r8, [dest], #1 + strb r7, [dest], #1 + + ldmia sp!, {r4 - r11, pc} + ENDP ;|vp8cx_horizontal_line_3_4_scale_c| + + +;/**************************************************************************** +; * +; * ROUTINE : vertical_band_3_4_scale_armv4 +; * +; * INPUTS : unsigned char *dest : Pointer to destination data. +; * unsigned int dest_pitch : Stride of destination data. +; * unsigned int dest_width : Width of destination data. +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Scales vertical band of pixels by scale 3 to 4. The +; * height of the band scaled is 3-pixels. +; * +; * SPECIAL NOTES : The routine uses the first line of the band below +; * the current band. +; * +; ****************************************************************************/ +;void vertical_band_3_4_scale_armv4 +;( +; r0 = UINT8 *dest +; r1 = UINT32 dest_pitch +; r2 = UINT32 dest_width +;) +|vertical_band_3_4_scale_armv4| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r10, =64 + ldr r11, =192 + mov r9, #128 + +; ldr r1,[r1] +vl34_loop + mov r3, src + ldrb r4, [r3], r1 ; a = des [0] + ldrb r5, [r3], r1 ; b = des [dest_pitch] + ldrb r7, [r3], r1 ; c = des [dest_pitch*2] + add lr, src, r1 + + mla r4, r10, r4, r9 ; a*64 + 128 + mla r4, r11, r5, r4 ; a*64 + b*192 + 1 + + add r5, r5, #1 ; b + 1 + add r5, r5, r7 ; b + c + 1 + mov r5, r5, asr #1 ; (b + c + 1) >> 1 + + mov r4, r4, asr #8 ; (a*64 + b*192 + 1) >> 8 + strb r4, [lr], r1 + + ldrb r4, [r3, r1] ; a = des [dest_pitch*4] + + strb r5, [lr], r1 + + mla r7, r11, r7, r9 ; c*192 + 128 + mla r7, r4, r10, r7 ; a*64 + b*192 + 128 + mov r7, r7, asr #8 ; (a*64 + b*192 + 128) >> 8 + + add src, src, #1 + subs r2, r2, #1 + + strb r7, [lr] + + bne vl34_loop + + ldmia sp!, {r4 - r11, pc} + ENDP ;|vertical_band_3_4_scale_armv4| + +;/**************************************************************************** +; * +; * ROUTINE : vp8cx_horizontal_line_1_2_scale_c +; * +; * INPUTS : const unsigned char *source : Pointer to source data. +; * unsigned int source_width : Stride of source. +; * unsigned char *dest : Pointer to destination data. +; * unsigned int dest_width : Stride of destination (NOT USED). +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Copies horizontal line of pixels from source to +; * destination scaling up by 1 to 2. +; * +; * SPECIAL NOTES : None. +; * +; ****************************************************************************/ +;void vp8cx_horizontal_line_1_2_scale_c +;( +; const unsigned char *source, +; unsigned int source_width, +; unsigned char *dest, +; unsigned int dest_width +;) +|horizontal_line_1_2_scale_armv4| PROC + stmdb sp!, {r4 - r5, lr} + + sub srcw, srcw, #1 + + ldrb r3, [src], #1 + ldrb r4, [src], #1 +hl12_loop + subs srcw, srcw, #1 + + add r5, r3, r4 + add r5, r5, #1 + mov r5, r5, lsr #1 + + orr r5, r3, r5, lsl #8 + strh r5, [dest], #2 + + mov r3, r4 + + ldrneb r4, [src], #1 + bne hl12_loop + + orr r5, r4, r4, lsl #8 + strh r5, [dest] + + ldmia sp!, {r4 - r5, pc} + ENDP ;|vertical_band_3_5_scale_armv4| + +;/**************************************************************************** +; * +; * ROUTINE : vp8cx_vertical_band_1_2_scale_c +; * +; * INPUTS : unsigned char *dest : Pointer to destination data. +; * unsigned int dest_pitch : Stride of destination data. +; * unsigned int dest_width : Width of destination data. +; * +; * OUTPUTS : None. +; * +; * RETURNS : void +; * +; * FUNCTION : Scales vertical band of pixels by scale 1 to 2. The +; * height of the band scaled is 1-pixel. +; * +; * SPECIAL NOTES : The routine uses the first line of the band below +; * the current band. +; * +; ****************************************************************************/ +;void vp8cx_vertical_band_1_2_scale_c +;( +; r0 = UINT8 *dest +; r1 = UINT32 dest_pitch +; r2 = UINT32 dest_width +;) +|vertical_band_1_2_scale_armv4| PROC + stmdb sp!, {r4 - r7, lr} + + ldr mask, =0xff00ff ; mask for selection + ldr lr, = 0x010001 + +vl12_loop + mov r3, src + ldr r4, [r3], r1 + ldr r5, [r3, r1] + + add src, src, #4 + subs r2, r2, #4 + + and r6, r4, mask + and r7, r5, mask + + add r6, r7, r6 + add r6, r6, lr + + and r4, mask, r4, lsr #8 + and r5, mask, r5, lsr #8 + + mov r6, r6, lsr #1 + and r6, r6, mask + + add r4, r5, r4 + add r4, r4, lr + + mov r4, r4, lsr #1 + and r4, r4, mask + + orr r5, r6, r4, lsl #8 + + str r5, [r3] + + bpl vl12_loop + + ldmia sp!, {r4 - r7, pc} + ENDP ;|vertical_band_3_5_scale_armv4| + + END diff --git a/vpx_scale/wce/scalesystemdependant.c b/vpx_scale/wce/scalesystemdependant.c new file mode 100644 index 000000000..a5a6a5275 --- /dev/null +++ b/vpx_scale/wce/scalesystemdependant.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "vpx_scale/vpxscale.h" + +/**************************************************************************** +* Imports +*****************************************************************************/ + +/**************************************************************************** + * + * ROUTINE : vp8_scale_machine_specific_config + * + * INPUTS : UINT32 Version : Codec version number. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Checks for machine specifc features such as MMX support + * sets appropriate flags and function pointers. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void vp8_scale_machine_specific_config() +{ + vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_armv4; + vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_armv4; + vp8_last_vertical_band_1_2_scale = vp8cx_last_vertical_band_1_2_scale_c; + vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_armv4; + vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_armv4; + vp8_last_vertical_band_3_5_scale = vp8cx_last_vertical_band_3_5_scale_c; + vp8_horizontal_line_3_4_scale = horizontal_line_3_4_scale_armv4; + vp8_vertical_band_3_4_scale = vertical_band_3_4_scale_armv4; + vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c; + vp8_horizontal_line_2_3_scale = horizontal_line_2_3_scale_armv4; + vp8_vertical_band_2_3_scale = vertical_band_2_3_scale_armv4; + vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c; + vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_armv4; + vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_armv4; + vp8_last_vertical_band_4_5_scale = vp8cx_last_vertical_band_4_5_scale_c; + + + vp8_vertical_band_5_4_scale = vp8cx_vertical_band_5_4_scale_c; + vp8_vertical_band_5_3_scale = vp8cx_vertical_band_5_3_scale_c; + vp8_vertical_band_2_1_scale = vp8cx_vertical_band_2_1_scale_c; + vp8_vertical_band_2_1_scale_i = vp8cx_vertical_band_2_1_scale_i_c; + vp8_horizontal_line_2_1_scale = vp8cx_horizontal_line_2_1_scale_c; + vp8_horizontal_line_5_3_scale = vp8cx_horizontal_line_5_3_scale_c; + vp8_horizontal_line_5_4_scale = vp8cx_horizontal_line_5_4_scale_c; +} diff --git a/vpx_scale/win32/scaleopt.c b/vpx_scale/win32/scaleopt.c new file mode 100644 index 000000000..da0533e6b --- /dev/null +++ b/vpx_scale/win32/scaleopt.c @@ -0,0 +1,1749 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** +* +* Module Title : scaleopt.cpp +* +* Description : Optimized scaling functions +* +****************************************************************************/ +#include "pragmas.h" + + + +/**************************************************************************** +* Module Statics +****************************************************************************/ +__declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 51 }; +__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 }; +__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 }; +__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 }; +__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 }; +__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1}; +__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 }; +__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 }; +__declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0}; +__declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 }; +__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 }; + + + +#include "vpx_scale/vpxscale.h" +#include "vpx_mem/vpx_mem.h" + +/**************************************************************************** + * + * ROUTINE : horizontal_line_3_5_scale_mmx + * + * INPUTS : const unsigned char *source : + * unsigned int source_width : + * unsigned char *dest : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void horizontal_line_3_5_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + (void) dest_width; + + __asm + { + + push ebx + + mov esi, source + mov edi, dest + + mov ecx, source_width + lea edx, [esi+ecx-3]; + + movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx + movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx + + movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx + pxor mm7, mm7 // clear mm7 + + horiz_line_3_5_loop: + + mov eax, DWORD PTR [esi] // eax = 00 01 02 03 + mov ebx, eax + + and ebx, 0xffff00 // ebx = xx 01 02 xx + mov ecx, eax // ecx = 00 01 02 03 + + and eax, 0xffff0000 // eax = xx xx 02 03 + xor ecx, eax // ecx = 00 01 xx xx + + shr ebx, 8 // ebx = 01 02 xx xx + or eax, ebx // eax = 01 02 02 03 + + shl ebx, 16 // ebx = xx xx 01 02 + movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx + + or ebx, ecx // ebx = 00 01 01 02 + punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx + + movd mm0, ebx // mm0 = 00 01 01 02 + pmullw mm1, mm6 // + + punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx + pmullw mm0, mm5 // + + mov [edi], ebx // writeoutput 00 xx xx xx + add esi, 3 + + add edi, 5 + paddw mm0, mm1 + + paddw mm0, mm4 + psrlw mm0, 8 + + cmp esi, edx + packuswb mm0, mm7 + + movd DWORD Ptr [edi-4], mm0 + jl horiz_line_3_5_loop + +//Exit: + mov eax, DWORD PTR [esi] // eax = 00 01 02 03 + mov ebx, eax + + and ebx, 0xffff00 // ebx = xx 01 02 xx + mov ecx, eax // ecx = 00 01 02 03 + + and eax, 0xffff0000 // eax = xx xx 02 03 + xor ecx, eax // ecx = 00 01 xx xx + + shr ebx, 8 // ebx = 01 02 xx xx + or eax, ebx // eax = 01 02 02 03 + + shl eax, 8 // eax = xx 01 02 02 + and eax, 0xffff0000 // eax = xx xx 02 02 + + or eax, ebx // eax = 01 02 02 02 + + shl ebx, 16 // ebx = xx xx 01 02 + movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx + + or ebx, ecx // ebx = 00 01 01 02 + punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx + + movd mm0, ebx // mm0 = 00 01 01 02 + pmullw mm1, mm6 // + + punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx + pmullw mm0, mm5 // + + mov [edi], ebx // writeoutput 00 xx xx xx + paddw mm0, mm1 + + paddw mm0, mm4 + psrlw mm0, 8 + + packuswb mm0, mm7 + movd DWORD Ptr [edi+1], mm0 + + pop ebx + + } + +} + + +/**************************************************************************** + * + * ROUTINE : horizontal_line_4_5_scale_mmx + * + * INPUTS : const unsigned char *source : + * unsigned int source_width : + * unsigned char *dest : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void horizontal_line_4_5_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + (void)dest_width; + + __asm + { + + mov esi, source + mov edi, dest + + mov ecx, source_width + lea edx, [esi+ecx-8]; + + movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx + movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx + + movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx + pxor mm7, mm7 // clear mm7 + + horiz_line_4_5_loop: + + movq mm0, QWORD PTR [esi] // mm0 = 00 01 02 03 04 05 06 07 + movq mm1, QWORD PTR [esi+1]; // mm1 = 01 02 03 04 05 06 07 08 + + movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07 + movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08 + + movd DWORD PTR [edi], mm0 // write output 00 xx xx xx + punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx + + punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx + pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205 + + pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51 + punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx + + movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx + pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205 + + punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx + pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51 + + paddw mm0, mm1 // added round values + paddw mm0, mm4 + + psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx + packuswb mm0, mm7 + + movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04 + add edi, 10 + + add esi, 8 + paddw mm2, mm3 // + + paddw mm2, mm4 // added round values + cmp esi, edx + + psrlw mm2, 8 + packuswb mm2, mm7 + + movd DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09 + jl horiz_line_4_5_loop + +//Exit: + movq mm0, [esi] // mm0 = 00 01 02 03 04 05 06 07 + movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07 + + movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07 + psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00 + + movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00 + pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00 + + psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07 + por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07 + + movq mm3, mm1 + + movd DWORD PTR [edi], mm0 // write output 00 xx xx xx + punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx + + punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx + pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205 + + pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51 + punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx + + movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx + pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205 + + punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx + pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51 + + paddw mm0, mm1 // added round values + paddw mm0, mm4 + + psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx + packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx + + movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04 + paddw mm2, mm3 // + + paddw mm2, mm4 // added round values + psrlw mm2, 8 + + packuswb mm2, mm7 + movd DWORD PTR [edi+6], mm2 // writeoutput 06 07 08 09 + + + } +} + +/**************************************************************************** + * + * ROUTINE : vertical_band_4_5_scale_mmx + * + * INPUTS : unsigned char *dest : + * unsigned int dest_pitch : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. The function also has a "C" only + * version. + * + ****************************************************************************/ +static +void vertical_band_4_5_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + + mov esi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + lea edi, [esi+ecx*2] // tow lines below + add edi, ecx // three lines below + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + vs_4_5_loop: + + movq mm0, QWORD ptr [esi] // src[0]; + movq mm1, QWORD ptr [esi+ecx] // src[1]; + + movq mm2, mm0 // Make a copy + punpcklbw mm0, mm7 // unpack low to word + + movq mm5, one_fifth + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm0, mm5 // a * 1/5 + + movq mm3, mm1 // make a copy + punpcklbw mm1, mm7 // unpack low to word + + pmullw mm2, mm5 // a * 1/5 + movq mm6, four_fifths // constan + + movq mm4, mm1 // copy of low b + pmullw mm4, mm6 // b * 4/5 + + punpckhbw mm3, mm7 // unpack high to word + movq mm5, mm3 // copy of high b + + pmullw mm5, mm6 // b * 4/5 + paddw mm0, mm4 // a * 1/5 + b * 4/5 + + paddw mm2, mm5 // a * 1/5 + b * 4/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des [1] + + movq QWORD ptr [esi+ecx], mm0 // write des[1] + movq mm0, [esi+ecx*2] // mm0 = src[2] + + // mm1, mm3 --- Src[1] + // mm0 --- Src[2] + // mm7 for unpacking + + movq mm5, two_fifths + movq mm2, mm0 // make a copy + + pmullw mm1, mm5 // b * 2/5 + movq mm6, three_fifths + + + punpcklbw mm0, mm7 // unpack low to word + pmullw mm3, mm5 // b * 2/5 + + movq mm4, mm0 // make copy of c + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm4, mm6 // c * 3/5 + movq mm5, mm2 + + pmullw mm5, mm6 // c * 3/5 + paddw mm1, mm4 // b * 2/5 + c * 3/5 + + paddw mm3, mm5 // b * 2/5 + c * 3/5 + paddw mm1, round_values // + 128 + + paddw mm3, round_values // + 128 + psrlw mm1, 8 + + psrlw mm3, 8 + packuswb mm1, mm3 // des[2] + + movq QWORD ptr [esi+ecx*2], mm1 // write des[2] + movq mm1, [edi] // mm1=Src[3]; + + // mm0, mm2 --- Src[2] + // mm1 --- Src[3] + // mm6 --- 3/5 + // mm7 for unpacking + + pmullw mm0, mm6 // c * 3/5 + movq mm5, two_fifths // mm5 = 2/5 + + movq mm3, mm1 // make a copy + pmullw mm2, mm6 // c * 3/5 + + punpcklbw mm1, mm7 // unpack low + movq mm4, mm1 // make a copy + + punpckhbw mm3, mm7 // unpack high + pmullw mm4, mm5 // d * 2/5 + + movq mm6, mm3 // make a copy + pmullw mm6, mm5 // d * 2/5 + + paddw mm0, mm4 // c * 3/5 + d * 2/5 + paddw mm2, mm6 // c * 3/5 + d * 2/5 + + paddw mm0, round_values // + 128 + paddw mm2, round_values // + 128 + + psrlw mm0, 8 + psrlw mm2, 8 + + packuswb mm0, mm2 // des[3] + movq QWORD ptr [edi], mm0 // write des[3] + + // mm1, mm3 --- Src[3] + // mm7 -- cleared for unpacking + + movq mm0, [edi+ecx*2] // mm0, Src[0] of the next group + + movq mm5, four_fifths // mm5 = 4/5 + pmullw mm1, mm5 // d * 4/5 + + movq mm6, one_fifth // mm6 = 1/5 + movq mm2, mm0 // make a copy + + pmullw mm3, mm5 // d * 4/5 + punpcklbw mm0, mm7 // unpack low + + pmullw mm0, mm6 // an * 1/5 + punpckhbw mm2, mm7 // unpack high + + paddw mm1, mm0 // d * 4/5 + an * 1/5 + pmullw mm2, mm6 // an * 1/5 + + paddw mm3, mm2 // d * 4/5 + an * 1/5 + paddw mm1, round_values // + 128 + + paddw mm3, round_values // + 128 + psrlw mm1, 8 + + psrlw mm3, 8 + packuswb mm1, mm3 // des[4] + + movq QWORD ptr [edi+ecx], mm1 // write des[4] + + add edi, 8 + add esi, 8 + + sub edx, 8 + jg vs_4_5_loop + } +} + +/**************************************************************************** + * + * ROUTINE : last_vertical_band_4_5_scale_mmx + * + * INPUTS : unsigned char *dest : + * unsigned int dest_pitch : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : None + * + * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. The function also has an "C" only + * version. + * + ****************************************************************************/ +static +void last_vertical_band_4_5_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + mov esi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + lea edi, [esi+ecx*2] // tow lines below + add edi, ecx // three lines below + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + last_vs_4_5_loop: + + movq mm0, QWORD ptr [esi] // src[0]; + movq mm1, QWORD ptr [esi+ecx] // src[1]; + + movq mm2, mm0 // Make a copy + punpcklbw mm0, mm7 // unpack low to word + + movq mm5, one_fifth + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm0, mm5 // a * 1/5 + + movq mm3, mm1 // make a copy + punpcklbw mm1, mm7 // unpack low to word + + pmullw mm2, mm5 // a * 1/5 + movq mm6, four_fifths // constan + + movq mm4, mm1 // copy of low b + pmullw mm4, mm6 // b * 4/5 + + punpckhbw mm3, mm7 // unpack high to word + movq mm5, mm3 // copy of high b + + pmullw mm5, mm6 // b * 4/5 + paddw mm0, mm4 // a * 1/5 + b * 4/5 + + paddw mm2, mm5 // a * 1/5 + b * 4/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des [1] + + movq QWORD ptr [esi+ecx], mm0 // write des[1] + movq mm0, [esi+ecx*2] // mm0 = src[2] + + // mm1, mm3 --- Src[1] + // mm0 --- Src[2] + // mm7 for unpacking + + movq mm5, two_fifths + movq mm2, mm0 // make a copy + + pmullw mm1, mm5 // b * 2/5 + movq mm6, three_fifths + + + punpcklbw mm0, mm7 // unpack low to word + pmullw mm3, mm5 // b * 2/5 + + movq mm4, mm0 // make copy of c + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm4, mm6 // c * 3/5 + movq mm5, mm2 + + pmullw mm5, mm6 // c * 3/5 + paddw mm1, mm4 // b * 2/5 + c * 3/5 + + paddw mm3, mm5 // b * 2/5 + c * 3/5 + paddw mm1, round_values // + 128 + + paddw mm3, round_values // + 128 + psrlw mm1, 8 + + psrlw mm3, 8 + packuswb mm1, mm3 // des[2] + + movq QWORD ptr [esi+ecx*2], mm1 // write des[2] + movq mm1, [edi] // mm1=Src[3]; + + movq QWORD ptr [edi+ecx], mm1 // write des[4]; + + // mm0, mm2 --- Src[2] + // mm1 --- Src[3] + // mm6 --- 3/5 + // mm7 for unpacking + + pmullw mm0, mm6 // c * 3/5 + movq mm5, two_fifths // mm5 = 2/5 + + movq mm3, mm1 // make a copy + pmullw mm2, mm6 // c * 3/5 + + punpcklbw mm1, mm7 // unpack low + movq mm4, mm1 // make a copy + + punpckhbw mm3, mm7 // unpack high + pmullw mm4, mm5 // d * 2/5 + + movq mm6, mm3 // make a copy + pmullw mm6, mm5 // d * 2/5 + + paddw mm0, mm4 // c * 3/5 + d * 2/5 + paddw mm2, mm6 // c * 3/5 + d * 2/5 + + paddw mm0, round_values // + 128 + paddw mm2, round_values // + 128 + + psrlw mm0, 8 + psrlw mm2, 8 + + packuswb mm0, mm2 // des[3] + movq QWORD ptr [edi], mm0 // write des[3] + + // mm1, mm3 --- Src[3] + // mm7 -- cleared for unpacking + add edi, 8 + add esi, 8 + + sub edx, 8 + jg last_vs_4_5_loop + } +} + +/**************************************************************************** + * + * ROUTINE : vertical_band_3_5_scale_mmx + * + * INPUTS : unsigned char *dest : + * unsigned int dest_pitch : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. The function also has an "C" only + * version. + * + ****************************************************************************/ +static +void vertical_band_3_5_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + mov esi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + lea edi, [esi+ecx*2] // tow lines below + add edi, ecx // three lines below + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + vs_3_5_loop: + + movq mm0, QWORD ptr [esi] // src[0]; + movq mm1, QWORD ptr [esi+ecx] // src[1]; + + movq mm2, mm0 // Make a copy + punpcklbw mm0, mm7 // unpack low to word + + movq mm5, two_fifths // mm5 = 2/5 + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm0, mm5 // a * 2/5 + + movq mm3, mm1 // make a copy + punpcklbw mm1, mm7 // unpack low to word + + pmullw mm2, mm5 // a * 2/5 + movq mm6, three_fifths // mm6 = 3/5 + + movq mm4, mm1 // copy of low b + pmullw mm4, mm6 // b * 3/5 + + punpckhbw mm3, mm7 // unpack high to word + movq mm5, mm3 // copy of high b + + pmullw mm5, mm6 // b * 3/5 + paddw mm0, mm4 // a * 2/5 + b * 3/5 + + paddw mm2, mm5 // a * 2/5 + b * 3/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des [1] + + movq QWORD ptr [esi+ecx], mm0 // write des[1] + movq mm0, [esi+ecx*2] // mm0 = src[2] + + // mm1, mm3 --- Src[1] + // mm0 --- Src[2] + // mm7 for unpacking + + movq mm4, mm1 // b low + pmullw mm1, four_fifths // b * 4/5 low + + movq mm5, mm3 // b high + pmullw mm3, four_fifths // b * 4/5 high + + movq mm2, mm0 // c + pmullw mm4, one_fifth // b * 1/5 + + punpcklbw mm0, mm7 // c low + pmullw mm5, one_fifth // b * 1/5 + + movq mm6, mm0 // make copy of c low + punpckhbw mm2, mm7 // c high + + pmullw mm6, one_fifth // c * 1/5 low + movq mm7, mm2 // make copy of c high + + pmullw mm7, one_fifth // c * 1/5 high + paddw mm1, mm6 // b * 4/5 + c * 1/5 low + + paddw mm3, mm7 // b * 4/5 + c * 1/5 high + movq mm6, mm0 // make copy of c low + + pmullw mm6, four_fifths // c * 4/5 low + movq mm7, mm2 // make copy of c high + + pmullw mm7, four_fifths // c * 4/5 high + + paddw mm4, mm6 // b * 1/5 + c * 4/5 low + paddw mm5, mm7 // b * 1/5 + c * 4/5 high + + paddw mm1, round_values // + 128 + paddw mm3, round_values // + 128 + + psrlw mm1, 8 + psrlw mm3, 8 + + packuswb mm1, mm3 // des[2] + movq QWORD ptr [esi+ecx*2], mm1 // write des[2] + + paddw mm4, round_values // + 128 + paddw mm5, round_values // + 128 + + psrlw mm4, 8 + psrlw mm5, 8 + + packuswb mm4, mm5 // des[3] + movq QWORD ptr [edi], mm4 // write des[3] + + // mm0, mm2 --- Src[3] + + pxor mm7, mm7 // clear mm7 for unpacking + movq mm1, [edi+ecx*2] // mm1 = Src[0] of the next group + + movq mm5, three_fifths // mm5 = 3/5 + pmullw mm0, mm5 // d * 3/5 + + movq mm6, two_fifths // mm6 = 2/5 + movq mm3, mm1 // make a copy + + pmullw mm2, mm5 // d * 3/5 + punpcklbw mm1, mm7 // unpack low + + pmullw mm1, mm6 // an * 2/5 + punpckhbw mm3, mm7 // unpack high + + paddw mm0, mm1 // d * 3/5 + an * 2/5 + pmullw mm3, mm6 // an * 2/5 + + paddw mm2, mm3 // d * 3/5 + an * 2/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des[4] + + movq QWORD ptr [edi+ecx], mm0 // write des[4] + + add edi, 8 + add esi, 8 + + sub edx, 8 + jg vs_3_5_loop + } +} + +/**************************************************************************** + * + * ROUTINE : last_vertical_band_3_5_scale_mmx + * + * INPUTS : unsigned char *dest : + * unsigned int dest_pitch : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. The function also has an "C" only + * version. + * + ****************************************************************************/ +static +void last_vertical_band_3_5_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + mov esi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + lea edi, [esi+ecx*2] // tow lines below + add edi, ecx // three lines below + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + + last_vs_3_5_loop: + + movq mm0, QWORD ptr [esi] // src[0]; + movq mm1, QWORD ptr [esi+ecx] // src[1]; + + movq mm2, mm0 // Make a copy + punpcklbw mm0, mm7 // unpack low to word + + movq mm5, two_fifths // mm5 = 2/5 + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm0, mm5 // a * 2/5 + + movq mm3, mm1 // make a copy + punpcklbw mm1, mm7 // unpack low to word + + pmullw mm2, mm5 // a * 2/5 + movq mm6, three_fifths // mm6 = 3/5 + + movq mm4, mm1 // copy of low b + pmullw mm4, mm6 // b * 3/5 + + punpckhbw mm3, mm7 // unpack high to word + movq mm5, mm3 // copy of high b + + pmullw mm5, mm6 // b * 3/5 + paddw mm0, mm4 // a * 2/5 + b * 3/5 + + paddw mm2, mm5 // a * 2/5 + b * 3/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des [1] + + movq QWORD ptr [esi+ecx], mm0 // write des[1] + movq mm0, [esi+ecx*2] // mm0 = src[2] + + + + // mm1, mm3 --- Src[1] + // mm0 --- Src[2] + // mm7 for unpacking + + movq mm4, mm1 // b low + pmullw mm1, four_fifths // b * 4/5 low + + movq QWORD ptr [edi+ecx], mm0 // write des[4] + + movq mm5, mm3 // b high + pmullw mm3, four_fifths // b * 4/5 high + + movq mm2, mm0 // c + pmullw mm4, one_fifth // b * 1/5 + + punpcklbw mm0, mm7 // c low + pmullw mm5, one_fifth // b * 1/5 + + movq mm6, mm0 // make copy of c low + punpckhbw mm2, mm7 // c high + + pmullw mm6, one_fifth // c * 1/5 low + movq mm7, mm2 // make copy of c high + + pmullw mm7, one_fifth // c * 1/5 high + paddw mm1, mm6 // b * 4/5 + c * 1/5 low + + paddw mm3, mm7 // b * 4/5 + c * 1/5 high + movq mm6, mm0 // make copy of c low + + pmullw mm6, four_fifths // c * 4/5 low + movq mm7, mm2 // make copy of c high + + pmullw mm7, four_fifths // c * 4/5 high + + paddw mm4, mm6 // b * 1/5 + c * 4/5 low + paddw mm5, mm7 // b * 1/5 + c * 4/5 high + + paddw mm1, round_values // + 128 + paddw mm3, round_values // + 128 + + psrlw mm1, 8 + psrlw mm3, 8 + + packuswb mm1, mm3 // des[2] + movq QWORD ptr [esi+ecx*2], mm1 // write des[2] + + paddw mm4, round_values // + 128 + paddw mm5, round_values // + 128 + + psrlw mm4, 8 + psrlw mm5, 8 + + packuswb mm4, mm5 // des[3] + movq QWORD ptr [edi], mm4 // write des[3] + + // mm0, mm2 --- Src[3] + + add edi, 8 + add esi, 8 + + sub edx, 8 + jg last_vs_3_5_loop + } +} + +/**************************************************************************** + * + * ROUTINE : vertical_band_1_2_scale_mmx + * + * INPUTS : unsigned char *dest : + * unsigned int dest_pitch : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 1 to 2 up-scaling of a band of pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. The function also has an "C" only + * version. + * + ****************************************************************************/ +static +void vertical_band_1_2_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + + mov esi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + vs_1_2_loop: + + movq mm0, [esi] // get Src[0] + movq mm1, [esi + ecx * 2] // get Src[1] + + movq mm2, mm0 // make copy before unpack + movq mm3, mm1 // make copy before unpack + + punpcklbw mm0, mm7 // low Src[0] + movq mm6, four_ones // mm6= 1, 1, 1, 1 + + punpcklbw mm1, mm7 // low Src[1] + paddw mm0, mm1 // low (a + b) + + punpckhbw mm2, mm7 // high Src[0] + paddw mm0, mm6 // low (a + b + 1) + + punpckhbw mm3, mm7 + paddw mm2, mm3 // high (a + b ) + + psraw mm0, 1 // low (a + b +1 )/2 + paddw mm2, mm6 // high (a + b + 1) + + psraw mm2, 1 // high (a + b + 1)/2 + packuswb mm0, mm2 // pack results + + movq [esi+ecx], mm0 // write out eight bytes + add esi, 8 + + sub edx, 8 + jg vs_1_2_loop + } + +} + +/**************************************************************************** + * + * ROUTINE : last_vertical_band_1_2_scale_mmx + * + * INPUTS : unsigned char *dest : + * unsigned int dest_pitch : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 1 to 2 up-scaling of band of pixels. + * + * SPECIAL NOTES : The routine uses the first line of the band below + * the current band. The function also has an "C" only + * version. + * + ****************************************************************************/ +static +void last_vertical_band_1_2_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + mov esi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + mov edx, dest_width // Loop counter + + last_vs_1_2_loop: + + movq mm0, [esi] // get Src[0] + movq [esi+ecx], mm0 // write out eight bytes + + add esi, 8 + sub edx, 8 + + jg last_vs_1_2_loop + } +} + +/**************************************************************************** + * + * ROUTINE : horizontal_line_1_2_scale + * + * INPUTS : const unsigned char *source : + * unsigned int source_width : + * unsigned char *dest : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void horizontal_line_1_2_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + (void) dest_width; + + __asm + { + mov esi, source + mov edi, dest + + pxor mm7, mm7 + movq mm6, four_ones + + mov ecx, source_width + + hs_1_2_loop: + + movq mm0, [esi] + movq mm1, [esi+1] + + movq mm2, mm0 + movq mm3, mm1 + + movq mm4, mm0 + punpcklbw mm0, mm7 + + punpcklbw mm1, mm7 + paddw mm0, mm1 + + paddw mm0, mm6 + punpckhbw mm2, mm7 + + punpckhbw mm3, mm7 + paddw mm2, mm3 + + paddw mm2, mm6 + psraw mm0, 1 + + psraw mm2, 1 + packuswb mm0, mm2 + + movq mm2, mm4 + punpcklbw mm2, mm0 + + movq [edi], mm2 + punpckhbw mm4, mm0 + + movq [edi+8], mm4 + add esi, 8 + + add edi, 16 + sub ecx, 8 + + cmp ecx, 8 + jg hs_1_2_loop + +// last eight pixel + + movq mm0, [esi] + movq mm1, mm0 + + movq mm2, mm0 + movq mm3, mm1 + + psrlq mm1, 8 + psrlq mm3, 56 + + psllq mm3, 56 + por mm1, mm3 + + movq mm3, mm1 + movq mm4, mm0 + + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + + paddw mm0, mm1 + paddw mm0, mm6 + + punpckhbw mm2, mm7 + punpckhbw mm3, mm7 + + paddw mm2, mm3 + paddw mm2, mm6 + + psraw mm0, 1 + psraw mm2, 1 + + packuswb mm0, mm2 + movq mm2, mm4 + + punpcklbw mm2, mm0 + movq [edi], mm2 + + punpckhbw mm4, mm0 + movq [edi+8], mm4 + } +} + + + + + +__declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 }; +__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 }; + + +/**************************************************************************** + * + * ROUTINE : horizontal_line_5_4_scale_mmx + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 4 to 5. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void horizontal_line_5_4_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + /* + unsigned i; + unsigned int a, b, c, d, e; + unsigned char *des = dest; + const unsigned char *src = source; + + (void) dest_width; + + for ( i=0; i<source_width; i+=5 ) + { + a = src[0]; + b = src[1]; + c = src[2]; + d = src[3]; + e = src[4]; + + des[0] = a; + des[1] = ((b*192 + c* 64 + 128)>>8); + des[2] = ((c*128 + d*128 + 128)>>8); + des[3] = ((d* 64 + e*192 + 128)>>8); + + src += 5; + des += 4; + } + */ + (void) dest_width; + + __asm + { + + mov esi, source ; + mov edi, dest ; + + mov ecx, source_width ; + movq mm5, const54_1 ; + + pxor mm7, mm7 ; + movq mm6, const54_2 ; + + movq mm4, round_values ; + lea edx, [esi+ecx] ; + horizontal_line_5_4_loop: + + movq mm0, QWORD PTR [esi] ; + 00 01 02 03 04 05 06 07 + movq mm1, mm0 ; + 00 01 02 03 04 05 06 07 + + psrlq mm0, 8 ; + 01 02 03 04 05 06 07 xx + punpcklbw mm1, mm7 ; + xx 00 xx 01 xx 02 xx 03 + + punpcklbw mm0, mm7 ; + xx 01 xx 02 xx 03 xx 04 + pmullw mm1, mm5 + + pmullw mm0, mm6 + add esi, 5 + + add edi, 4 + paddw mm1, mm0 + + paddw mm1, mm4 + psrlw mm1, 8 + + cmp esi, edx + packuswb mm1, mm7 + + movd DWORD PTR [edi-4], mm1 + + jl horizontal_line_5_4_loop + + } + +} +__declspec(align(16)) const static unsigned short one_fourths[] = { 64, 64, 64, 64 }; +__declspec(align(16)) const static unsigned short two_fourths[] = { 128, 128, 128, 128 }; +__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 }; + +static +void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + + __asm + { + push ebx + + mov esi, source // Get the source and destination pointer + mov ecx, src_pitch // Get the pitch size + + mov edi, dest // tow lines below + pxor mm7, mm7 // clear out mm7 + + mov edx, dest_pitch // Loop counter + mov ebx, dest_width + + vs_5_4_loop: + + movd mm0, DWORD ptr [esi] // src[0]; + movd mm1, DWORD ptr [esi+ecx] // src[1]; + + movd mm2, DWORD ptr [esi+ecx*2] + lea eax, [esi+ecx*2] // + + punpcklbw mm1, mm7 + punpcklbw mm2, mm7 + + movq mm3, mm2 + pmullw mm1, three_fourths + + pmullw mm2, one_fourths + movd mm4, [eax+ecx] + + pmullw mm3, two_fourths + punpcklbw mm4, mm7 + + movq mm5, mm4 + pmullw mm4, two_fourths + + paddw mm1, mm2 + movd mm6, [eax+ecx*2] + + pmullw mm5, one_fourths + paddw mm1, round_values; + + paddw mm3, mm4 + psrlw mm1, 8 + + punpcklbw mm6, mm7 + paddw mm3, round_values + + pmullw mm6, three_fourths + psrlw mm3, 8 + + packuswb mm1, mm7 + packuswb mm3, mm7 + + movd DWORD PTR [edi], mm0 + movd DWORD PTR [edi+edx], mm1 + + + paddw mm5, mm6 + movd DWORD PTR [edi+edx*2], mm3 + + lea eax, [edi+edx*2] + paddw mm5, round_values + + psrlw mm5, 8 + add edi, 4 + + packuswb mm5, mm7 + movd DWORD PTR [eax+edx], mm5 + + add esi, 4 + sub ebx, 4 + + jg vs_5_4_loop + + pop ebx + } +} + + +__declspec(align(16)) const static unsigned short const53_1[] = { 0, 85, 171, 0 }; +__declspec(align(16)) const static unsigned short const53_2[] = {256, 171, 85, 0 }; + + +static +void horizontal_line_5_3_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + + (void) dest_width; + __asm + { + + mov esi, source ; + mov edi, dest ; + + mov ecx, source_width ; + movq mm5, const53_1 ; + + pxor mm7, mm7 ; + movq mm6, const53_2 ; + + movq mm4, round_values ; + lea edx, [esi+ecx-5] ; + horizontal_line_5_3_loop: + + movq mm0, QWORD PTR [esi] ; + 00 01 02 03 04 05 06 07 + movq mm1, mm0 ; + 00 01 02 03 04 05 06 07 + + psllw mm0, 8 ; + xx 00 xx 02 xx 04 xx 06 + psrlw mm1, 8 ; + 01 xx 03 xx 05 xx 07 xx + + psrlw mm0, 8 ; + 00 xx 02 xx 04 xx 06 xx + psllq mm1, 16 ; + xx xx 01 xx 03 xx 05 xx + + pmullw mm0, mm6 + + pmullw mm1, mm5 + add esi, 5 + + add edi, 3 + paddw mm1, mm0 + + paddw mm1, mm4 + psrlw mm1, 8 + + cmp esi, edx + packuswb mm1, mm7 + + movd DWORD PTR [edi-3], mm1 + jl horizontal_line_5_3_loop + +//exit condition + movq mm0, QWORD PTR [esi] ; + 00 01 02 03 04 05 06 07 + movq mm1, mm0 ; + 00 01 02 03 04 05 06 07 + + psllw mm0, 8 ; + xx 00 xx 02 xx 04 xx 06 + psrlw mm1, 8 ; + 01 xx 03 xx 05 xx 07 xx + + psrlw mm0, 8 ; + 00 xx 02 xx 04 xx 06 xx + psllq mm1, 16 ; + xx xx 01 xx 03 xx 05 xx + + pmullw mm0, mm6 + + pmullw mm1, mm5 + paddw mm1, mm0 + + paddw mm1, mm4 + psrlw mm1, 8 + + packuswb mm1, mm7 + movd eax, mm1 + + mov edx, eax + shr edx, 16 + + mov WORD PTR[edi], ax + mov BYTE PTR[edi+2], dl + + } + +} + +__declspec(align(16)) const static unsigned short one_thirds[] = { 85, 85, 85, 85 }; +__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 }; + +static +void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + + __asm + { + push ebx + + mov esi, source // Get the source and destination pointer + mov ecx, src_pitch // Get the pitch size + + mov edi, dest // tow lines below + pxor mm7, mm7 // clear out mm7 + + mov edx, dest_pitch // Loop counter + movq mm5, one_thirds + + movq mm6, two_thirds + mov ebx, dest_width; + + vs_5_3_loop: + + movd mm0, DWORD ptr [esi] // src[0]; + movd mm1, DWORD ptr [esi+ecx] // src[1]; + + movd mm2, DWORD ptr [esi+ecx*2] + lea eax, [esi+ecx*2] // + + punpcklbw mm1, mm7 + punpcklbw mm2, mm7 + + pmullw mm1, mm5 + pmullw mm2, mm6 + + movd mm3, DWORD ptr [eax+ecx] + movd mm4, DWORD ptr [eax+ecx*2] + + punpcklbw mm3, mm7 + punpcklbw mm4, mm7 + + pmullw mm3, mm6 + pmullw mm4, mm5 + + + movd DWORD PTR [edi], mm0 + paddw mm1, mm2 + + paddw mm1, round_values + psrlw mm1, 8 + + packuswb mm1, mm7 + paddw mm3, mm4 + + paddw mm3, round_values + movd DWORD PTR [edi+edx], mm1 + + psrlw mm3, 8 + packuswb mm3, mm7 + + movd DWORD PTR [edi+edx*2], mm3 + + + add edi, 4 + add esi, 4 + + sub ebx, 4 + jg vs_5_3_loop + + pop ebx + } +} + + + + +/**************************************************************************** + * + * ROUTINE : horizontal_line_2_1_scale + * + * INPUTS : const unsigned char *source : + * unsigned int source_width : + * unsigned char *dest : + * unsigned int dest_width : + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static +void horizontal_line_2_1_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + (void) dest_width; + (void) source_width; + __asm + { + mov esi, source + mov edi, dest + + pxor mm7, mm7 + mov ecx, dest_width + + xor edx, edx + hs_2_1_loop: + + movq mm0, [esi+edx*2] + psllw mm0, 8 + + psrlw mm0, 8 + packuswb mm0, mm7 + + movd DWORD Ptr [edi+edx], mm0; + add edx, 4 + + cmp edx, ecx + jl hs_2_1_loop + + } +} + + + +static +void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + (void) dest_pitch; + (void) src_pitch; + vpx_memcpy(dest, source, dest_width); +} + + +__declspec(align(16)) const static unsigned short three_sixteenths[] = { 48, 48, 48, 48 }; +__declspec(align(16)) const static unsigned short ten_sixteenths[] = { 160, 160, 160, 160 }; + +static +void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) +{ + + (void) dest_pitch; + __asm + { + mov esi, source + mov edi, dest + + mov eax, src_pitch + mov edx, dest_width + + pxor mm7, mm7 + sub esi, eax //back one line + + + lea ecx, [esi+edx]; + movq mm6, round_values; + + movq mm5, three_sixteenths; + movq mm4, ten_sixteenths; + + vs_2_1_i_loop: + movd mm0, [esi] // + movd mm1, [esi+eax] // + + movd mm2, [esi+eax*2] // + punpcklbw mm0, mm7 + + pmullw mm0, mm5 + punpcklbw mm1, mm7 + + pmullw mm1, mm4 + punpcklbw mm2, mm7 + + pmullw mm2, mm5 + paddw mm0, round_values + + paddw mm1, mm2 + paddw mm0, mm1 + + psrlw mm0, 8 + packuswb mm0, mm7 + + movd DWORD PTR [edi], mm0 + add esi, 4 + + add edi, 4; + cmp esi, ecx + jl vs_2_1_i_loop + + } +} + + + +void +register_mmxscalers(void) +{ + vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx; + vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx; + vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx; + vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx; + vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx; + vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx; + vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx; + vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx; + vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx; + + vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c; + vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c; + vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c; + vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c; + vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c; + vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c; + + + + vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx; + vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx; + vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx; + vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx; + vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx; + vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx; + vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx; + + + + +} diff --git a/vpx_scale/win32/scalesystemdependant.c b/vpx_scale/win32/scalesystemdependant.c new file mode 100644 index 000000000..9ed48bfc6 --- /dev/null +++ b/vpx_scale/win32/scalesystemdependant.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** +* +* Module Title : system_dependant.c +* +* Description : Miscellaneous system dependant functions +* +****************************************************************************/ + +/**************************************************************************** +* Header Files +****************************************************************************/ +#include "vpx_scale/vpxscale.h" +#include "cpuidlib.h" + +/**************************************************************************** +* Imports +*****************************************************************************/ +extern void register_generic_scalers(void); +extern void register_mmxscalers(void); + +/**************************************************************************** + * + * ROUTINE : post_proc_machine_specific_config + * + * INPUTS : UINT32 Version : Codec version number. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Checks for machine specifc features such as MMX support + * sets appropriate flags and function pointers. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void +vp8_scale_machine_specific_config(void) +{ + // If MMX supported then set to use MMX versions of functions else + // use original 'C' versions. + int mmx_enabled; + int xmm_enabled; + int wmt_enabled; + + vpx_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled); + + if (mmx_enabled || xmm_enabled || wmt_enabled) + { + register_mmxscalers(); + } + else + { + vp8_horizontal_line_1_2_scale = vp8cx_horizontal_line_1_2_scale_c; + vp8_vertical_band_1_2_scale = vp8cx_vertical_band_1_2_scale_c; + vp8_last_vertical_band_1_2_scale = vp8cx_last_vertical_band_1_2_scale_c; + vp8_horizontal_line_3_5_scale = vp8cx_horizontal_line_3_5_scale_c; + vp8_vertical_band_3_5_scale = vp8cx_vertical_band_3_5_scale_c; + vp8_last_vertical_band_3_5_scale = vp8cx_last_vertical_band_3_5_scale_c; + vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c; + vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c; + vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c; + vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c; + vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c; + vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c; + vp8_horizontal_line_4_5_scale = vp8cx_horizontal_line_4_5_scale_c; + vp8_vertical_band_4_5_scale = vp8cx_vertical_band_4_5_scale_c; + vp8_last_vertical_band_4_5_scale = vp8cx_last_vertical_band_4_5_scale_c; + + + vp8_vertical_band_5_4_scale = vp8cx_vertical_band_5_4_scale_c; + vp8_vertical_band_5_3_scale = vp8cx_vertical_band_5_3_scale_c; + vp8_vertical_band_2_1_scale = vp8cx_vertical_band_2_1_scale_c; + vp8_vertical_band_2_1_scale_i = vp8cx_vertical_band_2_1_scale_i_c; + vp8_horizontal_line_2_1_scale = vp8cx_horizontal_line_2_1_scale_c; + vp8_horizontal_line_5_3_scale = vp8cx_horizontal_line_5_3_scale_c; + vp8_horizontal_line_5_4_scale = vp8cx_horizontal_line_5_4_scale_c; + + } +} diff --git a/vpx_scale/x86_64/scaleopt.c b/vpx_scale/x86_64/scaleopt.c new file mode 100644 index 000000000..3d2d5f237 --- /dev/null +++ b/vpx_scale/x86_64/scaleopt.c @@ -0,0 +1,1749 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** +* +* Module Title : scaleopt.cpp +* +* Description : Optimized scaling functions +* +****************************************************************************/ +#include "pragmas.h" + + + +/**************************************************************************** +* Module Statics +****************************************************************************/ +__declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 51 }; +__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 }; +__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 }; +__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 }; +__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 }; +__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1}; +__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 }; +__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 }; +__declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0}; +__declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 }; +__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 }; + + + +#include "vpx_scale/vpxscale.h" +#include "vpx_mem/vpx_mem.h" + +/**************************************************************************** +* +* ROUTINE : horizontal_line_3_5_scale_mmx +* +* INPUTS : const unsigned char *source : +* unsigned int source_width : +* unsigned char *dest : +* unsigned int dest_width : +* +* OUTPUTS : None. +* +* RETURNS : void +* +* FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels. +* +* SPECIAL NOTES : None. +* +****************************************************************************/ +static +void horizontal_line_3_5_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + (void) dest_width; + + __asm + { + + push rbx + + mov rsi, source + mov rdi, dest + + mov ecx, source_width + lea rdx, [rsi+rcx-3]; + + movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx + movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx + + movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx + pxor mm7, mm7 // clear mm7 + + horiz_line_3_5_loop: + + mov eax, DWORD PTR [rsi] // eax = 00 01 02 03 + mov ebx, eax + + and ebx, 0xffff00 // ebx = xx 01 02 xx + mov ecx, eax // ecx = 00 01 02 03 + + and eax, 0xffff0000 // eax = xx xx 02 03 + xor ecx, eax // ecx = 00 01 xx xx + + shr ebx, 8 // ebx = 01 02 xx xx + or eax, ebx // eax = 01 02 02 03 + + shl ebx, 16 // ebx = xx xx 01 02 + movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx + + or ebx, ecx // ebx = 00 01 01 02 + punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx + + movd mm0, ebx // mm0 = 00 01 01 02 + pmullw mm1, mm6 // + + punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx + pmullw mm0, mm5 // + + mov [rdi], ebx // writeoutput 00 xx xx xx + add rsi, 3 + + add rdi, 5 + paddw mm0, mm1 + + paddw mm0, mm4 + psrlw mm0, 8 + + cmp rsi, rdx + packuswb mm0, mm7 + + movd DWORD Ptr [rdi-4], mm0 + jl horiz_line_3_5_loop + +//Exit: + mov eax, DWORD PTR [rsi] // eax = 00 01 02 03 + mov ebx, eax + + and ebx, 0xffff00 // ebx = xx 01 02 xx + mov ecx, eax // ecx = 00 01 02 03 + + and eax, 0xffff0000 // eax = xx xx 02 03 + xor ecx, eax // ecx = 00 01 xx xx + + shr ebx, 8 // ebx = 01 02 xx xx + or eax, ebx // eax = 01 02 02 03 + + shl eax, 8 // eax = xx 01 02 02 + and eax, 0xffff0000 // eax = xx xx 02 02 + + or eax, ebx // eax = 01 02 02 02 + + shl ebx, 16 // ebx = xx xx 01 02 + movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx + + or ebx, ecx // ebx = 00 01 01 02 + punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx + + movd mm0, ebx // mm0 = 00 01 01 02 + pmullw mm1, mm6 // + + punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx + pmullw mm0, mm5 // + + mov [rdi], ebx // writeoutput 00 xx xx xx + paddw mm0, mm1 + + paddw mm0, mm4 + psrlw mm0, 8 + + packuswb mm0, mm7 + movd DWORD Ptr [rdi+1], mm0 + + pop rbx + + } + +} + + +/**************************************************************************** +* +* ROUTINE : horizontal_line_4_5_scale_mmx +* +* INPUTS : const unsigned char *source : +* unsigned int source_width : +* unsigned char *dest : +* unsigned int dest_width : +* +* OUTPUTS : None. +* +* RETURNS : void +* +* FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels. +* +* SPECIAL NOTES : None. +* +****************************************************************************/ +static +void horizontal_line_4_5_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + (void)dest_width; + + __asm + { + + mov rsi, source + mov rdi, dest + + mov ecx, source_width + lea rdx, [rsi+rcx-8]; + + movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx + movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx + + movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx + pxor mm7, mm7 // clear mm7 + + horiz_line_4_5_loop: + + movq mm0, QWORD PTR [rsi] // mm0 = 00 01 02 03 04 05 06 07 + movq mm1, QWORD PTR [rsi+1]; // mm1 = 01 02 03 04 05 06 07 08 + + movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07 + movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08 + + movd DWORD PTR [rdi], mm0 // write output 00 xx xx xx + punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx + + punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx + pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205 + + pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51 + punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx + + movd DWORD PTR [rdi+5], mm2 // write ouput 05 xx xx xx + pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205 + + punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx + pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51 + + paddw mm0, mm1 // added round values + paddw mm0, mm4 + + psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx + packuswb mm0, mm7 + + movd DWORD PTR [rdi+1], mm0 // write output 01 02 03 04 + add rdi, 10 + + add rsi, 8 + paddw mm2, mm3 // + + paddw mm2, mm4 // added round values + cmp rsi, rdx + + psrlw mm2, 8 + packuswb mm2, mm7 + + movd DWORD PTR [rdi-4], mm2 // writeoutput 06 07 08 09 + jl horiz_line_4_5_loop + +//Exit: + movq mm0, [rsi] // mm0 = 00 01 02 03 04 05 06 07 + movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07 + + movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07 + psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00 + + movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00 + pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00 + + psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07 + por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07 + + movq mm3, mm1 + + movd DWORD PTR [rdi], mm0 // write output 00 xx xx xx + punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx + + punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx + pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205 + + pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51 + punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx + + movd DWORD PTR [rdi+5], mm2 // write ouput 05 xx xx xx + pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205 + + punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx + pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51 + + paddw mm0, mm1 // added round values + paddw mm0, mm4 + + psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx + packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx + + movd DWORD PTR [rdi+1], mm0 // write output 01 02 03 04 + paddw mm2, mm3 // + + paddw mm2, mm4 // added round values + psrlw mm2, 8 + + packuswb mm2, mm7 + movd DWORD PTR [rdi+6], mm2 // writeoutput 06 07 08 09 + + + } +} + +/**************************************************************************** +* +* ROUTINE : vertical_band_4_5_scale_mmx +* +* INPUTS : unsigned char *dest : +* unsigned int dest_pitch : +* unsigned int dest_width : +* +* OUTPUTS : None. +* +* RETURNS : void +* +* FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels. +* +* SPECIAL NOTES : The routine uses the first line of the band below +* the current band. The function also has a "C" only +* version. +* +****************************************************************************/ +static +void vertical_band_4_5_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + + mov rsi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + lea rdi, [rsi+rcx*2] // tow lines below + add rdi, rcx // three lines below + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + vs_4_5_loop: + + movq mm0, QWORD ptr [rsi] // src[0]; + movq mm1, QWORD ptr [rsi+rcx] // src[1]; + + movq mm2, mm0 // Make a copy + punpcklbw mm0, mm7 // unpack low to word + + movq mm5, one_fifth + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm0, mm5 // a * 1/5 + + movq mm3, mm1 // make a copy + punpcklbw mm1, mm7 // unpack low to word + + pmullw mm2, mm5 // a * 1/5 + movq mm6, four_fifths // constan + + movq mm4, mm1 // copy of low b + pmullw mm4, mm6 // b * 4/5 + + punpckhbw mm3, mm7 // unpack high to word + movq mm5, mm3 // copy of high b + + pmullw mm5, mm6 // b * 4/5 + paddw mm0, mm4 // a * 1/5 + b * 4/5 + + paddw mm2, mm5 // a * 1/5 + b * 4/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des [1] + + movq QWORD ptr [rsi+rcx], mm0 // write des[1] + movq mm0, [rsi+rcx*2] // mm0 = src[2] + + // mm1, mm3 --- Src[1] + // mm0 --- Src[2] + // mm7 for unpacking + + movq mm5, two_fifths + movq mm2, mm0 // make a copy + + pmullw mm1, mm5 // b * 2/5 + movq mm6, three_fifths + + + punpcklbw mm0, mm7 // unpack low to word + pmullw mm3, mm5 // b * 2/5 + + movq mm4, mm0 // make copy of c + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm4, mm6 // c * 3/5 + movq mm5, mm2 + + pmullw mm5, mm6 // c * 3/5 + paddw mm1, mm4 // b * 2/5 + c * 3/5 + + paddw mm3, mm5 // b * 2/5 + c * 3/5 + paddw mm1, round_values // + 128 + + paddw mm3, round_values // + 128 + psrlw mm1, 8 + + psrlw mm3, 8 + packuswb mm1, mm3 // des[2] + + movq QWORD ptr [rsi+rcx*2], mm1 // write des[2] + movq mm1, [rdi] // mm1=Src[3]; + + // mm0, mm2 --- Src[2] + // mm1 --- Src[3] + // mm6 --- 3/5 + // mm7 for unpacking + + pmullw mm0, mm6 // c * 3/5 + movq mm5, two_fifths // mm5 = 2/5 + + movq mm3, mm1 // make a copy + pmullw mm2, mm6 // c * 3/5 + + punpcklbw mm1, mm7 // unpack low + movq mm4, mm1 // make a copy + + punpckhbw mm3, mm7 // unpack high + pmullw mm4, mm5 // d * 2/5 + + movq mm6, mm3 // make a copy + pmullw mm6, mm5 // d * 2/5 + + paddw mm0, mm4 // c * 3/5 + d * 2/5 + paddw mm2, mm6 // c * 3/5 + d * 2/5 + + paddw mm0, round_values // + 128 + paddw mm2, round_values // + 128 + + psrlw mm0, 8 + psrlw mm2, 8 + + packuswb mm0, mm2 // des[3] + movq QWORD ptr [rdi], mm0 // write des[3] + + // mm1, mm3 --- Src[3] + // mm7 -- cleared for unpacking + + movq mm0, [rdi+rcx*2] // mm0, Src[0] of the next group + + movq mm5, four_fifths // mm5 = 4/5 + pmullw mm1, mm5 // d * 4/5 + + movq mm6, one_fifth // mm6 = 1/5 + movq mm2, mm0 // make a copy + + pmullw mm3, mm5 // d * 4/5 + punpcklbw mm0, mm7 // unpack low + + pmullw mm0, mm6 // an * 1/5 + punpckhbw mm2, mm7 // unpack high + + paddw mm1, mm0 // d * 4/5 + an * 1/5 + pmullw mm2, mm6 // an * 1/5 + + paddw mm3, mm2 // d * 4/5 + an * 1/5 + paddw mm1, round_values // + 128 + + paddw mm3, round_values // + 128 + psrlw mm1, 8 + + psrlw mm3, 8 + packuswb mm1, mm3 // des[4] + + movq QWORD ptr [rdi+rcx], mm1 // write des[4] + + add rdi, 8 + add rsi, 8 + + sub rdx, 8 + jg vs_4_5_loop + } +} + +/**************************************************************************** +* +* ROUTINE : last_vertical_band_4_5_scale_mmx +* +* INPUTS : unsigned char *dest : +* unsigned int dest_pitch : +* unsigned int dest_width : +* +* OUTPUTS : None. +* +* RETURNS : None +* +* FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image. +* +* SPECIAL NOTES : The routine uses the first line of the band below +* the current band. The function also has an "C" only +* version. +* +****************************************************************************/ +static +void last_vertical_band_4_5_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + mov rsi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + lea rdi, [rsi+rcx*2] // tow lines below + add rdi, rcx // three lines below + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + last_vs_4_5_loop: + + movq mm0, QWORD ptr [rsi] // src[0]; + movq mm1, QWORD ptr [rsi+rcx] // src[1]; + + movq mm2, mm0 // Make a copy + punpcklbw mm0, mm7 // unpack low to word + + movq mm5, one_fifth + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm0, mm5 // a * 1/5 + + movq mm3, mm1 // make a copy + punpcklbw mm1, mm7 // unpack low to word + + pmullw mm2, mm5 // a * 1/5 + movq mm6, four_fifths // constan + + movq mm4, mm1 // copy of low b + pmullw mm4, mm6 // b * 4/5 + + punpckhbw mm3, mm7 // unpack high to word + movq mm5, mm3 // copy of high b + + pmullw mm5, mm6 // b * 4/5 + paddw mm0, mm4 // a * 1/5 + b * 4/5 + + paddw mm2, mm5 // a * 1/5 + b * 4/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des [1] + + movq QWORD ptr [rsi+rcx], mm0 // write des[1] + movq mm0, [rsi+rcx*2] // mm0 = src[2] + + // mm1, mm3 --- Src[1] + // mm0 --- Src[2] + // mm7 for unpacking + + movq mm5, two_fifths + movq mm2, mm0 // make a copy + + pmullw mm1, mm5 // b * 2/5 + movq mm6, three_fifths + + + punpcklbw mm0, mm7 // unpack low to word + pmullw mm3, mm5 // b * 2/5 + + movq mm4, mm0 // make copy of c + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm4, mm6 // c * 3/5 + movq mm5, mm2 + + pmullw mm5, mm6 // c * 3/5 + paddw mm1, mm4 // b * 2/5 + c * 3/5 + + paddw mm3, mm5 // b * 2/5 + c * 3/5 + paddw mm1, round_values // + 128 + + paddw mm3, round_values // + 128 + psrlw mm1, 8 + + psrlw mm3, 8 + packuswb mm1, mm3 // des[2] + + movq QWORD ptr [rsi+rcx*2], mm1 // write des[2] + movq mm1, [rdi] // mm1=Src[3]; + + movq QWORD ptr [rdi+rcx], mm1 // write des[4]; + + // mm0, mm2 --- Src[2] + // mm1 --- Src[3] + // mm6 --- 3/5 + // mm7 for unpacking + + pmullw mm0, mm6 // c * 3/5 + movq mm5, two_fifths // mm5 = 2/5 + + movq mm3, mm1 // make a copy + pmullw mm2, mm6 // c * 3/5 + + punpcklbw mm1, mm7 // unpack low + movq mm4, mm1 // make a copy + + punpckhbw mm3, mm7 // unpack high + pmullw mm4, mm5 // d * 2/5 + + movq mm6, mm3 // make a copy + pmullw mm6, mm5 // d * 2/5 + + paddw mm0, mm4 // c * 3/5 + d * 2/5 + paddw mm2, mm6 // c * 3/5 + d * 2/5 + + paddw mm0, round_values // + 128 + paddw mm2, round_values // + 128 + + psrlw mm0, 8 + psrlw mm2, 8 + + packuswb mm0, mm2 // des[3] + movq QWORD ptr [rdi], mm0 // write des[3] + + // mm1, mm3 --- Src[3] + // mm7 -- cleared for unpacking + add rdi, 8 + add rsi, 8 + + sub rdx, 8 + jg last_vs_4_5_loop + } +} + +/**************************************************************************** +* +* ROUTINE : vertical_band_3_5_scale_mmx +* +* INPUTS : unsigned char *dest : +* unsigned int dest_pitch : +* unsigned int dest_width : +* +* OUTPUTS : None. +* +* RETURNS : void +* +* FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels. +* +* SPECIAL NOTES : The routine uses the first line of the band below +* the current band. The function also has an "C" only +* version. +* +****************************************************************************/ +static +void vertical_band_3_5_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + mov rsi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + lea rdi, [rsi+rcx*2] // two lines below + add rdi, rcx // three lines below + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + vs_3_5_loop: + + movq mm0, QWORD ptr [rsi] // src[0]; + movq mm1, QWORD ptr [rsi+rcx] // src[1]; + + movq mm2, mm0 // Make a copy + punpcklbw mm0, mm7 // unpack low to word + + movq mm5, two_fifths // mm5 = 2/5 + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm0, mm5 // a * 2/5 + + movq mm3, mm1 // make a copy + punpcklbw mm1, mm7 // unpack low to word + + pmullw mm2, mm5 // a * 2/5 + movq mm6, three_fifths // mm6 = 3/5 + + movq mm4, mm1 // copy of low b + pmullw mm4, mm6 // b * 3/5 + + punpckhbw mm3, mm7 // unpack high to word + movq mm5, mm3 // copy of high b + + pmullw mm5, mm6 // b * 3/5 + paddw mm0, mm4 // a * 2/5 + b * 3/5 + + paddw mm2, mm5 // a * 2/5 + b * 3/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des [1] + + movq QWORD ptr [rsi+rcx], mm0 // write des[1] + movq mm0, [rsi+rcx*2] // mm0 = src[2] + + // mm1, mm3 --- Src[1] + // mm0 --- Src[2] + // mm7 for unpacking + + movq mm4, mm1 // b low + pmullw mm1, four_fifths // b * 4/5 low + + movq mm5, mm3 // b high + pmullw mm3, four_fifths // b * 4/5 high + + movq mm2, mm0 // c + pmullw mm4, one_fifth // b * 1/5 + + punpcklbw mm0, mm7 // c low + pmullw mm5, one_fifth // b * 1/5 + + movq mm6, mm0 // make copy of c low + punpckhbw mm2, mm7 // c high + + pmullw mm6, one_fifth // c * 1/5 low + movq mm7, mm2 // make copy of c high + + pmullw mm7, one_fifth // c * 1/5 high + paddw mm1, mm6 // b * 4/5 + c * 1/5 low + + paddw mm3, mm7 // b * 4/5 + c * 1/5 high + movq mm6, mm0 // make copy of c low + + pmullw mm6, four_fifths // c * 4/5 low + movq mm7, mm2 // make copy of c high + + pmullw mm7, four_fifths // c * 4/5 high + + paddw mm4, mm6 // b * 1/5 + c * 4/5 low + paddw mm5, mm7 // b * 1/5 + c * 4/5 high + + paddw mm1, round_values // + 128 + paddw mm3, round_values // + 128 + + psrlw mm1, 8 + psrlw mm3, 8 + + packuswb mm1, mm3 // des[2] + movq QWORD ptr [rsi+rcx*2], mm1 // write des[2] + + paddw mm4, round_values // + 128 + paddw mm5, round_values // + 128 + + psrlw mm4, 8 + psrlw mm5, 8 + + packuswb mm4, mm5 // des[3] + movq QWORD ptr [rdi], mm4 // write des[3] + + // mm0, mm2 --- Src[3] + + pxor mm7, mm7 // clear mm7 for unpacking + movq mm1, [rdi+rcx*2] // mm1 = Src[0] of the next group + + movq mm5, three_fifths // mm5 = 3/5 + pmullw mm0, mm5 // d * 3/5 + + movq mm6, two_fifths // mm6 = 2/5 + movq mm3, mm1 // make a copy + + pmullw mm2, mm5 // d * 3/5 + punpcklbw mm1, mm7 // unpack low + + pmullw mm1, mm6 // an * 2/5 + punpckhbw mm3, mm7 // unpack high + + paddw mm0, mm1 // d * 3/5 + an * 2/5 + pmullw mm3, mm6 // an * 2/5 + + paddw mm2, mm3 // d * 3/5 + an * 2/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des[4] + + movq QWORD ptr [rdi+rcx], mm0 // write des[4] + + add rdi, 8 + add rsi, 8 + + sub rdx, 8 + jg vs_3_5_loop + } +} + +/**************************************************************************** +* +* ROUTINE : last_vertical_band_3_5_scale_mmx +* +* INPUTS : unsigned char *dest : +* unsigned int dest_pitch : +* unsigned int dest_width : +* +* OUTPUTS : None. +* +* RETURNS : void +* +* FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels. +* +* SPECIAL NOTES : The routine uses the first line of the band below +* the current band. The function also has an "C" only +* version. +* +****************************************************************************/ +static +void last_vertical_band_3_5_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + mov rsi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + lea rdi, [rsi+rcx*2] // tow lines below + add rdi, rcx // three lines below + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + + last_vs_3_5_loop: + + movq mm0, QWORD ptr [rsi] // src[0]; + movq mm1, QWORD ptr [rsi+rcx] // src[1]; + + movq mm2, mm0 // Make a copy + punpcklbw mm0, mm7 // unpack low to word + + movq mm5, two_fifths // mm5 = 2/5 + punpckhbw mm2, mm7 // unpack high to word + + pmullw mm0, mm5 // a * 2/5 + + movq mm3, mm1 // make a copy + punpcklbw mm1, mm7 // unpack low to word + + pmullw mm2, mm5 // a * 2/5 + movq mm6, three_fifths // mm6 = 3/5 + + movq mm4, mm1 // copy of low b + pmullw mm4, mm6 // b * 3/5 + + punpckhbw mm3, mm7 // unpack high to word + movq mm5, mm3 // copy of high b + + pmullw mm5, mm6 // b * 3/5 + paddw mm0, mm4 // a * 2/5 + b * 3/5 + + paddw mm2, mm5 // a * 2/5 + b * 3/5 + paddw mm0, round_values // + 128 + + paddw mm2, round_values // + 128 + psrlw mm0, 8 + + psrlw mm2, 8 + packuswb mm0, mm2 // des [1] + + movq QWORD ptr [rsi+rcx], mm0 // write des[1] + movq mm0, [rsi+rcx*2] // mm0 = src[2] + + + + // mm1, mm3 --- Src[1] + // mm0 --- Src[2] + // mm7 for unpacking + + movq mm4, mm1 // b low + pmullw mm1, four_fifths // b * 4/5 low + + movq QWORD ptr [rdi+rcx], mm0 // write des[4] + + movq mm5, mm3 // b high + pmullw mm3, four_fifths // b * 4/5 high + + movq mm2, mm0 // c + pmullw mm4, one_fifth // b * 1/5 + + punpcklbw mm0, mm7 // c low + pmullw mm5, one_fifth // b * 1/5 + + movq mm6, mm0 // make copy of c low + punpckhbw mm2, mm7 // c high + + pmullw mm6, one_fifth // c * 1/5 low + movq mm7, mm2 // make copy of c high + + pmullw mm7, one_fifth // c * 1/5 high + paddw mm1, mm6 // b * 4/5 + c * 1/5 low + + paddw mm3, mm7 // b * 4/5 + c * 1/5 high + movq mm6, mm0 // make copy of c low + + pmullw mm6, four_fifths // c * 4/5 low + movq mm7, mm2 // make copy of c high + + pmullw mm7, four_fifths // c * 4/5 high + + paddw mm4, mm6 // b * 1/5 + c * 4/5 low + paddw mm5, mm7 // b * 1/5 + c * 4/5 high + + paddw mm1, round_values // + 128 + paddw mm3, round_values // + 128 + + psrlw mm1, 8 + psrlw mm3, 8 + + packuswb mm1, mm3 // des[2] + movq QWORD ptr [rsi+rcx*2], mm1 // write des[2] + + paddw mm4, round_values // + 128 + paddw mm5, round_values // + 128 + + psrlw mm4, 8 + psrlw mm5, 8 + + packuswb mm4, mm5 // des[3] + movq QWORD ptr [rdi], mm4 // write des[3] + + // mm0, mm2 --- Src[3] + + add rdi, 8 + add rsi, 8 + + sub rdx, 8 + jg last_vs_3_5_loop + } +} + +/**************************************************************************** +* +* ROUTINE : vertical_band_1_2_scale_mmx +* +* INPUTS : unsigned char *dest : +* unsigned int dest_pitch : +* unsigned int dest_width : +* +* OUTPUTS : None. +* +* RETURNS : void +* +* FUNCTION : 1 to 2 up-scaling of a band of pixels. +* +* SPECIAL NOTES : The routine uses the first line of the band below +* the current band. The function also has an "C" only +* version. +* +****************************************************************************/ +static +void vertical_band_1_2_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + + mov rsi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + pxor mm7, mm7 // clear out mm7 + mov edx, dest_width // Loop counter + + vs_1_2_loop: + + movq mm0, [rsi] // get Src[0] + movq mm1, [rsi + rcx * 2] // get Src[1] + + movq mm2, mm0 // make copy before unpack + movq mm3, mm1 // make copy before unpack + + punpcklbw mm0, mm7 // low Src[0] + movq mm6, four_ones // mm6= 1, 1, 1, 1 + + punpcklbw mm1, mm7 // low Src[1] + paddw mm0, mm1 // low (a + b) + + punpckhbw mm2, mm7 // high Src[0] + paddw mm0, mm6 // low (a + b + 1) + + punpckhbw mm3, mm7 + paddw mm2, mm3 // high (a + b ) + + psraw mm0, 1 // low (a + b +1 )/2 + paddw mm2, mm6 // high (a + b + 1) + + psraw mm2, 1 // high (a + b + 1)/2 + packuswb mm0, mm2 // pack results + + movq [rsi+rcx], mm0 // write out eight bytes + add rsi, 8 + + sub rdx, 8 + jg vs_1_2_loop + } + +} + +/**************************************************************************** +* +* ROUTINE : last_vertical_band_1_2_scale_mmx +* +* INPUTS : unsigned char *dest : +* unsigned int dest_pitch : +* unsigned int dest_width : +* +* OUTPUTS : None. +* +* RETURNS : void +* +* FUNCTION : 1 to 2 up-scaling of band of pixels. +* +* SPECIAL NOTES : The routine uses the first line of the band below +* the current band. The function also has an "C" only +* version. +* +****************************************************************************/ +static +void last_vertical_band_1_2_scale_mmx +( + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + mov rsi, dest // Get the source and destination pointer + mov ecx, dest_pitch // Get the pitch size + + mov edx, dest_width // Loop counter + + last_vs_1_2_loop: + + movq mm0, [rsi] // get Src[0] + movq [rsi+rcx], mm0 // write out eight bytes + + add rsi, 8 + sub rdx, 8 + + jg last_vs_1_2_loop + } +} + +/**************************************************************************** +* +* ROUTINE : horizontal_line_1_2_scale +* +* INPUTS : const unsigned char *source : +* unsigned int source_width : +* unsigned char *dest : +* unsigned int dest_width : +* +* OUTPUTS : None. +* +* RETURNS : void +* +* FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels. +* +* SPECIAL NOTES : None. +* +****************************************************************************/ +static +void horizontal_line_1_2_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + (void) dest_width; + + __asm + { + mov rsi, source + mov rdi, dest + + pxor mm7, mm7 + movq mm6, four_ones + + mov ecx, source_width + + hs_1_2_loop: + + movq mm0, [rsi] + movq mm1, [rsi+1] + + movq mm2, mm0 + movq mm3, mm1 + + movq mm4, mm0 + punpcklbw mm0, mm7 + + punpcklbw mm1, mm7 + paddw mm0, mm1 + + paddw mm0, mm6 + punpckhbw mm2, mm7 + + punpckhbw mm3, mm7 + paddw mm2, mm3 + + paddw mm2, mm6 + psraw mm0, 1 + + psraw mm2, 1 + packuswb mm0, mm2 + + movq mm2, mm4 + punpcklbw mm2, mm0 + + movq [rdi], mm2 + punpckhbw mm4, mm0 + + movq [rdi+8], mm4 + add rsi, 8 + + add rdi, 16 + sub rcx, 8 + + cmp rcx, 8 + jg hs_1_2_loop + +// last eight pixel + + movq mm0, [rsi] + movq mm1, mm0 + + movq mm2, mm0 + movq mm3, mm1 + + psrlq mm1, 8 + psrlq mm3, 56 + + psllq mm3, 56 + por mm1, mm3 + + movq mm3, mm1 + movq mm4, mm0 + + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + + paddw mm0, mm1 + paddw mm0, mm6 + + punpckhbw mm2, mm7 + punpckhbw mm3, mm7 + + paddw mm2, mm3 + paddw mm2, mm6 + + psraw mm0, 1 + psraw mm2, 1 + + packuswb mm0, mm2 + movq mm2, mm4 + + punpcklbw mm2, mm0 + movq [rdi], mm2 + + punpckhbw mm4, mm0 + movq [rdi+8], mm4 + } +} + + + + + +__declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 }; +__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 }; + + +/**************************************************************************** +* +* ROUTINE : horizontal_line_5_4_scale_mmx +* +* INPUTS : const unsigned char *source : Pointer to source data. +* unsigned int source_width : Stride of source. +* unsigned char *dest : Pointer to destination data. +* unsigned int dest_width : Stride of destination (NOT USED). +* +* OUTPUTS : None. +* +* RETURNS : void +* +* FUNCTION : Copies horizontal line of pixels from source to +* destination scaling up by 4 to 5. +* +* SPECIAL NOTES : None. +* +****************************************************************************/ +static +void horizontal_line_5_4_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + /* + unsigned i; + unsigned int a, b, c, d, e; + unsigned char *des = dest; + const unsigned char *src = source; + + (void) dest_width; + + for ( i=0; i<source_width; i+=5 ) + { + a = src[0]; + b = src[1]; + c = src[2]; + d = src[3]; + e = src[4]; + + des[0] = a; + des[1] = ((b*192 + c* 64 + 128)>>8); + des[2] = ((c*128 + d*128 + 128)>>8); + des[3] = ((d* 64 + e*192 + 128)>>8); + + src += 5; + des += 4; + } + */ + __asm + { + + mov rsi, source ; + mov rdi, dest ; + + mov ecx, source_width ; + movq mm5, const54_1 ; + + pxor mm7, mm7 ; + movq mm6, const54_2 ; + + movq mm4, round_values ; + lea rdx, [rsi+rcx] ; + horizontal_line_5_4_loop: + + movq mm0, QWORD PTR [rsi] ; + 00 01 02 03 04 05 06 07 + movq mm1, mm0 ; + 00 01 02 03 04 05 06 07 + + psrlq mm0, 8 ; + 01 02 03 04 05 06 07 xx + punpcklbw mm1, mm7 ; + xx 00 xx 01 xx 02 xx 03 + + punpcklbw mm0, mm7 ; + xx 01 xx 02 xx 03 xx 04 + pmullw mm1, mm5 + + pmullw mm0, mm6 + add rsi, 5 + + add rdi, 4 + paddw mm1, mm0 + + paddw mm1, mm4 + psrlw mm1, 8 + + cmp rsi, rdx + packuswb mm1, mm7 + + movd DWORD PTR [rdi-4], mm1 + + jl horizontal_line_5_4_loop + + } + +} +__declspec(align(16)) const static unsigned short one_fourths[] = { 64, 64, 64, 64 }; +__declspec(align(16)) const static unsigned short two_fourths[] = { 128, 128, 128, 128 }; +__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 }; + +static +void vertical_band_5_4_scale_mmx +( + unsigned char *source, + unsigned int src_pitch, + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + + __asm + { + + mov rsi, source // Get the source and destination pointer + mov ecx, src_pitch // Get the pitch size + + mov rdi, dest // tow lines below + pxor mm7, mm7 // clear out mm7 + + mov edx, dest_pitch // Loop counter + mov ebx, dest_width + + vs_5_4_loop: + + movd mm0, DWORD ptr [rsi] // src[0]; + movd mm1, DWORD ptr [rsi+rcx] // src[1]; + + movd mm2, DWORD ptr [rsi+rcx*2] + lea rax, [rsi+rcx*2] // + + punpcklbw mm1, mm7 + punpcklbw mm2, mm7 + + movq mm3, mm2 + pmullw mm1, three_fourths + + pmullw mm2, one_fourths + movd mm4, [rax+rcx] + + pmullw mm3, two_fourths + punpcklbw mm4, mm7 + + movq mm5, mm4 + pmullw mm4, two_fourths + + paddw mm1, mm2 + movd mm6, [rax+rcx*2] + + pmullw mm5, one_fourths + paddw mm1, round_values; + + paddw mm3, mm4 + psrlw mm1, 8 + + punpcklbw mm6, mm7 + paddw mm3, round_values + + pmullw mm6, three_fourths + psrlw mm3, 8 + + packuswb mm1, mm7 + packuswb mm3, mm7 + + movd DWORD PTR [rdi], mm0 + movd DWORD PTR [rdi+rdx], mm1 + + + paddw mm5, mm6 + movd DWORD PTR [rdi+rdx*2], mm3 + + lea rax, [rdi+rdx*2] + paddw mm5, round_values + + psrlw mm5, 8 + add rdi, 4 + + packuswb mm5, mm7 + movd DWORD PTR [rax+rdx], mm5 + + add rsi, 4 + sub rbx, 4 + + jg vs_5_4_loop + } +} + + +__declspec(align(16)) const static unsigned short const53_1[] = { 0, 85, 171, 0 }; +__declspec(align(16)) const static unsigned short const53_2[] = {256, 171, 85, 0 }; + + +static +void horizontal_line_5_3_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + __asm + { + + mov rsi, source ; + mov rdi, dest ; + + mov ecx, source_width ; + movq mm5, const53_1 ; + + pxor mm7, mm7 ; + movq mm6, const53_2 ; + + movq mm4, round_values ; + lea rdx, [rsi+rcx-5] ; + horizontal_line_5_3_loop: + + movq mm0, QWORD PTR [rsi] ; + 00 01 02 03 04 05 06 07 + movq mm1, mm0 ; + 00 01 02 03 04 05 06 07 + + psllw mm0, 8 ; + xx 00 xx 02 xx 04 xx 06 + psrlw mm1, 8 ; + 01 xx 03 xx 05 xx 07 xx + + psrlw mm0, 8 ; + 00 xx 02 xx 04 xx 06 xx + psllq mm1, 16 ; + xx xx 01 xx 03 xx 05 xx + + pmullw mm0, mm6 + + pmullw mm1, mm5 + add rsi, 5 + + add rdi, 3 + paddw mm1, mm0 + + paddw mm1, mm4 + psrlw mm1, 8 + + cmp rsi, rdx + packuswb mm1, mm7 + + movd DWORD PTR [rdi-3], mm1 + jl horizontal_line_5_3_loop + +//exit condition + movq mm0, QWORD PTR [rsi] ; + 00 01 02 03 04 05 06 07 + movq mm1, mm0 ; + 00 01 02 03 04 05 06 07 + + psllw mm0, 8 ; + xx 00 xx 02 xx 04 xx 06 + psrlw mm1, 8 ; + 01 xx 03 xx 05 xx 07 xx + + psrlw mm0, 8 ; + 00 xx 02 xx 04 xx 06 xx + psllq mm1, 16 ; + xx xx 01 xx 03 xx 05 xx + + pmullw mm0, mm6 + + pmullw mm1, mm5 + paddw mm1, mm0 + + paddw mm1, mm4 + psrlw mm1, 8 + + packuswb mm1, mm7 + movd rax, mm1 + + mov rdx, rax + shr rdx, 16 + + mov WORD PTR[rdi], ax + mov BYTE PTR[rdi+2], dl + + } + +} + +__declspec(align(16)) const static unsigned short one_thirds[] = { 85, 85, 85, 85 }; +__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 }; + +static +void vertical_band_5_3_scale_mmx +( + unsigned char *source, + unsigned int src_pitch, + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + + __asm + { + + mov rsi, source // Get the source and destination pointer + mov ecx, src_pitch // Get the pitch size + + mov rdi, dest // tow lines below + pxor mm7, mm7 // clear out mm7 + + mov edx, dest_pitch // Loop counter + movq mm5, one_thirds + + movq mm6, two_thirds + mov ebx, dest_width; + + vs_5_3_loop: + + movd mm0, DWORD ptr [rsi] // src[0]; + movd mm1, DWORD ptr [rsi+rcx] // src[1]; + + movd mm2, DWORD ptr [rsi+rcx*2] + lea rax, [rsi+rcx*2] // + + punpcklbw mm1, mm7 + punpcklbw mm2, mm7 + + pmullw mm1, mm5 + pmullw mm2, mm6 + + movd mm3, DWORD ptr [rax+rcx] + movd mm4, DWORD ptr [rax+rcx*2] + + punpcklbw mm3, mm7 + punpcklbw mm4, mm7 + + pmullw mm3, mm6 + pmullw mm4, mm5 + + + movd DWORD PTR [rdi], mm0 + paddw mm1, mm2 + + paddw mm1, round_values + psrlw mm1, 8 + + packuswb mm1, mm7 + paddw mm3, mm4 + + paddw mm3, round_values + movd DWORD PTR [rdi+rdx], mm1 + + psrlw mm3, 8 + packuswb mm3, mm7 + + movd DWORD PTR [rdi+rdx*2], mm3 + + + add rdi, 4 + add rsi, 4 + + sub rbx, 4 + jg vs_5_3_loop + } +} + + + + +/**************************************************************************** +* +* ROUTINE : horizontal_line_2_1_scale +* +* INPUTS : const unsigned char *source : +* unsigned int source_width : +* unsigned char *dest : +* unsigned int dest_width : +* +* OUTPUTS : None. +* +* RETURNS : void +* +* FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels. +* +* SPECIAL NOTES : None. +* +****************************************************************************/ +static +void horizontal_line_2_1_scale_mmx +( + const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width +) +{ + (void) dest_width; + + __asm + { + mov rsi, source + mov rdi, dest + + pxor mm7, mm7 + mov ecx, dest_width + + xor rdx, rdx + hs_2_1_loop: + + movq mm0, [rsi+rdx*2] + psllw mm0, 8 + + psrlw mm0, 8 + packuswb mm0, mm7 + + movd DWORD Ptr [rdi+rdx], mm0; + add rdx, 4 + + cmp rdx, rcx + jl hs_2_1_loop + + } +} + + + +static +void vertical_band_2_1_scale_mmx +( + unsigned char *source, + unsigned int src_pitch, + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width) +{ + vpx_memcpy(dest, source, dest_width); +} + + +__declspec(align(16)) const static unsigned short three_sixteenths[] = { 48, 48, 48, 48 }; +__declspec(align(16)) const static unsigned short ten_sixteenths[] = { 160, 160, 160, 160 }; + +static +void vertical_band_2_1_scale_i_mmx +( + unsigned char *source, + unsigned int src_pitch, + unsigned char *dest, + unsigned int dest_pitch, + unsigned int dest_width +) +{ + __asm + { + mov rsi, source + mov rdi, dest + + mov eax, src_pitch + mov edx, dest_width + + pxor mm7, mm7 + sub rsi, rax //back one line + + + lea rcx, [rsi+rdx]; + movq mm6, round_values; + + movq mm5, three_sixteenths; + movq mm4, ten_sixteenths; + + vs_2_1_i_loop: + movd mm0, [rsi] // + movd mm1, [rsi+rax] // + + movd mm2, [rsi+rax*2] // + punpcklbw mm0, mm7 + + pmullw mm0, mm5 + punpcklbw mm1, mm7 + + pmullw mm1, mm4 + punpcklbw mm2, mm7 + + pmullw mm2, mm5 + paddw mm0, round_values + + paddw mm1, mm2 + paddw mm0, mm1 + + psrlw mm0, 8 + packuswb mm0, mm7 + + movd DWORD PTR [rdi], mm0 + add rsi, 4 + + add rdi, 4; + cmp rsi, rcx + jl vs_2_1_i_loop + + } +} + + + +void +register_mmxscalers(void) +{ + vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx; + vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx; + vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx; + vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx; + vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx; + vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx; + vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx; + vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx; + vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx; + + vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx; + vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx; + vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx; + vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx; + vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx; + vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx; + vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx; +} diff --git a/vpx_scale/x86_64/scalesystemdependant.c b/vpx_scale/x86_64/scalesystemdependant.c new file mode 100644 index 000000000..43f05a68c --- /dev/null +++ b/vpx_scale/x86_64/scalesystemdependant.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +/**************************************************************************** +* +* Module Title : system_dependant.c +* +* Description : Miscellaneous system dependant functions +* +****************************************************************************/ + +/**************************************************************************** +* Header Files +****************************************************************************/ +#include "vpx_scale/vpxscale.h" +#include "cpuidlib.h" + +/**************************************************************************** +* Imports +*****************************************************************************/ +extern void register_generic_scalers(void); +extern void register_mmxscalers(void); + +/**************************************************************************** + * + * ROUTINE : post_proc_machine_specific_config + * + * INPUTS : UINT32 Version : Codec version number. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Checks for machine specifc features such as MMX support + * sets appropriate flags and function pointers. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void +vp8_scale_machine_specific_config(void) +{ + int wmt_enabled = 1; + + if (wmt_enabled) + { + register_mmxscalers(); + } + else + { + register_generic_scalers(); + } +} diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h new file mode 100644 index 000000000..a8d0ce45b --- /dev/null +++ b/vpx_scale/yv12config.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef YV12_CONFIG_H +#define YV12_CONFIG_H +#ifdef __cplusplus +extern "C" +{ +#endif + +#define VP7BORDERINPIXELS 48 +#define VP8BORDERINPIXELS 32 + + /************************************* + For INT_YUV: + + Y = (R+G*2+B)/4; + U = (R-B)/2; + V = (G*2 - R - B)/4; + And + R = Y+U-V; + G = Y+V; + B = Y-U-V; + ************************************/ + typedef enum + { + REG_YUV = 0, // Regular yuv + INT_YUV = 1 // The type of yuv that can be tranfer to and from RGB through integer transform + } + YUV_TYPE; + + typedef struct + { + int y_width; + int y_height; + int y_stride; +// int yinternal_width; + + int uv_width; + int uv_height; + int uv_stride; +// int uvinternal_width; + + unsigned char *y_buffer; + unsigned char *u_buffer; + unsigned char *v_buffer; + + unsigned char *buffer_alloc; + int border; + int frame_size; + YUV_TYPE clrtype; + } YV12_BUFFER_CONFIG; + + int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border); + int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf); + int vp8_yv12_black_frame_buffer(YV12_BUFFER_CONFIG *ybf); + +#ifdef __cplusplus +} +#endif + + +#endif //YV12_CONFIG_H diff --git a/vpx_scale/yv12extend.h b/vpx_scale/yv12extend.h new file mode 100644 index 000000000..9968feae8 --- /dev/null +++ b/vpx_scale/yv12extend.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#ifndef YV12_EXTEND_H +#define YV12_EXTEND_H + +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + + void vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf); + + /* Copy Y,U,V buffer data from src to dst, filling border of dst as well. */ + + void vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); + void vp8_yv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); + +#ifdef __cplusplus +} +#endif + +#endif |