diff options
author | John Koleszar <jkoleszar@google.com> | 2013-04-05 15:56:50 -0700 |
---|---|---|
committer | Gerrit Code Review <gerrit@gerrit.golo.chromium.org> | 2013-04-05 15:56:50 -0700 |
commit | fa135d7b9e1ce14789e12a4e76e6f564b7d6799d (patch) | |
tree | 28b3836fc597af246f611b899be0e034573b725c /vp9/common | |
parent | 9161127ee9176d823e3cabe011501d95be5ec427 (diff) | |
parent | 05a79f2fbf0ea412142b96ae53796fd171663c2e (diff) | |
download | libvpx-fa135d7b9e1ce14789e12a4e76e6f564b7d6799d.tar libvpx-fa135d7b9e1ce14789e12a4e76e6f564b7d6799d.tar.gz libvpx-fa135d7b9e1ce14789e12a4e76e6f564b7d6799d.tar.bz2 libvpx-fa135d7b9e1ce14789e12a4e76e6f564b7d6799d.zip |
Merge changes Ibbfa68d6,Idb76a0e2 into experimental
* changes:
Move EOB to per-plane data
Move qcoeff, dqcoeff from BLOCKD to per-plane data
Diffstat (limited to 'vp9/common')
-rw-r--r-- | vp9/common/vp9_blockd.h | 53 | ||||
-rw-r--r-- | vp9/common/vp9_invtrans.c | 107 | ||||
-rw-r--r-- | vp9/common/vp9_mbpitch.c | 5 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.sh | 11 |
4 files changed, 110 insertions, 66 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 3a5824ada..a147ec747 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -249,8 +249,6 @@ typedef struct { } MODE_INFO; typedef struct blockd { - int16_t *qcoeff; - int16_t *dqcoeff; uint8_t *predictor; int16_t *diff; int16_t *dequant; @@ -284,15 +282,28 @@ struct scale_factors { #endif }; +enum { MAX_MB_PLANE = 3 }; + +struct mb_plane { + DECLARE_ALIGNED(16, int16_t, qcoeff[64 * 64]); + DECLARE_ALIGNED(16, int16_t, dqcoeff[64 * 64]); + DECLARE_ALIGNED(16, uint16_t, eobs[256]); +}; + +#define BLOCK_OFFSET(x, i, n) ((x) + (i) * (n)) + +#define MB_SUBBLOCK_FIELD(x, field, i) (\ + ((i) < 16) ? BLOCK_OFFSET((x)->plane[0].field, (i), 16) : \ + ((i) < 20) ? BLOCK_OFFSET((x)->plane[1].field, ((i) - 16), 16) : \ + BLOCK_OFFSET((x)->plane[2].field, ((i) - 20), 16)) + typedef struct macroblockd { DECLARE_ALIGNED(16, int16_t, diff[64*64+32*32*2]); /* from idct diff */ DECLARE_ALIGNED(16, uint8_t, predictor[384]); // unused for superblocks - DECLARE_ALIGNED(16, int16_t, qcoeff[64*64+32*32*2]); - DECLARE_ALIGNED(16, int16_t, dqcoeff[64*64+32*32*2]); - DECLARE_ALIGNED(16, uint16_t, eobs[256+64*2]); #if CONFIG_CODE_NONZEROCOUNT DECLARE_ALIGNED(16, uint16_t, nzcs[256+64*2]); #endif + struct mb_plane plane[MAX_MB_PLANE]; /* 16 Y blocks, 4 U, 4 V, each with 16 entries. */ BLOCKD block[24]; @@ -372,8 +383,8 @@ typedef struct macroblockd { void (*itxm_add_y_block)(int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, struct macroblockd *xd); void (*itxm_add_uv_block)(int16_t *q, const int16_t *dq, - uint8_t *pre, uint8_t *dst_u, uint8_t *dst_v, int stride, - struct macroblockd *xd); + uint8_t *pre, uint8_t *dst, int stride, + uint16_t *eobs); struct subpix_fn_table subpix; @@ -669,4 +680,32 @@ static int get_nzc_used(TX_SIZE tx_size) { return (tx_size >= TX_16X16); } #endif + +struct plane_block_idx { + int plane; + int block; +}; + +// TODO(jkoleszar): returning a struct so it can be used in a const context, +// expect to refactor this further later. +static INLINE struct plane_block_idx plane_block_idx(int y_blocks, + int b_idx) { + const int v_offset = y_blocks * 5 / 4; + struct plane_block_idx res; + + if (b_idx < y_blocks) { + res.plane = 0; + res.block = b_idx; + } else if (b_idx < v_offset) { + res.plane = 1; + res.block = b_idx - y_blocks; + } else { + assert(b_idx < y_blocks * 3 / 2); + res.plane = 2; + res.block = b_idx - v_offset; + } + return res; +} + + #endif // VP9_COMMON_VP9_BLOCKD_H_ diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c index a03a66e33..0573b7df4 100644 --- a/vp9/common/vp9_invtrans.c +++ b/vp9/common/vp9_invtrans.c @@ -26,9 +26,12 @@ void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) { for (i = 0; i < 16; i++) { TX_TYPE tx_type = get_tx_type_4x4(xd, i); if (tx_type != DCT_DCT) { - vp9_short_iht4x4(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type); + vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, i, 16), + xd->block[i].diff, 16, tx_type); } else { - vp9_inverse_transform_b_4x4(xd, xd->eobs[i], xd->block[i].dqcoeff, + vp9_inverse_transform_b_4x4(xd, + xd->plane[0].eobs[i], + BLOCK_OFFSET(xd->plane[0].dqcoeff, i, 16), xd->block[i].diff, 32); } } @@ -37,8 +40,14 @@ void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) { void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd) { int i; - for (i = 16; i < 24; i++) { - vp9_inverse_transform_b_4x4(xd, xd->eobs[i], xd->block[i].dqcoeff, + for (i = 16; i < 20; i++) { + vp9_inverse_transform_b_4x4(xd, xd->plane[1].eobs[i - 16], + BLOCK_OFFSET(xd->plane[1].dqcoeff, i - 16, 16), + xd->block[i].diff, 16); + } + for (i = 20; i < 24; i++) { + vp9_inverse_transform_b_4x4(xd, xd->plane[2].eobs[i - 20], + BLOCK_OFFSET(xd->plane[2].dqcoeff, i - 20, 16), xd->block[i].diff, 16); } } @@ -60,19 +69,20 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) { for (i = 0; i < 9; i += 8) { TX_TYPE tx_type = get_tx_type_8x8(xd, i); if (tx_type != DCT_DCT) { - vp9_short_iht8x8(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type); + vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[0].dqcoeff, i, 16), + xd->block[i].diff, 16, tx_type); } else { - vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0], + vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[0].dqcoeff, i, 16), &blockd[i].diff[0], 32); } } for (i = 2; i < 11; i += 8) { TX_TYPE tx_type = get_tx_type_8x8(xd, i); if (tx_type != DCT_DCT) { - vp9_short_iht8x8(xd->block[i + 2].dqcoeff, xd->block[i].diff, - 16, tx_type); + vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[0].dqcoeff, i + 2, 16), + xd->block[i].diff, 16, tx_type); } else { - vp9_inverse_transform_b_8x8(&blockd[i + 2].dqcoeff[0], + vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[0].dqcoeff, i + 2, 16), &blockd[i].diff[0], 32); } } @@ -82,8 +92,12 @@ void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd) { int i; BLOCKD *blockd = xd->block; - for (i = 16; i < 24; i += 4) { - vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0], + for (i = 16; i < 20; i += 4) { + vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[1].dqcoeff, i - 16, 16), + &blockd[i].diff[0], 16); + } + for (i = 20; i < 24; i += 4) { + vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[2].dqcoeff, i - 20, 16), &blockd[i].diff[0], 16); } } @@ -102,9 +116,10 @@ void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd) { BLOCKD *bd = &xd->block[0]; TX_TYPE tx_type = get_tx_type_16x16(xd, 0); if (tx_type != DCT_DCT) { - vp9_short_iht16x16(bd->dqcoeff, bd->diff, 16, tx_type); + vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[0].dqcoeff, 0, 16), + bd->diff, 16, tx_type); } else { - vp9_inverse_transform_b_16x16(&xd->block[0].dqcoeff[0], + vp9_inverse_transform_b_16x16(BLOCK_OFFSET(xd->plane[0].dqcoeff, 0, 16), &xd->block[0].diff[0], 32); } } @@ -115,7 +130,7 @@ void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd) { } void vp9_inverse_transform_sby_32x32(MACROBLOCKD *xd) { - vp9_short_idct32x32(xd->dqcoeff, xd->diff, 64); + vp9_short_idct32x32(BLOCK_OFFSET(xd->plane[0].dqcoeff, 0, 16), xd->diff, 64); } void vp9_inverse_transform_sby_16x16(MACROBLOCKD *xd) { @@ -126,11 +141,11 @@ void vp9_inverse_transform_sby_16x16(MACROBLOCKD *xd) { const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 8 + x_idx) * 4); if (tx_type == DCT_DCT) { - vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256, + vp9_inverse_transform_b_16x16(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 256), xd->diff + x_idx * 16 + y_idx * 32 * 16, 64); } else { - vp9_short_iht16x16(xd->dqcoeff + n * 256, + vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 256), xd->diff + x_idx * 16 + y_idx * 32 * 16, 32, tx_type); } } @@ -144,10 +159,10 @@ void vp9_inverse_transform_sby_8x8(MACROBLOCKD *xd) { const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2); if (tx_type == DCT_DCT) { - vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64, + vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 64), xd->diff + x_idx * 8 + y_idx * 32 * 8, 64); } else { - vp9_short_iht8x8(xd->dqcoeff + n * 64, + vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 64), xd->diff + x_idx * 8 + y_idx * 32 * 8, 32, tx_type); } } @@ -161,19 +176,20 @@ void vp9_inverse_transform_sby_4x4(MACROBLOCKD *xd) { const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx); if (tx_type == DCT_DCT) { - vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16, + vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[n], + BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 16), xd->diff + x_idx * 4 + y_idx * 4 * 32, 64); } else { - vp9_short_iht4x4(xd->dqcoeff + n * 16, + vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 16), xd->diff + x_idx * 4 + y_idx * 4 * 32, 32, tx_type); } } } void vp9_inverse_transform_sbuv_16x16(MACROBLOCKD *xd) { - vp9_inverse_transform_b_16x16(xd->dqcoeff + 1024, + vp9_inverse_transform_b_16x16(xd->plane[1].dqcoeff, xd->diff + 1024, 32); - vp9_inverse_transform_b_16x16(xd->dqcoeff + 1280, + vp9_inverse_transform_b_16x16(xd->plane[2].dqcoeff, xd->diff + 1280, 32); } @@ -183,10 +199,10 @@ void vp9_inverse_transform_sbuv_8x8(MACROBLOCKD *xd) { for (n = 0; n < 4; n++) { const int x_idx = n & 1, y_idx = n >> 1; - vp9_inverse_transform_b_8x8(xd->dqcoeff + 1024 + n * 64, + vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[1].dqcoeff, n, 64), xd->diff + 1024 + x_idx * 8 + y_idx * 16 * 8, 32); - vp9_inverse_transform_b_8x8(xd->dqcoeff + 1280 + n * 64, + vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[2].dqcoeff, n, 64), xd->diff + 1280 + x_idx * 8 + y_idx * 16 * 8, 32); } @@ -198,12 +214,12 @@ void vp9_inverse_transform_sbuv_4x4(MACROBLOCKD *xd) { for (n = 0; n < 16; n++) { const int x_idx = n & 3, y_idx = n >> 2; - vp9_inverse_transform_b_4x4(xd, xd->eobs[64 + n], - xd->dqcoeff + 1024 + n * 16, + vp9_inverse_transform_b_4x4(xd, xd->plane[1].eobs[n], + BLOCK_OFFSET(xd->plane[1].dqcoeff, n, 16), xd->diff + 1024 + x_idx * 4 + y_idx * 16 * 4, 32); - vp9_inverse_transform_b_4x4(xd, xd->eobs[64 + 16 + n], - xd->dqcoeff + 1280 + n * 16, + vp9_inverse_transform_b_4x4(xd, xd->plane[2].eobs[n], + BLOCK_OFFSET(xd->plane[2].dqcoeff, n, 16), xd->diff + 1280 + x_idx * 4 + y_idx * 16 * 4, 32); } @@ -215,7 +231,7 @@ void vp9_inverse_transform_sb64y_32x32(MACROBLOCKD *xd) { for (n = 0; n < 4; n++) { const int x_idx = n & 1, y_idx = n >> 1; - vp9_short_idct32x32(xd->dqcoeff + n * 1024, + vp9_short_idct32x32(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 1024), xd->diff + x_idx * 32 + y_idx * 32 * 64, 128); } } @@ -228,11 +244,11 @@ void vp9_inverse_transform_sb64y_16x16(MACROBLOCKD *xd) { const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 16 + x_idx) * 4); if (tx_type == DCT_DCT) { - vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256, + vp9_inverse_transform_b_16x16(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 256), xd->diff + x_idx * 16 + y_idx * 64 * 16, 128); } else { - vp9_short_iht16x16(xd->dqcoeff + n * 256, + vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 256), xd->diff + x_idx * 16 + y_idx * 64 * 16, 64, tx_type); } } @@ -246,10 +262,10 @@ void vp9_inverse_transform_sb64y_8x8(MACROBLOCKD *xd) { const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2); if (tx_type == DCT_DCT) { - vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64, + vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 64), xd->diff + x_idx * 8 + y_idx * 64 * 8, 128); } else { - vp9_short_iht8x8(xd->dqcoeff + n * 64, + vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 64), xd->diff + x_idx * 8 + y_idx * 64 * 8, 64, tx_type); } } @@ -263,19 +279,20 @@ void vp9_inverse_transform_sb64y_4x4(MACROBLOCKD *xd) { const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx); if (tx_type == DCT_DCT) { - vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16, + vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[n], + BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 16), xd->diff + x_idx * 4 + y_idx * 4 * 64, 128); } else { - vp9_short_iht4x4(xd->dqcoeff + n * 16, + vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 16), xd->diff + x_idx * 4 + y_idx * 4 * 64, 64, tx_type); } } } void vp9_inverse_transform_sb64uv_32x32(MACROBLOCKD *xd) { - vp9_short_idct32x32(xd->dqcoeff + 4096, + vp9_short_idct32x32(xd->plane[1].dqcoeff, xd->diff + 4096, 64); - vp9_short_idct32x32(xd->dqcoeff + 4096 + 1024, + vp9_short_idct32x32(xd->plane[2].dqcoeff, xd->diff + 4096 + 1024, 64); } @@ -285,9 +302,9 @@ void vp9_inverse_transform_sb64uv_16x16(MACROBLOCKD *xd) { for (n = 0; n < 4; n++) { const int x_idx = n & 1, y_idx = n >> 1, off = x_idx * 16 + y_idx * 32 * 16; - vp9_inverse_transform_b_16x16(xd->dqcoeff + 4096 + n * 256, + vp9_inverse_transform_b_16x16(BLOCK_OFFSET(xd->plane[1].dqcoeff, n, 256), xd->diff + 4096 + off, 64); - vp9_inverse_transform_b_16x16(xd->dqcoeff + 4096 + 1024 + n * 256, + vp9_inverse_transform_b_16x16(BLOCK_OFFSET(xd->plane[2].dqcoeff, n, 256), xd->diff + 4096 + 1024 + off, 64); } } @@ -298,9 +315,9 @@ void vp9_inverse_transform_sb64uv_8x8(MACROBLOCKD *xd) { for (n = 0; n < 16; n++) { const int x_idx = n & 3, y_idx = n >> 2, off = x_idx * 8 + y_idx * 32 * 8; - vp9_inverse_transform_b_8x8(xd->dqcoeff + 4096 + n * 64, + vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[1].dqcoeff, n, 64), xd->diff + 4096 + off, 64); - vp9_inverse_transform_b_8x8(xd->dqcoeff + 4096 + 1024 + n * 64, + vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[2].dqcoeff, n, 64), xd->diff + 4096 + 1024 + off, 64); } } @@ -311,11 +328,11 @@ void vp9_inverse_transform_sb64uv_4x4(MACROBLOCKD *xd) { for (n = 0; n < 64; n++) { const int x_idx = n & 7, y_idx = n >> 3, off = x_idx * 4 + y_idx * 32 * 4; - vp9_inverse_transform_b_4x4(xd, xd->eobs[256 + n], - xd->dqcoeff + 4096 + n * 16, + vp9_inverse_transform_b_4x4(xd, xd->plane[1].eobs[n], + BLOCK_OFFSET(xd->plane[1].dqcoeff, n, 16), xd->diff + 4096 + off, 64); - vp9_inverse_transform_b_4x4(xd, xd->eobs[256 + 64 + n], - xd->dqcoeff + 4096 + 1024 + n * 16, + vp9_inverse_transform_b_4x4(xd, xd->plane[2].eobs[n], + BLOCK_OFFSET(xd->plane[2].dqcoeff, n, 16), xd->diff + 4096 + 1024 + off, 64); } } diff --git a/vp9/common/vp9_mbpitch.c b/vp9/common/vp9_mbpitch.c index 85ba82dd3..b357c9ac9 100644 --- a/vp9/common/vp9_mbpitch.c +++ b/vp9/common/vp9_mbpitch.c @@ -99,11 +99,6 @@ void vp9_setup_block_dptrs(MACROBLOCKD *mb) { blockd[to].predictor = &mb->predictor[from]; } } - - for (r = 0; r < 24; r++) { - blockd[r].qcoeff = &mb->qcoeff[r * 16]; - blockd[r].dqcoeff = &mb->dqcoeff[r * 16]; - } } void vp9_build_block_doffsets(MACROBLOCKD *mb) { diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 8b6efc384..cf95524e0 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -29,9 +29,6 @@ forward_decls vp9_common_forward_decls prototype void vp9_dequant_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, struct macroblockd *xd" specialize vp9_dequant_idct_add_y_block_8x8 -prototype void vp9_dequant_idct_add_uv_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, struct macroblockd *xd" -specialize vp9_dequant_idct_add_uv_block_8x8 - prototype void vp9_dequant_idct_add_16x16 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob" specialize vp9_dequant_idct_add_16x16 @@ -44,15 +41,12 @@ specialize vp9_dequant_idct_add prototype void vp9_dequant_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, struct macroblockd *xd" specialize vp9_dequant_idct_add_y_block -prototype void vp9_dequant_idct_add_uv_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, struct macroblockd *xd" +prototype void vp9_dequant_idct_add_uv_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs" specialize vp9_dequant_idct_add_uv_block prototype void vp9_dequant_idct_add_32x32 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int pitch, int stride, int eob" specialize vp9_dequant_idct_add_32x32 -prototype void vp9_dequant_idct_add_uv_block_16x16 "int16_t *q, const int16_t *dq, uint8_t *dstu, uint8_t *dstv, int stride, struct macroblockd *xd" -specialize vp9_dequant_idct_add_uv_block_16x16 - # # RECON # @@ -606,8 +600,7 @@ prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch" specialize vp9_subtract_b mmx sse2 prototype int vp9_mbuverror "struct macroblock *mb" -specialize vp9_mbuverror mmx sse2 -vp9_mbuverror_sse2=vp9_mbuverror_xmm +specialize vp9_mbuverror prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch" specialize vp9_subtract_b mmx sse2 |