diff options
author | Yunqing Wang <yunqingwang@google.com> | 2013-11-21 16:43:37 -0800 |
---|---|---|
committer | Yunqing Wang <yunqingwang@google.com> | 2013-11-22 10:04:51 -0800 |
commit | ed36720b66ca71438a8e14a41f05e837d030da61 (patch) | |
tree | bd3d5325cfd2029fbe6e033abcb99e579c082b4d /vp9/common/arm | |
parent | 5925ba08a33cafceb0a2d21ca6d30923dc58f372 (diff) | |
download | libvpx-ed36720b66ca71438a8e14a41f05e837d030da61.tar libvpx-ed36720b66ca71438a8e14a41f05e837d030da61.tar.gz libvpx-ed36720b66ca71438a8e14a41f05e837d030da61.tar.bz2 libvpx-ed36720b66ca71438a8e14a41f05e837d030da61.zip |
Do vertical loopfiltering in parallel
This patch followed "Add filter_selectively_vert_row2 to enable
parallel loopfiltering" commit, and added x86 SSE2 optimization
to do 16-pixel filtering in parallel. For other optimizations
(neon and dspr2), current 16-pixel functions were done by calling
8-pixel functions twice, and real 16-pixel functions could be added
later.
Decoder speedup:
tulip clip: 2% speed gain;
old_town_cross: 1.2% speed gain;
bus: 2% speed gain.
Change-Id: I4818a0c72f84b34f5fe678e496cf4a10238574b7
Diffstat (limited to 'vp9/common/arm')
-rw-r--r-- | vp9/common/arm/neon/vp9_loopfilter_16_neon.c | 31 |
1 files changed, 31 insertions, 0 deletions
diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c index 2f022dc1d..435c6ae6f 100644 --- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c +++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c @@ -31,3 +31,34 @@ void vp9_mbloop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */, vp9_mbloop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1); vp9_mbloop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1); } + +void vp9_loop_filter_vertical_edge_16_neon(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_loop_filter_vertical_edge_neon(s, p, blimit0, limit0, thresh0, 1); + vp9_loop_filter_vertical_edge_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); +} + +void vp9_mbloop_filter_vertical_edge_16_neon(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_mbloop_filter_vertical_edge_neon(s, p, blimit0, limit0, thresh0, 1); + vp9_mbloop_filter_vertical_edge_neon(s + 8 * p, p, blimit1, limit1, thresh1, + 1); +} + +void vp9_mb_lpf_vertical_edge_w_16_neon(uint8_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + vp9_mb_lpf_vertical_edge_w_neon(s, p, blimit, limit, thresh); + vp9_mb_lpf_vertical_edge_w_neon(s + 8 * p, p, blimit, limit, thresh); +} |