summaryrefslogtreecommitdiff
path: root/vp9/common/arm
diff options
context:
space:
mode:
authorYunqing Wang <yunqingwang@google.com>2013-11-21 16:43:37 -0800
committerYunqing Wang <yunqingwang@google.com>2013-11-22 10:04:51 -0800
commited36720b66ca71438a8e14a41f05e837d030da61 (patch)
treebd3d5325cfd2029fbe6e033abcb99e579c082b4d /vp9/common/arm
parent5925ba08a33cafceb0a2d21ca6d30923dc58f372 (diff)
downloadlibvpx-ed36720b66ca71438a8e14a41f05e837d030da61.tar
libvpx-ed36720b66ca71438a8e14a41f05e837d030da61.tar.gz
libvpx-ed36720b66ca71438a8e14a41f05e837d030da61.tar.bz2
libvpx-ed36720b66ca71438a8e14a41f05e837d030da61.zip
Do vertical loopfiltering in parallel
This patch followed "Add filter_selectively_vert_row2 to enable parallel loopfiltering" commit, and added x86 SSE2 optimization to do 16-pixel filtering in parallel. For other optimizations (neon and dspr2), current 16-pixel functions were done by calling 8-pixel functions twice, and real 16-pixel functions could be added later. Decoder speedup: tulip clip: 2% speed gain; old_town_cross: 1.2% speed gain; bus: 2% speed gain. Change-Id: I4818a0c72f84b34f5fe678e496cf4a10238574b7
Diffstat (limited to 'vp9/common/arm')
-rw-r--r--vp9/common/arm/neon/vp9_loopfilter_16_neon.c31
1 files changed, 31 insertions, 0 deletions
diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
index 2f022dc1d..435c6ae6f 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -31,3 +31,34 @@ void vp9_mbloop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */,
vp9_mbloop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1);
vp9_mbloop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1);
}
+
+void vp9_loop_filter_vertical_edge_16_neon(uint8_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_loop_filter_vertical_edge_neon(s, p, blimit0, limit0, thresh0, 1);
+ vp9_loop_filter_vertical_edge_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_mbloop_filter_vertical_edge_16_neon(uint8_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_mbloop_filter_vertical_edge_neon(s, p, blimit0, limit0, thresh0, 1);
+ vp9_mbloop_filter_vertical_edge_neon(s + 8 * p, p, blimit1, limit1, thresh1,
+ 1);
+}
+
+void vp9_mb_lpf_vertical_edge_w_16_neon(uint8_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ vp9_mb_lpf_vertical_edge_w_neon(s, p, blimit, limit, thresh);
+ vp9_mb_lpf_vertical_edge_w_neon(s + 8 * p, p, blimit, limit, thresh);
+}