Do vertical loopfiltering in parallel

This patch followed "Add filter_selectively_vert_row2 to enable parallel loopfiltering" commit, and added x86 SSE2 optimization to do 16-pixel filtering in parallel. For other optimizations (neon and dspr2), current 16-pixel functions were done by calling 8-pixel functions twice, and real 16-pixel functions could be added later. Decoder speedup: tulip clip: 2% speed gain; old_town_cross: 1.2% speed gain; bus: 2% speed gain. Change-Id: I4818a0c72f84b34f5fe678e496cf4a10238574b7
author: Yunqing Wang <yunqingwang@google.com> 2013-11-21 16:43:37 -0800
committer: Yunqing Wang <yunqingwang@google.com> 2013-11-22 10:04:51 -0800
commit: ed36720b66ca71438a8e14a41f05e837d030da61 (patch)
tree: bd3d5325cfd2029fbe6e033abcb99e579c082b4d /vp9/common/arm
parent: 5925ba08a33cafceb0a2d21ca6d30923dc58f372 (diff)
download: libvpx-ed36720b66ca71438a8e14a41f05e837d030da61.tar
libvpx-ed36720b66ca71438a8e14a41f05e837d030da61.tar.gz
libvpx-ed36720b66ca71438a8e14a41f05e837d030da61.tar.bz2
libvpx-ed36720b66ca71438a8e14a41f05e837d030da61.zip
1 files changed, 31 insertions, 0 deletions
diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
index 2f022dc1d..435c6ae6f 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -31,3 +31,34 @@ void vp9_mbloop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */,
   vp9_mbloop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1);
   vp9_mbloop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1);
 }
+
+void vp9_loop_filter_vertical_edge_16_neon(uint8_t *s, int p,
+                                           const uint8_t *blimit0,
+                                           const uint8_t *limit0,
+                                           const uint8_t *thresh0,
+                                           const uint8_t *blimit1,
+                                           const uint8_t *limit1,
+                                           const uint8_t *thresh1) {
+  vp9_loop_filter_vertical_edge_neon(s, p, blimit0, limit0, thresh0, 1);
+  vp9_loop_filter_vertical_edge_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_mbloop_filter_vertical_edge_16_neon(uint8_t *s, int p,
+                                             const uint8_t *blimit0,
+                                             const uint8_t *limit0,
+                                             const uint8_t *thresh0,
+                                             const uint8_t *blimit1,
+                                             const uint8_t *limit1,
+                                             const uint8_t *thresh1) {
+  vp9_mbloop_filter_vertical_edge_neon(s, p, blimit0, limit0, thresh0, 1);
+  vp9_mbloop_filter_vertical_edge_neon(s + 8 * p, p, blimit1, limit1, thresh1,
+                                       1);
+}
+
+void vp9_mb_lpf_vertical_edge_w_16_neon(uint8_t *s, int p,
+                                        const uint8_t *blimit,
+                                        const uint8_t *limit,
+                                        const uint8_t *thresh) {
+  vp9_mb_lpf_vertical_edge_w_neon(s, p, blimit, limit, thresh);
+  vp9_mb_lpf_vertical_edge_w_neon(s + 8 * p, p, blimit, limit, thresh);
+}
author	Yunqing Wang <yunqingwang@google.com>	2013-11-21 16:43:37 -0800
committer	Yunqing Wang <yunqingwang@google.com>	2013-11-22 10:04:51 -0800
commit	ed36720b66ca71438a8e14a41f05e837d030da61 (patch)
tree	bd3d5325cfd2029fbe6e033abcb99e579c082b4d /vp9/common/arm
parent	5925ba08a33cafceb0a2d21ca6d30923dc58f372 (diff)
download	libvpx-ed36720b66ca71438a8e14a41f05e837d030da61.tar libvpx-ed36720b66ca71438a8e14a41f05e837d030da61.tar.gz libvpx-ed36720b66ca71438a8e14a41f05e837d030da61.tar.bz2 libvpx-ed36720b66ca71438a8e14a41f05e837d030da61.zip