vpx_dsp/x86/convolve_sse2.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

/*
 *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#ifndef VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
#define VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_

#include <assert.h>
#include <emmintrin.h>  // SSE2

#include "./vpx_config.h"

// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns
// values at index 2 and 3 to return 3 2 3 2 3 2 3 2 as 16-bit words
static INLINE __m128i extract_quarter_2_epi16_sse2(const __m128i *const reg) {
  __m128i tmp = _mm_unpacklo_epi32(*reg, *reg);
  return _mm_unpackhi_epi64(tmp, tmp);
}

// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns
// values at index 2 and 3 to return 5 4 5 4 5 4 5 4 as 16-bit words.
static INLINE __m128i extract_quarter_3_epi16_sse2(const __m128i *const reg) {
  __m128i tmp = _mm_unpackhi_epi32(*reg, *reg);
  return _mm_unpacklo_epi64(tmp, tmp);
}

// Interprets src as 8-bit words, pads each word with zeroes to form 16-bit
// words, then multiplies with ker and add the adjacent results to form 32-bit
// words. Finally adds the result from 1 and 2 together.
static INLINE __m128i pad_multiply_add_add_epi8_sse2(
    const __m128i *const src_1, const __m128i *const src_2,
    const __m128i *const ker_1, const __m128i *const ker_2) {
  const __m128i src_1_half = _mm_unpacklo_epi8(*src_1, _mm_setzero_si128());
  const __m128i src_2_half = _mm_unpacklo_epi8(*src_2, _mm_setzero_si128());
  const __m128i madd_1 = _mm_madd_epi16(src_1_half, *ker_1);
  const __m128i madd_2 = _mm_madd_epi16(src_2_half, *ker_2);
  return _mm_add_epi32(madd_1, madd_2);
}

static INLINE __m128i multiply_add_packs_epi16_sse2(const __m128i *const src_0,
                                                    const __m128i *const src_1,
                                                    const __m128i *const ker) {
  const __m128i madd_1 = _mm_madd_epi16(*src_0, *ker);
  const __m128i madd_2 = _mm_madd_epi16(*src_1, *ker);
  return _mm_packs_epi32(madd_1, madd_2);
}
static INLINE __m128i combine_epi32_sse2(const __m128i *const src_1,
                                         const __m128i *const src_2) {
  const __m128i tmp_1 = _mm_unpacklo_epi32(*src_1, *src_2);
  const __m128i tmp_2 = _mm_unpackhi_epi32(*src_1, *src_2);
  return _mm_packs_epi32(tmp_1, tmp_2);
}

static INLINE __m128i round_epi16_sse2(const __m128i *const src,
                                       const __m128i *const half_depth,
                                       const int depth) {
  const __m128i nearest_src = _mm_adds_epi16(*src, *half_depth);
  return _mm_srai_epi16(nearest_src, depth);
}

#endif  // VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_