Note: We no longer publish the latest version of our code here. We primarily use a kumc-bmi github organization. The heron ETL repository, in particular, is not public. Peers in the informatics community should see MultiSiteDev for details on requesting access.

source: webrtc/webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S @ 0:4bda6873e34c

pub_scrub_3792 tip
Last change on this file since 0:4bda6873e34c was 0:4bda6873e34c, checked in by Michael Prittie <mprittie@…>, 6 years ago

Scrubbed password for publication.

File size: 11.1 KB
Line 
1@
2@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3@
4@ Use of this source code is governed by a BSD-style license
5@ that can be found in the LICENSE file in the root of the source
6@ tree. An additional intellectual property rights grant can be found
7@ in the file PATENTS.  All contributing project authors may
8@ be found in the AUTHORS file in the root of the source tree.
9@
10
11@ Contains a function for WebRtcIsacfix_AllpassFilter2FixDec16Neon()
12@ in iSAC codec, optimized for ARM Neon platform. Bit exact with function
13@ WebRtcIsacfix_AllpassFilter2FixDec16Neon() in filterbanks.c. Prototype
14@ C code is at end of this file.
15
16#include "webrtc/system_wrappers/interface/asm_defines.h"
17
18GLOBAL_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
19.align  2
20
21@void WebRtcIsacfix_AllpassFilter2FixDec16Neon(
22@    int16_t *data_ch1,  // Input and output in channel 1, in Q0
23@    int16_t *data_ch2,  // Input and output in channel 2, in Q0
24@    const int16_t *factor_ch1,  // Scaling factor for channel 1, in Q15
25@    const int16_t *factor_ch2,  // Scaling factor for channel 2, in Q15
26@    const int length,           // Length of the data buffers
27@    int32_t *filter_state_ch1,  // Filter state for channel 1, in Q16
28@    int32_t *filter_state_ch2); // Filter state for channel 2, in Q16
29
30DEFINE_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
31  push {r4 - r7}
32
33  ldr r5, [sp, #24]           @ filter_state_ch2
34  ldr r6, [sp, #20]           @ filter_state_ch1
35
36  @ Initialize the Neon registers.
37  vld1.16 d0[0], [r0]!        @ data_ch1[0]
38  vld1.16 d0[2], [r1]!        @ data_ch2[0]
39  vld1.32 d30[0], [r2]        @ factor_ch1[0], factor_ch1[1]
40  vld1.32 d30[1], [r3]        @ factor_ch2[0], factor_ch2[1]
41  vld1.32 d16[0], [r6]!       @ filter_state_ch1[0]
42  vld1.32 d17[0], [r5]!       @ filter_state_ch2[0]
43  vneg.s16 d31, d30
44
45  ldr r3, [sp, #16]           @ length
46  mov r4, #4                  @ Post offset value for the loop
47  mov r2, #-2                 @ Post offset value for the loop
48  sub r3, #2                  @ Loop counter
49
50  @ Loop unrolling pre-processing.
51  vqdmull.s16 q1, d30, d0
52  vshll.s16 q0, d0, #16
53  vqadd.s32 q2, q1, q8
54  vshrn.i32 d6, q2, #16
55  vmull.s16 q1, d31, d6
56  vshl.s32 q1, #1
57  vqadd.s32 q8, q1, q0
58  vld1.32 d16[1], [r6]        @ filter_state_ch1[1]
59  vld1.32 d17[1], [r5]        @ filter_state_ch2[1]
60  sub r6, #4                  @ &filter_state_ch1[0]
61  sub r5, #4                  @ &filter_state_ch2[0]
62  vld1.16 d6[1], [r0], r2     @ data_ch1[1]
63  vld1.16 d6[3], [r1], r2     @ data_ch2[1]
64  vrev32.16 d0, d6
65
66FOR_LOOP:
67  vqdmull.s16 q1, d30, d0
68  vshll.s16 q0, d0, #16
69  vqadd.s32 q2, q1, q8
70  vshrn.i32 d4, q2, #16
71  vmull.s16 q1, d31, d4
72  vst1.16 d4[1], [r0], r4     @ Store data_ch1[n]
73  vst1.16 d4[3], [r1], r4     @ Store data_ch2[n]
74  vshl.s32 q1, #1
75  vld1.16 d4[1], [r0], r2     @ Load data_ch1[n + 2]
76  vld1.16 d4[3], [r1], r2     @ Load data_ch2[n + 2]
77  vqadd.s32 q8, q1, q0
78  vrev32.16 d0, d4
79  vqdmull.s16 q1, d30, d0
80  subs r3, #2
81  vqadd.s32 q2, q1, q8
82  vshrn.i32 d6, q2, #16
83  vmull.s16 q1, d31, d6
84  vshll.s16 q0, d0, #16
85  vst1.16 d6[1], [r0], r4     @ Store data_ch1[n + 1]
86  vst1.16 d6[3], [r1], r4     @ Store data_ch2[n + 1]
87  vshl.s32 q1, #1
88  vld1.16 d6[1], [r0], r2     @ Load data_ch1[n + 3]
89  vld1.16 d6[3], [r1], r2     @ Load data_ch2[n + 3]
90  vqadd.s32 q8, q1, q0
91  vrev32.16 d0, d6
92  bgt FOR_LOOP
93
94  @ Loop unrolling post-processing.
95  vqdmull.s16 q1, d30, d0
96  vshll.s16 q0, d0, #16
97  vqadd.s32 q2, q1, q8
98  vshrn.i32 d4, q2, #16
99  vmull.s16 q1, d31, d4
100  vst1.16 d4[1], [r0]!        @ Store data_ch1[n]
101  vst1.16 d4[3], [r1]!        @ Store data_ch2[n]
102  vshl.s32 q1, #1
103  vqadd.s32 q8, q1, q0
104  vrev32.16 d0, d4
105  vqdmull.s16 q1, d30, d0
106  vshll.s16 q0, d0, #16
107  vqadd.s32 q2, q1, q8
108  vshrn.i32 d6, q2, #16
109  vmull.s16 q1, d31, d6
110  vst1.16 d6[1], [r0]         @ Store data_ch1[n + 1]
111  vst1.16 d6[3], [r1]         @ Store data_ch2[n + 1]
112  vshl.s32 q1, #1
113  vst1.32 d16[0], [r6]!       @ Store filter_state_ch1[0]
114  vqadd.s32 q9, q1, q0
115  vst1.32 d17[0], [r5]!       @ Store filter_state_ch1[1]
116  vst1.32 d18[1], [r6]        @ Store filter_state_ch2[0]
117  vst1.32 d19[1], [r5]        @ Store filter_state_ch2[1]
118
119  pop {r4 - r7}
120  bx lr
121
122@void AllpassFilter2FixDec16BothChannels(
123@    int16_t *data_ch1,  // Input and output in channel 1, in Q0
124@    int16_t *data_ch2,  // Input and output in channel 2, in Q0
125@    const int16_t *factor_ch1,  // Scaling factor for channel 1, in Q15
126@    const int16_t *factor_ch2,  // Scaling factor for channel 2, in Q15
127@    const int length,  // Length of the data buffers
128@    int32_t *filter_state_ch1,  // Filter state for channel 1, in Q16
129@    int32_t *filter_state_ch2) {  // Filter state for channel 2, in Q16
130@  int n = 0;
131@  int32_t state0_ch1 = filter_state_ch1[0], state1_ch1 = filter_state_ch1[1];
132@  int32_t state0_ch2 = filter_state_ch2[0], state1_ch2 = filter_state_ch2[1];
133@  int16_t sample0_ch1 = 0, sample0_ch2 = 0;
134@  int16_t sample1_ch1 = 0, sample1_ch2  = 0;
135@  int32_t a0_ch1 = 0, a0_ch2 = 0;
136@  int32_t b0_ch1 = 0, b0_ch2 = 0;
137@
138@  int32_t a1_ch1 = 0, a1_ch2 = 0;
139@  int32_t b1_ch1 = 0, b1_ch2 = 0;
140@  int32_t b2_ch1  = 0, b2_ch2 = 0;
141@
142@  // Loop unrolling preprocessing.
143@
144@  sample0_ch1 = data_ch1[n];
145@  sample0_ch2 = data_ch2[n];
146@
147@  a0_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[0], sample0_ch1) << 1;
148@  a0_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[0], sample0_ch2) << 1;
149@
150@  b0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1, state0_ch1);
151@  b0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2, state0_ch2); //Q16+Q16=Q16
152@
153@  a0_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[0], (int16_t) (b0_ch1 >> 16));
154@  a0_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[0], (int16_t) (b0_ch2 >> 16));
155@
156@  state0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1 <<1, (uint32_t)sample0_ch1 << 16);
157@  state0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2 <<1, (uint32_t)sample0_ch2 << 16);
158@
159@  sample1_ch1 = data_ch1[n + 1];
160@  sample0_ch1 = (int16_t) (b0_ch1 >> 16); //Save as Q0
161@  sample1_ch2  = data_ch2[n + 1];
162@  sample0_ch2 = (int16_t) (b0_ch2 >> 16); //Save as Q0
163@
164@
165@  for (n = 0; n < length - 2; n += 2) {
166@    a1_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[0], sample1_ch1) << 1;
167@    a0_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[1], sample0_ch1) << 1;
168@    a1_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[0], sample1_ch2 ) << 1;
169@    a0_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[1], sample0_ch2) << 1;
170@
171@    b1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1, state0_ch1);
172@    b0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1, state1_ch1); //Q16+Q16=Q16
173@    b1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2, state0_ch2); //Q16+Q16=Q16
174@    b0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2, state1_ch2); //Q16+Q16=Q16
175@
176@    a1_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[0], (int16_t) (b1_ch1 >> 16));
177@    a0_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[1], (int16_t) (b0_ch1 >> 16));
178@    a1_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[0], (int16_t) (b1_ch2 >> 16));
179@    a0_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[1], (int16_t) (b0_ch2 >> 16));
180@
181@    state0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1<<1, (uint32_t)sample1_ch1 <<16);
182@    state1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1<<1, (uint32_t)sample0_ch1 <<16);
183@    state0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2<<1, (uint32_t)sample1_ch2 <<16);
184@    state1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2<<1, (uint32_t)sample0_ch2 <<16);
185@
186@    sample0_ch1 = data_ch1[n + 2];
187@    sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0
188@    sample0_ch2 = data_ch2[n + 2];
189@    sample1_ch2  = (int16_t) (b1_ch2 >> 16); //Save as Q0
190@
191@    a0_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[0], sample0_ch1) << 1;
192@    a1_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[1], sample1_ch1) << 1;
193@    a0_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[0], sample0_ch2) << 1;
194@    a1_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[1], sample1_ch2 ) << 1;
195@
196@    b2_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1, state0_ch1);
197@    b1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1, state1_ch1); //Q16+Q16=Q16
198@    b2_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2, state0_ch2); //Q16+Q16=Q16
199@    b1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2, state1_ch2); //Q16+Q16=Q16
200@
201@    a0_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[0], (int16_t) (b2_ch1 >> 16));
202@    a1_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[1], (int16_t) (b1_ch1 >> 16));
203@    a0_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[0], (int16_t) (b2_ch2 >> 16));
204@    a1_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[1], (int16_t) (b1_ch2 >> 16));
205@
206@    state0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1<<1, (uint32_t)sample0_ch1<<16);
207@    state1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1<<1, (uint32_t)sample1_ch1<<16);
208@    state0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2<<1, (uint32_t)sample0_ch2<<16);
209@    state1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2<<1, (uint32_t)sample1_ch2<<16);
210@
211@
212@    sample1_ch1 = data_ch1[n + 3];
213@    sample0_ch1 = (int16_t) (b2_ch1  >> 16); //Save as Q0
214@    sample1_ch2 = data_ch2[n + 3];
215@    sample0_ch2 = (int16_t) (b2_ch2 >> 16); //Save as Q0
216@
217@    data_ch1[n]     = (int16_t) (b0_ch1 >> 16); //Save as Q0
218@    data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0
219@    data_ch2[n]     = (int16_t) (b0_ch2 >> 16);
220@    data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16);
221@  }
222@
223@  // Loop unrolling post-processing.
224@
225@  a1_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[0], sample1_ch1) << 1;
226@  a0_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[1], sample0_ch1) << 1;
227@  a1_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[0], sample1_ch2 ) << 1;
228@  a0_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[1], sample0_ch2) << 1;
229@
230@  b1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1, state0_ch1);
231@  b0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1, state1_ch1);
232@  b1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2, state0_ch2);
233@  b0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2, state1_ch2);
234@
235@  a1_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[0], (int16_t) (b1_ch1 >> 16));
236@  a0_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[1], (int16_t) (b0_ch1 >> 16));
237@  a1_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[0], (int16_t) (b1_ch2 >> 16));
238@  a0_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[1], (int16_t) (b0_ch2 >> 16));
239@
240@  state0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1<<1, (uint32_t)sample1_ch1 << 16);
241@  state1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1<<1, (uint32_t)sample0_ch1 << 16);
242@  state0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2<<1, (uint32_t)sample1_ch2 << 16);
243@  state1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2<<1, (uint32_t)sample0_ch2 << 16);
244@
245@  data_ch1[n] = (int16_t) (b0_ch1 >> 16); //Save as Q0
246@  data_ch2[n] = (int16_t) (b0_ch2 >> 16);
247@
248@  sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0
249@  sample1_ch2  = (int16_t) (b1_ch2 >> 16); //Save as Q0
250@
251@  a1_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[1], sample1_ch1) << 1;
252@  a1_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[1], sample1_ch2 ) << 1;
253@
254@  b1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1, state1_ch1); //Q16+Q16=Q16
255@  b1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2, state1_ch2); //Q16+Q16=Q16
256@
257@  a1_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[1], (int16_t) (b1_ch1 >> 16));
258@  a1_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[1], (int16_t) (b1_ch2 >> 16));
259@
260@  state1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1<<1, (uint32_t)sample1_ch1<<16);
261@  state1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2<<1, (uint32_t)sample1_ch2<<16);
262@
263@  data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0
264@  data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16);
265@
266@  filter_state_ch1[0] = state0_ch1;
267@  filter_state_ch1[1] = state1_ch1;
268@  filter_state_ch2[0] = state0_ch2;
269@  filter_state_ch2[1] = state1_ch2;
270@}
Note: See TracBrowser for help on using the repository browser.