Note: We no longer publish the latest version of our code here. We primarily use a kumc-bmi github organization. The heron ETL repository, in particular, is not public. Peers in the informatics community should see MultiSiteDev for details on requesting access.

source: webrtc/webrtc/modules/audio_processing/aec/aec_rdft.c @ 0:4bda6873e34c

pub_scrub_3792 tip
Last change on this file since 0:4bda6873e34c was 0:4bda6873e34c, checked in by Michael Prittie <mprittie@…>, 6 years ago

Scrubbed password for publication.

File size: 16.5 KB
Line 
1/*
2 * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
3 * Copyright Takuya OOURA, 1996-2001
4 *
5 * You may use, copy, modify and distribute this code for any purpose (include
6 * commercial use) and without fee. Please refer to this package when you modify
7 * this code.
8 *
9 * Changes by the WebRTC authors:
10 *    - Trivial type modifications.
11 *    - Minimal code subset to do rdft of length 128.
12 *    - Optimizations because of known length.
13 *
14 *  All changes are covered by the WebRTC license and IP grant:
15 *  Use of this source code is governed by a BSD-style license
16 *  that can be found in the LICENSE file in the root of the source
17 *  tree. An additional intellectual property rights grant can be found
18 *  in the file PATENTS.  All contributing project authors may
19 *  be found in the AUTHORS file in the root of the source tree.
20 */
21
22#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
23
24#include <math.h>
25
26#include "webrtc/system_wrappers/interface/cpu_features_wrapper.h"
27#include "webrtc/typedefs.h"
28
29// constants shared by all paths (C, SSE2).
30float rdft_w[64];
31// constants used by the C path.
32float rdft_wk3ri_first[32];
33float rdft_wk3ri_second[32];
34// constants used by SSE2 but initialized in C path.
35ALIGN16_BEG float ALIGN16_END rdft_wk1r[32];
36ALIGN16_BEG float ALIGN16_END rdft_wk2r[32];
37ALIGN16_BEG float ALIGN16_END rdft_wk3r[32];
38ALIGN16_BEG float ALIGN16_END rdft_wk1i[32];
39ALIGN16_BEG float ALIGN16_END rdft_wk2i[32];
40ALIGN16_BEG float ALIGN16_END rdft_wk3i[32];
41ALIGN16_BEG float ALIGN16_END cftmdl_wk1r[4];
42
43static int ip[16];
44
45static void bitrv2_32(int* ip, float* a) {
46  const int n = 32;
47  int j, j1, k, k1, m, m2;
48  float xr, xi, yr, yi;
49
50  ip[0] = 0;
51  {
52    int l = n;
53    m = 1;
54    while ((m << 3) < l) {
55      l >>= 1;
56      for (j = 0; j < m; j++) {
57        ip[m + j] = ip[j] + l;
58      }
59      m <<= 1;
60    }
61  }
62  m2 = 2 * m;
63  for (k = 0; k < m; k++) {
64    for (j = 0; j < k; j++) {
65      j1 = 2 * j + ip[k];
66      k1 = 2 * k + ip[j];
67      xr = a[j1];
68      xi = a[j1 + 1];
69      yr = a[k1];
70      yi = a[k1 + 1];
71      a[j1] = yr;
72      a[j1 + 1] = yi;
73      a[k1] = xr;
74      a[k1 + 1] = xi;
75      j1 += m2;
76      k1 += 2 * m2;
77      xr = a[j1];
78      xi = a[j1 + 1];
79      yr = a[k1];
80      yi = a[k1 + 1];
81      a[j1] = yr;
82      a[j1 + 1] = yi;
83      a[k1] = xr;
84      a[k1 + 1] = xi;
85      j1 += m2;
86      k1 -= m2;
87      xr = a[j1];
88      xi = a[j1 + 1];
89      yr = a[k1];
90      yi = a[k1 + 1];
91      a[j1] = yr;
92      a[j1 + 1] = yi;
93      a[k1] = xr;
94      a[k1 + 1] = xi;
95      j1 += m2;
96      k1 += 2 * m2;
97      xr = a[j1];
98      xi = a[j1 + 1];
99      yr = a[k1];
100      yi = a[k1 + 1];
101      a[j1] = yr;
102      a[j1 + 1] = yi;
103      a[k1] = xr;
104      a[k1 + 1] = xi;
105    }
106    j1 = 2 * k + m2 + ip[k];
107    k1 = j1 + m2;
108    xr = a[j1];
109    xi = a[j1 + 1];
110    yr = a[k1];
111    yi = a[k1 + 1];
112    a[j1] = yr;
113    a[j1 + 1] = yi;
114    a[k1] = xr;
115    a[k1 + 1] = xi;
116  }
117}
118
119static void bitrv2_128(float* a) {
120  /*
121      Following things have been attempted but are no faster:
122      (a) Storing the swap indexes in a LUT (index calculations are done
123          for 'free' while waiting on memory/L1).
124      (b) Consolidate the load/store of two consecutive floats by a 64 bit
125          integer (execution is memory/L1 bound).
126      (c) Do a mix of floats and 64 bit integer to maximize register
127          utilization (execution is memory/L1 bound).
128      (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5).
129      (e) Hard-coding of the offsets to completely eliminates index
130          calculations.
131  */
132
133  unsigned int j, j1, k, k1;
134  float xr, xi, yr, yi;
135
136  static const int ip[4] = {0, 64, 32, 96};
137  for (k = 0; k < 4; k++) {
138    for (j = 0; j < k; j++) {
139      j1 = 2 * j + ip[k];
140      k1 = 2 * k + ip[j];
141      xr = a[j1 + 0];
142      xi = a[j1 + 1];
143      yr = a[k1 + 0];
144      yi = a[k1 + 1];
145      a[j1 + 0] = yr;
146      a[j1 + 1] = yi;
147      a[k1 + 0] = xr;
148      a[k1 + 1] = xi;
149      j1 += 8;
150      k1 += 16;
151      xr = a[j1 + 0];
152      xi = a[j1 + 1];
153      yr = a[k1 + 0];
154      yi = a[k1 + 1];
155      a[j1 + 0] = yr;
156      a[j1 + 1] = yi;
157      a[k1 + 0] = xr;
158      a[k1 + 1] = xi;
159      j1 += 8;
160      k1 -= 8;
161      xr = a[j1 + 0];
162      xi = a[j1 + 1];
163      yr = a[k1 + 0];
164      yi = a[k1 + 1];
165      a[j1 + 0] = yr;
166      a[j1 + 1] = yi;
167      a[k1 + 0] = xr;
168      a[k1 + 1] = xi;
169      j1 += 8;
170      k1 += 16;
171      xr = a[j1 + 0];
172      xi = a[j1 + 1];
173      yr = a[k1 + 0];
174      yi = a[k1 + 1];
175      a[j1 + 0] = yr;
176      a[j1 + 1] = yi;
177      a[k1 + 0] = xr;
178      a[k1 + 1] = xi;
179    }
180    j1 = 2 * k + 8 + ip[k];
181    k1 = j1 + 8;
182    xr = a[j1 + 0];
183    xi = a[j1 + 1];
184    yr = a[k1 + 0];
185    yi = a[k1 + 1];
186    a[j1 + 0] = yr;
187    a[j1 + 1] = yi;
188    a[k1 + 0] = xr;
189    a[k1 + 1] = xi;
190  }
191}
192
193static void makewt_32(void) {
194  const int nw = 32;
195  int j, nwh;
196  float delta, x, y;
197
198  ip[0] = nw;
199  ip[1] = 1;
200  nwh = nw >> 1;
201  delta = atanf(1.0f) / nwh;
202  rdft_w[0] = 1;
203  rdft_w[1] = 0;
204  rdft_w[nwh] = cosf(delta * nwh);
205  rdft_w[nwh + 1] = rdft_w[nwh];
206  for (j = 2; j < nwh; j += 2) {
207    x = cosf(delta * j);
208    y = sinf(delta * j);
209    rdft_w[j] = x;
210    rdft_w[j + 1] = y;
211    rdft_w[nw - j] = y;
212    rdft_w[nw - j + 1] = x;
213  }
214  bitrv2_32(ip + 2, rdft_w);
215
216  // pre-calculate constants used by cft1st_128 and cftmdl_128...
217  cftmdl_wk1r[0] = rdft_w[2];
218  cftmdl_wk1r[1] = rdft_w[2];
219  cftmdl_wk1r[2] = rdft_w[2];
220  cftmdl_wk1r[3] = -rdft_w[2];
221  {
222    int k1;
223
224    for (k1 = 0, j = 0; j < 128; j += 16, k1 += 2) {
225      const int k2 = 2 * k1;
226      const float wk2r = rdft_w[k1 + 0];
227      const float wk2i = rdft_w[k1 + 1];
228      float wk1r, wk1i;
229      // ... scalar version.
230      wk1r = rdft_w[k2 + 0];
231      wk1i = rdft_w[k2 + 1];
232      rdft_wk3ri_first[k1 + 0] = wk1r - 2 * wk2i * wk1i;
233      rdft_wk3ri_first[k1 + 1] = 2 * wk2i * wk1r - wk1i;
234      wk1r = rdft_w[k2 + 2];
235      wk1i = rdft_w[k2 + 3];
236      rdft_wk3ri_second[k1 + 0] = wk1r - 2 * wk2r * wk1i;
237      rdft_wk3ri_second[k1 + 1] = 2 * wk2r * wk1r - wk1i;
238      // ... vector version.
239      rdft_wk1r[k2 + 0] = rdft_w[k2 + 0];
240      rdft_wk1r[k2 + 1] = rdft_w[k2 + 0];
241      rdft_wk1r[k2 + 2] = rdft_w[k2 + 2];
242      rdft_wk1r[k2 + 3] = rdft_w[k2 + 2];
243      rdft_wk2r[k2 + 0] = rdft_w[k1 + 0];
244      rdft_wk2r[k2 + 1] = rdft_w[k1 + 0];
245      rdft_wk2r[k2 + 2] = -rdft_w[k1 + 1];
246      rdft_wk2r[k2 + 3] = -rdft_w[k1 + 1];
247      rdft_wk3r[k2 + 0] = rdft_wk3ri_first[k1 + 0];
248      rdft_wk3r[k2 + 1] = rdft_wk3ri_first[k1 + 0];
249      rdft_wk3r[k2 + 2] = rdft_wk3ri_second[k1 + 0];
250      rdft_wk3r[k2 + 3] = rdft_wk3ri_second[k1 + 0];
251      rdft_wk1i[k2 + 0] = -rdft_w[k2 + 1];
252      rdft_wk1i[k2 + 1] = rdft_w[k2 + 1];
253      rdft_wk1i[k2 + 2] = -rdft_w[k2 + 3];
254      rdft_wk1i[k2 + 3] = rdft_w[k2 + 3];
255      rdft_wk2i[k2 + 0] = -rdft_w[k1 + 1];
256      rdft_wk2i[k2 + 1] = rdft_w[k1 + 1];
257      rdft_wk2i[k2 + 2] = -rdft_w[k1 + 0];
258      rdft_wk2i[k2 + 3] = rdft_w[k1 + 0];
259      rdft_wk3i[k2 + 0] = -rdft_wk3ri_first[k1 + 1];
260      rdft_wk3i[k2 + 1] = rdft_wk3ri_first[k1 + 1];
261      rdft_wk3i[k2 + 2] = -rdft_wk3ri_second[k1 + 1];
262      rdft_wk3i[k2 + 3] = rdft_wk3ri_second[k1 + 1];
263    }
264  }
265}
266
267static void makect_32(void) {
268  float* c = rdft_w + 32;
269  const int nc = 32;
270  int j, nch;
271  float delta;
272
273  ip[1] = nc;
274  nch = nc >> 1;
275  delta = atanf(1.0f) / nch;
276  c[0] = cosf(delta * nch);
277  c[nch] = 0.5f * c[0];
278  for (j = 1; j < nch; j++) {
279    c[j] = 0.5f * cosf(delta * j);
280    c[nc - j] = 0.5f * sinf(delta * j);
281  }
282}
283
284static void cft1st_128_C(float* a) {
285  const int n = 128;
286  int j, k1, k2;
287  float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
288  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
289
290  x0r = a[0] + a[2];
291  x0i = a[1] + a[3];
292  x1r = a[0] - a[2];
293  x1i = a[1] - a[3];
294  x2r = a[4] + a[6];
295  x2i = a[5] + a[7];
296  x3r = a[4] - a[6];
297  x3i = a[5] - a[7];
298  a[0] = x0r + x2r;
299  a[1] = x0i + x2i;
300  a[4] = x0r - x2r;
301  a[5] = x0i - x2i;
302  a[2] = x1r - x3i;
303  a[3] = x1i + x3r;
304  a[6] = x1r + x3i;
305  a[7] = x1i - x3r;
306  wk1r = rdft_w[2];
307  x0r = a[8] + a[10];
308  x0i = a[9] + a[11];
309  x1r = a[8] - a[10];
310  x1i = a[9] - a[11];
311  x2r = a[12] + a[14];
312  x2i = a[13] + a[15];
313  x3r = a[12] - a[14];
314  x3i = a[13] - a[15];
315  a[8] = x0r + x2r;
316  a[9] = x0i + x2i;
317  a[12] = x2i - x0i;
318  a[13] = x0r - x2r;
319  x0r = x1r - x3i;
320  x0i = x1i + x3r;
321  a[10] = wk1r * (x0r - x0i);
322  a[11] = wk1r * (x0r + x0i);
323  x0r = x3i + x1r;
324  x0i = x3r - x1i;
325  a[14] = wk1r * (x0i - x0r);
326  a[15] = wk1r * (x0i + x0r);
327  k1 = 0;
328  for (j = 16; j < n; j += 16) {
329    k1 += 2;
330    k2 = 2 * k1;
331    wk2r = rdft_w[k1 + 0];
332    wk2i = rdft_w[k1 + 1];
333    wk1r = rdft_w[k2 + 0];
334    wk1i = rdft_w[k2 + 1];
335    wk3r = rdft_wk3ri_first[k1 + 0];
336    wk3i = rdft_wk3ri_first[k1 + 1];
337    x0r = a[j + 0] + a[j + 2];
338    x0i = a[j + 1] + a[j + 3];
339    x1r = a[j + 0] - a[j + 2];
340    x1i = a[j + 1] - a[j + 3];
341    x2r = a[j + 4] + a[j + 6];
342    x2i = a[j + 5] + a[j + 7];
343    x3r = a[j + 4] - a[j + 6];
344    x3i = a[j + 5] - a[j + 7];
345    a[j + 0] = x0r + x2r;
346    a[j + 1] = x0i + x2i;
347    x0r -= x2r;
348    x0i -= x2i;
349    a[j + 4] = wk2r * x0r - wk2i * x0i;
350    a[j + 5] = wk2r * x0i + wk2i * x0r;
351    x0r = x1r - x3i;
352    x0i = x1i + x3r;
353    a[j + 2] = wk1r * x0r - wk1i * x0i;
354    a[j + 3] = wk1r * x0i + wk1i * x0r;
355    x0r = x1r + x3i;
356    x0i = x1i - x3r;
357    a[j + 6] = wk3r * x0r - wk3i * x0i;
358    a[j + 7] = wk3r * x0i + wk3i * x0r;
359    wk1r = rdft_w[k2 + 2];
360    wk1i = rdft_w[k2 + 3];
361    wk3r = rdft_wk3ri_second[k1 + 0];
362    wk3i = rdft_wk3ri_second[k1 + 1];
363    x0r = a[j + 8] + a[j + 10];
364    x0i = a[j + 9] + a[j + 11];
365    x1r = a[j + 8] - a[j + 10];
366    x1i = a[j + 9] - a[j + 11];
367    x2r = a[j + 12] + a[j + 14];
368    x2i = a[j + 13] + a[j + 15];
369    x3r = a[j + 12] - a[j + 14];
370    x3i = a[j + 13] - a[j + 15];
371    a[j + 8] = x0r + x2r;
372    a[j + 9] = x0i + x2i;
373    x0r -= x2r;
374    x0i -= x2i;
375    a[j + 12] = -wk2i * x0r - wk2r * x0i;
376    a[j + 13] = -wk2i * x0i + wk2r * x0r;
377    x0r = x1r - x3i;
378    x0i = x1i + x3r;
379    a[j + 10] = wk1r * x0r - wk1i * x0i;
380    a[j + 11] = wk1r * x0i + wk1i * x0r;
381    x0r = x1r + x3i;
382    x0i = x1i - x3r;
383    a[j + 14] = wk3r * x0r - wk3i * x0i;
384    a[j + 15] = wk3r * x0i + wk3i * x0r;
385  }
386}
387
388static void cftmdl_128_C(float* a) {
389  const int l = 8;
390  const int n = 128;
391  const int m = 32;
392  int j0, j1, j2, j3, k, k1, k2, m2;
393  float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
394  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
395
396  for (j0 = 0; j0 < l; j0 += 2) {
397    j1 = j0 + 8;
398    j2 = j0 + 16;
399    j3 = j0 + 24;
400    x0r = a[j0 + 0] + a[j1 + 0];
401    x0i = a[j0 + 1] + a[j1 + 1];
402    x1r = a[j0 + 0] - a[j1 + 0];
403    x1i = a[j0 + 1] - a[j1 + 1];
404    x2r = a[j2 + 0] + a[j3 + 0];
405    x2i = a[j2 + 1] + a[j3 + 1];
406    x3r = a[j2 + 0] - a[j3 + 0];
407    x3i = a[j2 + 1] - a[j3 + 1];
408    a[j0 + 0] = x0r + x2r;
409    a[j0 + 1] = x0i + x2i;
410    a[j2 + 0] = x0r - x2r;
411    a[j2 + 1] = x0i - x2i;
412    a[j1 + 0] = x1r - x3i;
413    a[j1 + 1] = x1i + x3r;
414    a[j3 + 0] = x1r + x3i;
415    a[j3 + 1] = x1i - x3r;
416  }
417  wk1r = rdft_w[2];
418  for (j0 = m; j0 < l + m; j0 += 2) {
419    j1 = j0 + 8;
420    j2 = j0 + 16;
421    j3 = j0 + 24;
422    x0r = a[j0 + 0] + a[j1 + 0];
423    x0i = a[j0 + 1] + a[j1 + 1];
424    x1r = a[j0 + 0] - a[j1 + 0];
425    x1i = a[j0 + 1] - a[j1 + 1];
426    x2r = a[j2 + 0] + a[j3 + 0];
427    x2i = a[j2 + 1] + a[j3 + 1];
428    x3r = a[j2 + 0] - a[j3 + 0];
429    x3i = a[j2 + 1] - a[j3 + 1];
430    a[j0 + 0] = x0r + x2r;
431    a[j0 + 1] = x0i + x2i;
432    a[j2 + 0] = x2i - x0i;
433    a[j2 + 1] = x0r - x2r;
434    x0r = x1r - x3i;
435    x0i = x1i + x3r;
436    a[j1 + 0] = wk1r * (x0r - x0i);
437    a[j1 + 1] = wk1r * (x0r + x0i);
438    x0r = x3i + x1r;
439    x0i = x3r - x1i;
440    a[j3 + 0] = wk1r * (x0i - x0r);
441    a[j3 + 1] = wk1r * (x0i + x0r);
442  }
443  k1 = 0;
444  m2 = 2 * m;
445  for (k = m2; k < n; k += m2) {
446    k1 += 2;
447    k2 = 2 * k1;
448    wk2r = rdft_w[k1 + 0];
449    wk2i = rdft_w[k1 + 1];
450    wk1r = rdft_w[k2 + 0];
451    wk1i = rdft_w[k2 + 1];
452    wk3r = rdft_wk3ri_first[k1 + 0];
453    wk3i = rdft_wk3ri_first[k1 + 1];
454    for (j0 = k; j0 < l + k; j0 += 2) {
455      j1 = j0 + 8;
456      j2 = j0 + 16;
457      j3 = j0 + 24;
458      x0r = a[j0 + 0] + a[j1 + 0];
459      x0i = a[j0 + 1] + a[j1 + 1];
460      x1r = a[j0 + 0] - a[j1 + 0];
461      x1i = a[j0 + 1] - a[j1 + 1];
462      x2r = a[j2 + 0] + a[j3 + 0];
463      x2i = a[j2 + 1] + a[j3 + 1];
464      x3r = a[j2 + 0] - a[j3 + 0];
465      x3i = a[j2 + 1] - a[j3 + 1];
466      a[j0 + 0] = x0r + x2r;
467      a[j0 + 1] = x0i + x2i;
468      x0r -= x2r;
469      x0i -= x2i;
470      a[j2 + 0] = wk2r * x0r - wk2i * x0i;
471      a[j2 + 1] = wk2r * x0i + wk2i * x0r;
472      x0r = x1r - x3i;
473      x0i = x1i + x3r;
474      a[j1 + 0] = wk1r * x0r - wk1i * x0i;
475      a[j1 + 1] = wk1r * x0i + wk1i * x0r;
476      x0r = x1r + x3i;
477      x0i = x1i - x3r;
478      a[j3 + 0] = wk3r * x0r - wk3i * x0i;
479      a[j3 + 1] = wk3r * x0i + wk3i * x0r;
480    }
481    wk1r = rdft_w[k2 + 2];
482    wk1i = rdft_w[k2 + 3];
483    wk3r = rdft_wk3ri_second[k1 + 0];
484    wk3i = rdft_wk3ri_second[k1 + 1];
485    for (j0 = k + m; j0 < l + (k + m); j0 += 2) {
486      j1 = j0 + 8;
487      j2 = j0 + 16;
488      j3 = j0 + 24;
489      x0r = a[j0 + 0] + a[j1 + 0];
490      x0i = a[j0 + 1] + a[j1 + 1];
491      x1r = a[j0 + 0] - a[j1 + 0];
492      x1i = a[j0 + 1] - a[j1 + 1];
493      x2r = a[j2 + 0] + a[j3 + 0];
494      x2i = a[j2 + 1] + a[j3 + 1];
495      x3r = a[j2 + 0] - a[j3 + 0];
496      x3i = a[j2 + 1] - a[j3 + 1];
497      a[j0 + 0] = x0r + x2r;
498      a[j0 + 1] = x0i + x2i;
499      x0r -= x2r;
500      x0i -= x2i;
501      a[j2 + 0] = -wk2i * x0r - wk2r * x0i;
502      a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
503      x0r = x1r - x3i;
504      x0i = x1i + x3r;
505      a[j1 + 0] = wk1r * x0r - wk1i * x0i;
506      a[j1 + 1] = wk1r * x0i + wk1i * x0r;
507      x0r = x1r + x3i;
508      x0i = x1i - x3r;
509      a[j3 + 0] = wk3r * x0r - wk3i * x0i;
510      a[j3 + 1] = wk3r * x0i + wk3i * x0r;
511    }
512  }
513}
514
515static void cftfsub_128(float* a) {
516  int j, j1, j2, j3, l;
517  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
518
519  cft1st_128(a);
520  cftmdl_128(a);
521  l = 32;
522  for (j = 0; j < l; j += 2) {
523    j1 = j + l;
524    j2 = j1 + l;
525    j3 = j2 + l;
526    x0r = a[j] + a[j1];
527    x0i = a[j + 1] + a[j1 + 1];
528    x1r = a[j] - a[j1];
529    x1i = a[j + 1] - a[j1 + 1];
530    x2r = a[j2] + a[j3];
531    x2i = a[j2 + 1] + a[j3 + 1];
532    x3r = a[j2] - a[j3];
533    x3i = a[j2 + 1] - a[j3 + 1];
534    a[j] = x0r + x2r;
535    a[j + 1] = x0i + x2i;
536    a[j2] = x0r - x2r;
537    a[j2 + 1] = x0i - x2i;
538    a[j1] = x1r - x3i;
539    a[j1 + 1] = x1i + x3r;
540    a[j3] = x1r + x3i;
541    a[j3 + 1] = x1i - x3r;
542  }
543}
544
545static void cftbsub_128(float* a) {
546  int j, j1, j2, j3, l;
547  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
548
549  cft1st_128(a);
550  cftmdl_128(a);
551  l = 32;
552
553  for (j = 0; j < l; j += 2) {
554    j1 = j + l;
555    j2 = j1 + l;
556    j3 = j2 + l;
557    x0r = a[j] + a[j1];
558    x0i = -a[j + 1] - a[j1 + 1];
559    x1r = a[j] - a[j1];
560    x1i = -a[j + 1] + a[j1 + 1];
561    x2r = a[j2] + a[j3];
562    x2i = a[j2 + 1] + a[j3 + 1];
563    x3r = a[j2] - a[j3];
564    x3i = a[j2 + 1] - a[j3 + 1];
565    a[j] = x0r + x2r;
566    a[j + 1] = x0i - x2i;
567    a[j2] = x0r - x2r;
568    a[j2 + 1] = x0i + x2i;
569    a[j1] = x1r - x3i;
570    a[j1 + 1] = x1i - x3r;
571    a[j3] = x1r + x3i;
572    a[j3 + 1] = x1i + x3r;
573  }
574}
575
576static void rftfsub_128_C(float* a) {
577  const float* c = rdft_w + 32;
578  int j1, j2, k1, k2;
579  float wkr, wki, xr, xi, yr, yi;
580
581  for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
582    k2 = 128 - j2;
583    k1 = 32 - j1;
584    wkr = 0.5f - c[k1];
585    wki = c[j1];
586    xr = a[j2 + 0] - a[k2 + 0];
587    xi = a[j2 + 1] + a[k2 + 1];
588    yr = wkr * xr - wki * xi;
589    yi = wkr * xi + wki * xr;
590    a[j2 + 0] -= yr;
591    a[j2 + 1] -= yi;
592    a[k2 + 0] += yr;
593    a[k2 + 1] -= yi;
594  }
595}
596
597static void rftbsub_128_C(float* a) {
598  const float* c = rdft_w + 32;
599  int j1, j2, k1, k2;
600  float wkr, wki, xr, xi, yr, yi;
601
602  a[1] = -a[1];
603  for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
604    k2 = 128 - j2;
605    k1 = 32 - j1;
606    wkr = 0.5f - c[k1];
607    wki = c[j1];
608    xr = a[j2 + 0] - a[k2 + 0];
609    xi = a[j2 + 1] + a[k2 + 1];
610    yr = wkr * xr + wki * xi;
611    yi = wkr * xi - wki * xr;
612    a[j2 + 0] = a[j2 + 0] - yr;
613    a[j2 + 1] = yi - a[j2 + 1];
614    a[k2 + 0] = yr + a[k2 + 0];
615    a[k2 + 1] = yi - a[k2 + 1];
616  }
617  a[65] = -a[65];
618}
619
620void aec_rdft_forward_128(float* a) {
621  float xi;
622  bitrv2_128(a);
623  cftfsub_128(a);
624  rftfsub_128(a);
625  xi = a[0] - a[1];
626  a[0] += a[1];
627  a[1] = xi;
628}
629
630void aec_rdft_inverse_128(float* a) {
631  a[1] = 0.5f * (a[0] - a[1]);
632  a[0] -= a[1];
633  rftbsub_128(a);
634  bitrv2_128(a);
635  cftbsub_128(a);
636}
637
638// code path selection
639rft_sub_128_t cft1st_128;
640rft_sub_128_t cftmdl_128;
641rft_sub_128_t rftfsub_128;
642rft_sub_128_t rftbsub_128;
643
644void aec_rdft_init(void) {
645  cft1st_128 = cft1st_128_C;
646  cftmdl_128 = cftmdl_128_C;
647  rftfsub_128 = rftfsub_128_C;
648  rftbsub_128 = rftbsub_128_C;
649#if defined(WEBRTC_ARCH_X86_FAMILY)
650  if (WebRtc_GetCPUInfo(kSSE2)) {
651    aec_rdft_init_sse2();
652  }
653#endif
654  // init library constants.
655  makewt_32();
656  makect_32();
657}
Note: See TracBrowser for help on using the repository browser.