update pffft.c again from the upstream master branch

2025-08-03 20:57:55 -04:00 · 2024-12-02 12:35:01 +01:00 · 2024-12-02 12:35:01 +01:00 · a2b8f11ba0
commit a2b8f11ba0
parent 6d9bdb8d25
2 changed files with 205 additions and 174 deletions
--- a/third-party/pffft/pffft.c
+++ b/third-party/pffft/pffft.c
@ -25,7 +25,7 @@
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
-   specific prior written permission.  
+   specific prior written permission.

   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
@ -53,11 +53,14 @@
 */

 /*
-  ChangeLog: 
+  ChangeLog:
  - 2011/10/02, version 1: This is the very first release of this file.
 */

-#define _USE_MATH_DEFINES
+#ifndef _USE_MATH_DEFINES
+#  define _USE_MATH_DEFINES // ask gently MSVC to define M_PI, M_SQRT2 etc.
+#endif
+
 #include "pffft.h"
 #include <stdlib.h>
 #include <stdio.h>
@ -84,21 +87,34 @@
 #endif


-/* 
-   vector support macros: the rest of the code is independant of
-   SSE/Altivec/NEON -- adding support for other platforms with 4-element
-   vectors should be limited to these macros 
+/*
+  vector support macros: the rest of the code is independant of
+  SSE/Altivec/NEON -- adding support for other platforms with 4-element
+  vectors should be limited to these macros
 */


 // define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
 //#define PFFFT_SIMD_DISABLE

+/* select which SIMD intrinsics will be used */
+#if !defined(PFFFT_SIMD_DISABLE)
+#  if (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)) \
+   && (defined(__VEC__) || defined(__ALTIVEC__))
+#    define PFFFT_SIMD_ALTIVEC
+#  elif defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64)  \
+   || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__wasm_simd128__)
+     // we test _M_ARM64EC before _M_X64 because when _M_ARM64EC is defined, the microsoft compiler also defines _M_X64
+#    define PFFFT_SIMD_NEON
+#  elif defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
+#    define PFFFT_SIMD_SSE
+#   endif
+#endif // PFFFT_SIMD_DISABLE
+
 /*
-   Altivec support macros 
+  Altivec support macros
 */
-#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)) \
-    && (defined(__VEC__) || defined(__ALTIVEC__))
+#ifdef PFFFT_SIMD_ALTIVEC
 #include <altivec.h>
 typedef vector float v4sf;
 #  define SIMD_SZ 4
@ -115,7 +131,7 @@ inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_p
    vector unsigned char vperm2 =  (vector unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31}; \
    v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
  }
-#  define VTRANSPOSE4(x0,x1,x2,x3) {              \
+#  define VTRANSPOSE4(x0,x1,x2,x3) {            \
    v4sf y0 = vec_mergeh(x0, x2);               \
    v4sf y1 = vec_mergel(x0, x2);               \
    v4sf y2 = vec_mergeh(x1, x3);               \
@ -126,13 +142,12 @@ inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_p
    x3 = vec_mergel(y1, y3);                    \
  }
 #  define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15})
-#  define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
+#  define VALIGNED(ptr) ((((size_t)(ptr)) & 0xF) == 0)

 /*
  SSE1 support macros
 */
-#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || \
-    (defined(_M_IX86_FP) && _M_IX86_FP >= 1))
+#elif defined(PFFFT_SIMD_SSE)

 #include <xmmintrin.h>
 typedef __m128 v4sf;
@ -147,12 +162,12 @@ typedef __m128 v4sf;
 #  define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
 #  define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
 #  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
-#  define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
+#  define VALIGNED(ptr) ((((size_t)(ptr)) & 0xF) == 0)

 /*
  ARM NEON support macros
 */
-#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64) || defined(_M_ARM64))
+#elif defined(PFFFT_SIMD_NEON)
 #  include <arm_neon.h>
 typedef float32x4_t v4sf;
 #  define SIMD_SZ 4
@ -174,7 +189,7 @@ typedef float32x4_t v4sf;
 // marginally faster version
 //#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
 #  define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
-#  define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
+#  define VALIGNED(ptr) ((((size_t)(ptr)) & 0x3) == 0)
 #else
 #  if !defined(PFFFT_SIMD_DISABLE)
 #    warning "building with simd disabled !\n";
@ -192,7 +207,7 @@ typedef float v4sf;
 #  define VMADD(a,b,c) ((a)*(b)+(c))
 #  define VSUB(a,b) ((a)-(b))
 #  define LD_PS1(p) (p)
-#  define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
+#  define VALIGNED(ptr) ((((size_t)(ptr)) & 0x3) == 0)
 #endif

 // shortcuts for complex multiplcations
@ -216,7 +231,7 @@ typedef union v4sf_union {
 /* detect bugs with the vector support macros */
 void validate_pffft_simd(void) {
  float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
-  v4sf_union a0, a1, a2, a3, t, u; 
+  v4sf_union a0, a1, a2, a3, t, u;
  memcpy(a0.f, f, 4*sizeof(float));
  memcpy(a1.f, f+4, 4*sizeof(float));
  memcpy(a2.f, f+8, 4*sizeof(float));
@ -245,9 +260,9 @@ void validate_pffft_simd(void) {
  printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
  assertv4(t, 8, 9, 6, 7);
  VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
-  printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", 
-         a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3], 
-         a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]); 
+  printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
+         a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
+         a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
  assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
 }
 #else
@ -327,7 +342,7 @@ static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
      di3 = VSUB(ci2, cr3);
      wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1];
      VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
-      ch[i+l1ido] = dr2; 
+      ch[i+l1ido] = dr2;
      ch[i+l1ido + 1] = di2;
      VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
      ch[i+2*l1ido] = dr3;
@ -359,7 +374,7 @@ static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
      ch[1*l1ido + 0] = VADD(tr1, tr4);
      ch[1*l1ido + 1] = VADD(ti1, ti4);
      ch[2*l1ido + 0] = VSUB(tr2, tr3);
-      ch[2*l1ido + 1] = VSUB(ti2, ti3);        
+      ch[2*l1ido + 1] = VSUB(ti2, ti3);
      ch[3*l1ido + 0] = VSUB(tr1, tr4);
      ch[3*l1ido + 1] = VSUB(ti1, ti4);
    }
@ -408,8 +423,8 @@ static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
  passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
 */
 static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
-                                    const float *wa1, const float *wa2, 
-                                    const float *wa3, const float *wa4, float fsign) {  
+                                    const float *wa1, const float *wa2,
+                                    const float *wa3, const float *wa4, float fsign) {
  static const float tr11 = .309016994374947f;
  const float ti11 = .951056516295154f*fsign;
  static const float tr12 = -.809016994374947f;
@ -488,7 +503,7 @@ static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4
      for (i=2; i<ido; i+=2) {
        v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
        v4sf br = cc[i - 1 + k], bi = cc[i + k];
-        VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1])); 
+        VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
        ch[i + 2*k] = VADD(bi, ti2);
        ch[2*(k+ido) - i] = VSUB(ti2, bi);
        ch[i - 1 + 2*k] = VADD(br, tr2);
@ -560,7 +575,7 @@ static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT
      wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
      dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
      VCPLXMULCONJ(dr3, di3, wr2, wi2);
-        
+
      cr2 = VADD(dr2, dr3);
      ci2 = VADD(di2, di3);
      ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
@ -626,7 +641,7 @@ static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s
  static const float minus_hsqt2 = (float)-0.7071067811865475;
  int i, k, l1ido = l1*ido;
  {
-    const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido; 
+    const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
    v4sf * RESTRICT ch_ = ch;
    while (cc < cc_end) {
      // this loop represents between 25% and 40% of total radf4_ps cost !
@ -659,20 +674,20 @@ static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s

        cr3 = pc[2*l1ido+0];
        ci3 = pc[2*l1ido+1];
-        wr = LD_PS1(wa2[i-2]); 
+        wr = LD_PS1(wa2[i-2]);
        wi = LD_PS1(wa2[i-1]);
        VCPLXMULCONJ(cr3, ci3, wr, wi);

        cr4 = pc[3*l1ido];
        ci4 = pc[3*l1ido+1];
-        wr = LD_PS1(wa3[i-2]); 
+        wr = LD_PS1(wa3[i-2]);
        wi = LD_PS1(wa3[i-1]);
        VCPLXMULCONJ(cr4, ci4, wr, wi);

        /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */

        tr1 = VADD(cr2,cr4);
-        tr4 = VSUB(cr4,cr2); 
+        tr4 = VSUB(cr4,cr2);
        tr2 = VADD(pc[0],cr3);
        tr3 = VSUB(pc[0],cr3);
        ch[i - 1 + 4*k] = VADD(tr1,tr2);
@ -698,8 +713,8 @@ static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s
    v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
    ch[ido-1 + 4*k] = VADD(tr1, c);
    ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
-    ch[4*k + 1*ido] = VSUB(ti1, d); 
-    ch[4*k + 3*ido] = VADD(ti1, d); 
+    ch[4*k + 1*ido] = VSUB(ti1, d);
+    ch[4*k + 3*ido] = VADD(ti1, d);
  }
 } /* radf4 */

@ -712,7 +727,7 @@ static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4
  int i, k, l1ido = l1*ido;
  v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
  {
-    const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido; 
+    const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
    v4sf *ch_ = ch;
    while (ch < ch_end) {
      v4sf a = cc[0], b = cc[4*ido-1];
@ -725,7 +740,7 @@ static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4
      ch[2*l1ido] = VSUB(tr2, tr3);
      ch[1*l1ido] = VSUB(tr1, tr4);
      ch[3*l1ido] = VADD(tr1, tr4);
-      
+
      cc += 4*ido; ch += ido;
    }
    cc = cc_; ch = ch_;
@ -784,7 +799,7 @@ static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4
  }
 } /* radb4 */

-static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, 
+static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
                     const float *wa1, const float *wa2, const float *wa3, const float *wa4)
 {
  static const float tr11 = .309016994374947f;
@ -871,8 +886,8 @@ static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT
 #undef ch_ref
 } /* radf5 */

-static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, 
-                  const float *wa1, const float *wa2, const float *wa3, const float *wa4)
+static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+                     const float *wa1, const float *wa2, const float *wa3, const float *wa4)
 {
  static const float tr11 = .309016994374947f;
  static const float ti11 = .951056516295154f;
@ -960,8 +975,8 @@ static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch
 #undef ch_ref
 } /* radb5 */

-static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, 
-                                      const float *wa, const int *ifac) {  
+static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
+                                      const float *wa, const int *ifac) {
  v4sf *in  = (v4sf*)input_readonly;
  v4sf *out = (in == work2 ? work1 : work2);
  int nf = ifac[1], k1;
@ -1007,8 +1022,8 @@ static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *w
  return in; /* this is in fact the output .. */
 } /* rfftf1 */

-static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, 
-                                      const float *wa, const int *ifac) {  
+static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
+                                      const float *wa, const int *ifac) {
  v4sf *in  = (v4sf*)input_readonly;
  v4sf *out = (in == work2 ? work1 : work2);
  int nf = ifac[1], k1;
@ -1054,6 +1069,7 @@ static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *w
  return in; /* this is in fact the output .. */
 }

+#define IFAC_MAX_SIZE 25 /* max number of integer factors for the decomposition, +2 */
 static int decompose(int n, int *ifac, const int *ntryh) {
  int nl = n, nf = 0, i, j = 0;
  for (j=0; ntryh[j]; ++j) {
@ -1062,6 +1078,7 @@ static int decompose(int n, int *ifac, const int *ntryh) {
      int nq = nl / ntry;
      int nr = nl - ntry * nq;
      if (nr == 0) {
+        assert(2 + nf < IFAC_MAX_SIZE);
        ifac[2+nf++] = ntry;
        nl = nq;
        if (ntry == 2 && nf != 1) {
@ -1075,7 +1092,7 @@ static int decompose(int n, int *ifac, const int *ntryh) {
    }
  }
  ifac[0] = n;
-  ifac[1] = nf;  
+  ifac[1] = nf;
  return nf;
 }

@ -1155,7 +1172,7 @@ void cffti1_ps(int n, float *wa, int *ifac)

 v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
  v4sf *in  = (v4sf*)input_readonly;
-  v4sf *out = (in == work2 ? work1 : work2); 
+  v4sf *out = (in == work2 ? work1 : work2);
  int nf = ifac[1], k1;
  int l1 = 1;
  int iw = 0;
@ -1203,7 +1220,8 @@ v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, con
 struct PFFFT_Setup {
  int     N;
  int     Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
-  int ifac[15];
+  // hold the decomposition into small integers of N
+  int ifac[IFAC_MAX_SIZE]; // N , number of factors, factors (admitted values: 2, 3, 4 ou 5)
  pffft_transform_t transform;
  v4sf *data; // allocated room for twiddle coefs
  float *e;    // points into 'data' , N/4*3 elements
@ -1211,21 +1229,30 @@ struct PFFFT_Setup {
 };

 PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
+  // validate N for negative values or potential int overflow
+  if (N < 0) {
+    return 0;
+  }
+  if (N > (1<<26)) {
+    // higher values of N will make you enter in the integer overflow world...
+    assert(0);
+    return 0;
+  }
  PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
  int k, m;
-  /* unfortunately, the fft size must be a multiple of 16 for complex FFTs 
+  /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
     and 32 for real FFTs -- a lot of stuff would need to be rewritten to
     handle other cases (or maybe just switch to a scalar fft, I don't know..) */
  if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
  if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
  //assert((N % 32) == 0);
  s->N = N;
-  s->transform = transform;  
+  s->transform = transform;
  /* nb of complex simd vectors */
  s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
  s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf));
  s->e = (float*)s->data;
-  s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);  
+  s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);

  for (k=0; k < s->Ncvec; ++k) {
    int i = k/SIMD_SZ;
@ -1265,7 +1292,7 @@ static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
  v4sf g0, g1;
  int k;
  INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
-  
+
  *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
  for (k=1; k < N; ++k) {
    v4sf h0, h1;
@ -1300,7 +1327,7 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
  v4sf *vout = (v4sf*)out;
  assert(in != out);
  if (setup->transform == PFFFT_REAL) {
-    int dk = N/32;
+    int k, dk = N/32;
    if (direction == PFFFT_FORWARD) {
      for (k=0; k < dk; ++k) {
        INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
@ -1318,12 +1345,12 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
    }
  } else {
    if (direction == PFFFT_FORWARD) {
-      for (k=0; k < Ncvec; ++k) { 
+      for (k=0; k < Ncvec; ++k) {
        int kk = (k/4) + (k%4)*(Ncvec/4);
        INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
      }
    } else {
-      for (k=0; k < Ncvec; ++k) { 
+      for (k=0; k < Ncvec; ++k) {
        int kk = (k/4) + (k%4)*(Ncvec/4);
        UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
      }
@ -1336,7 +1363,7 @@ void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
  assert(in != out);
-  for (k=0; k < dk; ++k) {    
+  for (k=0; k < dk; ++k) {
    r0 = in[8*k+0]; i0 = in[8*k+1];
    r1 = in[8*k+2]; i1 = in[8*k+3];
    r2 = in[8*k+4]; i2 = in[8*k+5];
@ -1354,7 +1381,7 @@ void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {

    /*
      transformation for each column is:
-      
+
      [1   1   1   1   0   0   0   0]   [r0]
      [1   0  -1   0   0  -1   0   1]   [r1]
      [1  -1   1  -1   0   0   0   0]   [r2]
@ -1362,14 +1389,14 @@ void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
      [0   0   0   0   1   1   1   1] * [i0]
      [0   1   0  -1   1   0  -1   0]   [i1]
      [0   0   0   0   1  -1   1  -1]   [i2]
-      [0  -1   0   1   1   0  -1   0]   [i3]    
+      [0  -1   0   1   1   0  -1   0]   [i3]
    */
-    
+
    r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
    r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
    r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
    r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
-  
+
    *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
    *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
  }
@ -1380,7 +1407,7 @@ void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
  assert(in != out);
-  for (k=0; k < dk; ++k) {    
+  for (k=0; k < dk; ++k) {
    r0 = in[8*k+0]; i0 = in[8*k+1];
    r1 = in[8*k+2]; i1 = in[8*k+3];
    r2 = in[8*k+4]; i2 = in[8*k+5];
@ -1410,14 +1437,14 @@ void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)


 static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
-                            const v4sf *e, v4sf *out) {
+                                                   const v4sf *e, v4sf *out) {
  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
  r0 = *in0; i0 = *in1;
  r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
  VTRANSPOSE4(r0,r1,r2,r3);
  VTRANSPOSE4(i0,i1,i2,i3);
- 
+
  /*
    transformation for each column is:

@ -1428,9 +1455,9 @@ static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *
    [0   0   0   0   1   1   1   1] * [i0]
    [0  -1   0   1  -1   0   1   0]   [i1]
    [0  -1   0   1   1   0  -1   0]   [i2]
-    [0   0   0   0  -1   1  -1   1]   [i3]    
+    [0   0   0   0  -1   1  -1   1]   [i3]
  */
-  
+
  //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
  //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";

@ -1441,9 +1468,9 @@ static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *
  //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
  //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";

-  sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2); 
+  sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
  sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
-  si0 = VADD(i0,i2); di0 = VSUB(i0,i2); 
+  si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
  si1 = VADD(i1,i3); di1 = VSUB(i3,i1);

  r0 = VADD(sr0, sr1);
@ -1499,19 +1526,19 @@ static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *o
  xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]);        uout[2].f[0] = xr1;
  xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[3].f[0] = xi1;
  xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]);        uout[6].f[0] = xr3;
-  xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[7].f[0] = xi3; 
+  xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[7].f[0] = xi3;

  for (k=1; k < dk; ++k) {
    v4sf save_next = in[8*k+7];
    pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1,
-                           e + k*6, out + k*8);
+                            e + k*6, out + k*8);
    save = save_next;
  }

 }

-static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, 
-                                             const v4sf *e, v4sf *out, int first) {
+static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
+                                                     const v4sf *e, v4sf *out, int first) {
  v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
  /*
    transformation for each column is:
@ -1523,12 +1550,12 @@ static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
    [0   0   0   0   1  -1   1  -1] * [i0]
    [0  -1   1   0   1   0   0   1]   [i1]
    [0   0   0   0   1   1  -1  -1]   [i2]
-    [0   1  -1   0   1   0   0   1]   [i3]    
+    [0   1  -1   0   1   0   0   1]   [i3]
  */

-  v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3); 
+  v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
  v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
-  v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3); 
+  v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
  v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);

  r0 = VADD(sr0, sr1);
@ -1586,7 +1613,7 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf
    [ci2] [0   0   0   0   0  -2   0   2]
    [ci3] [0  -s   0   s   0  -s   0  -s]
  */
-  for (k=1; k < dk; ++k) {    
+  for (k=1; k < dk; ++k) {
    pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
  }

@ -1602,7 +1629,7 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf


 void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
-                             pffft_direction_t direction, int ordered) {
+                              pffft_direction_t direction, int ordered) {
  int k, Ncvec   = setup->Ncvec;
  int nf_odd = (setup->ifac[1] & 1);

@ -1620,44 +1647,44 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo
  //assert(finput != foutput);
  if (direction == PFFFT_FORWARD) {
    ib = !ib;
-    if (setup->transform == PFFFT_REAL) { 
+    if (setup->transform == PFFFT_REAL) {
      ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
-                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);      
+                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
      pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
    } else {
      v4sf *tmp = buff[ib];
      for (k=0; k < Ncvec; ++k) {
        UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
      }
-      ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], 
+      ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
                      setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
      pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
    }
    if (ordered) {
-      pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);       
+      pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);
    } else ib = !ib;
  } else {
-    if (vinput == buff[ib]) { 
+    if (vinput == buff[ib]) {
      ib = !ib; // may happen when finput == foutput
    }
    if (ordered) {
-      pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD); 
+      pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD);
      vinput = buff[ib]; ib = !ib;
    }
    if (setup->transform == PFFFT_REAL) {
      pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
-      ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], 
+      ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1],
                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
    } else {
      pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
-      ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1], 
+      ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],
                      setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
      for (k=0; k < Ncvec; ++k) {
        INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
      }
    }
  }
-  
+
  if (buff[ib] != voutput) {
    /* extra copy required -- this situation should only happen when finput == foutput */
    assert(finput==foutput);
@ -1725,11 +1752,11 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
               "vld1.f32    {q2,q3},   [%0,:128]!         \n"
               "vld1.f32    {q6,q7},   [%1,:128]!         \n"
               "vld1.f32    {q8,q9},   [r8,:128]!          \n"
-               
+
               "vmul.f32    q10, q0, q4             \n"
               "vmul.f32    q11, q0, q5             \n"
-               "vmul.f32    q12, q2, q6             \n" 
-               "vmul.f32    q13, q2, q7             \n"                 
+               "vmul.f32    q12, q2, q6             \n"
+               "vmul.f32    q13, q2, q7             \n"
               "vmls.f32    q10, q1, q5             \n"
               "vmla.f32    q11, q1, q4             \n"
               "vld1.f32    {q0,q1}, [r8,:128]!     \n"
@ -1779,12 +1806,12 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pfff
  }
  else if (direction == PFFFT_FORWARD) {
    float x_N = in[N-1];
-    for (k=N-1; k > 1; --k) out[k] = in[k-1]; 
+    for (k=N-1; k > 1; --k) out[k] = in[k-1];
    out[0] = in[0];
    out[1] = x_N;
  } else {
    float x_N = in[1];
-    for (k=1; k < N-1; ++k) out[k] = in[k+1]; 
+    for (k=1; k < N-1; ++k) out[k] = in[k+1];
    out[0] = in[0];
    out[N-1] = x_N;
  }
@ -1792,7 +1819,7 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pfff

 #define pffft_transform_internal_nosimd pffft_transform_internal
 void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
-                                    pffft_direction_t direction, int ordered) {
+                                     pffft_direction_t direction, int ordered) {
  int Ncvec   = setup->Ncvec;
  int nf_odd = (setup->ifac[1] & 1);

@ -1808,29 +1835,29 @@ void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, flo
  ib = (nf_odd ^ ordered ? 1 : 0);

  if (direction == PFFFT_FORWARD) {
-    if (setup->transform == PFFFT_REAL) { 
+    if (setup->transform == PFFFT_REAL) {
      ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
-                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);      
+                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
    } else {
-      ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], 
+      ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
                      setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
    }
    if (ordered) {
      pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
    }
-  } else {    
-    if (input == buff[ib]) { 
+  } else {
+    if (input == buff[ib]) {
      ib = !ib; // may happen when finput == foutput
    }
    if (ordered) {
-      pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD); 
+      pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
      input = buff[!ib];
    }
    if (setup->transform == PFFFT_REAL) {
-      ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], 
+      ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
    } else {
-      ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], 
+      ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
                      setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
    }
  }
--- a/third-party/pffft/pffft.h
+++ b/third-party/pffft/pffft.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com ) 
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )

   Based on original fortran 77 code from FFTPACKv4 from NETLIB,
   authored by Dr Paul Swarztrauber of NCAR, in 1985.
@ -24,7 +24,7 @@
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
-   specific prior written permission.  
+   specific prior written permission.

   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
@ -44,33 +44,33 @@
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
 */
-   
+
 /*
-   PFFFT : a Pretty Fast FFT.
+  PFFFT : a Pretty Fast FFT.

-   This is basically an adaptation of the single precision fftpack
-   (v4) as found on netlib taking advantage of SIMD instruction found
-   on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
-   
-   For architectures where no SIMD instruction is available, the code
-   falls back to a scalar version.  
+  This is basically an adaptation of the single precision fftpack
+  (v4) as found on netlib taking advantage of SIMD instruction found
+  on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).

-   Restrictions: 
+  For architectures where no SIMD instruction is available, the code
+  falls back to a scalar version.

-   - 1D transforms only, with 32-bit single precision.
+  Restrictions:

-   - supports only transforms for inputs of length N of the form
-   N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
-   144, 160, etc are all acceptable lengths). Performance is best for
-   128<=N<=8192.
+  - 1D transforms only, with 32-bit single precision.

-   - all (float*) pointers in the functions below are expected to
-   have an "simd-compatible" alignment, that is 16 bytes on x86 and
-   powerpc CPUs.
-  
-   You can allocate such buffers with the functions
-   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
-   posix_memalign..)
+  - supports only transforms for inputs of length N of the form
+  N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
+  144, 160, etc are all acceptable lengths). Performance is best for
+  128<=N<=8192.
+
+  - all (float*) pointers in the functions below are expected to
+  have an "simd-compatible" alignment, that is 16 bytes on x86 and
+  powerpc CPUs.
+
+  You can allocate such buffers with the functions
+  pffft_aligned_malloc / pffft_aligned_free (or with stuff like
+  posix_memalign..)

 */

@ -83,91 +83,95 @@
 extern "C" {
 #endif

-  /* opaque struct holding internal stuff (precomputed twiddle factors)
+  /**
+     Opaque struct holding internal stuff (precomputed twiddle factors)
     this struct can be shared by many threads as it contains only
-     read-only data.  
+     read-only data.
  */
  typedef struct PFFFT_Setup PFFFT_Setup;

-  /* direction of the transform */
+  /** Direction of the transform */
  typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
-  
-  /* type of transform */
+
+  /** Type of transform */
  typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;

-  /*
-    prepare for performing transforms of size N -- the returned
+  /**
+    Prepare for performing transforms of size N -- the returned
    PFFFT_Setup structure is read-only so it can safely be shared by
-    multiple concurrent threads. 
+    multiple concurrent threads.
+
+    Will return NULL if N is not suitable (too large / no decomposable with simple integer
+    factors..)
  */
  PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
  void pffft_destroy_setup(PFFFT_Setup *);
-  /* 
-     Perform a Fourier transform , The z-domain data is stored in the
-     most efficient order for transforming it back, or using it for
-     convolution. If you need to have its content sorted in the
-     "usual" way, that is as an array of interleaved complex numbers,
-     either use pffft_transform_ordered , or call pffft_zreorder after
-     the forward fft, and before the backward fft.
+  /**
+    Perform a Fourier transform , The z-domain data is stored in the
+    most efficient order for transforming it back, or using it for
+    convolution. If you need to have its content sorted in the
+    "usual" way, that is as an array of interleaved complex numbers,
+    either use pffft_transform_ordered , or call pffft_zreorder after
+    the forward fft, and before the backward fft.

-     Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
-     Typically you will want to scale the backward transform by 1/N.
-     
-     The 'work' pointer should point to an area of N (2*N for complex
-     fft) floats, properly aligned. If 'work' is NULL, then stack will
-     be used instead (this is probably the best strategy for small
-     FFTs, say for N < 16384).
+    Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
+    Typically you will want to scale the backward transform by 1/N.

-     input and output may alias.
+    The 'work' pointer should point to an area of N (2*N for complex
+    fft) floats, properly aligned. If 'work' is NULL, then stack will
+    be used instead (this is probably the best strategy for small
+    FFTs, say for N < 16384).
+
+    input and output may alias.
  */
  void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);

-  /* 
-     Similar to pffft_transform, but makes sure that the output is
-     ordered as expected (interleaved complex numbers).  This is
-     similar to calling pffft_transform and then pffft_zreorder.
-     
-     input and output may alias.
+  /**
+    Similar to pffft_transform, but makes sure that the output is
+    ordered as expected (interleaved complex numbers).  This is
+    similar to calling pffft_transform and then pffft_zreorder.
+
+    input and output may alias.
  */
  void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);

-  /* 
-     call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
-     PFFFT_FORWARD) if you want to have the frequency components in
-     the correct "canonical" order, as interleaved complex numbers.
-     
-     (for real transforms, both 0-frequency and half frequency
-     components, which are real, are assembled in the first entry as
-     F(0)+i*F(n/2+1). Note that the original fftpack did place
-     F(n/2+1) at the end of the arrays).
-     
-     input and output should not alias.
+  /**
+    call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
+    PFFFT_FORWARD) if you want to have the frequency components in
+    the correct "canonical" order, as interleaved complex numbers.
+
+    (for real transforms, both 0-frequency and half frequency
+    components, which are real, are assembled in the first entry as
+    F(0)+i*F(n/2+1). Note that the original fftpack did place
+    F(n/2+1) at the end of the arrays).
+
+    input and output should not alias.
  */
  void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);

-  /* 
-     Perform a multiplication of the frequency components of dft_a and
-     dft_b and accumulate them into dft_ab. The arrays should have
-     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
-     *not* have been reordered with pffft_zreorder (otherwise just
-     perform the operation yourself as the dft coefs are stored as
-     interleaved complex numbers).
-     
-     the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
-     
-     The dft_a, dft_b and dft_ab pointers may alias.
+  /**
+    Perform a multiplication of the frequency components of dft_a and
+    dft_b and accumulate them into dft_ab. The arrays should have
+    been obtained with pffft_transform(.., PFFFT_FORWARD) and should
+    *not* have been reordered with pffft_zreorder (otherwise just
+    perform the operation yourself as the dft coefs are stored as
+    interleaved complex numbers).
+
+    the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
+
+    The dft_a, dft_b and dft_ab pointers may alias.
  */
  void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);

-  /*
+  /**
    the float buffers must have the correct alignment (16-byte boundary
    on intel and powerpc). This function may be used to obtain such
-    correctly aligned buffers.  
+    correctly aligned buffers.
  */
  void *pffft_aligned_malloc(size_t nb_bytes);
  void pffft_aligned_free(void *);

-  /* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
+  /** return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
  int pffft_simd_size(void);

 #ifdef __cplusplus