update pffft.c again from the upstream master branch

This commit is contained in:
Fabian Greffrath 2024-12-02 12:35:01 +01:00
parent 6d9bdb8d25
commit a2b8f11ba0
2 changed files with 205 additions and 174 deletions

View File

@ -25,7 +25,7 @@
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
@ -53,11 +53,14 @@
*/
/*
ChangeLog:
ChangeLog:
- 2011/10/02, version 1: This is the very first release of this file.
*/
#define _USE_MATH_DEFINES
#ifndef _USE_MATH_DEFINES
# define _USE_MATH_DEFINES // ask gently MSVC to define M_PI, M_SQRT2 etc.
#endif
#include "pffft.h"
#include <stdlib.h>
#include <stdio.h>
@ -84,21 +87,34 @@
#endif
/*
vector support macros: the rest of the code is independant of
SSE/Altivec/NEON -- adding support for other platforms with 4-element
vectors should be limited to these macros
/*
vector support macros: the rest of the code is independant of
SSE/Altivec/NEON -- adding support for other platforms with 4-element
vectors should be limited to these macros
*/
// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
//#define PFFFT_SIMD_DISABLE
/* select which SIMD intrinsics will be used */
#if !defined(PFFFT_SIMD_DISABLE)
# if (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)) \
&& (defined(__VEC__) || defined(__ALTIVEC__))
# define PFFFT_SIMD_ALTIVEC
# elif defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64) \
|| defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__wasm_simd128__)
// we test _M_ARM64EC before _M_X64 because when _M_ARM64EC is defined, the microsoft compiler also defines _M_X64
# define PFFFT_SIMD_NEON
# elif defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
# define PFFFT_SIMD_SSE
# endif
#endif // PFFFT_SIMD_DISABLE
/*
Altivec support macros
Altivec support macros
*/
#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)) \
&& (defined(__VEC__) || defined(__ALTIVEC__))
#ifdef PFFFT_SIMD_ALTIVEC
#include <altivec.h>
typedef vector float v4sf;
# define SIMD_SZ 4
@ -115,7 +131,7 @@ inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_p
vector unsigned char vperm2 = (vector unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31}; \
v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
}
# define VTRANSPOSE4(x0,x1,x2,x3) { \
# define VTRANSPOSE4(x0,x1,x2,x3) { \
v4sf y0 = vec_mergeh(x0, x2); \
v4sf y1 = vec_mergel(x0, x2); \
v4sf y2 = vec_mergeh(x1, x3); \
@ -126,13 +142,12 @@ inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_p
x3 = vec_mergel(y1, y3); \
}
# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15})
# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
# define VALIGNED(ptr) ((((size_t)(ptr)) & 0xF) == 0)
/*
SSE1 support macros
*/
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || \
(defined(_M_IX86_FP) && _M_IX86_FP >= 1))
#elif defined(PFFFT_SIMD_SSE)
#include <xmmintrin.h>
typedef __m128 v4sf;
@ -147,12 +162,12 @@ typedef __m128 v4sf;
# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
# define VALIGNED(ptr) ((((size_t)(ptr)) & 0xF) == 0)
/*
ARM NEON support macros
*/
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64) || defined(_M_ARM64))
#elif defined(PFFFT_SIMD_NEON)
# include <arm_neon.h>
typedef float32x4_t v4sf;
# define SIMD_SZ 4
@ -174,7 +189,7 @@ typedef float32x4_t v4sf;
// marginally faster version
//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
# define VALIGNED(ptr) ((((size_t)(ptr)) & 0x3) == 0)
#else
# if !defined(PFFFT_SIMD_DISABLE)
# warning "building with simd disabled !\n";
@ -192,7 +207,7 @@ typedef float v4sf;
# define VMADD(a,b,c) ((a)*(b)+(c))
# define VSUB(a,b) ((a)-(b))
# define LD_PS1(p) (p)
# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
# define VALIGNED(ptr) ((((size_t)(ptr)) & 0x3) == 0)
#endif
// shortcuts for complex multiplcations
@ -216,7 +231,7 @@ typedef union v4sf_union {
/* detect bugs with the vector support macros */
void validate_pffft_simd(void) {
float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
v4sf_union a0, a1, a2, a3, t, u;
v4sf_union a0, a1, a2, a3, t, u;
memcpy(a0.f, f, 4*sizeof(float));
memcpy(a1.f, f+4, 4*sizeof(float));
memcpy(a2.f, f+8, 4*sizeof(float));
@ -245,9 +260,9 @@ void validate_pffft_simd(void) {
printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
assertv4(t, 8, 9, 6, 7);
VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
}
#else
@ -327,7 +342,7 @@ static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
di3 = VSUB(ci2, cr3);
wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1];
VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
ch[i+l1ido] = dr2;
ch[i+l1ido] = dr2;
ch[i+l1ido + 1] = di2;
VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
ch[i+2*l1ido] = dr3;
@ -359,7 +374,7 @@ static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
ch[1*l1ido + 0] = VADD(tr1, tr4);
ch[1*l1ido + 1] = VADD(ti1, ti4);
ch[2*l1ido + 0] = VSUB(tr2, tr3);
ch[2*l1ido + 1] = VSUB(ti2, ti3);
ch[2*l1ido + 1] = VSUB(ti2, ti3);
ch[3*l1ido + 0] = VSUB(tr1, tr4);
ch[3*l1ido + 1] = VSUB(ti1, ti4);
}
@ -408,8 +423,8 @@ static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
*/
static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
const float *wa1, const float *wa2,
const float *wa3, const float *wa4, float fsign) {
const float *wa1, const float *wa2,
const float *wa3, const float *wa4, float fsign) {
static const float tr11 = .309016994374947f;
const float ti11 = .951056516295154f*fsign;
static const float tr12 = -.809016994374947f;
@ -488,7 +503,7 @@ static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4
for (i=2; i<ido; i+=2) {
v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
v4sf br = cc[i - 1 + k], bi = cc[i + k];
VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
ch[i + 2*k] = VADD(bi, ti2);
ch[2*(k+ido) - i] = VSUB(ti2, bi);
ch[i - 1 + 2*k] = VADD(br, tr2);
@ -560,7 +575,7 @@ static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT
wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
VCPLXMULCONJ(dr3, di3, wr2, wi2);
cr2 = VADD(dr2, dr3);
ci2 = VADD(di2, di3);
ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
@ -626,7 +641,7 @@ static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s
static const float minus_hsqt2 = (float)-0.7071067811865475;
int i, k, l1ido = l1*ido;
{
const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
v4sf * RESTRICT ch_ = ch;
while (cc < cc_end) {
// this loop represents between 25% and 40% of total radf4_ps cost !
@ -659,20 +674,20 @@ static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s
cr3 = pc[2*l1ido+0];
ci3 = pc[2*l1ido+1];
wr = LD_PS1(wa2[i-2]);
wr = LD_PS1(wa2[i-2]);
wi = LD_PS1(wa2[i-1]);
VCPLXMULCONJ(cr3, ci3, wr, wi);
cr4 = pc[3*l1ido];
ci4 = pc[3*l1ido+1];
wr = LD_PS1(wa3[i-2]);
wr = LD_PS1(wa3[i-2]);
wi = LD_PS1(wa3[i-1]);
VCPLXMULCONJ(cr4, ci4, wr, wi);
/* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
tr1 = VADD(cr2,cr4);
tr4 = VSUB(cr4,cr2);
tr4 = VSUB(cr4,cr2);
tr2 = VADD(pc[0],cr3);
tr3 = VSUB(pc[0],cr3);
ch[i - 1 + 4*k] = VADD(tr1,tr2);
@ -698,8 +713,8 @@ static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s
v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
ch[ido-1 + 4*k] = VADD(tr1, c);
ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
ch[4*k + 1*ido] = VSUB(ti1, d);
ch[4*k + 3*ido] = VADD(ti1, d);
ch[4*k + 1*ido] = VSUB(ti1, d);
ch[4*k + 3*ido] = VADD(ti1, d);
}
} /* radf4 */
@ -712,7 +727,7 @@ static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4
int i, k, l1ido = l1*ido;
v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
{
const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
v4sf *ch_ = ch;
while (ch < ch_end) {
v4sf a = cc[0], b = cc[4*ido-1];
@ -725,7 +740,7 @@ static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4
ch[2*l1ido] = VSUB(tr2, tr3);
ch[1*l1ido] = VSUB(tr1, tr4);
ch[3*l1ido] = VADD(tr1, tr4);
cc += 4*ido; ch += ido;
}
cc = cc_; ch = ch_;
@ -784,7 +799,7 @@ static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4
}
} /* radb4 */
static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
const float *wa1, const float *wa2, const float *wa3, const float *wa4)
{
static const float tr11 = .309016994374947f;
@ -871,8 +886,8 @@ static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT
#undef ch_ref
} /* radf5 */
static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
const float *wa1, const float *wa2, const float *wa3, const float *wa4)
static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
const float *wa1, const float *wa2, const float *wa3, const float *wa4)
{
static const float tr11 = .309016994374947f;
static const float ti11 = .951056516295154f;
@ -960,8 +975,8 @@ static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch
#undef ch_ref
} /* radb5 */
static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
const float *wa, const int *ifac) {
static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
const float *wa, const int *ifac) {
v4sf *in = (v4sf*)input_readonly;
v4sf *out = (in == work2 ? work1 : work2);
int nf = ifac[1], k1;
@ -1007,8 +1022,8 @@ static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *w
return in; /* this is in fact the output .. */
} /* rfftf1 */
static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
const float *wa, const int *ifac) {
static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
const float *wa, const int *ifac) {
v4sf *in = (v4sf*)input_readonly;
v4sf *out = (in == work2 ? work1 : work2);
int nf = ifac[1], k1;
@ -1054,6 +1069,7 @@ static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *w
return in; /* this is in fact the output .. */
}
#define IFAC_MAX_SIZE 25 /* max number of integer factors for the decomposition, +2 */
static int decompose(int n, int *ifac, const int *ntryh) {
int nl = n, nf = 0, i, j = 0;
for (j=0; ntryh[j]; ++j) {
@ -1062,6 +1078,7 @@ static int decompose(int n, int *ifac, const int *ntryh) {
int nq = nl / ntry;
int nr = nl - ntry * nq;
if (nr == 0) {
assert(2 + nf < IFAC_MAX_SIZE);
ifac[2+nf++] = ntry;
nl = nq;
if (ntry == 2 && nf != 1) {
@ -1075,7 +1092,7 @@ static int decompose(int n, int *ifac, const int *ntryh) {
}
}
ifac[0] = n;
ifac[1] = nf;
ifac[1] = nf;
return nf;
}
@ -1155,7 +1172,7 @@ void cffti1_ps(int n, float *wa, int *ifac)
v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
v4sf *in = (v4sf*)input_readonly;
v4sf *out = (in == work2 ? work1 : work2);
v4sf *out = (in == work2 ? work1 : work2);
int nf = ifac[1], k1;
int l1 = 1;
int iw = 0;
@ -1203,7 +1220,8 @@ v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, con
struct PFFFT_Setup {
int N;
int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
int ifac[15];
// hold the decomposition into small integers of N
int ifac[IFAC_MAX_SIZE]; // N , number of factors, factors (admitted values: 2, 3, 4 ou 5)
pffft_transform_t transform;
v4sf *data; // allocated room for twiddle coefs
float *e; // points into 'data' , N/4*3 elements
@ -1211,21 +1229,30 @@ struct PFFFT_Setup {
};
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
// validate N for negative values or potential int overflow
if (N < 0) {
return 0;
}
if (N > (1<<26)) {
// higher values of N will make you enter in the integer overflow world...
assert(0);
return 0;
}
PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
int k, m;
/* unfortunately, the fft size must be a multiple of 16 for complex FFTs
/* unfortunately, the fft size must be a multiple of 16 for complex FFTs
and 32 for real FFTs -- a lot of stuff would need to be rewritten to
handle other cases (or maybe just switch to a scalar fft, I don't know..) */
if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
//assert((N % 32) == 0);
s->N = N;
s->transform = transform;
s->transform = transform;
/* nb of complex simd vectors */
s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf));
s->e = (float*)s->data;
s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
for (k=0; k < s->Ncvec; ++k) {
int i = k/SIMD_SZ;
@ -1265,7 +1292,7 @@ static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
v4sf g0, g1;
int k;
INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
*--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
for (k=1; k < N; ++k) {
v4sf h0, h1;
@ -1300,7 +1327,7 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
v4sf *vout = (v4sf*)out;
assert(in != out);
if (setup->transform == PFFFT_REAL) {
int dk = N/32;
int k, dk = N/32;
if (direction == PFFFT_FORWARD) {
for (k=0; k < dk; ++k) {
INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
@ -1318,12 +1345,12 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
}
} else {
if (direction == PFFFT_FORWARD) {
for (k=0; k < Ncvec; ++k) {
for (k=0; k < Ncvec; ++k) {
int kk = (k/4) + (k%4)*(Ncvec/4);
INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
}
} else {
for (k=0; k < Ncvec; ++k) {
for (k=0; k < Ncvec; ++k) {
int kk = (k/4) + (k%4)*(Ncvec/4);
UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
}
@ -1336,7 +1363,7 @@ void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
v4sf r0, i0, r1, i1, r2, i2, r3, i3;
v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
assert(in != out);
for (k=0; k < dk; ++k) {
for (k=0; k < dk; ++k) {
r0 = in[8*k+0]; i0 = in[8*k+1];
r1 = in[8*k+2]; i1 = in[8*k+3];
r2 = in[8*k+4]; i2 = in[8*k+5];
@ -1354,7 +1381,7 @@ void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
/*
transformation for each column is:
[1 1 1 1 0 0 0 0] [r0]
[1 0 -1 0 0 -1 0 1] [r1]
[1 -1 1 -1 0 0 0 0] [r2]
@ -1362,14 +1389,14 @@ void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
[0 0 0 0 1 1 1 1] * [i0]
[0 1 0 -1 1 0 -1 0] [i1]
[0 0 0 0 1 -1 1 -1] [i2]
[0 -1 0 1 1 0 -1 0] [i3]
[0 -1 0 1 1 0 -1 0] [i3]
*/
r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
*out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
*out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
}
@ -1380,7 +1407,7 @@ void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
v4sf r0, i0, r1, i1, r2, i2, r3, i3;
v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
assert(in != out);
for (k=0; k < dk; ++k) {
for (k=0; k < dk; ++k) {
r0 = in[8*k+0]; i0 = in[8*k+1];
r1 = in[8*k+2]; i1 = in[8*k+3];
r2 = in[8*k+4]; i2 = in[8*k+5];
@ -1410,14 +1437,14 @@ void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
const v4sf *e, v4sf *out) {
const v4sf *e, v4sf *out) {
v4sf r0, i0, r1, i1, r2, i2, r3, i3;
v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
r0 = *in0; i0 = *in1;
r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
VTRANSPOSE4(r0,r1,r2,r3);
VTRANSPOSE4(i0,i1,i2,i3);
/*
transformation for each column is:
@ -1428,9 +1455,9 @@ static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *
[0 0 0 0 1 1 1 1] * [i0]
[0 -1 0 1 -1 0 1 0] [i1]
[0 -1 0 1 1 0 -1 0] [i2]
[0 0 0 0 -1 1 -1 1] [i3]
[0 0 0 0 -1 1 -1 1] [i3]
*/
//cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
//cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
@ -1441,9 +1468,9 @@ static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *
//cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
//cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
r0 = VADD(sr0, sr1);
@ -1499,19 +1526,19 @@ static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *o
xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]); uout[2].f[0] = xr1;
xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[3].f[0] = xi1;
xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]); uout[6].f[0] = xr3;
xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3;
xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3;
for (k=1; k < dk; ++k) {
v4sf save_next = in[8*k+7];
pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1,
e + k*6, out + k*8);
e + k*6, out + k*8);
save = save_next;
}
}
static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
const v4sf *e, v4sf *out, int first) {
static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
const v4sf *e, v4sf *out, int first) {
v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
/*
transformation for each column is:
@ -1523,12 +1550,12 @@ static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
[0 0 0 0 1 -1 1 -1] * [i0]
[0 -1 1 0 1 0 0 1] [i1]
[0 0 0 0 1 1 -1 -1] [i2]
[0 1 -1 0 1 0 0 1] [i3]
[0 1 -1 0 1 0 0 1] [i3]
*/
v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
r0 = VADD(sr0, sr1);
@ -1586,7 +1613,7 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf
[ci2] [0 0 0 0 0 -2 0 2]
[ci3] [0 -s 0 s 0 -s 0 -s]
*/
for (k=1; k < dk; ++k) {
for (k=1; k < dk; ++k) {
pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
}
@ -1602,7 +1629,7 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf
void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
pffft_direction_t direction, int ordered) {
pffft_direction_t direction, int ordered) {
int k, Ncvec = setup->Ncvec;
int nf_odd = (setup->ifac[1] & 1);
@ -1620,44 +1647,44 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo
//assert(finput != foutput);
if (direction == PFFFT_FORWARD) {
ib = !ib;
if (setup->transform == PFFFT_REAL) {
if (setup->transform == PFFFT_REAL) {
ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
} else {
v4sf *tmp = buff[ib];
for (k=0; k < Ncvec; ++k) {
UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
}
ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
}
if (ordered) {
pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);
pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);
} else ib = !ib;
} else {
if (vinput == buff[ib]) {
if (vinput == buff[ib]) {
ib = !ib; // may happen when finput == foutput
}
if (ordered) {
pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD);
pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD);
vinput = buff[ib]; ib = !ib;
}
if (setup->transform == PFFFT_REAL) {
pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1],
ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1],
setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
} else {
pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],
ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],
setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
for (k=0; k < Ncvec; ++k) {
INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
}
}
}
if (buff[ib] != voutput) {
/* extra copy required -- this situation should only happen when finput == foutput */
assert(finput==foutput);
@ -1725,11 +1752,11 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
"vld1.f32 {q2,q3}, [%0,:128]! \n"
"vld1.f32 {q6,q7}, [%1,:128]! \n"
"vld1.f32 {q8,q9}, [r8,:128]! \n"
"vmul.f32 q10, q0, q4 \n"
"vmul.f32 q11, q0, q5 \n"
"vmul.f32 q12, q2, q6 \n"
"vmul.f32 q13, q2, q7 \n"
"vmul.f32 q12, q2, q6 \n"
"vmul.f32 q13, q2, q7 \n"
"vmls.f32 q10, q1, q5 \n"
"vmla.f32 q11, q1, q4 \n"
"vld1.f32 {q0,q1}, [r8,:128]! \n"
@ -1779,12 +1806,12 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pfff
}
else if (direction == PFFFT_FORWARD) {
float x_N = in[N-1];
for (k=N-1; k > 1; --k) out[k] = in[k-1];
for (k=N-1; k > 1; --k) out[k] = in[k-1];
out[0] = in[0];
out[1] = x_N;
} else {
float x_N = in[1];
for (k=1; k < N-1; ++k) out[k] = in[k+1];
for (k=1; k < N-1; ++k) out[k] = in[k+1];
out[0] = in[0];
out[N-1] = x_N;
}
@ -1792,7 +1819,7 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pfff
#define pffft_transform_internal_nosimd pffft_transform_internal
void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
pffft_direction_t direction, int ordered) {
pffft_direction_t direction, int ordered) {
int Ncvec = setup->Ncvec;
int nf_odd = (setup->ifac[1] & 1);
@ -1808,29 +1835,29 @@ void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, flo
ib = (nf_odd ^ ordered ? 1 : 0);
if (direction == PFFFT_FORWARD) {
if (setup->transform == PFFFT_REAL) {
if (setup->transform == PFFFT_REAL) {
ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
} else {
ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
}
if (ordered) {
pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
}
} else {
if (input == buff[ib]) {
} else {
if (input == buff[ib]) {
ib = !ib; // may happen when finput == foutput
}
if (ordered) {
pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
input = buff[!ib];
}
if (setup->transform == PFFFT_REAL) {
ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
} else {
ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
}
}

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Based on original fortran 77 code from FFTPACKv4 from NETLIB,
authored by Dr Paul Swarztrauber of NCAR, in 1985.
@ -24,7 +24,7 @@
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
@ -44,33 +44,33 @@
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
*/
/*
PFFFT : a Pretty Fast FFT.
PFFFT : a Pretty Fast FFT.
This is basically an adaptation of the single precision fftpack
(v4) as found on netlib taking advantage of SIMD instruction found
on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
For architectures where no SIMD instruction is available, the code
falls back to a scalar version.
This is basically an adaptation of the single precision fftpack
(v4) as found on netlib taking advantage of SIMD instruction found
on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
Restrictions:
For architectures where no SIMD instruction is available, the code
falls back to a scalar version.
- 1D transforms only, with 32-bit single precision.
Restrictions:
- supports only transforms for inputs of length N of the form
N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
144, 160, etc are all acceptable lengths). Performance is best for
128<=N<=8192.
- 1D transforms only, with 32-bit single precision.
- all (float*) pointers in the functions below are expected to
have an "simd-compatible" alignment, that is 16 bytes on x86 and
powerpc CPUs.
You can allocate such buffers with the functions
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
posix_memalign..)
- supports only transforms for inputs of length N of the form
N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
144, 160, etc are all acceptable lengths). Performance is best for
128<=N<=8192.
- all (float*) pointers in the functions below are expected to
have an "simd-compatible" alignment, that is 16 bytes on x86 and
powerpc CPUs.
You can allocate such buffers with the functions
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
posix_memalign..)
*/
@ -83,91 +83,95 @@
extern "C" {
#endif
/* opaque struct holding internal stuff (precomputed twiddle factors)
/**
Opaque struct holding internal stuff (precomputed twiddle factors)
this struct can be shared by many threads as it contains only
read-only data.
read-only data.
*/
typedef struct PFFFT_Setup PFFFT_Setup;
/* direction of the transform */
/** Direction of the transform */
typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
/* type of transform */
/** Type of transform */
typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
/*
prepare for performing transforms of size N -- the returned
/**
Prepare for performing transforms of size N -- the returned
PFFFT_Setup structure is read-only so it can safely be shared by
multiple concurrent threads.
multiple concurrent threads.
Will return NULL if N is not suitable (too large / no decomposable with simple integer
factors..)
*/
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
void pffft_destroy_setup(PFFFT_Setup *);
/*
Perform a Fourier transform , The z-domain data is stored in the
most efficient order for transforming it back, or using it for
convolution. If you need to have its content sorted in the
"usual" way, that is as an array of interleaved complex numbers,
either use pffft_transform_ordered , or call pffft_zreorder after
the forward fft, and before the backward fft.
/**
Perform a Fourier transform , The z-domain data is stored in the
most efficient order for transforming it back, or using it for
convolution. If you need to have its content sorted in the
"usual" way, that is as an array of interleaved complex numbers,
either use pffft_transform_ordered , or call pffft_zreorder after
the forward fft, and before the backward fft.
Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
Typically you will want to scale the backward transform by 1/N.
The 'work' pointer should point to an area of N (2*N for complex
fft) floats, properly aligned. If 'work' is NULL, then stack will
be used instead (this is probably the best strategy for small
FFTs, say for N < 16384).
Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
Typically you will want to scale the backward transform by 1/N.
input and output may alias.
The 'work' pointer should point to an area of N (2*N for complex
fft) floats, properly aligned. If 'work' is NULL, then stack will
be used instead (this is probably the best strategy for small
FFTs, say for N < 16384).
input and output may alias.
*/
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
/*
Similar to pffft_transform, but makes sure that the output is
ordered as expected (interleaved complex numbers). This is
similar to calling pffft_transform and then pffft_zreorder.
input and output may alias.
/**
Similar to pffft_transform, but makes sure that the output is
ordered as expected (interleaved complex numbers). This is
similar to calling pffft_transform and then pffft_zreorder.
input and output may alias.
*/
void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
/*
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
PFFFT_FORWARD) if you want to have the frequency components in
the correct "canonical" order, as interleaved complex numbers.
(for real transforms, both 0-frequency and half frequency
components, which are real, are assembled in the first entry as
F(0)+i*F(n/2+1). Note that the original fftpack did place
F(n/2+1) at the end of the arrays).
input and output should not alias.
/**
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
PFFFT_FORWARD) if you want to have the frequency components in
the correct "canonical" order, as interleaved complex numbers.
(for real transforms, both 0-frequency and half frequency
components, which are real, are assembled in the first entry as
F(0)+i*F(n/2+1). Note that the original fftpack did place
F(n/2+1) at the end of the arrays).
input and output should not alias.
*/
void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
/*
Perform a multiplication of the frequency components of dft_a and
dft_b and accumulate them into dft_ab. The arrays should have
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
*not* have been reordered with pffft_zreorder (otherwise just
perform the operation yourself as the dft coefs are stored as
interleaved complex numbers).
the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
The dft_a, dft_b and dft_ab pointers may alias.
/**
Perform a multiplication of the frequency components of dft_a and
dft_b and accumulate them into dft_ab. The arrays should have
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
*not* have been reordered with pffft_zreorder (otherwise just
perform the operation yourself as the dft coefs are stored as
interleaved complex numbers).
the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
The dft_a, dft_b and dft_ab pointers may alias.
*/
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
/*
/**
the float buffers must have the correct alignment (16-byte boundary
on intel and powerpc). This function may be used to obtain such
correctly aligned buffers.
correctly aligned buffers.
*/
void *pffft_aligned_malloc(size_t nb_bytes);
void pffft_aligned_free(void *);
/* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
/** return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
int pffft_simd_size(void);
#ifdef __cplusplus