update pffft.c again from the upstream master branch

This commit is contained in:
Fabian Greffrath 2024-12-02 12:35:01 +01:00
parent 6d9bdb8d25
commit a2b8f11ba0
2 changed files with 205 additions and 174 deletions

View File

@ -57,7 +57,10 @@
- 2011/10/02, version 1: This is the very first release of this file. - 2011/10/02, version 1: This is the very first release of this file.
*/ */
#define _USE_MATH_DEFINES #ifndef _USE_MATH_DEFINES
# define _USE_MATH_DEFINES // ask gently MSVC to define M_PI, M_SQRT2 etc.
#endif
#include "pffft.h" #include "pffft.h"
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h> #include <stdio.h>
@ -94,11 +97,24 @@
// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code // define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
//#define PFFFT_SIMD_DISABLE //#define PFFFT_SIMD_DISABLE
/* select which SIMD intrinsics will be used */
#if !defined(PFFFT_SIMD_DISABLE)
# if (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)) \
&& (defined(__VEC__) || defined(__ALTIVEC__))
# define PFFFT_SIMD_ALTIVEC
# elif defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64) \
|| defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__wasm_simd128__)
// we test _M_ARM64EC before _M_X64 because when _M_ARM64EC is defined, the microsoft compiler also defines _M_X64
# define PFFFT_SIMD_NEON
# elif defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
# define PFFFT_SIMD_SSE
# endif
#endif // PFFFT_SIMD_DISABLE
/* /*
Altivec support macros Altivec support macros
*/ */
#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)) \ #ifdef PFFFT_SIMD_ALTIVEC
&& (defined(__VEC__) || defined(__ALTIVEC__))
#include <altivec.h> #include <altivec.h>
typedef vector float v4sf; typedef vector float v4sf;
# define SIMD_SZ 4 # define SIMD_SZ 4
@ -126,13 +142,12 @@ inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_p
x3 = vec_mergel(y1, y3); \ x3 = vec_mergel(y1, y3); \
} }
# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15}) # define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15})
# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0) # define VALIGNED(ptr) ((((size_t)(ptr)) & 0xF) == 0)
/* /*
SSE1 support macros SSE1 support macros
*/ */
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || \ #elif defined(PFFFT_SIMD_SSE)
(defined(_M_IX86_FP) && _M_IX86_FP >= 1))
#include <xmmintrin.h> #include <xmmintrin.h>
typedef __m128 v4sf; typedef __m128 v4sf;
@ -147,12 +162,12 @@ typedef __m128 v4sf;
# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } # define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3) # define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0)) # define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0) # define VALIGNED(ptr) ((((size_t)(ptr)) & 0xF) == 0)
/* /*
ARM NEON support macros ARM NEON support macros
*/ */
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64) || defined(_M_ARM64)) #elif defined(PFFFT_SIMD_NEON)
# include <arm_neon.h> # include <arm_neon.h>
typedef float32x4_t v4sf; typedef float32x4_t v4sf;
# define SIMD_SZ 4 # define SIMD_SZ 4
@ -174,7 +189,7 @@ typedef float32x4_t v4sf;
// marginally faster version // marginally faster version
//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); } //# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a)) # define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0) # define VALIGNED(ptr) ((((size_t)(ptr)) & 0x3) == 0)
#else #else
# if !defined(PFFFT_SIMD_DISABLE) # if !defined(PFFFT_SIMD_DISABLE)
# warning "building with simd disabled !\n"; # warning "building with simd disabled !\n";
@ -192,7 +207,7 @@ typedef float v4sf;
# define VMADD(a,b,c) ((a)*(b)+(c)) # define VMADD(a,b,c) ((a)*(b)+(c))
# define VSUB(a,b) ((a)-(b)) # define VSUB(a,b) ((a)-(b))
# define LD_PS1(p) (p) # define LD_PS1(p) (p)
# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0) # define VALIGNED(ptr) ((((size_t)(ptr)) & 0x3) == 0)
#endif #endif
// shortcuts for complex multiplcations // shortcuts for complex multiplcations
@ -1054,6 +1069,7 @@ static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *w
return in; /* this is in fact the output .. */ return in; /* this is in fact the output .. */
} }
#define IFAC_MAX_SIZE 25 /* max number of integer factors for the decomposition, +2 */
static int decompose(int n, int *ifac, const int *ntryh) { static int decompose(int n, int *ifac, const int *ntryh) {
int nl = n, nf = 0, i, j = 0; int nl = n, nf = 0, i, j = 0;
for (j=0; ntryh[j]; ++j) { for (j=0; ntryh[j]; ++j) {
@ -1062,6 +1078,7 @@ static int decompose(int n, int *ifac, const int *ntryh) {
int nq = nl / ntry; int nq = nl / ntry;
int nr = nl - ntry * nq; int nr = nl - ntry * nq;
if (nr == 0) { if (nr == 0) {
assert(2 + nf < IFAC_MAX_SIZE);
ifac[2+nf++] = ntry; ifac[2+nf++] = ntry;
nl = nq; nl = nq;
if (ntry == 2 && nf != 1) { if (ntry == 2 && nf != 1) {
@ -1203,7 +1220,8 @@ v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, con
struct PFFFT_Setup { struct PFFFT_Setup {
int N; int N;
int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
int ifac[15]; // hold the decomposition into small integers of N
int ifac[IFAC_MAX_SIZE]; // N , number of factors, factors (admitted values: 2, 3, 4 ou 5)
pffft_transform_t transform; pffft_transform_t transform;
v4sf *data; // allocated room for twiddle coefs v4sf *data; // allocated room for twiddle coefs
float *e; // points into 'data' , N/4*3 elements float *e; // points into 'data' , N/4*3 elements
@ -1211,6 +1229,15 @@ struct PFFFT_Setup {
}; };
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) { PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
// validate N for negative values or potential int overflow
if (N < 0) {
return 0;
}
if (N > (1<<26)) {
// higher values of N will make you enter in the integer overflow world...
assert(0);
return 0;
}
PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup)); PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
int k, m; int k, m;
/* unfortunately, the fft size must be a multiple of 16 for complex FFTs /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
@ -1300,7 +1327,7 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
v4sf *vout = (v4sf*)out; v4sf *vout = (v4sf*)out;
assert(in != out); assert(in != out);
if (setup->transform == PFFFT_REAL) { if (setup->transform == PFFFT_REAL) {
int dk = N/32; int k, dk = N/32;
if (direction == PFFFT_FORWARD) { if (direction == PFFFT_FORWARD) {
for (k=0; k < dk; ++k) { for (k=0; k < dk; ++k) {
INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]); INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);

View File

@ -83,26 +83,30 @@
extern "C" { extern "C" {
#endif #endif
/* opaque struct holding internal stuff (precomputed twiddle factors) /**
Opaque struct holding internal stuff (precomputed twiddle factors)
this struct can be shared by many threads as it contains only this struct can be shared by many threads as it contains only
read-only data. read-only data.
*/ */
typedef struct PFFFT_Setup PFFFT_Setup; typedef struct PFFFT_Setup PFFFT_Setup;
/* direction of the transform */ /** Direction of the transform */
typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t; typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
/* type of transform */ /** Type of transform */
typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t; typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
/* /**
prepare for performing transforms of size N -- the returned Prepare for performing transforms of size N -- the returned
PFFFT_Setup structure is read-only so it can safely be shared by PFFFT_Setup structure is read-only so it can safely be shared by
multiple concurrent threads. multiple concurrent threads.
Will return NULL if N is not suitable (too large / no decomposable with simple integer
factors..)
*/ */
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform); PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
void pffft_destroy_setup(PFFFT_Setup *); void pffft_destroy_setup(PFFFT_Setup *);
/* /**
Perform a Fourier transform , The z-domain data is stored in the Perform a Fourier transform , The z-domain data is stored in the
most efficient order for transforming it back, or using it for most efficient order for transforming it back, or using it for
convolution. If you need to have its content sorted in the convolution. If you need to have its content sorted in the
@ -122,7 +126,7 @@ extern "C" {
*/ */
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
/* /**
Similar to pffft_transform, but makes sure that the output is Similar to pffft_transform, but makes sure that the output is
ordered as expected (interleaved complex numbers). This is ordered as expected (interleaved complex numbers). This is
similar to calling pffft_transform and then pffft_zreorder. similar to calling pffft_transform and then pffft_zreorder.
@ -131,7 +135,7 @@ extern "C" {
*/ */
void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
/* /**
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(..., call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
PFFFT_FORWARD) if you want to have the frequency components in PFFFT_FORWARD) if you want to have the frequency components in
the correct "canonical" order, as interleaved complex numbers. the correct "canonical" order, as interleaved complex numbers.
@ -145,7 +149,7 @@ extern "C" {
*/ */
void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction); void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
/* /**
Perform a multiplication of the frequency components of dft_a and Perform a multiplication of the frequency components of dft_a and
dft_b and accumulate them into dft_ab. The arrays should have dft_b and accumulate them into dft_ab. The arrays should have
been obtained with pffft_transform(.., PFFFT_FORWARD) and should been obtained with pffft_transform(.., PFFFT_FORWARD) and should
@ -159,7 +163,7 @@ extern "C" {
*/ */
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
/* /**
the float buffers must have the correct alignment (16-byte boundary the float buffers must have the correct alignment (16-byte boundary
on intel and powerpc). This function may be used to obtain such on intel and powerpc). This function may be used to obtain such
correctly aligned buffers. correctly aligned buffers.
@ -167,7 +171,7 @@ extern "C" {
void *pffft_aligned_malloc(size_t nb_bytes); void *pffft_aligned_malloc(size_t nb_bytes);
void pffft_aligned_free(void *); void pffft_aligned_free(void *);
/* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */ /** return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
int pffft_simd_size(void); int pffft_simd_size(void);
#ifdef __cplusplus #ifdef __cplusplus