update pffft.c again from the upstream master branch

This commit is contained in:
Fabian Greffrath 2024-12-02 12:35:01 +01:00
parent 6d9bdb8d25
commit a2b8f11ba0
2 changed files with 205 additions and 174 deletions

View File

@ -57,7 +57,10 @@
- 2011/10/02, version 1: This is the very first release of this file.
*/
#define _USE_MATH_DEFINES
#ifndef _USE_MATH_DEFINES
# define _USE_MATH_DEFINES // ask gently MSVC to define M_PI, M_SQRT2 etc.
#endif
#include "pffft.h"
#include <stdlib.h>
#include <stdio.h>
@ -94,11 +97,24 @@
// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
//#define PFFFT_SIMD_DISABLE
/* select which SIMD intrinsics will be used */
#if !defined(PFFFT_SIMD_DISABLE)
# if (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)) \
&& (defined(__VEC__) || defined(__ALTIVEC__))
# define PFFFT_SIMD_ALTIVEC
# elif defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64) \
|| defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__wasm_simd128__)
// we test _M_ARM64EC before _M_X64 because when _M_ARM64EC is defined, the microsoft compiler also defines _M_X64
# define PFFFT_SIMD_NEON
# elif defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
# define PFFFT_SIMD_SSE
# endif
#endif // PFFFT_SIMD_DISABLE
/*
Altivec support macros
*/
#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)) \
&& (defined(__VEC__) || defined(__ALTIVEC__))
#ifdef PFFFT_SIMD_ALTIVEC
#include <altivec.h>
typedef vector float v4sf;
# define SIMD_SZ 4
@ -126,13 +142,12 @@ inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_p
x3 = vec_mergel(y1, y3); \
}
# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15})
# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
# define VALIGNED(ptr) ((((size_t)(ptr)) & 0xF) == 0)
/*
SSE1 support macros
*/
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || \
(defined(_M_IX86_FP) && _M_IX86_FP >= 1))
#elif defined(PFFFT_SIMD_SSE)
#include <xmmintrin.h>
typedef __m128 v4sf;
@ -147,12 +162,12 @@ typedef __m128 v4sf;
# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
# define VALIGNED(ptr) ((((size_t)(ptr)) & 0xF) == 0)
/*
ARM NEON support macros
*/
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64) || defined(_M_ARM64))
#elif defined(PFFFT_SIMD_NEON)
# include <arm_neon.h>
typedef float32x4_t v4sf;
# define SIMD_SZ 4
@ -174,7 +189,7 @@ typedef float32x4_t v4sf;
// marginally faster version
//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
# define VALIGNED(ptr) ((((size_t)(ptr)) & 0x3) == 0)
#else
# if !defined(PFFFT_SIMD_DISABLE)
# warning "building with simd disabled !\n";
@ -192,7 +207,7 @@ typedef float v4sf;
# define VMADD(a,b,c) ((a)*(b)+(c))
# define VSUB(a,b) ((a)-(b))
# define LD_PS1(p) (p)
# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
# define VALIGNED(ptr) ((((size_t)(ptr)) & 0x3) == 0)
#endif
// shortcuts for complex multiplcations
@ -1054,6 +1069,7 @@ static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *w
return in; /* this is in fact the output .. */
}
#define IFAC_MAX_SIZE 25 /* max number of integer factors for the decomposition, +2 */
static int decompose(int n, int *ifac, const int *ntryh) {
int nl = n, nf = 0, i, j = 0;
for (j=0; ntryh[j]; ++j) {
@ -1062,6 +1078,7 @@ static int decompose(int n, int *ifac, const int *ntryh) {
int nq = nl / ntry;
int nr = nl - ntry * nq;
if (nr == 0) {
assert(2 + nf < IFAC_MAX_SIZE);
ifac[2+nf++] = ntry;
nl = nq;
if (ntry == 2 && nf != 1) {
@ -1203,7 +1220,8 @@ v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, con
struct PFFFT_Setup {
int N;
int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
int ifac[15];
// hold the decomposition into small integers of N
int ifac[IFAC_MAX_SIZE]; // N , number of factors, factors (admitted values: 2, 3, 4 ou 5)
pffft_transform_t transform;
v4sf *data; // allocated room for twiddle coefs
float *e; // points into 'data' , N/4*3 elements
@ -1211,6 +1229,15 @@ struct PFFFT_Setup {
};
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
// validate N for negative values or potential int overflow
if (N < 0) {
return 0;
}
if (N > (1<<26)) {
// higher values of N will make you enter in the integer overflow world...
assert(0);
return 0;
}
PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
int k, m;
/* unfortunately, the fft size must be a multiple of 16 for complex FFTs
@ -1300,7 +1327,7 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
v4sf *vout = (v4sf*)out;
assert(in != out);
if (setup->transform == PFFFT_REAL) {
int dk = N/32;
int k, dk = N/32;
if (direction == PFFFT_FORWARD) {
for (k=0; k < dk; ++k) {
INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);

View File

@ -83,26 +83,30 @@
extern "C" {
#endif
/* opaque struct holding internal stuff (precomputed twiddle factors)
/**
Opaque struct holding internal stuff (precomputed twiddle factors)
this struct can be shared by many threads as it contains only
read-only data.
*/
typedef struct PFFFT_Setup PFFFT_Setup;
/* direction of the transform */
/** Direction of the transform */
typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
/* type of transform */
/** Type of transform */
typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
/*
prepare for performing transforms of size N -- the returned
/**
Prepare for performing transforms of size N -- the returned
PFFFT_Setup structure is read-only so it can safely be shared by
multiple concurrent threads.
Will return NULL if N is not suitable (too large / no decomposable with simple integer
factors..)
*/
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
void pffft_destroy_setup(PFFFT_Setup *);
/*
/**
Perform a Fourier transform , The z-domain data is stored in the
most efficient order for transforming it back, or using it for
convolution. If you need to have its content sorted in the
@ -122,7 +126,7 @@ extern "C" {
*/
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
/*
/**
Similar to pffft_transform, but makes sure that the output is
ordered as expected (interleaved complex numbers). This is
similar to calling pffft_transform and then pffft_zreorder.
@ -131,7 +135,7 @@ extern "C" {
*/
void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
/*
/**
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
PFFFT_FORWARD) if you want to have the frequency components in
the correct "canonical" order, as interleaved complex numbers.
@ -145,7 +149,7 @@ extern "C" {
*/
void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
/*
/**
Perform a multiplication of the frequency components of dft_a and
dft_b and accumulate them into dft_ab. The arrays should have
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
@ -159,7 +163,7 @@ extern "C" {
*/
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
/*
/**
the float buffers must have the correct alignment (16-byte boundary
on intel and powerpc). This function may be used to obtain such
correctly aligned buffers.
@ -167,7 +171,7 @@ extern "C" {
void *pffft_aligned_malloc(size_t nb_bytes);
void pffft_aligned_free(void *);
/* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
/** return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
int pffft_simd_size(void);
#ifdef __cplusplus