mirror of
https://github.com/fabiangreffrath/woof.git
synced 2025-09-11 05:47:48 -04:00
update pffft.c again from the upstream master branch
This commit is contained in:
parent
6d9bdb8d25
commit
a2b8f11ba0
51
third-party/pffft/pffft.c
vendored
51
third-party/pffft/pffft.c
vendored
@ -57,7 +57,10 @@
|
||||
- 2011/10/02, version 1: This is the very first release of this file.
|
||||
*/
|
||||
|
||||
#define _USE_MATH_DEFINES
|
||||
#ifndef _USE_MATH_DEFINES
|
||||
# define _USE_MATH_DEFINES // ask gently MSVC to define M_PI, M_SQRT2 etc.
|
||||
#endif
|
||||
|
||||
#include "pffft.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
@ -94,11 +97,24 @@
|
||||
// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
|
||||
//#define PFFFT_SIMD_DISABLE
|
||||
|
||||
/* select which SIMD intrinsics will be used */
|
||||
#if !defined(PFFFT_SIMD_DISABLE)
|
||||
# if (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)) \
|
||||
&& (defined(__VEC__) || defined(__ALTIVEC__))
|
||||
# define PFFFT_SIMD_ALTIVEC
|
||||
# elif defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64) \
|
||||
|| defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__wasm_simd128__)
|
||||
// we test _M_ARM64EC before _M_X64 because when _M_ARM64EC is defined, the microsoft compiler also defines _M_X64
|
||||
# define PFFFT_SIMD_NEON
|
||||
# elif defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
|
||||
# define PFFFT_SIMD_SSE
|
||||
# endif
|
||||
#endif // PFFFT_SIMD_DISABLE
|
||||
|
||||
/*
|
||||
Altivec support macros
|
||||
*/
|
||||
#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)) \
|
||||
&& (defined(__VEC__) || defined(__ALTIVEC__))
|
||||
#ifdef PFFFT_SIMD_ALTIVEC
|
||||
#include <altivec.h>
|
||||
typedef vector float v4sf;
|
||||
# define SIMD_SZ 4
|
||||
@ -126,13 +142,12 @@ inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_p
|
||||
x3 = vec_mergel(y1, y3); \
|
||||
}
|
||||
# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15})
|
||||
# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
|
||||
# define VALIGNED(ptr) ((((size_t)(ptr)) & 0xF) == 0)
|
||||
|
||||
/*
|
||||
SSE1 support macros
|
||||
*/
|
||||
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || \
|
||||
(defined(_M_IX86_FP) && _M_IX86_FP >= 1))
|
||||
#elif defined(PFFFT_SIMD_SSE)
|
||||
|
||||
#include <xmmintrin.h>
|
||||
typedef __m128 v4sf;
|
||||
@ -147,12 +162,12 @@ typedef __m128 v4sf;
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
|
||||
# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
|
||||
# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
|
||||
# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
|
||||
# define VALIGNED(ptr) ((((size_t)(ptr)) & 0xF) == 0)
|
||||
|
||||
/*
|
||||
ARM NEON support macros
|
||||
*/
|
||||
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64) || defined(_M_ARM64))
|
||||
#elif defined(PFFFT_SIMD_NEON)
|
||||
# include <arm_neon.h>
|
||||
typedef float32x4_t v4sf;
|
||||
# define SIMD_SZ 4
|
||||
@ -174,7 +189,7 @@ typedef float32x4_t v4sf;
|
||||
// marginally faster version
|
||||
//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
|
||||
# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
|
||||
# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
|
||||
# define VALIGNED(ptr) ((((size_t)(ptr)) & 0x3) == 0)
|
||||
#else
|
||||
# if !defined(PFFFT_SIMD_DISABLE)
|
||||
# warning "building with simd disabled !\n";
|
||||
@ -192,7 +207,7 @@ typedef float v4sf;
|
||||
# define VMADD(a,b,c) ((a)*(b)+(c))
|
||||
# define VSUB(a,b) ((a)-(b))
|
||||
# define LD_PS1(p) (p)
|
||||
# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
|
||||
# define VALIGNED(ptr) ((((size_t)(ptr)) & 0x3) == 0)
|
||||
#endif
|
||||
|
||||
// shortcuts for complex multiplcations
|
||||
@ -1054,6 +1069,7 @@ static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *w
|
||||
return in; /* this is in fact the output .. */
|
||||
}
|
||||
|
||||
#define IFAC_MAX_SIZE 25 /* max number of integer factors for the decomposition, +2 */
|
||||
static int decompose(int n, int *ifac, const int *ntryh) {
|
||||
int nl = n, nf = 0, i, j = 0;
|
||||
for (j=0; ntryh[j]; ++j) {
|
||||
@ -1062,6 +1078,7 @@ static int decompose(int n, int *ifac, const int *ntryh) {
|
||||
int nq = nl / ntry;
|
||||
int nr = nl - ntry * nq;
|
||||
if (nr == 0) {
|
||||
assert(2 + nf < IFAC_MAX_SIZE);
|
||||
ifac[2+nf++] = ntry;
|
||||
nl = nq;
|
||||
if (ntry == 2 && nf != 1) {
|
||||
@ -1203,7 +1220,8 @@ v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, con
|
||||
struct PFFFT_Setup {
|
||||
int N;
|
||||
int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
|
||||
int ifac[15];
|
||||
// hold the decomposition into small integers of N
|
||||
int ifac[IFAC_MAX_SIZE]; // N , number of factors, factors (admitted values: 2, 3, 4 ou 5)
|
||||
pffft_transform_t transform;
|
||||
v4sf *data; // allocated room for twiddle coefs
|
||||
float *e; // points into 'data' , N/4*3 elements
|
||||
@ -1211,6 +1229,15 @@ struct PFFFT_Setup {
|
||||
};
|
||||
|
||||
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
|
||||
// validate N for negative values or potential int overflow
|
||||
if (N < 0) {
|
||||
return 0;
|
||||
}
|
||||
if (N > (1<<26)) {
|
||||
// higher values of N will make you enter in the integer overflow world...
|
||||
assert(0);
|
||||
return 0;
|
||||
}
|
||||
PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
|
||||
int k, m;
|
||||
/* unfortunately, the fft size must be a multiple of 16 for complex FFTs
|
||||
@ -1300,7 +1327,7 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
|
||||
v4sf *vout = (v4sf*)out;
|
||||
assert(in != out);
|
||||
if (setup->transform == PFFFT_REAL) {
|
||||
int dk = N/32;
|
||||
int k, dk = N/32;
|
||||
if (direction == PFFFT_FORWARD) {
|
||||
for (k=0; k < dk; ++k) {
|
||||
INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
|
||||
|
26
third-party/pffft/pffft.h
vendored
26
third-party/pffft/pffft.h
vendored
@ -83,26 +83,30 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* opaque struct holding internal stuff (precomputed twiddle factors)
|
||||
/**
|
||||
Opaque struct holding internal stuff (precomputed twiddle factors)
|
||||
this struct can be shared by many threads as it contains only
|
||||
read-only data.
|
||||
*/
|
||||
typedef struct PFFFT_Setup PFFFT_Setup;
|
||||
|
||||
/* direction of the transform */
|
||||
/** Direction of the transform */
|
||||
typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
|
||||
|
||||
/* type of transform */
|
||||
/** Type of transform */
|
||||
typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
|
||||
|
||||
/*
|
||||
prepare for performing transforms of size N -- the returned
|
||||
/**
|
||||
Prepare for performing transforms of size N -- the returned
|
||||
PFFFT_Setup structure is read-only so it can safely be shared by
|
||||
multiple concurrent threads.
|
||||
|
||||
Will return NULL if N is not suitable (too large / no decomposable with simple integer
|
||||
factors..)
|
||||
*/
|
||||
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
|
||||
void pffft_destroy_setup(PFFFT_Setup *);
|
||||
/*
|
||||
/**
|
||||
Perform a Fourier transform , The z-domain data is stored in the
|
||||
most efficient order for transforming it back, or using it for
|
||||
convolution. If you need to have its content sorted in the
|
||||
@ -122,7 +126,7 @@ extern "C" {
|
||||
*/
|
||||
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
/**
|
||||
Similar to pffft_transform, but makes sure that the output is
|
||||
ordered as expected (interleaved complex numbers). This is
|
||||
similar to calling pffft_transform and then pffft_zreorder.
|
||||
@ -131,7 +135,7 @@ extern "C" {
|
||||
*/
|
||||
void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
/**
|
||||
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
|
||||
PFFFT_FORWARD) if you want to have the frequency components in
|
||||
the correct "canonical" order, as interleaved complex numbers.
|
||||
@ -145,7 +149,7 @@ extern "C" {
|
||||
*/
|
||||
void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
/**
|
||||
Perform a multiplication of the frequency components of dft_a and
|
||||
dft_b and accumulate them into dft_ab. The arrays should have
|
||||
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||
@ -159,7 +163,7 @@ extern "C" {
|
||||
*/
|
||||
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
|
||||
|
||||
/*
|
||||
/**
|
||||
the float buffers must have the correct alignment (16-byte boundary
|
||||
on intel and powerpc). This function may be used to obtain such
|
||||
correctly aligned buffers.
|
||||
@ -167,7 +171,7 @@ extern "C" {
|
||||
void *pffft_aligned_malloc(size_t nb_bytes);
|
||||
void pffft_aligned_free(void *);
|
||||
|
||||
/* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
|
||||
/** return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
|
||||
int pffft_simd_size(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
Loading…
x
Reference in New Issue
Block a user