update pffft.c again from the upstream master branch

2025-09-11 05:47:48 -04:00 · 2024-12-02 12:35:01 +01:00 · 2024-12-02 12:35:01 +01:00 · a2b8f11ba0
commit a2b8f11ba0
parent 6d9bdb8d25
2 changed files with 205 additions and 174 deletions
--- a/third-party/pffft/pffft.c
+++ b/third-party/pffft/pffft.c
@ -57,7 +57,10 @@
  - 2011/10/02, version 1: This is the very first release of this file.
 */

-#define _USE_MATH_DEFINES
+#ifndef _USE_MATH_DEFINES
+#  define _USE_MATH_DEFINES // ask gently MSVC to define M_PI, M_SQRT2 etc.
+#endif
+
 #include "pffft.h"
 #include <stdlib.h>
 #include <stdio.h>
@ -94,11 +97,24 @@
 // define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
 //#define PFFFT_SIMD_DISABLE

+/* select which SIMD intrinsics will be used */
+#if !defined(PFFFT_SIMD_DISABLE)
+#  if (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)) \
+   && (defined(__VEC__) || defined(__ALTIVEC__))
+#    define PFFFT_SIMD_ALTIVEC
+#  elif defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64)  \
+   || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__wasm_simd128__)
+     // we test _M_ARM64EC before _M_X64 because when _M_ARM64EC is defined, the microsoft compiler also defines _M_X64
+#    define PFFFT_SIMD_NEON
+#  elif defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
+#    define PFFFT_SIMD_SSE
+#   endif
+#endif // PFFFT_SIMD_DISABLE
+
 /*
  Altivec support macros
 */
-#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)) \
-    && (defined(__VEC__) || defined(__ALTIVEC__))
+#ifdef PFFFT_SIMD_ALTIVEC
 #include <altivec.h>
 typedef vector float v4sf;
 #  define SIMD_SZ 4
@ -126,13 +142,12 @@ inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_p
    x3 = vec_mergel(y1, y3);                    \
  }
 #  define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15})
-#  define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
+#  define VALIGNED(ptr) ((((size_t)(ptr)) & 0xF) == 0)

 /*
  SSE1 support macros
 */
-#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || \
-    (defined(_M_IX86_FP) && _M_IX86_FP >= 1))
+#elif defined(PFFFT_SIMD_SSE)

 #include <xmmintrin.h>
 typedef __m128 v4sf;
@ -147,12 +162,12 @@ typedef __m128 v4sf;
 #  define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
 #  define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
 #  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
-#  define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
+#  define VALIGNED(ptr) ((((size_t)(ptr)) & 0xF) == 0)

 /*
  ARM NEON support macros
 */
-#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64) || defined(_M_ARM64))
+#elif defined(PFFFT_SIMD_NEON)
 #  include <arm_neon.h>
 typedef float32x4_t v4sf;
 #  define SIMD_SZ 4
@ -174,7 +189,7 @@ typedef float32x4_t v4sf;
 // marginally faster version
 //#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
 #  define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
-#  define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
+#  define VALIGNED(ptr) ((((size_t)(ptr)) & 0x3) == 0)
 #else
 #  if !defined(PFFFT_SIMD_DISABLE)
 #    warning "building with simd disabled !\n";
@ -192,7 +207,7 @@ typedef float v4sf;
 #  define VMADD(a,b,c) ((a)*(b)+(c))
 #  define VSUB(a,b) ((a)-(b))
 #  define LD_PS1(p) (p)
-#  define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
+#  define VALIGNED(ptr) ((((size_t)(ptr)) & 0x3) == 0)
 #endif

 // shortcuts for complex multiplcations
@ -1054,6 +1069,7 @@ static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *w
  return in; /* this is in fact the output .. */
 }

+#define IFAC_MAX_SIZE 25 /* max number of integer factors for the decomposition, +2 */
 static int decompose(int n, int *ifac, const int *ntryh) {
  int nl = n, nf = 0, i, j = 0;
  for (j=0; ntryh[j]; ++j) {
@ -1062,6 +1078,7 @@ static int decompose(int n, int *ifac, const int *ntryh) {
      int nq = nl / ntry;
      int nr = nl - ntry * nq;
      if (nr == 0) {
+        assert(2 + nf < IFAC_MAX_SIZE);
        ifac[2+nf++] = ntry;
        nl = nq;
        if (ntry == 2 && nf != 1) {
@ -1203,7 +1220,8 @@ v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, con
 struct PFFFT_Setup {
  int     N;
  int     Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
-  int ifac[15];
+  // hold the decomposition into small integers of N
+  int ifac[IFAC_MAX_SIZE]; // N , number of factors, factors (admitted values: 2, 3, 4 ou 5)
  pffft_transform_t transform;
  v4sf *data; // allocated room for twiddle coefs
  float *e;    // points into 'data' , N/4*3 elements
@ -1211,6 +1229,15 @@ struct PFFFT_Setup {
 };

 PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
+  // validate N for negative values or potential int overflow
+  if (N < 0) {
+    return 0;
+  }
+  if (N > (1<<26)) {
+    // higher values of N will make you enter in the integer overflow world...
+    assert(0);
+    return 0;
+  }
  PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
  int k, m;
  /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
@ -1300,7 +1327,7 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
  v4sf *vout = (v4sf*)out;
  assert(in != out);
  if (setup->transform == PFFFT_REAL) {
-    int dk = N/32;
+    int k, dk = N/32;
    if (direction == PFFFT_FORWARD) {
      for (k=0; k < dk; ++k) {
        INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
--- a/third-party/pffft/pffft.h
+++ b/third-party/pffft/pffft.h
@ -83,26 +83,30 @@
 extern "C" {
 #endif

-  /* opaque struct holding internal stuff (precomputed twiddle factors)
+  /**
+     Opaque struct holding internal stuff (precomputed twiddle factors)
     this struct can be shared by many threads as it contains only
     read-only data.
  */
  typedef struct PFFFT_Setup PFFFT_Setup;

-  /* direction of the transform */
+  /** Direction of the transform */
  typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;

-  /* type of transform */
+  /** Type of transform */
  typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;

-  /*
-    prepare for performing transforms of size N -- the returned
+  /**
+    Prepare for performing transforms of size N -- the returned
    PFFFT_Setup structure is read-only so it can safely be shared by
    multiple concurrent threads.
+
+    Will return NULL if N is not suitable (too large / no decomposable with simple integer
+    factors..)
  */
  PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
  void pffft_destroy_setup(PFFFT_Setup *);
-  /* 
+  /**
    Perform a Fourier transform , The z-domain data is stored in the
    most efficient order for transforming it back, or using it for
    convolution. If you need to have its content sorted in the
@ -122,7 +126,7 @@ extern "C" {
  */
  void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);

-  /* 
+  /**
    Similar to pffft_transform, but makes sure that the output is
    ordered as expected (interleaved complex numbers).  This is
    similar to calling pffft_transform and then pffft_zreorder.
@ -131,7 +135,7 @@ extern "C" {
  */
  void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);

-  /* 
+  /**
    call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
    PFFFT_FORWARD) if you want to have the frequency components in
    the correct "canonical" order, as interleaved complex numbers.
@ -145,7 +149,7 @@ extern "C" {
  */
  void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);

-  /* 
+  /**
    Perform a multiplication of the frequency components of dft_a and
    dft_b and accumulate them into dft_ab. The arrays should have
    been obtained with pffft_transform(.., PFFFT_FORWARD) and should
@ -159,7 +163,7 @@ extern "C" {
  */
  void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);

-  /*
+  /**
    the float buffers must have the correct alignment (16-byte boundary
    on intel and powerpc). This function may be used to obtain such
    correctly aligned buffers.
@ -167,7 +171,7 @@ extern "C" {
  void *pffft_aligned_malloc(size_t nb_bytes);
  void pffft_aligned_free(void *);

-  /* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
+  /** return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
  int pffft_simd_size(void);

 #ifdef __cplusplus