Bug Summary

File:numpy/fft/_pocketfft.c
Warning:line 2372, column 9
PyObject ownership leak with reference count of 1

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name _pocketfft.c -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -analyzer-output=html -analyzer-checker=python -analyzer-disable-checker=deadcode -analyzer-config prune-paths=true,suppress-c++-stdlib=true,suppress-null-return-paths=false,crosscheck-with-z3=true,model-path=/opt/pyrefcon/lib/pyrefcon/models/models -analyzer-config experimental-enable-naive-ctu-analysis=true,ctu-dir=/tmp/pyrefcon/numpy/csa-scan,ctu-index-name=/tmp/pyrefcon/numpy/csa-scan/externalDefMap.txt,ctu-invocation-list=/tmp/pyrefcon/numpy/csa-scan/invocations.yaml,display-ctu-progress=false -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -target-feature +sse -target-feature +sse2 -target-feature +sse3 -tune-cpu generic -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -fcoverage-compilation-dir=/tmp/pyrefcon/numpy -resource-dir /opt/pyrefcon/lib/clang/13.0.0 -isystem /opt/pyrefcon/lib/pyrefcon/models/python3.8 -D NDEBUG -D _FORTIFY_SOURCE=2 -I numpy/core/include -I build/src.linux-x86_64-3.8/numpy/core/include/numpy -I build/src.linux-x86_64-3.8/numpy/distutils/include -I numpy/core/src/common -I numpy/core/src -I numpy/core -I numpy/core/src/npymath -I numpy/core/src/multiarray -I numpy/core/src/umath -I numpy/core/src/npysort -I numpy/core/src/_simd -I build/src.linux-x86_64-3.8/numpy/core/src/common -I build/src.linux-x86_64-3.8/numpy/core/src/npymath -internal-isystem /opt/pyrefcon/lib/clang/13.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-result -Wsign-compare -Wall -Wformat -Werror=format-security -Wformat -Werror=format-security -Wdate-time -fdebug-compilation-dir=/tmp/pyrefcon/numpy -ferror-limit 19 -fwrapv -pthread -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/pyrefcon/numpy/csa-scan/reports -x c numpy/fft/_pocketfft.c

numpy/fft/_pocketfft.c

1/*
2 * This file is part of pocketfft.
3 * Licensed under a 3-clause BSD style license - see LICENSE.md
4 */
5
6/*
7 * Main implementation file.
8 *
9 * Copyright (C) 2004-2018 Max-Planck-Society
10 * \author Martin Reinecke
11 */
12
13#define NPY_NO_DEPRECATED_API0x0000000E NPY_API_VERSION0x0000000E
14
15#include "Python.h"
16#include "numpy/arrayobject.h"
17
18#include <math.h>
19#include <string.h>
20#include <stdlib.h>
21
22#include "npy_config.h"
23#define restrictrestrict NPY_RESTRICTNPY_RESTRICT
24
25#define RALLOC(type,num)((type *)malloc((num)*sizeof(type))) \
26 ((type *)malloc((num)*sizeof(type)))
27#define DEALLOC(ptr)do { free(ptr); (ptr)=((void*)0); } while(0) \
28 do { free(ptr); (ptr)=NULL((void*)0); } while(0)
29
30#define SWAP(a,b,type)do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0) \
31 do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0)
32
33#ifdef __GNUC__4
34#define NOINLINE__attribute__((noinline)) __attribute__((noinline))
35#define WARN_UNUSED_RESULT__attribute__ ((warn_unused_result)) __attribute__ ((warn_unused_result))
36#else
37#define NOINLINE__attribute__((noinline))
38#define WARN_UNUSED_RESULT__attribute__ ((warn_unused_result))
39#endif
40
41struct cfft_plan_i;
42typedef struct cfft_plan_i * cfft_plan;
43struct rfft_plan_i;
44typedef struct rfft_plan_i * rfft_plan;
45
46// adapted from https://stackoverflow.com/questions/42792939/
47// CAUTION: this function only works for arguments in the range [-0.25; 0.25]!
48static void my_sincosm1pi (double a, double *restrictrestrict res)
49 {
50 double s = a * a;
51 /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
52 double r = -1.0369917389758117e-4;
53 r = fma (r, s, 1.9294935641298806e-3);
54 r = fma (r, s, -2.5806887942825395e-2);
55 r = fma (r, s, 2.3533063028328211e-1);
56 r = fma (r, s, -1.3352627688538006e+0);
57 r = fma (r, s, 4.0587121264167623e+0);
58 r = fma (r, s, -4.9348022005446790e+0);
59 double c = r*s;
60 /* Approximate sin(pi*x) for x in [-0.25,0.25] */
61 r = 4.6151442520157035e-4;
62 r = fma (r, s, -7.3700183130883555e-3);
63 r = fma (r, s, 8.2145868949323936e-2);
64 r = fma (r, s, -5.9926452893214921e-1);
65 r = fma (r, s, 2.5501640398732688e+0);
66 r = fma (r, s, -5.1677127800499516e+0);
67 s = s * a;
68 r = r * s;
69 s = fma (a, 3.1415926535897931e+0, r);
70 res[0] = c;
71 res[1] = s;
72 }
73
74NOINLINE__attribute__((noinline)) static void calc_first_octant(size_t den, double * restrictrestrict res)
75 {
76 size_t n = (den+4)>>3;
77 if (n==0) return;
78 res[0]=1.; res[1]=0.;
79 if (n==1) return;
80 size_t l1=(size_t)sqrt(n);
81 for (size_t i=1; i<l1; ++i)
82 my_sincosm1pi((2.*i)/den,&res[2*i]);
83 size_t start=l1;
84 while(start<n)
85 {
86 double cs[2];
87 my_sincosm1pi((2.*start)/den,cs);
88 res[2*start] = cs[0]+1.;
89 res[2*start+1] = cs[1];
90 size_t end = l1;
91 if (start+end>n) end = n-start;
92 for (size_t i=1; i<end; ++i)
93 {
94 double csx[2]={res[2*i], res[2*i+1]};
95 res[2*(start+i)] = ((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1.;
96 res[2*(start+i)+1] = (cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1];
97 }
98 start += l1;
99 }
100 for (size_t i=1; i<l1; ++i)
101 res[2*i] += 1.;
102 }
103
104NOINLINE__attribute__((noinline)) static void calc_first_quadrant(size_t n, double * restrictrestrict res)
105 {
106 double * restrictrestrict p = res+n;
107 calc_first_octant(n<<1, p);
108 size_t ndone=(n+2)>>2;
109 size_t i=0, idx1=0, idx2=2*ndone-2;
110 for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
111 {
112 res[idx1] = p[2*i];
113 res[idx1+1] = p[2*i+1];
114 res[idx2] = p[2*i+3];
115 res[idx2+1] = p[2*i+2];
116 }
117 if (i!=ndone)
118 {
119 res[idx1 ] = p[2*i];
120 res[idx1+1] = p[2*i+1];
121 }
122 }
123
124NOINLINE__attribute__((noinline)) static void calc_first_half(size_t n, double * restrictrestrict res)
125 {
126 int ndone=(n+1)>>1;
127 double * p = res+n-1;
128 calc_first_octant(n<<2, p);
129 int i4=0, in=n, i=0;
130 for (; i4<=in-i4; ++i, i4+=4) // octant 0
131 {
132 res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1];
133 }
134 for (; i4-in <= 0; ++i, i4+=4) // octant 1
135 {
136 int xm = in-i4;
137 res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm];
138 }
139 for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
140 {
141 int xm = i4-in;
142 res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm];
143 }
144 for (; i<ndone; ++i, i4+=4) // octant 3
145 {
146 int xm = 2*in-i4;
147 res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1];
148 }
149 }
150
151NOINLINE__attribute__((noinline)) static void fill_first_quadrant(size_t n, double * restrictrestrict res)
152 {
153 const double hsqt2 = 0.707106781186547524400844362104849;
154 size_t quart = n>>2;
155 if ((n&7)==0)
156 res[quart] = res[quart+1] = hsqt2;
157 for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
158 {
159 res[j ] = res[i+1];
160 res[j+1] = res[i ];
161 }
162 }
163
164NOINLINE__attribute__((noinline)) static void fill_first_half(size_t n, double * restrictrestrict res)
165 {
166 size_t half = n>>1;
167 if ((n&3)==0)
168 for (size_t i=0; i<half; i+=2)
169 {
170 res[i+half] = -res[i+1];
171 res[i+half+1] = res[i ];
172 }
173 else
174 for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
175 {
176 res[j ] = -res[i ];
177 res[j+1] = res[i+1];
178 }
179 }
180
181NOINLINE__attribute__((noinline)) static void fill_second_half(size_t n, double * restrictrestrict res)
182 {
183 if ((n&1)==0)
184 for (size_t i=0; i<n; ++i)
185 res[i+n] = -res[i];
186 else
187 for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
188 {
189 res[j ] = res[i ];
190 res[j+1] = -res[i+1];
191 }
192 }
193
194NOINLINE__attribute__((noinline)) static void sincos_2pibyn_half(size_t n, double * restrictrestrict res)
195 {
196 if ((n&3)==0)
197 {
198 calc_first_octant(n, res);
199 fill_first_quadrant(n, res);
200 fill_first_half(n, res);
201 }
202 else if ((n&1)==0)
203 {
204 calc_first_quadrant(n, res);
205 fill_first_half(n, res);
206 }
207 else
208 calc_first_half(n, res);
209 }
210
211NOINLINE__attribute__((noinline)) static void sincos_2pibyn(size_t n, double * restrictrestrict res)
212 {
213 sincos_2pibyn_half(n, res);
214 fill_second_half(n, res);
215 }
216
217NOINLINE__attribute__((noinline)) static size_t largest_prime_factor (size_t n)
218 {
219 size_t res=1;
220 size_t tmp;
221 while (((tmp=(n>>1))<<1)==n)
222 { res=2; n=tmp; }
223
224 size_t limit=(size_t)sqrt(n+0.01);
225 for (size_t x=3; x<=limit; x+=2)
226 while (((tmp=(n/x))*x)==n)
227 {
228 res=x;
229 n=tmp;
230 limit=(size_t)sqrt(n+0.01);
231 }
232 if (n>1) res=n;
233
234 return res;
235 }
236
237NOINLINE__attribute__((noinline)) static double cost_guess (size_t n)
238 {
239 const double lfp=1.1; // penalty for non-hardcoded larger factors
240 size_t ni=n;
241 double result=0.;
242 size_t tmp;
243 while (((tmp=(n>>1))<<1)==n)
244 { result+=2; n=tmp; }
245
246 size_t limit=(size_t)sqrt(n+0.01);
247 for (size_t x=3; x<=limit; x+=2)
248 while ((tmp=(n/x))*x==n)
249 {
250 result+= (x<=5) ? x : lfp*x; // penalize larger prime factors
251 n=tmp;
252 limit=(size_t)sqrt(n+0.01);
253 }
254 if (n>1) result+=(n<=5) ? n : lfp*n;
255
256 return result*ni;
257 }
258
259/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
260NOINLINE__attribute__((noinline)) static size_t good_size(size_t n)
261 {
262 if (n<=6) return n;
263
264 size_t bestfac=2*n;
265 for (size_t f2=1; f2<bestfac; f2*=2)
266 for (size_t f23=f2; f23<bestfac; f23*=3)
267 for (size_t f235=f23; f235<bestfac; f235*=5)
268 for (size_t f2357=f235; f2357<bestfac; f2357*=7)
269 for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
270 if (f235711>=n) bestfac=f235711;
271 return bestfac;
272 }
273
274typedef struct cmplx {
275 double r,i;
276} cmplx;
277
278#define NFCT25 25
279typedef struct cfftp_fctdata
280 {
281 size_t fct;
282 cmplx *tw, *tws;
283 } cfftp_fctdata;
284
285typedef struct cfftp_plan_i
286 {
287 size_t length, nfct;
288 cmplx *mem;
289 cfftp_fctdata fct[NFCT25];
290 } cfftp_plan_i;
291typedef struct cfftp_plan_i * cfftp_plan;
292
293#define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; }
294#define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; }
295#define SCALEC(a,b) { a.r*=b; a.i*=b; }
296#define ROT90(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; }
297#define ROTM90(a){ double tmp_=-a.r; a.r=a.i; a.i=tmp_; } { double tmp_=-a.r; a.r=a.i; a.i=tmp_; }
298#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
299#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
300#define WA(x,i) wa[(i)-1+(x)*(ido-1)]
301/* a = b*c */
302#define A_EQ_B_MUL_C(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; }
303/* a = conj(b)*c*/
304#define A_EQ_CB_MUL_C(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; }
305
306#define PMSIGNC(a,b,c,d) { a.r=c.r+sign*d.r; a.i=c.i+sign*d.i; b.r=c.r-sign*d.r; b.i=c.i-sign*d.i; }
307/* a = b*c */
308#define MULPMSIGNC(a,b,c) { a.r=b.r*c.r-sign*b.i*c.i; a.i=b.r*c.i+sign*b.i*c.r; }
309/* a *= b */
310#define MULPMSIGNCEQ(a,b) { double xtmp=a.r; a.r=b.r*a.r-sign*b.i*a.i; a.i=b.r*a.i+sign*b.i*xtmp; }
311
312NOINLINE__attribute__((noinline)) static void pass2b (size_t ido, size_t l1, const cmplx * restrictrestrict cc,
313 cmplx * restrictrestrict ch, const cmplx * restrictrestrict wa)
314 {
315 const size_t cdim=2;
316
317 if (ido==1)
318 for (size_t k=0; k<l1; ++k)
319 PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
320 else
321 for (size_t k=0; k<l1; ++k)
322 {
323 PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
324 for (size_t i=1; i<ido; ++i)
325 {
326 cmplx t;
327 PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
328 A_EQ_B_MUL_C (CH(i,k,1),WA(0,i),t)
329 }
330 }
331 }
332
333NOINLINE__attribute__((noinline)) static void pass2f (size_t ido, size_t l1, const cmplx * restrictrestrict cc,
334 cmplx * restrictrestrict ch, const cmplx * restrictrestrict wa)
335 {
336 const size_t cdim=2;
337
338 if (ido==1)
339 for (size_t k=0; k<l1; ++k)
340 PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
341 else
342 for (size_t k=0; k<l1; ++k)
343 {
344 PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
345 for (size_t i=1; i<ido; ++i)
346 {
347 cmplx t;
348 PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
349 A_EQ_CB_MUL_C (CH(i,k,1),WA(0,i),t)
350 }
351 }
352 }
353
354#define PREP3(idx)cmplx t0 = CC(idx,0,k), t1, t2; PMC (t1,t2,CC(idx,1,k),CC(idx
,2,k)) CH(idx,k,0).r=t0.r+t1.r; CH(idx,k,0).i=t0.i+t1.i;
\
355 cmplx t0 = CC(idx,0,k), t1, t2; \
356 PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)) \
357 CH(idx,k,0).r=t0.r+t1.r; \
358 CH(idx,k,0).i=t0.i+t1.i;
359#define PARTSTEP3a(u1,u2,twr,twi){ cmplx ca,cb; ca.r=t0.r+twr*t1.r; ca.i=t0.i+twr*t1.i; cb.i=twi
*t2.r; cb.r=-(twi*t2.i); PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) }
\
360 { \
361 cmplx ca,cb; \
362 ca.r=t0.r+twr*t1.r; \
363 ca.i=t0.i+twr*t1.i; \
364 cb.i=twi*t2.r; \
365 cb.r=-(twi*t2.i); \
366 PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
367 }
368
369#define PARTSTEP3b(u1,u2,twr,twi){ cmplx ca,cb,da,db; ca.r=t0.r+twr*t1.r; ca.i=t0.i+twr*t1.i; cb
.i=twi*t2.r; cb.r=-(twi*t2.i); PMC(da,db,ca,cb) A_EQ_B_MUL_C (
CH(i,k,u1),WA(u1-1,i),da) A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i)
,db) }
\
370 { \
371 cmplx ca,cb,da,db; \
372 ca.r=t0.r+twr*t1.r; \
373 ca.i=t0.i+twr*t1.i; \
374 cb.i=twi*t2.r; \
375 cb.r=-(twi*t2.i); \
376 PMC(da,db,ca,cb) \
377 A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
378 A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
379 }
380NOINLINE__attribute__((noinline)) static void pass3b (size_t ido, size_t l1, const cmplx * restrictrestrict cc,
381 cmplx * restrictrestrict ch, const cmplx * restrictrestrict wa)
382 {
383 const size_t cdim=3;
384 const double tw1r=-0.5, tw1i= 0.86602540378443864676;
385
386 if (ido==1)
387 for (size_t k=0; k<l1; ++k)
388 {
389 PREP3(0)cmplx t0 = CC(0,0,k), t1, t2; PMC (t1,t2,CC(0,1,k),CC(0,2,k))
CH(0,k,0).r=t0.r+t1.r; CH(0,k,0).i=t0.i+t1.i;
390 PARTSTEP3a(1,2,tw1r,tw1i){ cmplx ca,cb; ca.r=t0.r+tw1r*t1.r; ca.i=t0.i+tw1r*t1.i; cb.i
=tw1i*t2.r; cb.r=-(tw1i*t2.i); PMC(CH(0,k,1),CH(0,k,2),ca,cb)
}
391 }
392 else
393 for (size_t k=0; k<l1; ++k)
394 {
395 {
396 PREP3(0)cmplx t0 = CC(0,0,k), t1, t2; PMC (t1,t2,CC(0,1,k),CC(0,2,k))
CH(0,k,0).r=t0.r+t1.r; CH(0,k,0).i=t0.i+t1.i;
397 PARTSTEP3a(1,2,tw1r,tw1i){ cmplx ca,cb; ca.r=t0.r+tw1r*t1.r; ca.i=t0.i+tw1r*t1.i; cb.i
=tw1i*t2.r; cb.r=-(tw1i*t2.i); PMC(CH(0,k,1),CH(0,k,2),ca,cb)
}
398 }
399 for (size_t i=1; i<ido; ++i)
400 {
401 PREP3(i)cmplx t0 = CC(i,0,k), t1, t2; PMC (t1,t2,CC(i,1,k),CC(i,2,k))
CH(i,k,0).r=t0.r+t1.r; CH(i,k,0).i=t0.i+t1.i;
402 PARTSTEP3b(1,2,tw1r,tw1i){ cmplx ca,cb,da,db; ca.r=t0.r+tw1r*t1.r; ca.i=t0.i+tw1r*t1.i
; cb.i=tw1i*t2.r; cb.r=-(tw1i*t2.i); PMC(da,db,ca,cb) A_EQ_B_MUL_C
(CH(i,k,1),WA(1 -1,i),da) A_EQ_B_MUL_C (CH(i,k,2),WA(2 -1,i)
,db) }
403 }
404 }
405 }
406#define PARTSTEP3f(u1,u2,twr,twi){ cmplx ca,cb,da,db; ca.r=t0.r+twr*t1.r; ca.i=t0.i+twr*t1.i; cb
.i=twi*t2.r; cb.r=-(twi*t2.i); PMC(da,db,ca,cb) A_EQ_CB_MUL_C
(CH(i,k,u1),WA(u1-1,i),da) A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1
,i),db) }
\
407 { \
408 cmplx ca,cb,da,db; \
409 ca.r=t0.r+twr*t1.r; \
410 ca.i=t0.i+twr*t1.i; \
411 cb.i=twi*t2.r; \
412 cb.r=-(twi*t2.i); \
413 PMC(da,db,ca,cb) \
414 A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
415 A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
416 }
417NOINLINE__attribute__((noinline)) static void pass3f (size_t ido, size_t l1, const cmplx * restrictrestrict cc,
418 cmplx * restrictrestrict ch, const cmplx * restrictrestrict wa)
419 {
420 const size_t cdim=3;
421 const double tw1r=-0.5, tw1i= -0.86602540378443864676;
422
423 if (ido==1)
424 for (size_t k=0; k<l1; ++k)
425 {
426 PREP3(0)cmplx t0 = CC(0,0,k), t1, t2; PMC (t1,t2,CC(0,1,k),CC(0,2,k))
CH(0,k,0).r=t0.r+t1.r; CH(0,k,0).i=t0.i+t1.i;
427 PARTSTEP3a(1,2,tw1r,tw1i){ cmplx ca,cb; ca.r=t0.r+tw1r*t1.r; ca.i=t0.i+tw1r*t1.i; cb.i
=tw1i*t2.r; cb.r=-(tw1i*t2.i); PMC(CH(0,k,1),CH(0,k,2),ca,cb)
}
428 }
429 else
430 for (size_t k=0; k<l1; ++k)
431 {
432 {
433 PREP3(0)cmplx t0 = CC(0,0,k), t1, t2; PMC (t1,t2,CC(0,1,k),CC(0,2,k))
CH(0,k,0).r=t0.r+t1.r; CH(0,k,0).i=t0.i+t1.i;
434 PARTSTEP3a(1,2,tw1r,tw1i){ cmplx ca,cb; ca.r=t0.r+tw1r*t1.r; ca.i=t0.i+tw1r*t1.i; cb.i
=tw1i*t2.r; cb.r=-(tw1i*t2.i); PMC(CH(0,k,1),CH(0,k,2),ca,cb)
}
435 }
436 for (size_t i=1; i<ido; ++i)
437 {
438 PREP3(i)cmplx t0 = CC(i,0,k), t1, t2; PMC (t1,t2,CC(i,1,k),CC(i,2,k))
CH(i,k,0).r=t0.r+t1.r; CH(i,k,0).i=t0.i+t1.i;
439 PARTSTEP3f(1,2,tw1r,tw1i){ cmplx ca,cb,da,db; ca.r=t0.r+tw1r*t1.r; ca.i=t0.i+tw1r*t1.i
; cb.i=tw1i*t2.r; cb.r=-(tw1i*t2.i); PMC(da,db,ca,cb) A_EQ_CB_MUL_C
(CH(i,k,1),WA(1 -1,i),da) A_EQ_CB_MUL_C (CH(i,k,2),WA(2 -1,i
),db) }
440 }
441 }
442 }
443
444NOINLINE__attribute__((noinline)) static void pass4b (size_t ido, size_t l1, const cmplx * restrictrestrict cc,
445 cmplx * restrictrestrict ch, const cmplx * restrictrestrict wa)
446 {
447 const size_t cdim=4;
448
449 if (ido==1)
450 for (size_t k=0; k<l1; ++k)
451 {
452 cmplx t1, t2, t3, t4;
453 PMC(t2,t1,CC(0,0,k),CC(0,2,k))
454 PMC(t3,t4,CC(0,1,k),CC(0,3,k))
455 ROT90(t4)
456 PMC(CH(0,k,0),CH(0,k,2),t2,t3)
457 PMC(CH(0,k,1),CH(0,k,3),t1,t4)
458 }
459 else
460 for (size_t k=0; k<l1; ++k)
461 {
462 {
463 cmplx t1, t2, t3, t4;
464 PMC(t2,t1,CC(0,0,k),CC(0,2,k))
465 PMC(t3,t4,CC(0,1,k),CC(0,3,k))
466 ROT90(t4)
467 PMC(CH(0,k,0),CH(0,k,2),t2,t3)
468 PMC(CH(0,k,1),CH(0,k,3),t1,t4)
469 }
470 for (size_t i=1; i<ido; ++i)
471 {
472 cmplx c2, c3, c4, t1, t2, t3, t4;
473 cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
474 PMC(t2,t1,cc0,cc2)
475 PMC(t3,t4,cc1,cc3)
476 ROT90(t4)
477 cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
478 PMC(CH(i,k,0),c3,t2,t3)
479 PMC(c2,c4,t1,t4)
480 A_EQ_B_MUL_C (CH(i,k,1),wa0,c2)
481 A_EQ_B_MUL_C (CH(i,k,2),wa1,c3)
482 A_EQ_B_MUL_C (CH(i,k,3),wa2,c4)
483 }
484 }
485 }
486NOINLINE__attribute__((noinline)) static void pass4f (size_t ido, size_t l1, const cmplx * restrictrestrict cc,
487 cmplx * restrictrestrict ch, const cmplx * restrictrestrict wa)
488 {
489 const size_t cdim=4;
490
491 if (ido==1)
492 for (size_t k=0; k<l1; ++k)
493 {
494 cmplx t1, t2, t3, t4;
495 PMC(t2,t1,CC(0,0,k),CC(0,2,k))
496 PMC(t3,t4,CC(0,1,k),CC(0,3,k))
497 ROTM90(t4){ double tmp_=-t4.r; t4.r=t4.i; t4.i=tmp_; }
498 PMC(CH(0,k,0),CH(0,k,2),t2,t3)
499 PMC(CH(0,k,1),CH(0,k,3),t1,t4)
500 }
501 else
502 for (size_t k=0; k<l1; ++k)
503 {
504 {
505 cmplx t1, t2, t3, t4;
506 PMC(t2,t1,CC(0,0,k),CC(0,2,k))
507 PMC(t3,t4,CC(0,1,k),CC(0,3,k))
508 ROTM90(t4){ double tmp_=-t4.r; t4.r=t4.i; t4.i=tmp_; }
509 PMC(CH(0,k,0),CH(0,k,2),t2,t3)
510 PMC (CH(0,k,1),CH(0,k,3),t1,t4)
511 }
512 for (size_t i=1; i<ido; ++i)
513 {
514 cmplx c2, c3, c4, t1, t2, t3, t4;
515 cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
516 PMC(t2,t1,cc0,cc2)
517 PMC(t3,t4,cc1,cc3)
518 ROTM90(t4){ double tmp_=-t4.r; t4.r=t4.i; t4.i=tmp_; }
519 cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
520 PMC(CH(i,k,0),c3,t2,t3)
521 PMC(c2,c4,t1,t4)
522 A_EQ_CB_MUL_C (CH(i,k,1),wa0,c2)
523 A_EQ_CB_MUL_C (CH(i,k,2),wa1,c3)
524 A_EQ_CB_MUL_C (CH(i,k,3),wa2,c4)
525 }
526 }
527 }
528
529#define PREP5(idx)cmplx t0 = CC(idx,0,k), t1, t2, t3, t4; PMC (t1,t4,CC(idx,1,k
),CC(idx,4,k)) PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)) CH(idx,k,0
).r=t0.r+t1.r+t2.r; CH(idx,k,0).i=t0.i+t1.i+t2.i;
\
530 cmplx t0 = CC(idx,0,k), t1, t2, t3, t4; \
531 PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)) \
532 PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)) \
533 CH(idx,k,0).r=t0.r+t1.r+t2.r; \
534 CH(idx,k,0).i=t0.i+t1.i+t2.i;
535
536#define PARTSTEP5a(u1,u2,twar,twbr,twai,twbi){ cmplx ca,cb; ca.r=t0.r+twar*t1.r+twbr*t2.r; ca.i=t0.i+twar*
t1.i+twbr*t2.i; cb.i=twai*t4.r twbi*t3.r; cb.r=-(twai*t4.i twbi
*t3.i); PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) }
\
537 { \
538 cmplx ca,cb; \
539 ca.r=t0.r+twar*t1.r+twbr*t2.r; \
540 ca.i=t0.i+twar*t1.i+twbr*t2.i; \
541 cb.i=twai*t4.r twbi*t3.r; \
542 cb.r=-(twai*t4.i twbi*t3.i); \
543 PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
544 }
545
546#define PARTSTEP5b(u1,u2,twar,twbr,twai,twbi){ cmplx ca,cb,da,db; ca.r=t0.r+twar*t1.r+twbr*t2.r; ca.i=t0.i
+twar*t1.i+twbr*t2.i; cb.i=twai*t4.r twbi*t3.r; cb.r=-(twai*t4
.i twbi*t3.i); PMC(da,db,ca,cb) A_EQ_B_MUL_C (CH(i,k,u1),WA(u1
-1,i),da) A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) }
\
547 { \
548 cmplx ca,cb,da,db; \
549 ca.r=t0.r+twar*t1.r+twbr*t2.r; \
550 ca.i=t0.i+twar*t1.i+twbr*t2.i; \
551 cb.i=twai*t4.r twbi*t3.r; \
552 cb.r=-(twai*t4.i twbi*t3.i); \
553 PMC(da,db,ca,cb) \
554 A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
555 A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
556 }
557NOINLINE__attribute__((noinline)) static void pass5b (size_t ido, size_t l1, const cmplx * restrictrestrict cc,
558 cmplx * restrictrestrict ch, const cmplx * restrictrestrict wa)
559 {
560 const size_t cdim=5;
561 const double tw1r= 0.3090169943749474241,
562 tw1i= 0.95105651629515357212,
563 tw2r= -0.8090169943749474241,
564 tw2i= 0.58778525229247312917;
565
566 if (ido==1)
567 for (size_t k=0; k<l1; ++k)
568 {
569 PREP5(0)cmplx t0 = CC(0,0,k), t1, t2, t3, t4; PMC (t1,t4,CC(0,1,k),CC
(0,4,k)) PMC (t2,t3,CC(0,2,k),CC(0,3,k)) CH(0,k,0).r=t0.r+t1.
r+t2.r; CH(0,k,0).i=t0.i+t1.i+t2.i;
570 PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i){ cmplx ca,cb; ca.r=t0.r+tw1r*t1.r+tw2r*t2.r; ca.i=t0.i+tw1r*
t1.i+tw2r*t2.i; cb.i=+tw1i*t4.r +tw2i*t3.r; cb.r=-(+tw1i*t4.i
+tw2i*t3.i); PMC(CH(0,k,1),CH(0,k,4),ca,cb) }
571 PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i){ cmplx ca,cb; ca.r=t0.r+tw2r*t1.r+tw1r*t2.r; ca.i=t0.i+tw2r*
t1.i+tw1r*t2.i; cb.i=+tw2i*t4.r -tw1i*t3.r; cb.r=-(+tw2i*t4.i
-tw1i*t3.i); PMC(CH(0,k,2),CH(0,k,3),ca,cb) }
572 }
573 else
574 for (size_t k=0; k<l1; ++k)
575 {
576 {
577 PREP5(0)cmplx t0 = CC(0,0,k), t1, t2, t3, t4; PMC (t1,t4,CC(0,1,k),CC
(0,4,k)) PMC (t2,t3,CC(0,2,k),CC(0,3,k)) CH(0,k,0).r=t0.r+t1.
r+t2.r; CH(0,k,0).i=t0.i+t1.i+t2.i;
578 PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i){ cmplx ca,cb; ca.r=t0.r+tw1r*t1.r+tw2r*t2.r; ca.i=t0.i+tw1r*
t1.i+tw2r*t2.i; cb.i=+tw1i*t4.r +tw2i*t3.r; cb.r=-(+tw1i*t4.i
+tw2i*t3.i); PMC(CH(0,k,1),CH(0,k,4),ca,cb) }
579 PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i){ cmplx ca,cb; ca.r=t0.r+tw2r*t1.r+tw1r*t2.r; ca.i=t0.i+tw2r*
t1.i+tw1r*t2.i; cb.i=+tw2i*t4.r -tw1i*t3.r; cb.r=-(+tw2i*t4.i
-tw1i*t3.i); PMC(CH(0,k,2),CH(0,k,3),ca,cb) }
580 }
581 for (size_t i=1; i<ido; ++i)
582 {
583 PREP5(i)cmplx t0 = CC(i,0,k), t1, t2, t3, t4; PMC (t1,t4,CC(i,1,k),CC
(i,4,k)) PMC (t2,t3,CC(i,2,k),CC(i,3,k)) CH(i,k,0).r=t0.r+t1.
r+t2.r; CH(i,k,0).i=t0.i+t1.i+t2.i;
584 PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i){ cmplx ca,cb,da,db; ca.r=t0.r+tw1r*t1.r+tw2r*t2.r; ca.i=t0.i
+tw1r*t1.i+tw2r*t2.i; cb.i=+tw1i*t4.r +tw2i*t3.r; cb.r=-(+tw1i
*t4.i +tw2i*t3.i); PMC(da,db,ca,cb) A_EQ_B_MUL_C (CH(i,k,1),WA
(1 -1,i),da) A_EQ_B_MUL_C (CH(i,k,4),WA(4 -1,i),db) }
585 PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i){ cmplx ca,cb,da,db; ca.r=t0.r+tw2r*t1.r+tw1r*t2.r; ca.i=t0.i
+tw2r*t1.i+tw1r*t2.i; cb.i=+tw2i*t4.r -tw1i*t3.r; cb.r=-(+tw2i
*t4.i -tw1i*t3.i); PMC(da,db,ca,cb) A_EQ_B_MUL_C (CH(i,k,2),WA
(2 -1,i),da) A_EQ_B_MUL_C (CH(i,k,3),WA(3 -1,i),db) }
586 }
587 }
588 }
589#define PARTSTEP5f(u1,u2,twar,twbr,twai,twbi){ cmplx ca,cb,da,db; ca.r=t0.r+twar*t1.r+twbr*t2.r; ca.i=t0.i
+twar*t1.i+twbr*t2.i; cb.i=twai*t4.r twbi*t3.r; cb.r=-(twai*t4
.i twbi*t3.i); PMC(da,db,ca,cb) A_EQ_CB_MUL_C (CH(i,k,u1),WA(
u1-1,i),da) A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) }
\
590 { \
591 cmplx ca,cb,da,db; \
592 ca.r=t0.r+twar*t1.r+twbr*t2.r; \
593 ca.i=t0.i+twar*t1.i+twbr*t2.i; \
594 cb.i=twai*t4.r twbi*t3.r; \
595 cb.r=-(twai*t4.i twbi*t3.i); \
596 PMC(da,db,ca,cb) \
597 A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
598 A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
599 }
600NOINLINE__attribute__((noinline)) static void pass5f (size_t ido, size_t l1, const cmplx * restrictrestrict cc,
601 cmplx * restrictrestrict ch, const cmplx * restrictrestrict wa)
602 {
603 const size_t cdim=5;
604 const double tw1r= 0.3090169943749474241,
605 tw1i= -0.95105651629515357212,
606 tw2r= -0.8090169943749474241,
607 tw2i= -0.58778525229247312917;
608
609 if (ido==1)
610 for (size_t k=0; k<l1; ++k)
611 {
612 PREP5(0)cmplx t0 = CC(0,0,k), t1, t2, t3, t4; PMC (t1,t4,CC(0,1,k),CC
(0,4,k)) PMC (t2,t3,CC(0,2,k),CC(0,3,k)) CH(0,k,0).r=t0.r+t1.
r+t2.r; CH(0,k,0).i=t0.i+t1.i+t2.i;
613 PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i){ cmplx ca,cb; ca.r=t0.r+tw1r*t1.r+tw2r*t2.r; ca.i=t0.i+tw1r*
t1.i+tw2r*t2.i; cb.i=+tw1i*t4.r +tw2i*t3.r; cb.r=-(+tw1i*t4.i
+tw2i*t3.i); PMC(CH(0,k,1),CH(0,k,4),ca,cb) }
614 PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i){ cmplx ca,cb; ca.r=t0.r+tw2r*t1.r+tw1r*t2.r; ca.i=t0.i+tw2r*
t1.i+tw1r*t2.i; cb.i=+tw2i*t4.r -tw1i*t3.r; cb.r=-(+tw2i*t4.i
-tw1i*t3.i); PMC(CH(0,k,2),CH(0,k,3),ca,cb) }
615 }
616 else
617 for (size_t k=0; k<l1; ++k)
618 {
619 {
620 PREP5(0)cmplx t0 = CC(0,0,k), t1, t2, t3, t4; PMC (t1,t4,CC(0,1,k),CC
(0,4,k)) PMC (t2,t3,CC(0,2,k),CC(0,3,k)) CH(0,k,0).r=t0.r+t1.
r+t2.r; CH(0,k,0).i=t0.i+t1.i+t2.i;
621 PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i){ cmplx ca,cb; ca.r=t0.r+tw1r*t1.r+tw2r*t2.r; ca.i=t0.i+tw1r*
t1.i+tw2r*t2.i; cb.i=+tw1i*t4.r +tw2i*t3.r; cb.r=-(+tw1i*t4.i
+tw2i*t3.i); PMC(CH(0,k,1),CH(0,k,4),ca,cb) }
622 PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i){ cmplx ca,cb; ca.r=t0.r+tw2r*t1.r+tw1r*t2.r; ca.i=t0.i+tw2r*
t1.i+tw1r*t2.i; cb.i=+tw2i*t4.r -tw1i*t3.r; cb.r=-(+tw2i*t4.i
-tw1i*t3.i); PMC(CH(0,k,2),CH(0,k,3),ca,cb) }
623 }
624 for (size_t i=1; i<ido; ++i)
625 {
626 PREP5(i)cmplx t0 = CC(i,0,k), t1, t2, t3, t4; PMC (t1,t4,CC(i,1,k),CC
(i,4,k)) PMC (t2,t3,CC(i,2,k),CC(i,3,k)) CH(i,k,0).r=t0.r+t1.
r+t2.r; CH(i,k,0).i=t0.i+t1.i+t2.i;
627 PARTSTEP5f(1,4,tw1r,tw2r,+tw1i,+tw2i){ cmplx ca,cb,da,db; ca.r=t0.r+tw1r*t1.r+tw2r*t2.r; ca.i=t0.i
+tw1r*t1.i+tw2r*t2.i; cb.i=+tw1i*t4.r +tw2i*t3.r; cb.r=-(+tw1i
*t4.i +tw2i*t3.i); PMC(da,db,ca,cb) A_EQ_CB_MUL_C (CH(i,k,1),
WA(1 -1,i),da) A_EQ_CB_MUL_C (CH(i,k,4),WA(4 -1,i),db) }
628 PARTSTEP5f(2,3,tw2r,tw1r,+tw2i,-tw1i){ cmplx ca,cb,da,db; ca.r=t0.r+tw2r*t1.r+tw1r*t2.r; ca.i=t0.i
+tw2r*t1.i+tw1r*t2.i; cb.i=+tw2i*t4.r -tw1i*t3.r; cb.r=-(+tw2i
*t4.i -tw1i*t3.i); PMC(da,db,ca,cb) A_EQ_CB_MUL_C (CH(i,k,2),
WA(2 -1,i),da) A_EQ_CB_MUL_C (CH(i,k,3),WA(3 -1,i),db) }
629 }
630 }
631 }
632
633#define PREP7(idx)cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; PMC (t2,t7,CC
(idx,1,k),CC(idx,6,k)) PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)) PMC
(t4,t5,CC(idx,3,k),CC(idx,4,k)) CH(idx,k,0).r=t1.r+t2.r+t3.r
+t4.r; CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
\
634 cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
635 PMC (t2,t7,CC(idx,1,k),CC(idx,6,k)) \
636 PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)) \
637 PMC (t4,t5,CC(idx,3,k),CC(idx,4,k)) \
638 CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
639 CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
640
641#define PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2){ cmplx ca,cb; ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; ca.i=t1.i+x1
*t2.i+x2*t3.i+x3*t4.i; cb.i=y1*t7.r y2*t6.r y3*t5.r; cb.r=-(y1
*t7.i y2*t6.i y3*t5.i); PMC(out1,out2,ca,cb) }
\
642 { \
643 cmplx ca,cb; \
644 ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
645 ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
646 cb.i=y1*t7.r y2*t6.r y3*t5.r; \
647 cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
648 PMC(out1,out2,ca,cb) \
649 }
650#define PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3){ cmplx ca,cb; ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; ca.i=t1.i+x1
*t2.i+x2*t3.i+x3*t4.i; cb.i=y1*t7.r y2*t6.r y3*t5.r; cb.r=-(y1
*t7.i y2*t6.i y3*t5.i); PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) }
\
651 PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2)){ cmplx ca,cb; ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; ca.i=t1.i+x1
*t2.i+x2*t3.i+x3*t4.i; cb.i=y1*t7.r y2*t6.r y3*t5.r; cb.r=-(y1
*t7.i y2*t6.i y3*t5.i); PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) }
652#define PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3){ cmplx da,db; { cmplx ca,cb; ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4
.r; ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; cb.i=y1*t7.r y2*t6.r y3
*t5.r; cb.r=-(y1*t7.i y2*t6.i y3*t5.i); PMC(da,db,ca,cb) } MULPMSIGNC
(CH(i,k,u1),WA(u1-1,i),da) MULPMSIGNC (CH(i,k,u2),WA(u2-1,i)
,db) }
\
653 { \
654 cmplx da,db; \
655 PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db){ cmplx ca,cb; ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; ca.i=t1.i+x1
*t2.i+x2*t3.i+x3*t4.i; cb.i=y1*t7.r y2*t6.r y3*t5.r; cb.r=-(y1
*t7.i y2*t6.i y3*t5.i); PMC(da,db,ca,cb) }
\
656 MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
657 MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
658 }
659
660NOINLINE__attribute__((noinline)) static void pass7(size_t ido, size_t l1, const cmplx * restrictrestrict cc,
661 cmplx * restrictrestrict ch, const cmplx * restrictrestrict wa, const int sign)
662 {
663 const size_t cdim=7;
664 const double tw1r= 0.623489801858733530525,
665 tw1i= sign * 0.7818314824680298087084,
666 tw2r= -0.222520933956314404289,
667 tw2i= sign * 0.9749279121818236070181,
668 tw3r= -0.9009688679024191262361,
669 tw3i= sign * 0.4338837391175581204758;
670
671 if (ido==1)
672 for (size_t k=0; k<l1; ++k)
673 {
674 PREP7(0)cmplx t1 = CC(0,0,k), t2, t3, t4, t5, t6, t7; PMC (t2,t7,CC(0
,1,k),CC(0,6,k)) PMC (t3,t6,CC(0,2,k),CC(0,5,k)) PMC (t4,t5,CC
(0,3,k),CC(0,4,k)) CH(0,k,0).r=t1.r+t2.r+t3.r+t4.r; CH(0,k,0)
.i=t1.i+t2.i+t3.i+t4.i;
675 PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i){ cmplx ca,cb; ca.r=t1.r+tw1r*t2.r+tw2r*t3.r+tw3r*t4.r; ca.i=
t1.i+tw1r*t2.i+tw2r*t3.i+tw3r*t4.i; cb.i=+tw1i*t7.r +tw2i*t6.
r +tw3i*t5.r; cb.r=-(+tw1i*t7.i +tw2i*t6.i +tw3i*t5.i); PMC(CH
(0,k,1),CH(0,k,6),ca,cb) }
676 PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i){ cmplx ca,cb; ca.r=t1.r+tw2r*t2.r+tw3r*t3.r+tw1r*t4.r; ca.i=
t1.i+tw2r*t2.i+tw3r*t3.i+tw1r*t4.i; cb.i=+tw2i*t7.r -tw3i*t6.
r -tw1i*t5.r; cb.r=-(+tw2i*t7.i -tw3i*t6.i -tw1i*t5.i); PMC(CH
(0,k,2),CH(0,k,5),ca,cb) }
677 PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i){ cmplx ca,cb; ca.r=t1.r+tw3r*t2.r+tw1r*t3.r+tw2r*t4.r; ca.i=
t1.i+tw3r*t2.i+tw1r*t3.i+tw2r*t4.i; cb.i=+tw3i*t7.r -tw1i*t6.
r +tw2i*t5.r; cb.r=-(+tw3i*t7.i -tw1i*t6.i +tw2i*t5.i); PMC(CH
(0,k,3),CH(0,k,4),ca,cb) }
678 }
679 else
680 for (size_t k=0; k<l1; ++k)
681 {
682 {
683 PREP7(0)cmplx t1 = CC(0,0,k), t2, t3, t4, t5, t6, t7; PMC (t2,t7,CC(0
,1,k),CC(0,6,k)) PMC (t3,t6,CC(0,2,k),CC(0,5,k)) PMC (t4,t5,CC
(0,3,k),CC(0,4,k)) CH(0,k,0).r=t1.r+t2.r+t3.r+t4.r; CH(0,k,0)
.i=t1.i+t2.i+t3.i+t4.i;
684 PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i){ cmplx ca,cb; ca.r=t1.r+tw1r*t2.r+tw2r*t3.r+tw3r*t4.r; ca.i=
t1.i+tw1r*t2.i+tw2r*t3.i+tw3r*t4.i; cb.i=+tw1i*t7.r +tw2i*t6.
r +tw3i*t5.r; cb.r=-(+tw1i*t7.i +tw2i*t6.i +tw3i*t5.i); PMC(CH
(0,k,1),CH(0,k,6),ca,cb) }
685 PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i){ cmplx ca,cb; ca.r=t1.r+tw2r*t2.r+tw3r*t3.r+tw1r*t4.r; ca.i=
t1.i+tw2r*t2.i+tw3r*t3.i+tw1r*t4.i; cb.i=+tw2i*t7.r -tw3i*t6.
r -tw1i*t5.r; cb.r=-(+tw2i*t7.i -tw3i*t6.i -tw1i*t5.i); PMC(CH
(0,k,2),CH(0,k,5),ca,cb) }
686 PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i){ cmplx ca,cb; ca.r=t1.r+tw3r*t2.r+tw1r*t3.r+tw2r*t4.r; ca.i=
t1.i+tw3r*t2.i+tw1r*t3.i+tw2r*t4.i; cb.i=+tw3i*t7.r -tw1i*t6.
r +tw2i*t5.r; cb.r=-(+tw3i*t7.i -tw1i*t6.i +tw2i*t5.i); PMC(CH
(0,k,3),CH(0,k,4),ca,cb) }
687 }
688 for (size_t i=1; i<ido; ++i)
689 {
690 PREP7(i)cmplx t1 = CC(i,0,k), t2, t3, t4, t5, t6, t7; PMC (t2,t7,CC(i
,1,k),CC(i,6,k)) PMC (t3,t6,CC(i,2,k),CC(i,5,k)) PMC (t4,t5,CC
(i,3,k),CC(i,4,k)) CH(i,k,0).r=t1.r+t2.r+t3.r+t4.r; CH(i,k,0)
.i=t1.i+t2.i+t3.i+t4.i;
691 PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i){ cmplx da,db; { cmplx ca,cb; ca.r=t1.r+tw1r*t2.r+tw2r*t3.r+tw3r
*t4.r; ca.i=t1.i+tw1r*t2.i+tw2r*t3.i+tw3r*t4.i; cb.i=+tw1i*t7
.r +tw2i*t6.r +tw3i*t5.r; cb.r=-(+tw1i*t7.i +tw2i*t6.i +tw3i*
t5.i); PMC(da,db,ca,cb) } MULPMSIGNC (CH(i,k,1),WA(1 -1,i),da
) MULPMSIGNC (CH(i,k,6),WA(6 -1,i),db) }
692 PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i){ cmplx da,db; { cmplx ca,cb; ca.r=t1.r+tw2r*t2.r+tw3r*t3.r+tw1r
*t4.r; ca.i=t1.i+tw2r*t2.i+tw3r*t3.i+tw1r*t4.i; cb.i=+tw2i*t7
.r -tw3i*t6.r -tw1i*t5.r; cb.r=-(+tw2i*t7.i -tw3i*t6.i -tw1i*
t5.i); PMC(da,db,ca,cb) } MULPMSIGNC (CH(i,k,2),WA(2 -1,i),da
) MULPMSIGNC (CH(i,k,5),WA(5 -1,i),db) }
693 PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i){ cmplx da,db; { cmplx ca,cb; ca.r=t1.r+tw3r*t2.r+tw1r*t3.r+tw2r
*t4.r; ca.i=t1.i+tw3r*t2.i+tw1r*t3.i+tw2r*t4.i; cb.i=+tw3i*t7
.r -tw1i*t6.r +tw2i*t5.r; cb.r=-(+tw3i*t7.i -tw1i*t6.i +tw2i*
t5.i); PMC(da,db,ca,cb) } MULPMSIGNC (CH(i,k,3),WA(3 -1,i),da
) MULPMSIGNC (CH(i,k,4),WA(4 -1,i),db) }
694 }
695 }
696 }
697
698#define PREP11(idx)cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11
; PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)) PMC (t3,t10,CC(idx,2,
k),CC(idx, 9,k)) PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)) PMC (t5
,t8 ,CC(idx,4,k),CC(idx, 7,k)) PMC (t6,t7 ,CC(idx,5,k),CC(idx
, 6,k)) CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; CH(idx,k
,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
\
699 cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
700 PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)) \
701 PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)) \
702 PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)) \
703 PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)) \
704 PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)) \
705 CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
706 CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
707
708#define PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2){ cmplx ca,cb; ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r+x4*t5.r+x5*t6
.r; ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i+x4*t5.i+x5*t6.i; cb.i=y1
*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; cb.r=-(y1*t11.i y2*t10
.i y3*t9.i y4*t8.i y5*t7.i ); PMC(out1,out2,ca,cb) }
\
709 { \
710 cmplx ca,cb; \
711 ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r+x4*t5.r+x5*t6.r; \
712 ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i+x4*t5.i+x5*t6.i; \
713 cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
714 cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
715 PMC(out1,out2,ca,cb) \
716 }
717#define PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5){ cmplx ca,cb; ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r+x4*t5.r+x5*t6
.r; ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i+x4*t5.i+x5*t6.i; cb.i=y1
*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; cb.r=-(y1*t11.i y2*t10
.i y3*t9.i y4*t8.i y5*t7.i ); PMC(CH(0,k,u1),CH(0,k,u2),ca,cb
) }
\
718 PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2)){ cmplx ca,cb; ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r+x4*t5.r+x5*t6
.r; ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i+x4*t5.i+x5*t6.i; cb.i=y1
*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; cb.r=-(y1*t11.i y2*t10
.i y3*t9.i y4*t8.i y5*t7.i ); PMC(CH(0,k,u1),CH(0,k,u2),ca,cb
) }
719#define PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5){ cmplx da,db; { cmplx ca,cb; ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4
.r+x4*t5.r+x5*t6.r; ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i+x4*t5.i
+x5*t6.i; cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; cb.
r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); PMC(da,db,ca
,cb) } MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) MULPMSIGNC (CH(i
,k,u2),WA(u2-1,i),db) }
\
720 { \
721 cmplx da,db; \
722 PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db){ cmplx ca,cb; ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r+x4*t5.r+x5*t6
.r; ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i+x4*t5.i+x5*t6.i; cb.i=y1
*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; cb.r=-(y1*t11.i y2*t10
.i y3*t9.i y4*t8.i y5*t7.i ); PMC(da,db,ca,cb) }
\
723 MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
724 MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
725 }
726
727NOINLINE__attribute__((noinline)) static void pass11 (size_t ido, size_t l1, const cmplx * restrictrestrict cc,
728 cmplx * restrictrestrict ch, const cmplx * restrictrestrict wa, const int sign)
729 {
730 const size_t cdim=11;
731 const double tw1r = 0.8412535328311811688618,
732 tw1i = sign * 0.5406408174555975821076,
733 tw2r = 0.4154150130018864255293,
734 tw2i = sign * 0.9096319953545183714117,
735 tw3r = -0.1423148382732851404438,
736 tw3i = sign * 0.9898214418809327323761,
737 tw4r = -0.6548607339452850640569,
738 tw4i = sign * 0.755749574354258283774,
739 tw5r = -0.9594929736144973898904,
740 tw5i = sign * 0.2817325568414296977114;
741
742 if (ido==1)
743 for (size_t k=0; k<l1; ++k)
744 {
745 PREP11(0)cmplx t1 = CC(0,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11
; PMC (t2,t11,CC(0,1,k),CC(0,10,k)) PMC (t3,t10,CC(0,2,k),CC(
0, 9,k)) PMC (t4,t9 ,CC(0,3,k),CC(0, 8,k)) PMC (t5,t8 ,CC(0,4
,k),CC(0, 7,k)) PMC (t6,t7 ,CC(0,5,k),CC(0, 6,k)) CH(0,k,0).r
=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; CH(0,k,0).i=t1.i+t2.i+t3.i+t4
.i+t5.i+t6.i;
746 PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i){ cmplx ca,cb; ca.r=t1.r+tw1r*t2.r+tw2r*t3.r+tw3r*t4.r+tw4r*t5
.r+tw5r*t6.r; ca.i=t1.i+tw1r*t2.i+tw2r*t3.i+tw3r*t4.i+tw4r*t5
.i+tw5r*t6.i; cb.i=+tw1i*t11.r +tw2i*t10.r +tw3i*t9.r +tw4i*t8
.r +tw5i*t7.r; cb.r=-(+tw1i*t11.i +tw2i*t10.i +tw3i*t9.i +tw4i
*t8.i +tw5i*t7.i ); PMC(CH(0,k,1),CH(0,k,10),ca,cb) }
747 PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i){ cmplx ca,cb; ca.r=t1.r+tw2r*t2.r+tw4r*t3.r+tw5r*t4.r+tw3r*t5
.r+tw1r*t6.r; ca.i=t1.i+tw2r*t2.i+tw4r*t3.i+tw5r*t4.i+tw3r*t5
.i+tw1r*t6.i; cb.i=+tw2i*t11.r +tw4i*t10.r -tw5i*t9.r -tw3i*t8
.r -tw1i*t7.r; cb.r=-(+tw2i*t11.i +tw4i*t10.i -tw5i*t9.i -tw3i
*t8.i -tw1i*t7.i ); PMC(CH(0,k,2),CH(0,k,9),ca,cb) }
748 PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i){ cmplx ca,cb; ca.r=t1.r+tw3r*t2.r+tw5r*t3.r+tw2r*t4.r+tw1r*t5
.r+tw4r*t6.r; ca.i=t1.i+tw3r*t2.i+tw5r*t3.i+tw2r*t4.i+tw1r*t5
.i+tw4r*t6.i; cb.i=+tw3i*t11.r -tw5i*t10.r -tw2i*t9.r +tw1i*t8
.r +tw4i*t7.r; cb.r=-(+tw3i*t11.i -tw5i*t10.i -tw2i*t9.i +tw1i
*t8.i +tw4i*t7.i ); PMC(CH(0,k,3),CH(0,k,8),ca,cb) }
749 PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i){ cmplx ca,cb; ca.r=t1.r+tw4r*t2.r+tw3r*t3.r+tw1r*t4.r+tw5r*t5
.r+tw2r*t6.r; ca.i=t1.i+tw4r*t2.i+tw3r*t3.i+tw1r*t4.i+tw5r*t5
.i+tw2r*t6.i; cb.i=+tw4i*t11.r -tw3i*t10.r +tw1i*t9.r +tw5i*t8
.r -tw2i*t7.r; cb.r=-(+tw4i*t11.i -tw3i*t10.i +tw1i*t9.i +tw5i
*t8.i -tw2i*t7.i ); PMC(CH(0,k,4),CH(0,k,7),ca,cb) }
750 PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i){ cmplx ca,cb; ca.r=t1.r+tw5r*t2.r+tw1r*t3.r+tw4r*t4.r+tw2r*t5
.r+tw3r*t6.r; ca.i=t1.i+tw5r*t2.i+tw1r*t3.i+tw4r*t4.i+tw2r*t5
.i+tw3r*t6.i; cb.i=+tw5i*t11.r -tw1i*t10.r +tw4i*t9.r -tw2i*t8
.r +tw3i*t7.r; cb.r=-(+tw5i*t11.i -tw1i*t10.i +tw4i*t9.i -tw2i
*t8.i +tw3i*t7.i ); PMC(CH(0,k,5),CH(0,k,6),ca,cb) }
751 }
752 else
753 for (size_t k=0; k<l1; ++k)
754 {
755 {
756 PREP11(0)cmplx t1 = CC(0,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11
; PMC (t2,t11,CC(0,1,k),CC(0,10,k)) PMC (t3,t10,CC(0,2,k),CC(
0, 9,k)) PMC (t4,t9 ,CC(0,3,k),CC(0, 8,k)) PMC (t5,t8 ,CC(0,4
,k),CC(0, 7,k)) PMC (t6,t7 ,CC(0,5,k),CC(0, 6,k)) CH(0,k,0).r
=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; CH(0,k,0).i=t1.i+t2.i+t3.i+t4
.i+t5.i+t6.i;
757 PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i){ cmplx ca,cb; ca.r=t1.r+tw1r*t2.r+tw2r*t3.r+tw3r*t4.r+tw4r*t5
.r+tw5r*t6.r; ca.i=t1.i+tw1r*t2.i+tw2r*t3.i+tw3r*t4.i+tw4r*t5
.i+tw5r*t6.i; cb.i=+tw1i*t11.r +tw2i*t10.r +tw3i*t9.r +tw4i*t8
.r +tw5i*t7.r; cb.r=-(+tw1i*t11.i +tw2i*t10.i +tw3i*t9.i +tw4i
*t8.i +tw5i*t7.i ); PMC(CH(0,k,1),CH(0,k,10),ca,cb) }
758 PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i){ cmplx ca,cb; ca.r=t1.r+tw2r*t2.r+tw4r*t3.r+tw5r*t4.r+tw3r*t5
.r+tw1r*t6.r; ca.i=t1.i+tw2r*t2.i+tw4r*t3.i+tw5r*t4.i+tw3r*t5
.i+tw1r*t6.i; cb.i=+tw2i*t11.r +tw4i*t10.r -tw5i*t9.r -tw3i*t8
.r -tw1i*t7.r; cb.r=-(+tw2i*t11.i +tw4i*t10.i -tw5i*t9.i -tw3i
*t8.i -tw1i*t7.i ); PMC(CH(0,k,2),CH(0,k,9),ca,cb) }
759 PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i){ cmplx ca,cb; ca.r=t1.r+tw3r*t2.r+tw5r*t3.r+tw2r*t4.r+tw1r*t5
.r+tw4r*t6.r; ca.i=t1.i+tw3r*t2.i+tw5r*t3.i+tw2r*t4.i+tw1r*t5
.i+tw4r*t6.i; cb.i=+tw3i*t11.r -tw5i*t10.r -tw2i*t9.r +tw1i*t8
.r +tw4i*t7.r; cb.r=-(+tw3i*t11.i -tw5i*t10.i -tw2i*t9.i +tw1i
*t8.i +tw4i*t7.i ); PMC(CH(0,k,3),CH(0,k,8),ca,cb) }
760 PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i){ cmplx ca,cb; ca.r=t1.r+tw4r*t2.r+tw3r*t3.r+tw1r*t4.r+tw5r*t5
.r+tw2r*t6.r; ca.i=t1.i+tw4r*t2.i+tw3r*t3.i+tw1r*t4.i+tw5r*t5
.i+tw2r*t6.i; cb.i=+tw4i*t11.r -tw3i*t10.r +tw1i*t9.r +tw5i*t8
.r -tw2i*t7.r; cb.r=-(+tw4i*t11.i -tw3i*t10.i +tw1i*t9.i +tw5i
*t8.i -tw2i*t7.i ); PMC(CH(0,k,4),CH(0,k,7),ca,cb) }
761 PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i){ cmplx ca,cb; ca.r=t1.r+tw5r*t2.r+tw1r*t3.r+tw4r*t4.r+tw2r*t5
.r+tw3r*t6.r; ca.i=t1.i+tw5r*t2.i+tw1r*t3.i+tw4r*t4.i+tw2r*t5
.i+tw3r*t6.i; cb.i=+tw5i*t11.r -tw1i*t10.r +tw4i*t9.r -tw2i*t8
.r +tw3i*t7.r; cb.r=-(+tw5i*t11.i -tw1i*t10.i +tw4i*t9.i -tw2i
*t8.i +tw3i*t7.i ); PMC(CH(0,k,5),CH(0,k,6),ca,cb) }
762 }
763 for (size_t i=1; i<ido; ++i)
764 {
765 PREP11(i)cmplx t1 = CC(i,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11
; PMC (t2,t11,CC(i,1,k),CC(i,10,k)) PMC (t3,t10,CC(i,2,k),CC(
i, 9,k)) PMC (t4,t9 ,CC(i,3,k),CC(i, 8,k)) PMC (t5,t8 ,CC(i,4
,k),CC(i, 7,k)) PMC (t6,t7 ,CC(i,5,k),CC(i, 6,k)) CH(i,k,0).r
=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; CH(i,k,0).i=t1.i+t2.i+t3.i+t4
.i+t5.i+t6.i;
766 PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i){ cmplx da,db; { cmplx ca,cb; ca.r=t1.r+tw1r*t2.r+tw2r*t3.r+tw3r
*t4.r+tw4r*t5.r+tw5r*t6.r; ca.i=t1.i+tw1r*t2.i+tw2r*t3.i+tw3r
*t4.i+tw4r*t5.i+tw5r*t6.i; cb.i=+tw1i*t11.r +tw2i*t10.r +tw3i
*t9.r +tw4i*t8.r +tw5i*t7.r; cb.r=-(+tw1i*t11.i +tw2i*t10.i +
tw3i*t9.i +tw4i*t8.i +tw5i*t7.i ); PMC(da,db,ca,cb) } MULPMSIGNC
(CH(i,k,1),WA(1 -1,i),da) MULPMSIGNC (CH(i,k,10),WA(10 -1,i)
,db) }
767 PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i){ cmplx da,db; { cmplx ca,cb; ca.r=t1.r+tw2r*t2.r+tw4r*t3.r+tw5r
*t4.r+tw3r*t5.r+tw1r*t6.r; ca.i=t1.i+tw2r*t2.i+tw4r*t3.i+tw5r
*t4.i+tw3r*t5.i+tw1r*t6.i; cb.i=+tw2i*t11.r +tw4i*t10.r -tw5i
*t9.r -tw3i*t8.r -tw1i*t7.r; cb.r=-(+tw2i*t11.i +tw4i*t10.i -
tw5i*t9.i -tw3i*t8.i -tw1i*t7.i ); PMC(da,db,ca,cb) } MULPMSIGNC
(CH(i,k,2),WA(2 -1,i),da) MULPMSIGNC (CH(i,k,9),WA(9 -1,i),db
) }
768 PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i){ cmplx da,db; { cmplx ca,cb; ca.r=t1.r+tw3r*t2.r+tw5r*t3.r+tw2r
*t4.r+tw1r*t5.r+tw4r*t6.r; ca.i=t1.i+tw3r*t2.i+tw5r*t3.i+tw2r
*t4.i+tw1r*t5.i+tw4r*t6.i; cb.i=+tw3i*t11.r -tw5i*t10.r -tw2i
*t9.r +tw1i*t8.r +tw4i*t7.r; cb.r=-(+tw3i*t11.i -tw5i*t10.i -
tw2i*t9.i +tw1i*t8.i +tw4i*t7.i ); PMC(da,db,ca,cb) } MULPMSIGNC
(CH(i,k,3),WA(3 -1,i),da) MULPMSIGNC (CH(i,k,8),WA(8 -1,i),db
) }
769 PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i){ cmplx da,db; { cmplx ca,cb; ca.r=t1.r+tw4r*t2.r+tw3r*t3.r+tw1r
*t4.r+tw5r*t5.r+tw2r*t6.r; ca.i=t1.i+tw4r*t2.i+tw3r*t3.i+tw1r
*t4.i+tw5r*t5.i+tw2r*t6.i; cb.i=+tw4i*t11.r -tw3i*t10.r +tw1i
*t9.r +tw5i*t8.r -tw2i*t7.r; cb.r=-(+tw4i*t11.i -tw3i*t10.i +
tw1i*t9.i +tw5i*t8.i -tw2i*t7.i ); PMC(da,db,ca,cb) } MULPMSIGNC
(CH(i,k,4),WA(4 -1,i),da) MULPMSIGNC (CH(i,k,7),WA(7 -1,i),db
) }
770 PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i){ cmplx da,db; { cmplx ca,cb; ca.r=t1.r+tw5r*t2.r+tw1r*t3.r+tw4r
*t4.r+tw2r*t5.r+tw3r*t6.r; ca.i=t1.i+tw5r*t2.i+tw1r*t3.i+tw4r
*t4.i+tw2r*t5.i+tw3r*t6.i; cb.i=+tw5i*t11.r -tw1i*t10.r +tw4i
*t9.r -tw2i*t8.r +tw3i*t7.r; cb.r=-(+tw5i*t11.i -tw1i*t10.i +
tw4i*t9.i -tw2i*t8.i +tw3i*t7.i ); PMC(da,db,ca,cb) } MULPMSIGNC
(CH(i,k,5),WA(5 -1,i),da) MULPMSIGNC (CH(i,k,6),WA(6 -1,i),db
) }
771 }
772 }
773 }
774
775#define CX(a,b,c) cc[(a)+ido*((b)+l1*(c))]
776#define CX2(a,b) cc[(a)+idl1*(b)]
777#define CH2(a,b) ch[(a)+idl1*(b)]
778
779NOINLINE__attribute__((noinline)) static int passg (size_t ido, size_t ip, size_t l1,
780 cmplx * restrictrestrict cc, cmplx * restrictrestrict ch, const cmplx * restrictrestrict wa,
781 const cmplx * restrictrestrict csarr, const int sign)
782 {
783 const size_t cdim=ip;
784 size_t ipph = (ip+1)/2;
785 size_t idl1 = ido*l1;
786
787 cmplx * restrictrestrict wal=RALLOC(cmplx,ip)((cmplx *)malloc((ip)*sizeof(cmplx)));
788 if (!wal) return -1;
789 wal[0]=(cmplx){1.,0.};
790 for (size_t i=1; i<ip; ++i)
791 wal[i]=(cmplx){csarr[i].r,sign*csarr[i].i};
792
793 for (size_t k=0; k<l1; ++k)
794 for (size_t i=0; i<ido; ++i)
795 CH(i,k,0) = CC(i,0,k);
796 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
797 for (size_t k=0; k<l1; ++k)
798 for (size_t i=0; i<ido; ++i)
799 PMC(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k))
800 for (size_t k=0; k<l1; ++k)
801 for (size_t i=0; i<ido; ++i)
802 {
803 cmplx tmp = CH(i,k,0);
804 for (size_t j=1; j<ipph; ++j)
805 ADDC(tmp,tmp,CH(i,k,j))
806 CX(i,k,0) = tmp;
807 }
808 for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
809 {
810 // j=0
811 for (size_t ik=0; ik<idl1; ++ik)
812 {
813 CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
814 CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
815 CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
816 CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
817 }
818
819 size_t iwal=2*l;
820 size_t j=3, jc=ip-3;
821 for (; j<ipph-1; j+=2, jc-=2)
822 {
823 iwal+=l; if (iwal>ip) iwal-=ip;
824 cmplx xwal=wal[iwal];
825 iwal+=l; if (iwal>ip) iwal-=ip;
826 cmplx xwal2=wal[iwal];
827 for (size_t ik=0; ik<idl1; ++ik)
828 {
829 CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
830 CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
831 CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
832 CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
833 }
834 }
835 for (; j<ipph; ++j, --jc)
836 {
837 iwal+=l; if (iwal>ip) iwal-=ip;
838 cmplx xwal=wal[iwal];
839 for (size_t ik=0; ik<idl1; ++ik)
840 {
841 CX2(ik,l).r += CH2(ik,j).r*xwal.r;
842 CX2(ik,l).i += CH2(ik,j).i*xwal.r;
843 CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
844 CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
845 }
846 }
847 }
848 DEALLOC(wal)do { free(wal); (wal)=((void*)0); } while(0);
849
850 // shuffling and twiddling
851 if (ido==1)
852 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
853 for (size_t ik=0; ik<idl1; ++ik)
854 {
855 cmplx t1=CX2(ik,j), t2=CX2(ik,jc);
856 PMC(CX2(ik,j),CX2(ik,jc),t1,t2)
857 }
858 else
859 {
860 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
861 for (size_t k=0; k<l1; ++k)
862 {
863 cmplx t1=CX(0,k,j), t2=CX(0,k,jc);
864 PMC(CX(0,k,j),CX(0,k,jc),t1,t2)
865 for (size_t i=1; i<ido; ++i)
866 {
867 cmplx x1, x2;
868 PMC(x1,x2,CX(i,k,j),CX(i,k,jc))
869 size_t idij=(j-1)*(ido-1)+i-1;
870 MULPMSIGNC (CX(i,k,j),wa[idij],x1)
871 idij=(jc-1)*(ido-1)+i-1;
872 MULPMSIGNC (CX(i,k,jc),wa[idij],x2)
873 }
874 }
875 }
876 return 0;
877 }
878
879#undef CH2
880#undef CX2
881#undef CX
882
883NOINLINE__attribute__((noinline)) WARN_UNUSED_RESULT__attribute__ ((warn_unused_result)) static int pass_all(cfftp_plan plan, cmplx c[], double fct,
884 const int sign)
885 {
886 if (plan->length==1) return 0;
887 size_t len=plan->length;
888 size_t l1=1, nf=plan->nfct;
889 cmplx *ch = RALLOC(cmplx, len)((cmplx *)malloc((len)*sizeof(cmplx)));
890 if (!ch) return -1;
891 cmplx *p1=c, *p2=ch;
892
893 for(size_t k1=0; k1<nf; k1++)
894 {
895 size_t ip=plan->fct[k1].fct;
896 size_t l2=ip*l1;
897 size_t ido = len/l2;
898 if (ip==4)
899 sign>0 ? pass4b (ido, l1, p1, p2, plan->fct[k1].tw)
900 : pass4f (ido, l1, p1, p2, plan->fct[k1].tw);
901 else if(ip==2)
902 sign>0 ? pass2b (ido, l1, p1, p2, plan->fct[k1].tw)
903 : pass2f (ido, l1, p1, p2, plan->fct[k1].tw);
904 else if(ip==3)
905 sign>0 ? pass3b (ido, l1, p1, p2, plan->fct[k1].tw)
906 : pass3f (ido, l1, p1, p2, plan->fct[k1].tw);
907 else if(ip==5)
908 sign>0 ? pass5b (ido, l1, p1, p2, plan->fct[k1].tw)
909 : pass5f (ido, l1, p1, p2, plan->fct[k1].tw);
910 else if(ip==7) pass7 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
911 else if(ip==11) pass11(ido, l1, p1, p2, plan->fct[k1].tw, sign);
912 else
913 {
914 if (passg(ido, ip, l1, p1, p2, plan->fct[k1].tw, plan->fct[k1].tws, sign))
915 { DEALLOC(ch)do { free(ch); (ch)=((void*)0); } while(0); return -1; }
916 SWAP(p1,p2,cmplx *)do { cmplx * tmp_=(p1); (p1)=(p2); (p2)=tmp_; } while(0);
917 }
918 SWAP(p1,p2,cmplx *)do { cmplx * tmp_=(p1); (p1)=(p2); (p2)=tmp_; } while(0);
919 l1=l2;
920 }
921 if (p1!=c)
922 {
923 if (fct!=1.)
924 for (size_t i=0; i<len; ++i)
925 {
926 c[i].r = ch[i].r*fct;
927 c[i].i = ch[i].i*fct;
928 }
929 else
930 memcpy (c,p1,len*sizeof(cmplx));
931 }
932 else
933 if (fct!=1.)
934 for (size_t i=0; i<len; ++i)
935 {
936 c[i].r *= fct;
937 c[i].i *= fct;
938 }
939 DEALLOC(ch)do { free(ch); (ch)=((void*)0); } while(0);
940 return 0;
941 }
942
943#undef PMSIGNC
944#undef A_EQ_B_MUL_C
945#undef A_EQ_CB_MUL_C
946#undef MULPMSIGNC
947#undef MULPMSIGNCEQ
948
949#undef WA
950#undef CC
951#undef CH
952#undef ROT90
953#undef SCALEC
954#undef ADDC
955#undef PMC
956
957NOINLINE__attribute__((noinline)) WARN_UNUSED_RESULT__attribute__ ((warn_unused_result))
958static int cfftp_forward(cfftp_plan plan, double c[], double fct)
959 { return pass_all(plan,(cmplx *)c, fct, -1); }
960
961NOINLINE__attribute__((noinline)) WARN_UNUSED_RESULT__attribute__ ((warn_unused_result))
962static int cfftp_backward(cfftp_plan plan, double c[], double fct)
963 { return pass_all(plan,(cmplx *)c, fct, 1); }
964
965NOINLINE__attribute__((noinline)) WARN_UNUSED_RESULT__attribute__ ((warn_unused_result))
966static int cfftp_factorize (cfftp_plan plan)
967 {
968 size_t length=plan->length;
969 size_t nfct=0;
970 while ((length%4)==0)
971 { if (nfct>=NFCT25) return -1; plan->fct[nfct++].fct=4; length>>=2; }
972 if ((length%2)==0)
973 {
974 length>>=1;
975 // factor 2 should be at the front of the factor list
976 if (nfct>=NFCT25) return -1;
977 plan->fct[nfct++].fct=2;
978 SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t)do { size_t tmp_=(plan->fct[0].fct); (plan->fct[0].fct)
=(plan->fct[nfct-1].fct); (plan->fct[nfct-1].fct)=tmp_;
} while(0)
;
979 }
980 size_t maxl=(size_t)(sqrt((double)length))+1;
981 for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
982 if ((length%divisor)==0)
983 {
984 while ((length%divisor)==0)
985 {
986 if (nfct>=NFCT25) return -1;
987 plan->fct[nfct++].fct=divisor;
988 length/=divisor;
989 }
990 maxl=(size_t)(sqrt((double)length))+1;
991 }
992 if (length>1) plan->fct[nfct++].fct=length;
993 plan->nfct=nfct;
994 return 0;
995 }
996
997NOINLINE__attribute__((noinline)) static size_t cfftp_twsize (cfftp_plan plan)
998 {
999 size_t twsize=0, l1=1;
1000 for (size_t k=0; k<plan->nfct; ++k)
1001 {
1002 size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
1003 twsize+=(ip-1)*(ido-1);
1004 if (ip>11)
1005 twsize+=ip;
1006 l1*=ip;
1007 }
1008 return twsize;
1009 }
1010
1011NOINLINE__attribute__((noinline)) WARN_UNUSED_RESULT__attribute__ ((warn_unused_result)) static int cfftp_comp_twiddle (cfftp_plan plan)
1012 {
1013 size_t length=plan->length;
1014 double *twid = RALLOC(double, 2*length)((double *)malloc((2*length)*sizeof(double)));
1015 if (!twid) return -1;
1016 sincos_2pibyn(length, twid);
1017 size_t l1=1;
1018 size_t memofs=0;
1019 for (size_t k=0; k<plan->nfct; ++k)
1020 {
1021 size_t ip=plan->fct[k].fct, ido= length/(l1*ip);
1022 plan->fct[k].tw=plan->mem+memofs;
1023 memofs+=(ip-1)*(ido-1);
1024 for (size_t j=1; j<ip; ++j)
1025 for (size_t i=1; i<ido; ++i)
1026 {
1027 plan->fct[k].tw[(j-1)*(ido-1)+i-1].r = twid[2*j*l1*i];
1028 plan->fct[k].tw[(j-1)*(ido-1)+i-1].i = twid[2*j*l1*i+1];
1029 }
1030 if (ip>11)
1031 {
1032 plan->fct[k].tws=plan->mem+memofs;
1033 memofs+=ip;
1034 for (size_t j=0; j<ip; ++j)
1035 {
1036 plan->fct[k].tws[j].r = twid[2*j*l1*ido];
1037 plan->fct[k].tws[j].i = twid[2*j*l1*ido+1];
1038 }
1039 }
1040 l1*=ip;
1041 }
1042 DEALLOC(twid)do { free(twid); (twid)=((void*)0); } while(0);
1043 return 0;
1044 }
1045
1046static cfftp_plan make_cfftp_plan (size_t length)
1047 {
1048 if (length==0) return NULL((void*)0);
1049 cfftp_plan plan = RALLOC(cfftp_plan_i,1)((cfftp_plan_i *)malloc((1)*sizeof(cfftp_plan_i)));
1050 if (!plan) return NULL((void*)0);
1051 plan->length=length;
1052 plan->nfct=0;
1053 for (size_t i=0; i<NFCT25; ++i)
1054 plan->fct[i]=(cfftp_fctdata){0,0,0};
1055 plan->mem=0;
1056 if (length==1) return plan;
1057 if (cfftp_factorize(plan)!=0) { DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0); return NULL((void*)0); }
1058 size_t tws=cfftp_twsize(plan);
1059 plan->mem=RALLOC(cmplx,tws)((cmplx *)malloc((tws)*sizeof(cmplx)));
1060 if (!plan->mem) { DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0); return NULL((void*)0); }
1061 if (cfftp_comp_twiddle(plan)!=0)
1062 { DEALLOC(plan->mem)do { free(plan->mem); (plan->mem)=((void*)0); } while(0
)
; DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0); return NULL((void*)0); }
1063 return plan;
1064 }
1065
1066static void destroy_cfftp_plan (cfftp_plan plan)
1067 {
1068 DEALLOC(plan->mem)do { free(plan->mem); (plan->mem)=((void*)0); } while(0
)
;
1069 DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0);
1070 }
1071
1072typedef struct rfftp_fctdata
1073 {
1074 size_t fct;
1075 double *tw, *tws;
1076 } rfftp_fctdata;
1077
1078typedef struct rfftp_plan_i
1079 {
1080 size_t length, nfct;
1081 double *mem;
1082 rfftp_fctdata fct[NFCT25];
1083 } rfftp_plan_i;
1084typedef struct rfftp_plan_i * rfftp_plan;
1085
1086#define WA(x,i) wa[(i)+(x)*(ido-1)]
1087#define PM(a,b,c,d) { a=c+d; b=c-d; }
1088/* (a+ib) = conj(c+id) * (e+if) */
1089#define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; }
1090
1091#define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))]
1092#define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))]
1093
1094NOINLINE__attribute__((noinline)) static void radf2 (size_t ido, size_t l1, const double * restrictrestrict cc,
1095 double * restrictrestrict ch, const double * restrictrestrict wa)
1096 {
1097 const size_t cdim=2;
1098
1099 for (size_t k=0; k<l1; k++)
1100 PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1))
1101 if ((ido&1)==0)
1102 for (size_t k=0; k<l1; k++)
1103 {
1104 CH( 0,1,k) = -CC(ido-1,k,1);
1105 CH(ido-1,0,k) = CC(ido-1,k,0);
1106 }
1107 if (ido<=2) return;
1108 for (size_t k=0; k<l1; k++)
1109 for (size_t i=2; i<ido; i+=2)
1110 {
1111 size_t ic=ido-i;
1112 double tr2, ti2;
1113 MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
1114 PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2)
1115 PM (CH(i ,0,k),CH(ic ,1,k),ti2,CC(i ,k,0))
1116 }
1117 }
1118
1119NOINLINE__attribute__((noinline)) static void radf3(size_t ido, size_t l1, const double * restrictrestrict cc,
1120 double * restrictrestrict ch, const double * restrictrestrict wa)
1121 {
1122 const size_t cdim=3;
1123 static const double taur=-0.5, taui=0.86602540378443864676;
1124
1125 for (size_t k=0; k<l1; k++)
1126 {
1127 double cr2=CC(0,k,1)+CC(0,k,2);
1128 CH(0,0,k) = CC(0,k,0)+cr2;
1129 CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
1130 CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
1131 }
1132 if (ido==1) return;
1133 for (size_t k=0; k<l1; k++)
1134 for (size_t i=2; i<ido; i+=2)
1135 {
1136 size_t ic=ido-i;
1137 double di2, di3, dr2, dr3;
1138 MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)) // d2=conj(WA0)*CC1
1139 MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)) // d3=conj(WA1)*CC2
1140 double cr2=dr2+dr3; // c add
1141 double ci2=di2+di3;
1142 CH(i-1,0,k) = CC(i-1,k,0)+cr2; // c add
1143 CH(i ,0,k) = CC(i ,k,0)+ci2;
1144 double tr2 = CC(i-1,k,0)+taur*cr2; // c add
1145 double ti2 = CC(i ,k,0)+taur*ci2;
1146 double tr3 = taui*(di2-di3); // t3 = taui*i*(d3-d2)?
1147 double ti3 = taui*(dr3-dr2);
1148 PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3) // PM(i) = t2+t3
1149 PM(CH(i ,2,k),CH(ic ,1,k),ti3,ti2) // PM(ic) = conj(t2-t3)
1150 }
1151 }
1152
1153NOINLINE__attribute__((noinline)) static void radf4(size_t ido, size_t l1, const double * restrictrestrict cc,
1154 double * restrictrestrict ch, const double * restrictrestrict wa)
1155 {
1156 const size_t cdim=4;
1157 static const double hsqt2=0.70710678118654752440;
1158
1159 for (size_t k=0; k<l1; k++)
1160 {
1161 double tr1,tr2;
1162 PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1))
1163 PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2))
1164 PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1)
1165 }
1166 if ((ido&1)==0)
1167 for (size_t k=0; k<l1; k++)
1168 {
1169 double ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
1170 double tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
1171 PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1)
1172 PM (CH( 0,3,k),CH( 0,1,k),ti1,CC(ido-1,k,2))
1173 }
1174 if (ido<=2) return;
1175 for (size_t k=0; k<l1; k++)
1176 for (size_t i=2; i<ido; i+=2)
1177 {
1178 size_t ic=ido-i;
1179 double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1180 MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
1181 MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
1182 MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
1183 PM(tr1,tr4,cr4,cr2)
1184 PM(ti1,ti4,ci2,ci4)
1185 PM(tr2,tr3,CC(i-1,k,0),cr3)
1186 PM(ti2,ti3,CC(i ,k,0),ci3)
1187 PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1)
1188 PM(CH(i ,0,k),CH(ic ,3,k),ti1,ti2)
1189 PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4)
1190 PM(CH(i ,2,k),CH(ic ,1,k),tr4,ti3)
1191 }
1192 }
1193
1194NOINLINE__attribute__((noinline)) static void radf5(size_t ido, size_t l1, const double * restrictrestrict cc,
1195 double * restrictrestrict ch, const double * restrictrestrict wa)
1196 {
1197 const size_t cdim=5;
1198 static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
1199 tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
1200
1201 for (size_t k=0; k<l1; k++)
1202 {
1203 double cr2, cr3, ci4, ci5;
1204 PM (cr2,ci5,CC(0,k,4),CC(0,k,1))
1205 PM (cr3,ci4,CC(0,k,3),CC(0,k,2))
1206 CH(0,0,k)=CC(0,k,0)+cr2+cr3;
1207 CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
1208 CH(0,2,k)=ti11*ci5+ti12*ci4;
1209 CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
1210 CH(0,4,k)=ti12*ci5-ti11*ci4;
1211 }
1212 if (ido==1) return;
1213 for (size_t k=0; k<l1;++k)
1214 for (size_t i=2; i<ido; i+=2)
1215 {
1216 double ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3,
1217 dr4, dr5, cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
1218 size_t ic=ido-i;
1219 MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
1220 MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
1221 MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
1222 MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4))
1223 PM(cr2,ci5,dr5,dr2)
1224 PM(ci2,cr5,di2,di5)
1225 PM(cr3,ci4,dr4,dr3)
1226 PM(ci3,cr4,di3,di4)
1227 CH(i-1,0,k)=CC(i-1,k,0)+cr2+cr3;
1228 CH(i ,0,k)=CC(i ,k,0)+ci2+ci3;
1229 tr2=CC(i-1,k,0)+tr11*cr2+tr12*cr3;
1230 ti2=CC(i ,k,0)+tr11*ci2+tr12*ci3;
1231 tr3=CC(i-1,k,0)+tr12*cr2+tr11*cr3;
1232 ti3=CC(i ,k,0)+tr12*ci2+tr11*ci3;
1233 MULPM(tr5,tr4,cr5,cr4,ti11,ti12)
1234 MULPM(ti5,ti4,ci5,ci4,ti11,ti12)
1235 PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5)
1236 PM(CH(i ,2,k),CH(ic ,1,k),ti5,ti2)
1237 PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4)
1238 PM(CH(i ,4,k),CH(ic ,3,k),ti4,ti3)
1239 }
1240 }
1241
1242#undef CC
1243#undef CH
1244#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
1245#define C2(a,b) cc[(a)+idl1*(b)]
1246#define CH2(a,b) ch[(a)+idl1*(b)]
1247#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
1248#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
1249NOINLINE__attribute__((noinline)) static void radfg(size_t ido, size_t ip, size_t l1,
1250 double * restrictrestrict cc, double * restrictrestrict ch, const double * restrictrestrict wa,
1251 const double * restrictrestrict csarr)
1252 {
1253 const size_t cdim=ip;
1254 size_t ipph=(ip+1)/2;
1255 size_t idl1 = ido*l1;
1256
1257 if (ido>1)
1258 {
1259 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 114
1260 {
1261 size_t is=(j-1)*(ido-1),
1262 is2=(jc-1)*(ido-1);
1263 for (size_t k=0; k<l1; ++k) // 113
1264 {
1265 size_t idij=is;
1266 size_t idij2=is2;
1267 for (size_t i=1; i<=ido-2; i+=2) // 112
1268 {
1269 double t1=C1(i,k,j ), t2=C1(i+1,k,j ),
1270 t3=C1(i,k,jc), t4=C1(i+1,k,jc);
1271 double x1=wa[idij]*t1 + wa[idij+1]*t2,
1272 x2=wa[idij]*t2 - wa[idij+1]*t1,
1273 x3=wa[idij2]*t3 + wa[idij2+1]*t4,
1274 x4=wa[idij2]*t4 - wa[idij2+1]*t3;
1275 C1(i ,k,j ) = x1+x3;
1276 C1(i ,k,jc) = x2-x4;
1277 C1(i+1,k,j ) = x2+x4;
1278 C1(i+1,k,jc) = x3-x1;
1279 idij+=2;
1280 idij2+=2;
1281 }
1282 }
1283 }
1284 }
1285
1286 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 123
1287 for (size_t k=0; k<l1; ++k) // 122
1288 {
1289 double t1=C1(0,k,j), t2=C1(0,k,jc);
1290 C1(0,k,j ) = t1+t2;
1291 C1(0,k,jc) = t2-t1;
1292 }
1293
1294//everything in C
1295//memset(ch,0,ip*l1*ido*sizeof(double));
1296
1297 for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc) // 127
1298 {
1299 for (size_t ik=0; ik<idl1; ++ik) // 124
1300 {
1301 CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
1302 CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
1303 }
1304 size_t iang = 2*l;
1305 size_t j=3, jc=ip-3;
1306 for (; j<ipph-3; j+=4,jc-=4) // 126
1307 {
1308 iang+=l; if (iang>=ip) iang-=ip;
1309 double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1310 iang+=l; if (iang>=ip) iang-=ip;
1311 double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1312 iang+=l; if (iang>=ip) iang-=ip;
1313 double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
1314 iang+=l; if (iang>=ip) iang-=ip;
1315 double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
1316 for (size_t ik=0; ik<idl1; ++ik) // 125
1317 {
1318 CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
1319 +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
1320 CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
1321 +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
1322 }
1323 }
1324 for (; j<ipph-1; j+=2,jc-=2) // 126
1325 {
1326 iang+=l; if (iang>=ip) iang-=ip;
1327 double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1328 iang+=l; if (iang>=ip) iang-=ip;
1329 double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1330 for (size_t ik=0; ik<idl1; ++ik) // 125
1331 {
1332 CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
1333 CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
1334 }
1335 }
1336 for (; j<ipph; ++j,--jc) // 126
1337 {
1338 iang+=l; if (iang>=ip) iang-=ip;
1339 double ar=csarr[2*iang], ai=csarr[2*iang+1];
1340 for (size_t ik=0; ik<idl1; ++ik) // 125
1341 {
1342 CH2(ik,l ) += ar*C2(ik,j );
1343 CH2(ik,lc) += ai*C2(ik,jc);
1344 }
1345 }
1346 }
1347 for (size_t ik=0; ik<idl1; ++ik) // 101
1348 CH2(ik,0) = C2(ik,0);
1349 for (size_t j=1; j<ipph; ++j) // 129
1350 for (size_t ik=0; ik<idl1; ++ik) // 128
1351 CH2(ik,0) += C2(ik,j);
1352
1353// everything in CH at this point!
1354//memset(cc,0,ip*l1*ido*sizeof(double));
1355
1356 for (size_t k=0; k<l1; ++k) // 131
1357 for (size_t i=0; i<ido; ++i) // 130
1358 CC(i,0,k) = CH(i,k,0);
1359
1360 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 137
1361 {
1362 size_t j2=2*j-1;
1363 for (size_t k=0; k<l1; ++k) // 136
1364 {
1365 CC(ido-1,j2,k) = CH(0,k,j);
1366 CC(0,j2+1,k) = CH(0,k,jc);
1367 }
1368 }
1369
1370 if (ido==1) return;
1371
1372 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 140
1373 {
1374 size_t j2=2*j-1;
1375 for(size_t k=0; k<l1; ++k) // 139
1376 for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2) // 138
1377 {
1378 CC(i ,j2+1,k) = CH(i ,k,j )+CH(i ,k,jc);
1379 CC(ic ,j2 ,k) = CH(i ,k,j )-CH(i ,k,jc);
1380 CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
1381 CC(ic+1,j2 ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
1382 }
1383 }
1384 }
1385#undef C1
1386#undef C2
1387#undef CH2
1388
1389#undef CH
1390#undef CC
1391#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
1392#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
1393
1394NOINLINE__attribute__((noinline)) static void radb2(size_t ido, size_t l1, const double * restrictrestrict cc,
1395 double * restrictrestrict ch, const double * restrictrestrict wa)
1396 {
1397 const size_t cdim=2;
1398
1399 for (size_t k=0; k<l1; k++)
1400 PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k))
1401 if ((ido&1)==0)
1402 for (size_t k=0; k<l1; k++)
1403 {
1404 CH(ido-1,k,0) = 2.*CC(ido-1,0,k);
1405 CH(ido-1,k,1) =-2.*CC(0 ,1,k);
1406 }
1407 if (ido<=2) return;
1408 for (size_t k=0; k<l1;++k)
1409 for (size_t i=2; i<ido; i+=2)
1410 {
1411 size_t ic=ido-i;
1412 double ti2, tr2;
1413 PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k))
1414 PM (ti2,CH(i ,k,0),CC(i ,0,k),CC(ic ,1,k))
1415 MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2)
1416 }
1417 }
1418
1419NOINLINE__attribute__((noinline)) static void radb3(size_t ido, size_t l1, const double * restrictrestrict cc,
1420 double * restrictrestrict ch, const double * restrictrestrict wa)
1421 {
1422 const size_t cdim=3;
1423 static const double taur=-0.5, taui=0.86602540378443864676;
1424
1425 for (size_t k=0; k<l1; k++)
1426 {
1427 double tr2=2.*CC(ido-1,1,k);
1428 double cr2=CC(0,0,k)+taur*tr2;
1429 CH(0,k,0)=CC(0,0,k)+tr2;
1430 double ci3=2.*taui*CC(0,2,k);
1431 PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
1432 }
1433 if (ido==1) return;
1434 for (size_t k=0; k<l1; k++)
1435 for (size_t i=2; i<ido; i+=2)
1436 {
1437 size_t ic=ido-i;
1438 double tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic))
1439 double ti2=CC(i ,2,k)-CC(ic ,1,k);
1440 double cr2=CC(i-1,0,k)+taur*tr2; // c2=CC +taur*t2
1441 double ci2=CC(i ,0,k)+taur*ti2;
1442 CH(i-1,k,0)=CC(i-1,0,k)+tr2; // CH=CC+t2
1443 CH(i ,k,0)=CC(i ,0,k)+ti2;
1444 double cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic)))
1445 double ci3=taui*(CC(i ,2,k)+CC(ic ,1,k));
1446 double di2, di3, dr2, dr3;
1447 PM(dr3,dr2,cr2,ci3) // d2= (cr2-ci3, ci2+cr3) = c2+i*c3
1448 PM(di2,di3,ci2,cr3) // d3= (cr2+ci3, ci2-cr3) = c2-i*c3
1449 MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2) // ch = WA*d2
1450 MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
1451 }
1452 }
1453
1454NOINLINE__attribute__((noinline)) static void radb4(size_t ido, size_t l1, const double * restrictrestrict cc,
1455 double * restrictrestrict ch, const double * restrictrestrict wa)
1456 {
1457 const size_t cdim=4;
1458 static const double sqrt2=1.41421356237309504880;
1459
1460 for (size_t k=0; k<l1; k++)
1461 {
1462 double tr1, tr2;
1463 PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k))
1464 double tr3=2.*CC(ido-1,1,k);
1465 double tr4=2.*CC(0,2,k);
1466 PM (CH(0,k,0),CH(0,k,2),tr2,tr3)
1467 PM (CH(0,k,3),CH(0,k,1),tr1,tr4)
1468 }
1469 if ((ido&1)==0)
1470 for (size_t k=0; k<l1; k++)
1471 {
1472 double tr1,tr2,ti1,ti2;
1473 PM (ti1,ti2,CC(0 ,3,k),CC(0 ,1,k))
1474 PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k))
1475 CH(ido-1,k,0)=tr2+tr2;
1476 CH(ido-1,k,1)=sqrt2*(tr1-ti1);
1477 CH(ido-1,k,2)=ti2+ti2;
1478 CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
1479 }
1480 if (ido<=2) return;
1481 for (size_t k=0; k<l1;++k)
1482 for (size_t i=2; i<ido; i+=2)
1483 {
1484 double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1485 size_t ic=ido-i;
1486 PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k))
1487 PM (ti1,ti2,CC(i ,0,k),CC(ic ,3,k))
1488 PM (tr4,ti3,CC(i ,2,k),CC(ic ,1,k))
1489 PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k))
1490 PM (CH(i-1,k,0),cr3,tr2,tr3)
1491 PM (CH(i ,k,0),ci3,ti2,ti3)
1492 PM (cr4,cr2,tr1,tr4)
1493 PM (ci2,ci4,ti1,ti4)
1494 MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2)
1495 MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3)
1496 MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4)
1497 }
1498 }
1499
1500NOINLINE__attribute__((noinline)) static void radb5(size_t ido, size_t l1, const double * restrictrestrict cc,
1501 double * restrictrestrict ch, const double * restrictrestrict wa)
1502 {
1503 const size_t cdim=5;
1504 static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
1505 tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
1506
1507 for (size_t k=0; k<l1; k++)
1508 {
1509 double ti5=CC(0,2,k)+CC(0,2,k);
1510 double ti4=CC(0,4,k)+CC(0,4,k);
1511 double tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
1512 double tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
1513 CH(0,k,0)=CC(0,0,k)+tr2+tr3;
1514 double cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
1515 double cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
1516 double ci4, ci5;
1517 MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
1518 PM(CH(0,k,4),CH(0,k,1),cr2,ci5)
1519 PM(CH(0,k,3),CH(0,k,2),cr3,ci4)
1520 }
1521 if (ido==1) return;
1522 for (size_t k=0; k<l1;++k)
1523 for (size_t i=2; i<ido; i+=2)
1524 {
1525 size_t ic=ido-i;
1526 double tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
1527 PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k))
1528 PM(ti5,ti2,CC(i ,2,k),CC(ic ,1,k))
1529 PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k))
1530 PM(ti4,ti3,CC(i ,4,k),CC(ic ,3,k))
1531 CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
1532 CH(i ,k,0)=CC(i ,0,k)+ti2+ti3;
1533 double cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
1534 double ci2=CC(i ,0,k)+tr11*ti2+tr12*ti3;
1535 double cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
1536 double ci3=CC(i ,0,k)+tr12*ti2+tr11*ti3;
1537 double ci4, ci5, cr5, cr4;
1538 MULPM(cr5,cr4,tr5,tr4,ti11,ti12)
1539 MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
1540 double dr2, dr3, dr4, dr5, di2, di3, di4, di5;
1541 PM(dr4,dr3,cr3,ci4)
1542 PM(di3,di4,ci3,cr4)
1543 PM(dr5,dr2,cr2,ci5)
1544 PM(di2,di5,ci2,cr5)
1545 MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2)
1546 MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
1547 MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4)
1548 MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5)
1549 }
1550 }
1551
1552#undef CC
1553#undef CH
1554#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
1555#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
1556#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
1557#define C2(a,b) cc[(a)+idl1*(b)]
1558#define CH2(a,b) ch[(a)+idl1*(b)]
1559
1560NOINLINE__attribute__((noinline)) static void radbg(size_t ido, size_t ip, size_t l1,
1561 double * restrictrestrict cc, double * restrictrestrict ch, const double * restrictrestrict wa,
1562 const double * restrictrestrict csarr)
1563 {
1564 const size_t cdim=ip;
1565 size_t ipph=(ip+1)/ 2;
1566 size_t idl1 = ido*l1;
1567
1568 for (size_t k=0; k<l1; ++k) // 102
1569 for (size_t i=0; i<ido; ++i) // 101
1570 CH(i,k,0) = CC(i,0,k);
1571 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) // 108
1572 {
1573 size_t j2=2*j-1;
1574 for (size_t k=0; k<l1; ++k)
1575 {
1576 CH(0,k,j ) = 2*CC(ido-1,j2,k);
1577 CH(0,k,jc) = 2*CC(0,j2+1,k);
1578 }
1579 }
1580
1581 if (ido!=1)
1582 {
1583 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 111
1584 {
1585 size_t j2=2*j-1;
1586 for (size_t k=0; k<l1; ++k)
1587 for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2) // 109
1588 {
1589 CH(i ,k,j ) = CC(i ,j2+1,k)+CC(ic ,j2,k);
1590 CH(i ,k,jc) = CC(i ,j2+1,k)-CC(ic ,j2,k);
1591 CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
1592 CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
1593 }
1594 }
1595 }
1596 for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
1597 {
1598 for (size_t ik=0; ik<idl1; ++ik)
1599 {
1600 C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
1601 C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
1602 }
1603 size_t iang=2*l;
1604 size_t j=3,jc=ip-3;
1605 for(; j<ipph-3; j+=4,jc-=4)
1606 {
1607 iang+=l; if(iang>ip) iang-=ip;
1608 double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1609 iang+=l; if(iang>ip) iang-=ip;
1610 double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1611 iang+=l; if(iang>ip) iang-=ip;
1612 double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
1613 iang+=l; if(iang>ip) iang-=ip;
1614 double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
1615 for (size_t ik=0; ik<idl1; ++ik)
1616 {
1617 C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
1618 +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
1619 C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
1620 +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
1621 }
1622 }
1623 for(; j<ipph-1; j+=2,jc-=2)
1624 {
1625 iang+=l; if(iang>ip) iang-=ip;
1626 double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1627 iang+=l; if(iang>ip) iang-=ip;
1628 double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1629 for (size_t ik=0; ik<idl1; ++ik)
1630 {
1631 C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
1632 C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
1633 }
1634 }
1635 for(; j<ipph; ++j,--jc)
1636 {
1637 iang+=l; if(iang>ip) iang-=ip;
1638 double war=csarr[2*iang], wai=csarr[2*iang+1];
1639 for (size_t ik=0; ik<idl1; ++ik)
1640 {
1641 C2(ik,l ) += war*CH2(ik,j );
1642 C2(ik,lc) += wai*CH2(ik,jc);
1643 }
1644 }
1645 }
1646 for (size_t j=1; j<ipph; ++j)
1647 for (size_t ik=0; ik<idl1; ++ik)
1648 CH2(ik,0) += CH2(ik,j);
1649 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 124
1650 for (size_t k=0; k<l1; ++k)
1651 {
1652 CH(0,k,j ) = C1(0,k,j)-C1(0,k,jc);
1653 CH(0,k,jc) = C1(0,k,j)+C1(0,k,jc);
1654 }
1655
1656 if (ido==1) return;
1657
1658 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) // 127
1659 for (size_t k=0; k<l1; ++k)
1660 for (size_t i=1; i<=ido-2; i+=2)
1661 {
1662 CH(i ,k,j ) = C1(i ,k,j)-C1(i+1,k,jc);
1663 CH(i ,k,jc) = C1(i ,k,j)+C1(i+1,k,jc);
1664 CH(i+1,k,j ) = C1(i+1,k,j)+C1(i ,k,jc);
1665 CH(i+1,k,jc) = C1(i+1,k,j)-C1(i ,k,jc);
1666 }
1667
1668// All in CH
1669
1670 for (size_t j=1; j<ip; ++j)
1671 {
1672 size_t is = (j-1)*(ido-1);
1673 for (size_t k=0; k<l1; ++k)
1674 {
1675 size_t idij = is;
1676 for (size_t i=1; i<=ido-2; i+=2)
1677 {
1678 double t1=CH(i,k,j), t2=CH(i+1,k,j);
1679 CH(i ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
1680 CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
1681 idij+=2;
1682 }
1683 }
1684 }
1685 }
1686#undef C1
1687#undef C2
1688#undef CH2
1689
1690#undef CC
1691#undef CH
1692#undef PM
1693#undef MULPM
1694#undef WA
1695
1696static void copy_and_norm(double *c, double *p1, size_t n, double fct)
1697 {
1698 if (p1!=c)
1699 {
1700 if (fct!=1.)
1701 for (size_t i=0; i<n; ++i)
1702 c[i] = fct*p1[i];
1703 else
1704 memcpy (c,p1,n*sizeof(double));
1705 }
1706 else
1707 if (fct!=1.)
1708 for (size_t i=0; i<n; ++i)
1709 c[i] *= fct;
1710 }
1711
1712WARN_UNUSED_RESULT__attribute__ ((warn_unused_result))
1713static int rfftp_forward(rfftp_plan plan, double c[], double fct)
1714 {
1715 if (plan->length==1) return 0;
1716 size_t n=plan->length;
1717 size_t l1=n, nf=plan->nfct;
1718 double *ch = RALLOC(double, n)((double *)malloc((n)*sizeof(double)));
1719 if (!ch) return -1;
1720 double *p1=c, *p2=ch;
1721
1722 for(size_t k1=0; k1<nf;++k1)
1723 {
1724 size_t k=nf-k1-1;
1725 size_t ip=plan->fct[k].fct;
1726 size_t ido=n / l1;
1727 l1 /= ip;
1728 if(ip==4)
1729 radf4(ido, l1, p1, p2, plan->fct[k].tw);
1730 else if(ip==2)
1731 radf2(ido, l1, p1, p2, plan->fct[k].tw);
1732 else if(ip==3)
1733 radf3(ido, l1, p1, p2, plan->fct[k].tw);
1734 else if(ip==5)
1735 radf5(ido, l1, p1, p2, plan->fct[k].tw);
1736 else
1737 {
1738 radfg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
1739 SWAP (p1,p2,double *)do { double * tmp_=(p1); (p1)=(p2); (p2)=tmp_; } while(0);
1740 }
1741 SWAP (p1,p2,double *)do { double * tmp_=(p1); (p1)=(p2); (p2)=tmp_; } while(0);
1742 }
1743 copy_and_norm(c,p1,n,fct);
1744 DEALLOC(ch)do { free(ch); (ch)=((void*)0); } while(0);
1745 return 0;
1746 }
1747
1748WARN_UNUSED_RESULT__attribute__ ((warn_unused_result))
1749static int rfftp_backward(rfftp_plan plan, double c[], double fct)
1750 {
1751 if (plan->length==1) return 0;
1752 size_t n=plan->length;
1753 size_t l1=1, nf=plan->nfct;
1754 double *ch = RALLOC(double, n)((double *)malloc((n)*sizeof(double)));
1755 if (!ch) return -1;
1756 double *p1=c, *p2=ch;
1757
1758 for(size_t k=0; k<nf; k++)
1759 {
1760 size_t ip = plan->fct[k].fct,
1761 ido= n/(ip*l1);
1762 if(ip==4)
1763 radb4(ido, l1, p1, p2, plan->fct[k].tw);
1764 else if(ip==2)
1765 radb2(ido, l1, p1, p2, plan->fct[k].tw);
1766 else if(ip==3)
1767 radb3(ido, l1, p1, p2, plan->fct[k].tw);
1768 else if(ip==5)
1769 radb5(ido, l1, p1, p2, plan->fct[k].tw);
1770 else
1771 radbg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
1772 SWAP (p1,p2,double *)do { double * tmp_=(p1); (p1)=(p2); (p2)=tmp_; } while(0);
1773 l1*=ip;
1774 }
1775 copy_and_norm(c,p1,n,fct);
1776 DEALLOC(ch)do { free(ch); (ch)=((void*)0); } while(0);
1777 return 0;
1778 }
1779
1780WARN_UNUSED_RESULT__attribute__ ((warn_unused_result))
1781static int rfftp_factorize (rfftp_plan plan)
1782 {
1783 size_t length=plan->length;
1784 size_t nfct=0;
1785 while ((length%4)==0)
1786 { if (nfct>=NFCT25) return -1; plan->fct[nfct++].fct=4; length>>=2; }
1787 if ((length%2)==0)
1788 {
1789 length>>=1;
1790 // factor 2 should be at the front of the factor list
1791 if (nfct>=NFCT25) return -1;
1792 plan->fct[nfct++].fct=2;
1793 SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t)do { size_t tmp_=(plan->fct[0].fct); (plan->fct[0].fct)
=(plan->fct[nfct-1].fct); (plan->fct[nfct-1].fct)=tmp_;
} while(0)
;
1794 }
1795 size_t maxl=(size_t)(sqrt((double)length))+1;
1796 for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
1797 if ((length%divisor)==0)
1798 {
1799 while ((length%divisor)==0)
1800 {
1801 if (nfct>=NFCT25) return -1;
1802 plan->fct[nfct++].fct=divisor;
1803 length/=divisor;
1804 }
1805 maxl=(size_t)(sqrt((double)length))+1;
1806 }
1807 if (length>1) plan->fct[nfct++].fct=length;
1808 plan->nfct=nfct;
1809 return 0;
1810 }
1811
1812static size_t rfftp_twsize(rfftp_plan plan)
1813 {
1814 size_t twsize=0, l1=1;
1815 for (size_t k=0; k<plan->nfct; ++k)
1816 {
1817 size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
1818 twsize+=(ip-1)*(ido-1);
1819 if (ip>5) twsize+=2*ip;
1820 l1*=ip;
1821 }
1822 return twsize;
1823 return 0;
1824 }
1825
1826WARN_UNUSED_RESULT__attribute__ ((warn_unused_result)) NOINLINE__attribute__((noinline)) static int rfftp_comp_twiddle (rfftp_plan plan)
1827 {
1828 size_t length=plan->length;
1829 double *twid = RALLOC(double, 2*length)((double *)malloc((2*length)*sizeof(double)));
1830 if (!twid) return -1;
1831 sincos_2pibyn_half(length, twid);
1832 size_t l1=1;
1833 double *ptr=plan->mem;
1834 for (size_t k=0; k<plan->nfct; ++k)
1835 {
1836 size_t ip=plan->fct[k].fct, ido=length/(l1*ip);
1837 if (k<plan->nfct-1) // last factor doesn't need twiddles
1838 {
1839 plan->fct[k].tw=ptr; ptr+=(ip-1)*(ido-1);
1840 for (size_t j=1; j<ip; ++j)
1841 for (size_t i=1; i<=(ido-1)/2; ++i)
1842 {
1843 plan->fct[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i];
1844 plan->fct[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1];
1845 }
1846 }
1847 if (ip>5) // special factors required by *g functions
1848 {
1849 plan->fct[k].tws=ptr; ptr+=2*ip;
1850 plan->fct[k].tws[0] = 1.;
1851 plan->fct[k].tws[1] = 0.;
1852 for (size_t i=1; i<=(ip>>1); ++i)
1853 {
1854 plan->fct[k].tws[2*i ] = twid[2*i*(length/ip)];
1855 plan->fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1];
1856 plan->fct[k].tws[2*(ip-i) ] = twid[2*i*(length/ip)];
1857 plan->fct[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1];
1858 }
1859 }
1860 l1*=ip;
1861 }
1862 DEALLOC(twid)do { free(twid); (twid)=((void*)0); } while(0);
1863 return 0;
1864 }
1865
1866NOINLINE__attribute__((noinline)) static rfftp_plan make_rfftp_plan (size_t length)
1867 {
1868 if (length==0) return NULL((void*)0);
1869 rfftp_plan plan = RALLOC(rfftp_plan_i,1)((rfftp_plan_i *)malloc((1)*sizeof(rfftp_plan_i)));
1870 if (!plan) return NULL((void*)0);
1871 plan->length=length;
1872 plan->nfct=0;
1873 plan->mem=NULL((void*)0);
1874 for (size_t i=0; i<NFCT25; ++i)
1875 plan->fct[i]=(rfftp_fctdata){0,0,0};
1876 if (length==1) return plan;
1877 if (rfftp_factorize(plan)!=0) { DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0); return NULL((void*)0); }
1878 size_t tws=rfftp_twsize(plan);
1879 plan->mem=RALLOC(double,tws)((double *)malloc((tws)*sizeof(double)));
1880 if (!plan->mem) { DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0); return NULL((void*)0); }
1881 if (rfftp_comp_twiddle(plan)!=0)
1882 { DEALLOC(plan->mem)do { free(plan->mem); (plan->mem)=((void*)0); } while(0
)
; DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0); return NULL((void*)0); }
1883 return plan;
1884 }
1885
1886NOINLINE__attribute__((noinline)) static void destroy_rfftp_plan (rfftp_plan plan)
1887 {
1888 DEALLOC(plan->mem)do { free(plan->mem); (plan->mem)=((void*)0); } while(0
)
;
1889 DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0);
1890 }
1891
1892typedef struct fftblue_plan_i
1893 {
1894 size_t n, n2;
1895 cfftp_plan plan;
1896 double *mem;
1897 double *bk, *bkf;
1898 } fftblue_plan_i;
1899typedef struct fftblue_plan_i * fftblue_plan;
1900
1901NOINLINE__attribute__((noinline)) static fftblue_plan make_fftblue_plan (size_t length)
1902 {
1903 fftblue_plan plan = RALLOC(fftblue_plan_i,1)((fftblue_plan_i *)malloc((1)*sizeof(fftblue_plan_i)));
1904 if (!plan) return NULL((void*)0);
1905 plan->n = length;
1906 plan->n2 = good_size(plan->n*2-1);
1907 plan->mem = RALLOC(double, 2*plan->n+2*plan->n2)((double *)malloc((2*plan->n+2*plan->n2)*sizeof(double)
))
;
1908 if (!plan->mem) { DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0); return NULL((void*)0); }
1909 plan->bk = plan->mem;
1910 plan->bkf = plan->bk+2*plan->n;
1911
1912/* initialize b_k */
1913 double *tmp = RALLOC(double,4*plan->n)((double *)malloc((4*plan->n)*sizeof(double)));
1914 if (!tmp) { DEALLOC(plan->mem)do { free(plan->mem); (plan->mem)=((void*)0); } while(0
)
; DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0); return NULL((void*)0); }
1915 sincos_2pibyn(2*plan->n,tmp);
1916 plan->bk[0] = 1;
1917 plan->bk[1] = 0;
1918
1919 size_t coeff=0;
1920 for (size_t m=1; m<plan->n; ++m)
1921 {
1922 coeff+=2*m-1;
1923 if (coeff>=2*plan->n) coeff-=2*plan->n;
1924 plan->bk[2*m ] = tmp[2*coeff ];
1925 plan->bk[2*m+1] = tmp[2*coeff+1];
1926 }
1927
1928 /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
1929 double xn2 = 1./plan->n2;
1930 plan->bkf[0] = plan->bk[0]*xn2;
1931 plan->bkf[1] = plan->bk[1]*xn2;
1932 for (size_t m=2; m<2*plan->n; m+=2)
1933 {
1934 plan->bkf[m] = plan->bkf[2*plan->n2-m] = plan->bk[m] *xn2;
1935 plan->bkf[m+1] = plan->bkf[2*plan->n2-m+1] = plan->bk[m+1] *xn2;
1936 }
1937 for (size_t m=2*plan->n;m<=(2*plan->n2-2*plan->n+1);++m)
1938 plan->bkf[m]=0.;
1939 plan->plan=make_cfftp_plan(plan->n2);
1940 if (!plan->plan)
1941 { DEALLOC(tmp)do { free(tmp); (tmp)=((void*)0); } while(0); DEALLOC(plan->mem)do { free(plan->mem); (plan->mem)=((void*)0); } while(0
)
; DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0); return NULL((void*)0); }
1942 if (cfftp_forward(plan->plan,plan->bkf,1.)!=0)
1943 { DEALLOC(tmp)do { free(tmp); (tmp)=((void*)0); } while(0); DEALLOC(plan->mem)do { free(plan->mem); (plan->mem)=((void*)0); } while(0
)
; DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0); return NULL((void*)0); }
1944 DEALLOC(tmp)do { free(tmp); (tmp)=((void*)0); } while(0);
1945
1946 return plan;
1947 }
1948
1949NOINLINE__attribute__((noinline)) static void destroy_fftblue_plan (fftblue_plan plan)
1950 {
1951 DEALLOC(plan->mem)do { free(plan->mem); (plan->mem)=((void*)0); } while(0
)
;
1952 destroy_cfftp_plan(plan->plan);
1953 DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0);
1954 }
1955
1956NOINLINE__attribute__((noinline)) WARN_UNUSED_RESULT__attribute__ ((warn_unused_result))
1957static int fftblue_fft(fftblue_plan plan, double c[], int isign, double fct)
1958 {
1959 size_t n=plan->n;
1960 size_t n2=plan->n2;
1961 double *bk = plan->bk;
1962 double *bkf = plan->bkf;
1963 double *akf = RALLOC(double, 2*n2)((double *)malloc((2*n2)*sizeof(double)));
1964 if (!akf) return -1;
1965
1966/* initialize a_k and FFT it */
1967 if (isign>0)
1968 for (size_t m=0; m<2*n; m+=2)
1969 {
1970 akf[m] = c[m]*bk[m] - c[m+1]*bk[m+1];
1971 akf[m+1] = c[m]*bk[m+1] + c[m+1]*bk[m];
1972 }
1973 else
1974 for (size_t m=0; m<2*n; m+=2)
1975 {
1976 akf[m] = c[m]*bk[m] + c[m+1]*bk[m+1];
1977 akf[m+1] =-c[m]*bk[m+1] + c[m+1]*bk[m];
1978 }
1979 for (size_t m=2*n; m<2*n2; ++m)
1980 akf[m]=0;
1981
1982 if (cfftp_forward (plan->plan,akf,fct)!=0)
1983 { DEALLOC(akf)do { free(akf); (akf)=((void*)0); } while(0); return -1; }
1984
1985/* do the convolution */
1986 if (isign>0)
1987 for (size_t m=0; m<2*n2; m+=2)
1988 {
1989 double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
1990 akf[m ] = akf[m]*bkf[m] + akf[m+1]*bkf[m+1];
1991 akf[m+1] = im;
1992 }
1993 else
1994 for (size_t m=0; m<2*n2; m+=2)
1995 {
1996 double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
1997 akf[m ] = akf[m]*bkf[m] - akf[m+1]*bkf[m+1];
1998 akf[m+1] = im;
1999 }
2000
2001/* inverse FFT */
2002 if (cfftp_backward (plan->plan,akf,1.)!=0)
2003 { DEALLOC(akf)do { free(akf); (akf)=((void*)0); } while(0); return -1; }
2004
2005/* multiply by b_k */
2006 if (isign>0)
2007 for (size_t m=0; m<2*n; m+=2)
2008 {
2009 c[m] = bk[m] *akf[m] - bk[m+1]*akf[m+1];
2010 c[m+1] = bk[m+1]*akf[m] + bk[m] *akf[m+1];
2011 }
2012 else
2013 for (size_t m=0; m<2*n; m+=2)
2014 {
2015 c[m] = bk[m] *akf[m] + bk[m+1]*akf[m+1];
2016 c[m+1] =-bk[m+1]*akf[m] + bk[m] *akf[m+1];
2017 }
2018 DEALLOC(akf)do { free(akf); (akf)=((void*)0); } while(0);
2019 return 0;
2020 }
2021
2022WARN_UNUSED_RESULT__attribute__ ((warn_unused_result))
2023static int cfftblue_backward(fftblue_plan plan, double c[], double fct)
2024 { return fftblue_fft(plan,c,1,fct); }
2025
2026WARN_UNUSED_RESULT__attribute__ ((warn_unused_result))
2027static int cfftblue_forward(fftblue_plan plan, double c[], double fct)
2028 { return fftblue_fft(plan,c,-1,fct); }
2029
2030WARN_UNUSED_RESULT__attribute__ ((warn_unused_result))
2031static int rfftblue_backward(fftblue_plan plan, double c[], double fct)
2032 {
2033 size_t n=plan->n;
2034 double *tmp = RALLOC(double,2*n)((double *)malloc((2*n)*sizeof(double)));
2035 if (!tmp) return -1;
2036 tmp[0]=c[0];
2037 tmp[1]=0.;
2038 memcpy (tmp+2,c+1, (n-1)*sizeof(double));
2039 if ((n&1)==0) tmp[n+1]=0.;
2040 for (size_t m=2; m<n; m+=2)
2041 {
2042 tmp[2*n-m]=tmp[m];
2043 tmp[2*n-m+1]=-tmp[m+1];
2044 }
2045 if (fftblue_fft(plan,tmp,1,fct)!=0)
2046 { DEALLOC(tmp)do { free(tmp); (tmp)=((void*)0); } while(0); return -1; }
2047 for (size_t m=0; m<n; ++m)
2048 c[m] = tmp[2*m];
2049 DEALLOC(tmp)do { free(tmp); (tmp)=((void*)0); } while(0);
2050 return 0;
2051 }
2052
2053WARN_UNUSED_RESULT__attribute__ ((warn_unused_result))
2054static int rfftblue_forward(fftblue_plan plan, double c[], double fct)
2055 {
2056 size_t n=plan->n;
2057 double *tmp = RALLOC(double,2*n)((double *)malloc((2*n)*sizeof(double)));
2058 if (!tmp) return -1;
2059 for (size_t m=0; m<n; ++m)
2060 {
2061 tmp[2*m] = c[m];
2062 tmp[2*m+1] = 0.;
2063 }
2064 if (fftblue_fft(plan,tmp,-1,fct)!=0)
2065 { DEALLOC(tmp)do { free(tmp); (tmp)=((void*)0); } while(0); return -1; }
2066 c[0] = tmp[0];
2067 memcpy (c+1, tmp+2, (n-1)*sizeof(double));
2068 DEALLOC(tmp)do { free(tmp); (tmp)=((void*)0); } while(0);
2069 return 0;
2070 }
2071
2072typedef struct cfft_plan_i
2073 {
2074 cfftp_plan packplan;
2075 fftblue_plan blueplan;
2076 } cfft_plan_i;
2077
2078static cfft_plan make_cfft_plan (size_t length)
2079 {
2080 if (length==0) return NULL((void*)0);
2081 cfft_plan plan = RALLOC(cfft_plan_i,1)((cfft_plan_i *)malloc((1)*sizeof(cfft_plan_i)));
2082 if (!plan) return NULL((void*)0);
2083 plan->blueplan=0;
2084 plan->packplan=0;
2085 if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
2086 {
2087 plan->packplan=make_cfftp_plan(length);
2088 if (!plan->packplan) { DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0); return NULL((void*)0); }
2089 return plan;
2090 }
2091 double comp1 = cost_guess(length);
2092 double comp2 = 2*cost_guess(good_size(2*length-1));
2093 comp2*=1.5; /* fudge factor that appears to give good overall performance */
2094 if (comp2<comp1) // use Bluestein
2095 {
2096 plan->blueplan=make_fftblue_plan(length);
2097 if (!plan->blueplan) { DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0); return NULL((void*)0); }
2098 }
2099 else
2100 {
2101 plan->packplan=make_cfftp_plan(length);
2102 if (!plan->packplan) { DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0); return NULL((void*)0); }
2103 }
2104 return plan;
2105 }
2106
2107static void destroy_cfft_plan (cfft_plan plan)
2108 {
2109 if (plan->blueplan)
2110 destroy_fftblue_plan(plan->blueplan);
2111 if (plan->packplan)
2112 destroy_cfftp_plan(plan->packplan);
2113 DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0);
2114 }
2115
2116WARN_UNUSED_RESULT__attribute__ ((warn_unused_result)) static int cfft_backward(cfft_plan plan, double c[], double fct)
2117 {
2118 if (plan->packplan)
2119 return cfftp_backward(plan->packplan,c,fct);
2120 // if (plan->blueplan)
2121 return cfftblue_backward(plan->blueplan,c,fct);
2122 }
2123
2124WARN_UNUSED_RESULT__attribute__ ((warn_unused_result)) static int cfft_forward(cfft_plan plan, double c[], double fct)
2125 {
2126 if (plan->packplan)
2127 return cfftp_forward(plan->packplan,c,fct);
2128 // if (plan->blueplan)
2129 return cfftblue_forward(plan->blueplan,c,fct);
2130 }
2131
2132typedef struct rfft_plan_i
2133 {
2134 rfftp_plan packplan;
2135 fftblue_plan blueplan;
2136 } rfft_plan_i;
2137
2138static rfft_plan make_rfft_plan (size_t length)
2139 {
2140 if (length==0) return NULL((void*)0);
2141 rfft_plan plan = RALLOC(rfft_plan_i,1)((rfft_plan_i *)malloc((1)*sizeof(rfft_plan_i)));
2142 if (!plan) return NULL((void*)0);
2143 plan->blueplan=0;
2144 plan->packplan=0;
2145 if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
2146 {
2147 plan->packplan=make_rfftp_plan(length);
2148 if (!plan->packplan) { DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0); return NULL((void*)0); }
2149 return plan;
2150 }
2151 double comp1 = 0.5*cost_guess(length);
2152 double comp2 = 2*cost_guess(good_size(2*length-1));
2153 comp2*=1.5; /* fudge factor that appears to give good overall performance */
2154 if (comp2<comp1) // use Bluestein
2155 {
2156 plan->blueplan=make_fftblue_plan(length);
2157 if (!plan->blueplan) { DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0); return NULL((void*)0); }
2158 }
2159 else
2160 {
2161 plan->packplan=make_rfftp_plan(length);
2162 if (!plan->packplan) { DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0); return NULL((void*)0); }
2163 }
2164 return plan;
2165 }
2166
2167static void destroy_rfft_plan (rfft_plan plan)
2168 {
2169 if (plan->blueplan)
2170 destroy_fftblue_plan(plan->blueplan);
2171 if (plan->packplan)
2172 destroy_rfftp_plan(plan->packplan);
2173 DEALLOC(plan)do { free(plan); (plan)=((void*)0); } while(0);
2174 }
2175
2176WARN_UNUSED_RESULT__attribute__ ((warn_unused_result)) static int rfft_backward(rfft_plan plan, double c[], double fct)
2177 {
2178 if (plan->packplan)
2179 return rfftp_backward(plan->packplan,c,fct);
2180 else // if (plan->blueplan)
2181 return rfftblue_backward(plan->blueplan,c,fct);
2182 }
2183
2184WARN_UNUSED_RESULT__attribute__ ((warn_unused_result)) static int rfft_forward(rfft_plan plan, double c[], double fct)
2185 {
2186 if (plan->packplan)
2187 return rfftp_forward(plan->packplan,c,fct);
2188 else // if (plan->blueplan)
2189 return rfftblue_forward(plan->blueplan,c,fct);
2190 }
2191
2192static PyObject *
2193execute_complex(PyObject *a1, int is_forward, double fct)
2194{
2195 PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(*(PyObject * (*)(PyObject *, PyArray_Descr *, int, int, int,
PyObject *)) PyArray_API[69])
(a1,
2196 PyArray_DescrFromType(*(PyArray_Descr * (*)(int)) PyArray_API[45])(NPY_CDOUBLE), 1, 0,
2197 NPY_ARRAY_ENSURECOPY0x0020 | NPY_ARRAY_DEFAULT((0x0001 | (0x0100 | 0x0400))) |
2198 NPY_ARRAY_ENSUREARRAY0x0040 | NPY_ARRAY_FORCECAST0x0010,
2199 NULL((void*)0));
2200 if (!data) return NULL((void*)0);
2201
2202 int npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
2203 cfft_plan plan=NULL((void*)0);
2204
2205 int nrepeats = PyArray_SIZE(data)(*(npy_intp (*)(npy_intp const *, int)) PyArray_API[158])(PyArray_DIMS
(data), PyArray_NDIM(data))
/npts;
2206 double *dptr = (double *)PyArray_DATA(data);
2207 int fail=0;
2208 Py_BEGIN_ALLOW_THREADS{ PyThreadState *_save; _save = PyEval_SaveThread();;
2209 plan = make_cfft_plan(npts);
2210 if (!plan) fail=1;
2211 if (!fail)
2212 for (int i = 0; i < nrepeats; i++) {
2213 int res = is_forward ?
2214 cfft_forward(plan, dptr, fct) : cfft_backward(plan, dptr, fct);
2215 if (res!=0) { fail=1; break; }
2216 dptr += npts*2;
2217 }
2218 if (plan) destroy_cfft_plan(plan);
2219 Py_END_ALLOW_THREADSPyEval_RestoreThread(_save); };
2220 if (fail) {
2221 Py_XDECREF(data)_Py_XDECREF(((PyObject*)(data)));
2222 return PyErr_NoMemory();
2223 }
2224 return (PyObject *)data;
2225}
2226
2227static PyObject *
2228execute_real_forward(PyObject *a1, double fct)
2229{
2230 rfft_plan plan=NULL((void*)0);
2231 int fail = 0;
2232 PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(*(PyObject * (*)(PyObject *, PyArray_Descr *, int, int, int,
PyObject *)) PyArray_API[69])
(a1,
2233 PyArray_DescrFromType(*(PyArray_Descr * (*)(int)) PyArray_API[45])(NPY_DOUBLE), 1, 0,
2234 NPY_ARRAY_DEFAULT((0x0001 | (0x0100 | 0x0400))) | NPY_ARRAY_ENSUREARRAY0x0040 | NPY_ARRAY_FORCECAST0x0010,
2235 NULL((void*)0));
2236 if (!data) return NULL((void*)0);
2237
2238 int ndim = PyArray_NDIM(data);
2239 const npy_intp *odim = PyArray_DIMS(data);
2240 int npts = odim[ndim - 1];
2241 npy_intp *tdim=(npy_intp *)malloc(ndim*sizeof(npy_intp));
2242 if (!tdim)
2243 { Py_XDECREF(data)_Py_XDECREF(((PyObject*)(data))); return NULL((void*)0); }
2244 for (int d=0; d<ndim-1; ++d)
2245 tdim[d] = odim[d];
2246 tdim[ndim-1] = npts/2 + 1;
2247 PyArrayObject *ret = (PyArrayObject *)PyArray_Empty(*(PyObject * (*)(int, npy_intp const *, PyArray_Descr *, int
)) PyArray_API[184])
(ndim,
2248 tdim, PyArray_DescrFromType(*(PyArray_Descr * (*)(int)) PyArray_API[45])(NPY_CDOUBLE), 0);
2249 free(tdim);
2250 if (!ret) fail=1;
2251 if (!fail) {
2252 int rstep = PyArray_DIM(ret, PyArray_NDIM(ret) - 1)*2;
2253
2254 int nrepeats = PyArray_SIZE(data)(*(npy_intp (*)(npy_intp const *, int)) PyArray_API[158])(PyArray_DIMS
(data), PyArray_NDIM(data))
/npts;
2255 double *rptr = (double *)PyArray_DATA(ret),
2256 *dptr = (double *)PyArray_DATA(data);
2257
2258 Py_BEGIN_ALLOW_THREADS{ PyThreadState *_save; _save = PyEval_SaveThread();;
2259 plan = make_rfft_plan(npts);
2260 if (!plan) fail=1;
2261 if (!fail)
2262 for (int i = 0; i < nrepeats; i++) {
2263 rptr[rstep-1] = 0.0;
2264 memcpy((char *)(rptr+1), dptr, npts*sizeof(double));
2265 if (rfft_forward(plan, rptr+1, fct)!=0) {fail=1; break;}
2266 rptr[0] = rptr[1];
2267 rptr[1] = 0.0;
2268 rptr += rstep;
2269 dptr += npts;
2270 }
2271 if (plan) destroy_rfft_plan(plan);
2272 Py_END_ALLOW_THREADSPyEval_RestoreThread(_save); };
2273 }
2274 if (fail) {
2275 Py_XDECREF(data)_Py_XDECREF(((PyObject*)(data)));
2276 Py_XDECREF(ret)_Py_XDECREF(((PyObject*)(ret)));
2277 return PyErr_NoMemory();
2278 }
2279 Py_DECREF(data)_Py_DECREF(((PyObject*)(data)));
2280 return (PyObject *)ret;
2281}
2282static PyObject *
2283execute_real_backward(PyObject *a1, double fct)
2284{
2285 rfft_plan plan=NULL((void*)0);
2286 PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(*(PyObject * (*)(PyObject *, PyArray_Descr *, int, int, int,
PyObject *)) PyArray_API[69])
(a1,
2287 PyArray_DescrFromType(*(PyArray_Descr * (*)(int)) PyArray_API[45])(NPY_CDOUBLE), 1, 0,
2288 NPY_ARRAY_DEFAULT((0x0001 | (0x0100 | 0x0400))) | NPY_ARRAY_ENSUREARRAY0x0040 | NPY_ARRAY_FORCECAST0x0010,
2289 NULL((void*)0));
2290 if (!data) return NULL((void*)0);
2291 int npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
2292 PyArrayObject *ret = (PyArrayObject *)PyArray_Empty(*(PyObject * (*)(int, npy_intp const *, PyArray_Descr *, int
)) PyArray_API[184])
(PyArray_NDIM(data),
2293 PyArray_DIMS(data), PyArray_DescrFromType(*(PyArray_Descr * (*)(int)) PyArray_API[45])(NPY_DOUBLE), 0);
2294 int fail = 0;
2295 if (!ret) fail=1;
2296 if (!fail) {
2297 int nrepeats = PyArray_SIZE(ret)(*(npy_intp (*)(npy_intp const *, int)) PyArray_API[158])(PyArray_DIMS
(ret), PyArray_NDIM(ret))
/npts;
2298 double *rptr = (double *)PyArray_DATA(ret),
2299 *dptr = (double *)PyArray_DATA(data);
2300
2301 Py_BEGIN_ALLOW_THREADS{ PyThreadState *_save; _save = PyEval_SaveThread();;
2302 plan = make_rfft_plan(npts);
2303 if (!plan) fail=1;
2304 if (!fail) {
2305 for (int i = 0; i < nrepeats; i++) {
2306 memcpy((char *)(rptr + 1), (dptr + 2), (npts - 1)*sizeof(double));
2307 rptr[0] = dptr[0];
2308 if (rfft_backward(plan, rptr, fct)!=0) {fail=1; break;}
2309 rptr += npts;
2310 dptr += npts*2;
2311 }
2312 }
2313 if (plan) destroy_rfft_plan(plan);
2314 Py_END_ALLOW_THREADSPyEval_RestoreThread(_save); };
2315 }
2316 if (fail) {
2317 Py_XDECREF(data)_Py_XDECREF(((PyObject*)(data)));
2318 Py_XDECREF(ret)_Py_XDECREF(((PyObject*)(ret)));
2319 return PyErr_NoMemory();
2320 }
2321 Py_DECREF(data)_Py_DECREF(((PyObject*)(data)));
2322 return (PyObject *)ret;
2323}
2324
2325static PyObject *
2326execute_real(PyObject *a1, int is_forward, double fct)
2327{
2328 return is_forward ? execute_real_forward(a1, fct)
2329 : execute_real_backward(a1, fct);
2330}
2331
2332static const char execute__doc__[] = "";
2333
2334static PyObject *
2335execute(PyObject *NPY_UNUSED(self)(__NPY_UNUSED_TAGGEDself) __attribute__ ((__unused__)), PyObject *args)
2336{
2337 PyObject *a1;
2338 int is_real, is_forward;
2339 double fct;
2340
2341 if(!PyArg_ParseTuple(args, "Oiid:execute", &a1, &is_real, &is_forward, &fct)) {
2342 return NULL((void*)0);
2343 }
2344
2345 return is_real ? execute_real(a1, is_forward, fct)
2346 : execute_complex(a1, is_forward, fct);
2347}
2348
2349/* List of methods defined in the module */
2350
2351static struct PyMethodDef methods[] = {
2352 {"execute", execute, 1, execute__doc__},
2353 {NULL((void*)0), NULL((void*)0), 0, NULL((void*)0)} /* sentinel */
2354};
2355
2356static struct PyModuleDef moduledef = {
2357 PyModuleDef_HEAD_INIT{ { 1, ((void*)0) }, ((void*)0), 0, ((void*)0), },
2358 "_pocketfft_internal",
2359 NULL((void*)0),
2360 -1,
2361 methods,
2362 NULL((void*)0),
2363 NULL((void*)0),
2364 NULL((void*)0),
2365 NULL((void*)0)
2366};
2367
2368/* Initialization function for the module */
2369PyMODINIT_FUNCPyObject* PyInit__pocketfft_internal(void)
2370{
2371 PyObject *m;
2372 m = PyModule_Create(&moduledef)PyModule_Create2(&moduledef, 1013);
1
Calling 'PyModule_Create2'
3
Returning from 'PyModule_Create2'
7
PyObject ownership leak with reference count of 1
2373 if (m == NULL((void*)0)) {
4
Assuming 'm' is not equal to NULL
5
Taking false branch
2374 return NULL((void*)0);
2375 }
2376
2377 /* Import the array object */
2378 import_array(){if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(
PyExc_ImportError, "numpy.core.multiarray failed to import");
return ((void*)0); } }
;
6
Taking true branch
2379
2380 /* XXXX Add constants here */
2381
2382 return m;
2383}

/opt/pyrefcon/lib/pyrefcon/models/models/PyModule_Create2.model

1#ifndef PyModule_Create2
2struct _object;
3typedef struct _object PyObject;
4PyObject* clang_analyzer_PyObject_New_Reference();
5PyObject* PyModule_Create2(PyModuleDef *def, int module_api_version) {
6 return clang_analyzer_PyObject_New_Reference();
2
Setting reference count to 1
7}
8#else
9#warning "API PyModule_Create2 is defined as a macro."
10#endif