Rev 3 | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
2 | pj | 1 | /* |
2 | * Copyright (c) 1997-1999 Massachusetts Institute of Technology |
||
3 | * |
||
4 | * This program is free software; you can redistribute it and/or modify |
||
5 | * it under the terms of the GNU General Public License as published by |
||
6 | * the Free Software Foundation; either version 2 of the License, or |
||
7 | * (at your option) any later version. |
||
8 | * |
||
9 | * This program is distributed in the hope that it will be useful, |
||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
12 | * GNU General Public License for more details. |
||
13 | * |
||
14 | * You should have received a copy of the GNU General Public License |
||
15 | * along with this program; if not, write to the Free Software |
||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
||
17 | * |
||
18 | */ |
||
19 | |||
107 | pj | 20 | /* $Id: fftwnd.c,v 1.2 2003-03-24 11:14:50 pj Exp $ */ |
2 | pj | 21 | |
107 | pj | 22 | #include <fftw-int.h> |
2 | pj | 23 | |
24 | /* the number of buffers to use for buffered transforms: */ |
||
25 | #define FFTWND_NBUFFERS 8 |
||
26 | |||
27 | /* the default number of buffers to use: */ |
||
28 | #define FFTWND_DEFAULT_NBUFFERS 0 |
||
29 | |||
30 | /* the number of "padding" elements between consecutive buffer lines */ |
||
31 | #define FFTWND_BUFFER_PADDING 8 |
||
32 | |||
33 | static void destroy_plan_array(int rank, fftw_plan *plans); |
||
34 | |||
35 | static void init_test_array(fftw_complex *arr, int stride, int n) |
||
36 | { |
||
37 | int j; |
||
38 | |||
39 | for (j = 0; j < n; ++j) { |
||
40 | c_re(arr[stride * j]) = 0.0; |
||
41 | c_im(arr[stride * j]) = 0.0; |
||
42 | } |
||
43 | } |
||
44 | |||
45 | /* |
||
46 | * Same as fftw_measure_runtime, except for fftwnd plan. |
||
47 | */ |
||
48 | double fftwnd_measure_runtime(fftwnd_plan plan, |
||
49 | fftw_complex *in, int istride, |
||
50 | fftw_complex *out, int ostride) |
||
51 | { |
||
52 | fftw_time begin, end, start; |
||
53 | double t, tmax, tmin; |
||
54 | int i, iter; |
||
55 | int n; |
||
56 | int repeat; |
||
57 | |||
58 | if (plan->rank == 0) |
||
59 | return 0.0; |
||
60 | |||
61 | n = 1; |
||
62 | for (i = 0; i < plan->rank; ++i) |
||
63 | n *= plan->n[i]; |
||
64 | |||
65 | iter = 1; |
||
66 | |||
67 | for (;;) { |
||
68 | tmin = 1.0E10; |
||
69 | tmax = -1.0E10; |
||
70 | init_test_array(in, istride, n); |
||
71 | |||
72 | start = fftw_get_time(); |
||
73 | /* repeat the measurement FFTW_TIME_REPEAT times */ |
||
74 | for (repeat = 0; repeat < FFTW_TIME_REPEAT; ++repeat) { |
||
75 | begin = fftw_get_time(); |
||
76 | for (i = 0; i < iter; ++i) { |
||
77 | fftwnd(plan, 1, in, istride, 0, out, ostride, 0); |
||
78 | } |
||
79 | end = fftw_get_time(); |
||
80 | |||
81 | t = fftw_time_to_sec(fftw_time_diff(end, begin)); |
||
82 | if (t < tmin) |
||
83 | tmin = t; |
||
84 | if (t > tmax) |
||
85 | tmax = t; |
||
86 | |||
87 | /* do not run for too long */ |
||
88 | t = fftw_time_to_sec(fftw_time_diff(end, start)); |
||
89 | if (t > FFTW_TIME_LIMIT) |
||
90 | break; |
||
91 | } |
||
92 | |||
93 | if (tmin >= FFTW_TIME_MIN) |
||
94 | break; |
||
95 | |||
96 | iter *= 2; |
||
97 | } |
||
98 | |||
99 | tmin /= (double) iter; |
||
100 | tmax /= (double) iter; |
||
101 | |||
102 | return tmin; |
||
103 | } |
||
104 | |||
105 | /********************** Initializing the FFTWND Plan ***********************/ |
||
106 | |||
107 | /* Initialize everything except for the 1D plans and the work array: */ |
||
108 | fftwnd_plan fftwnd_create_plan_aux(int rank, const int *n, |
||
109 | fftw_direction dir, int flags) |
||
110 | { |
||
111 | int i; |
||
112 | fftwnd_plan p; |
||
113 | |||
114 | if (rank < 0) |
||
115 | return 0; |
||
116 | |||
117 | for (i = 0; i < rank; ++i) |
||
118 | if (n[i] <= 0) |
||
119 | return 0; |
||
120 | |||
121 | p = (fftwnd_plan) fftw_malloc(sizeof(fftwnd_data)); |
||
122 | p->n = 0; |
||
123 | p->n_before = 0; |
||
124 | p->n_after = 0; |
||
125 | p->plans = 0; |
||
126 | p->work = 0; |
||
127 | p->dir = dir; |
||
128 | |||
129 | p->rank = rank; |
||
130 | p->is_in_place = flags & FFTW_IN_PLACE; |
||
131 | |||
132 | p->nwork = 0; |
||
133 | p->nbuffers = 0; |
||
134 | |||
135 | if (rank == 0) |
||
136 | return 0; |
||
137 | |||
138 | p->n = (int *) fftw_malloc(sizeof(int) * rank); |
||
139 | p->n_before = (int *) fftw_malloc(sizeof(int) * rank); |
||
140 | p->n_after = (int *) fftw_malloc(sizeof(int) * rank); |
||
141 | p->n_before[0] = 1; |
||
142 | p->n_after[rank - 1] = 1; |
||
143 | |||
144 | for (i = 0; i < rank; ++i) { |
||
145 | p->n[i] = n[i]; |
||
146 | |||
147 | if (i) { |
||
148 | p->n_before[i] = p->n_before[i - 1] * n[i - 1]; |
||
149 | p->n_after[rank - 1 - i] = p->n_after[rank - i] * n[rank - i]; |
||
150 | } |
||
151 | } |
||
152 | |||
153 | return p; |
||
154 | } |
||
155 | |||
156 | /* create an empty new array of rank 1d plans */ |
||
157 | fftw_plan *fftwnd_new_plan_array(int rank) |
||
158 | { |
||
159 | fftw_plan *plans; |
||
160 | int i; |
||
161 | |||
162 | plans = (fftw_plan *) fftw_malloc(rank * sizeof(fftw_plan)); |
||
163 | if (!plans) |
||
164 | return 0; |
||
165 | for (i = 0; i < rank; ++i) |
||
166 | plans[i] = 0; |
||
167 | return plans; |
||
168 | } |
||
169 | |||
170 | /* |
||
171 | * create an array of plans using the ordinary 1d fftw_create_plan, |
||
172 | * which allocates its own array and creates plans optimized for |
||
173 | * contiguous data. |
||
174 | */ |
||
175 | fftw_plan *fftwnd_create_plans_generic(fftw_plan *plans, |
||
176 | int rank, const int *n, |
||
177 | fftw_direction dir, int flags) |
||
178 | { |
||
179 | if (rank <= 0) |
||
180 | return 0; |
||
181 | |||
182 | if (plans) { |
||
183 | int i, j; |
||
184 | int cur_flags; |
||
185 | |||
186 | for (i = 0; i < rank; ++i) { |
||
187 | if (i < rank - 1 || (flags & FFTW_IN_PLACE)) { |
||
188 | /* |
||
189 | * fft's except the last dimension are always in-place |
||
190 | */ |
||
191 | cur_flags = flags | FFTW_IN_PLACE; |
||
192 | for (j = i - 1; j >= 0 && n[i] != n[j]; --j); |
||
193 | } else { |
||
194 | cur_flags = flags; |
||
195 | /* |
||
196 | * we must create a separate plan for the last |
||
197 | * dimension |
||
198 | */ |
||
199 | j = -1; |
||
200 | } |
||
201 | |||
202 | if (j >= 0) { |
||
203 | /* |
||
204 | * If a plan already exists for this size |
||
205 | * array, reuse it: |
||
206 | */ |
||
207 | plans[i] = plans[j]; |
||
208 | } else { |
||
209 | /* generate a new plan: */ |
||
210 | plans[i] = fftw_create_plan(n[i], dir, cur_flags); |
||
211 | if (!plans[i]) { |
||
212 | destroy_plan_array(rank, plans); |
||
213 | return 0; |
||
214 | } |
||
215 | } |
||
216 | } |
||
217 | } |
||
218 | return plans; |
||
219 | } |
||
220 | |||
221 | static int get_maxdim(int rank, const int *n, int flags) |
||
222 | { |
||
223 | int i; |
||
224 | int maxdim = 0; |
||
225 | |||
226 | for (i = 0; i < rank - 1; ++i) |
||
227 | if (n[i] > maxdim) |
||
228 | maxdim = n[i]; |
||
229 | if (rank > 0 && flags & FFTW_IN_PLACE && n[rank - 1] > maxdim) |
||
230 | maxdim = n[rank - 1]; |
||
231 | |||
232 | return maxdim; |
||
233 | } |
||
234 | |||
235 | /* compute number of elements required for work array (has to |
||
236 | be big enough to hold ncopies of the largest dimension in |
||
237 | n that will need an in-place transform. */ |
||
238 | int fftwnd_work_size(int rank, const int *n, int flags, int ncopies) |
||
239 | { |
||
240 | return (ncopies * get_maxdim(rank, n, flags) |
||
241 | + (ncopies - 1) * FFTWND_BUFFER_PADDING); |
||
242 | } |
||
243 | |||
244 | /* |
||
245 | * create plans using the fftw_create_plan_specific planner, which |
||
246 | * allows us to create plans for each dimension that are specialized |
||
247 | * for the strides that we are going to use. |
||
248 | */ |
||
249 | fftw_plan *fftwnd_create_plans_specific(fftw_plan *plans, |
||
250 | int rank, const int *n, |
||
251 | const int *n_after, |
||
252 | fftw_direction dir, int flags, |
||
253 | fftw_complex *in, int istride, |
||
254 | fftw_complex *out, int ostride) |
||
255 | { |
||
256 | if (rank <= 0) |
||
257 | return 0; |
||
258 | |||
259 | if (plans) { |
||
260 | int i, stride, cur_flags; |
||
261 | fftw_complex *work = 0; |
||
262 | int nwork; |
||
263 | |||
264 | nwork = fftwnd_work_size(rank, n, flags, 1); |
||
265 | if (nwork) |
||
266 | work = (fftw_complex*)fftw_malloc(nwork * sizeof(fftw_complex)); |
||
267 | |||
268 | for (i = 0; i < rank; ++i) { |
||
269 | /* fft's except the last dimension are always in-place */ |
||
270 | if (i < rank - 1) |
||
271 | cur_flags = flags | FFTW_IN_PLACE; |
||
272 | else |
||
273 | cur_flags = flags; |
||
274 | |||
275 | /* stride for transforming ith dimension */ |
||
276 | stride = n_after[i]; |
||
277 | |||
278 | if (cur_flags & FFTW_IN_PLACE) |
||
279 | plans[i] = fftw_create_plan_specific(n[i], dir, cur_flags, |
||
280 | in, istride * stride, |
||
281 | work, 1); |
||
282 | else |
||
283 | plans[i] = fftw_create_plan_specific(n[i], dir, cur_flags, |
||
284 | in, istride * stride, |
||
285 | out, ostride * stride); |
||
286 | if (!plans[i]) { |
||
287 | destroy_plan_array(rank, plans); |
||
288 | fftw_free(work); |
||
289 | return 0; |
||
290 | } |
||
291 | } |
||
292 | |||
293 | if (work) |
||
294 | fftw_free(work); |
||
295 | } |
||
296 | return plans; |
||
297 | } |
||
298 | |||
299 | /* |
||
300 | * Create an fftwnd_plan specialized for specific arrays. (These |
||
301 | * arrays are ignored, however, if they are NULL or if the flags do |
||
302 | * not include FFTW_MEASURE.) The main advantage of being provided |
||
303 | * arrays like this is that we can do runtime timing measurements of |
||
304 | * our options, without worrying about allocating excessive scratch |
||
305 | * space. |
||
306 | */ |
||
307 | fftwnd_plan fftwnd_create_plan_specific(int rank, const int *n, |
||
308 | fftw_direction dir, int flags, |
||
309 | fftw_complex *in, int istride, |
||
310 | fftw_complex *out, int ostride) |
||
311 | { |
||
312 | fftwnd_plan p; |
||
313 | |||
314 | if (!(p = fftwnd_create_plan_aux(rank, n, dir, flags))) |
||
315 | return 0; |
||
316 | |||
317 | if (!(flags & FFTW_MEASURE) || in == 0 |
||
318 | || (!p->is_in_place && out == 0)) { |
||
319 | |||
320 | /**** use default plan ****/ |
||
321 | |||
322 | p->plans = fftwnd_create_plans_generic(fftwnd_new_plan_array(rank), |
||
323 | rank, n, dir, flags); |
||
324 | if (!p->plans) { |
||
325 | fftwnd_destroy_plan(p); |
||
326 | return 0; |
||
327 | } |
||
328 | if (flags & FFTWND_FORCE_BUFFERED) |
||
329 | p->nbuffers = FFTWND_NBUFFERS; |
||
330 | else |
||
331 | p->nbuffers = FFTWND_DEFAULT_NBUFFERS; |
||
332 | |||
333 | p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1); |
||
334 | if (p->nwork && !(flags & FFTW_THREADSAFE)) { |
||
335 | p->work = (fftw_complex*) fftw_malloc(p->nwork |
||
336 | * sizeof(fftw_complex)); |
||
337 | if (!p->work) { |
||
338 | fftwnd_destroy_plan(p); |
||
339 | return 0; |
||
340 | } |
||
341 | } |
||
342 | } else { |
||
343 | /**** use runtime measurements to pick plan ****/ |
||
344 | |||
345 | fftw_plan *plans_buf, *plans_nobuf; |
||
346 | double t_buf, t_nobuf; |
||
347 | |||
348 | p->nwork = fftwnd_work_size(rank, n, flags, FFTWND_NBUFFERS + 1); |
||
349 | if (p->nwork && !(flags & FFTW_THREADSAFE)) { |
||
350 | p->work = (fftw_complex*) fftw_malloc(p->nwork |
||
351 | * sizeof(fftw_complex)); |
||
352 | if (!p->work) { |
||
353 | fftwnd_destroy_plan(p); |
||
354 | return 0; |
||
355 | } |
||
356 | } |
||
357 | else |
||
358 | p->work = (fftw_complex*) NULL; |
||
359 | |||
360 | /* two possible sets of 1D plans: */ |
||
361 | plans_buf = fftwnd_create_plans_generic(fftwnd_new_plan_array(rank), |
||
362 | rank, n, dir, flags); |
||
363 | plans_nobuf = |
||
364 | fftwnd_create_plans_specific(fftwnd_new_plan_array(rank), |
||
365 | rank, n, p->n_after, dir, |
||
366 | flags, in, istride, |
||
367 | out, ostride); |
||
368 | if (!plans_buf || !plans_nobuf) { |
||
369 | destroy_plan_array(rank, plans_nobuf); |
||
370 | destroy_plan_array(rank, plans_buf); |
||
371 | fftwnd_destroy_plan(p); |
||
372 | return 0; |
||
373 | } |
||
374 | /* time the two possible plans */ |
||
375 | p->plans = plans_nobuf; |
||
376 | p->nbuffers = 0; |
||
377 | p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1); |
||
378 | t_nobuf = fftwnd_measure_runtime(p, in, istride, out, ostride); |
||
379 | p->plans = plans_buf; |
||
380 | p->nbuffers = FFTWND_NBUFFERS; |
||
381 | p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1); |
||
382 | t_buf = fftwnd_measure_runtime(p, in, istride, out, ostride); |
||
383 | |||
384 | /* pick the better one: */ |
||
385 | if (t_nobuf < t_buf) { /* use unbuffered transform */ |
||
386 | p->plans = plans_nobuf; |
||
387 | p->nbuffers = 0; |
||
388 | |||
389 | /* work array is unnecessarily large */ |
||
390 | if (p->work) |
||
391 | fftw_free(p->work); |
||
392 | p->work = 0; |
||
393 | |||
394 | destroy_plan_array(rank, plans_buf); |
||
395 | |||
396 | /* allocate a work array of the correct size: */ |
||
397 | p->nwork = fftwnd_work_size(rank, n, flags, p->nbuffers + 1); |
||
398 | if (p->nwork && !(flags & FFTW_THREADSAFE)) { |
||
399 | p->work = (fftw_complex*) fftw_malloc(p->nwork |
||
400 | * sizeof(fftw_complex)); |
||
401 | if (!p->work) { |
||
402 | fftwnd_destroy_plan(p); |
||
403 | return 0; |
||
404 | } |
||
405 | } |
||
406 | } else { /* use buffered transform */ |
||
407 | destroy_plan_array(rank, plans_nobuf); |
||
408 | } |
||
409 | } |
||
410 | |||
411 | return p; |
||
412 | } |
||
413 | |||
414 | fftwnd_plan fftw2d_create_plan_specific(int nx, int ny, |
||
415 | fftw_direction dir, int flags, |
||
416 | fftw_complex *in, int istride, |
||
417 | fftw_complex *out, int ostride) |
||
418 | { |
||
419 | int n[2]; |
||
420 | |||
421 | n[0] = nx; |
||
422 | n[1] = ny; |
||
423 | |||
424 | return fftwnd_create_plan_specific(2, n, dir, flags, |
||
425 | in, istride, out, ostride); |
||
426 | } |
||
427 | |||
428 | fftwnd_plan fftw3d_create_plan_specific(int nx, int ny, int nz, |
||
429 | fftw_direction dir, int flags, |
||
430 | fftw_complex *in, int istride, |
||
431 | fftw_complex *out, int ostride) |
||
432 | { |
||
433 | int n[3]; |
||
434 | |||
435 | n[0] = nx; |
||
436 | n[1] = ny; |
||
437 | n[2] = nz; |
||
438 | |||
439 | return fftwnd_create_plan_specific(3, n, dir, flags, |
||
440 | in, istride, out, ostride); |
||
441 | } |
||
442 | |||
443 | /* Create a generic fftwnd plan: */ |
||
444 | |||
445 | fftwnd_plan fftwnd_create_plan(int rank, const int *n, |
||
446 | fftw_direction dir, int flags) |
||
447 | { |
||
448 | return fftwnd_create_plan_specific(rank, n, dir, flags, 0, 1, 0, 1); |
||
449 | } |
||
450 | |||
451 | fftwnd_plan fftw2d_create_plan(int nx, int ny, |
||
452 | fftw_direction dir, int flags) |
||
453 | { |
||
454 | return fftw2d_create_plan_specific(nx, ny, dir, flags, 0, 1, 0, 1); |
||
455 | } |
||
456 | |||
457 | fftwnd_plan fftw3d_create_plan(int nx, int ny, int nz, |
||
458 | fftw_direction dir, int flags) |
||
459 | { |
||
460 | return fftw3d_create_plan_specific(nx, ny, nz, dir, flags, 0, 1, 0, 1); |
||
461 | } |
||
462 | |||
463 | /************************ Freeing the FFTWND Plan ************************/ |
||
464 | |||
465 | static void destroy_plan_array(int rank, fftw_plan *plans) |
||
466 | { |
||
467 | if (plans) { |
||
468 | int i, j; |
||
469 | |||
470 | for (i = 0; i < rank; ++i) { |
||
471 | for (j = i - 1; |
||
472 | j >= 0 && plans[i] != plans[j]; |
||
473 | --j); |
||
474 | if (j < 0 && plans[i]) |
||
475 | fftw_destroy_plan(plans[i]); |
||
476 | } |
||
477 | fftw_free(plans); |
||
478 | } |
||
479 | } |
||
480 | |||
481 | void fftwnd_destroy_plan(fftwnd_plan plan) |
||
482 | { |
||
483 | if (plan) { |
||
484 | destroy_plan_array(plan->rank, plan->plans); |
||
485 | |||
486 | if (plan->n) |
||
487 | fftw_free(plan->n); |
||
488 | |||
489 | if (plan->n_before) |
||
490 | fftw_free(plan->n_before); |
||
491 | |||
492 | if (plan->n_after) |
||
493 | fftw_free(plan->n_after); |
||
494 | |||
495 | if (plan->work) |
||
496 | fftw_free(plan->work); |
||
497 | |||
498 | fftw_free(plan); |
||
499 | } |
||
500 | } |
||
501 | |||
502 | /************************ Printing the FFTWND Plan ************************/ |
||
503 | /* |
||
504 | void fftwnd_fprint_plan(FILE *f, fftwnd_plan plan) |
||
505 | { |
||
506 | if (plan) { |
||
507 | int i, j; |
||
508 | |||
509 | if (plan->rank == 0) { |
||
510 | fprintf(f, "plan for rank 0 (null) transform.\n"); |
||
511 | return; |
||
512 | } |
||
513 | fprintf(f, "plan for "); |
||
514 | for (i = 0; i < plan->rank; ++i) |
||
515 | fprintf(f, "%s%d", i ? "x" : "", plan->n[i]); |
||
516 | fprintf(f, " transform:\n"); |
||
517 | |||
518 | if (plan->nbuffers > 0) |
||
519 | fprintf(f, " -- using buffered transforms (%d buffers)\n", |
||
520 | plan->nbuffers); |
||
521 | else |
||
522 | fprintf(f, " -- using unbuffered transform\n"); |
||
523 | |||
524 | for (i = 0; i < plan->rank; ++i) { |
||
525 | fprintf(f, "* dimension %d (size %d) ", i, plan->n[i]); |
||
526 | |||
527 | for (j = i - 1; j >= 0; --j) |
||
528 | if (plan->plans[j] == plan->plans[i]) |
||
529 | break; |
||
530 | |||
531 | if (j < 0) |
||
532 | fftw_fprint_plan(f, plan->plans[i]); |
||
533 | else |
||
534 | fprintf(f, "plan is same as dimension %d plan.\n", j); |
||
535 | } |
||
536 | } |
||
537 | } |
||
538 | |||
539 | void fftwnd_print_plan(fftwnd_plan plan) |
||
540 | { |
||
541 | fftwnd_fprint_plan(stdout, plan); |
||
542 | } |
||
543 | */ |
||
544 | /********************* Buffered FFTW (in-place) *********************/ |
||
545 | |||
546 | void fftw_buffered(fftw_plan p, int howmany, |
||
547 | fftw_complex *in, int istride, int idist, |
||
548 | fftw_complex *work, |
||
549 | int nbuffers, fftw_complex *buffers) |
||
550 | { |
||
551 | int i = 0, n, nb; |
||
552 | |||
553 | n = p->n; |
||
554 | nb = n + FFTWND_BUFFER_PADDING; |
||
555 | |||
556 | do { |
||
557 | for (; i <= howmany - nbuffers; i += nbuffers) { |
||
558 | fftw_complex *cur_in = in + i * idist; |
||
559 | int j, buf; |
||
560 | |||
561 | /* |
||
562 | * First, copy nbuffers strided arrays to the |
||
563 | * contiguous buffer arrays (reading consecutive |
||
564 | * locations, assuming that idist is 1): |
||
565 | */ |
||
566 | for (j = 0; j < n; ++j) { |
||
567 | fftw_complex *cur_in2 = cur_in + j * istride; |
||
568 | fftw_complex *cur_buffers = buffers + j; |
||
569 | |||
570 | for (buf = 0; buf <= nbuffers - 4; buf += 4) { |
||
571 | *cur_buffers = *cur_in2; |
||
572 | *(cur_buffers += nb) = *(cur_in2 += idist); |
||
573 | *(cur_buffers += nb) = *(cur_in2 += idist); |
||
574 | *(cur_buffers += nb) = *(cur_in2 += idist); |
||
575 | cur_buffers += nb; |
||
576 | cur_in2 += idist; |
||
577 | } |
||
578 | for (; buf < nbuffers; ++buf) { |
||
579 | *cur_buffers = *cur_in2; |
||
580 | cur_buffers += nb; |
||
581 | cur_in2 += idist; |
||
582 | } |
||
583 | } |
||
584 | |||
585 | /* |
||
586 | * Now, compute the FFTs in the buffers (in-place |
||
587 | * using work): |
||
588 | */ |
||
589 | fftw(p, nbuffers, buffers, 1, nb, work, 1, 0); |
||
590 | |||
591 | /* |
||
592 | * Finally, copy the results back from the contiguous |
||
593 | * buffers to the strided arrays (writing consecutive |
||
594 | * locations): |
||
595 | */ |
||
596 | for (j = 0; j < n; ++j) { |
||
597 | fftw_complex *cur_in2 = cur_in + j * istride; |
||
598 | fftw_complex *cur_buffers = buffers + j; |
||
599 | |||
600 | for (buf = 0; buf <= nbuffers - 4; buf += 4) { |
||
601 | *cur_in2 = *cur_buffers; |
||
602 | *(cur_in2 += idist) = *(cur_buffers += nb); |
||
603 | *(cur_in2 += idist) = *(cur_buffers += nb); |
||
604 | *(cur_in2 += idist) = *(cur_buffers += nb); |
||
605 | cur_buffers += nb; |
||
606 | cur_in2 += idist; |
||
607 | } |
||
608 | for (; buf < nbuffers; ++buf) { |
||
609 | *cur_in2 = *cur_buffers; |
||
610 | cur_buffers += nb; |
||
611 | cur_in2 += idist; |
||
612 | } |
||
613 | } |
||
614 | } |
||
615 | |||
616 | /* |
||
617 | * we skip howmany % nbuffers ffts at the end of the loop, |
||
618 | * so we have to go back and do them: |
||
619 | */ |
||
620 | nbuffers = howmany - i; |
||
621 | } while (i < howmany); |
||
622 | } |
||
623 | |||
624 | /********************* Computing the N-Dimensional FFT *********************/ |
||
625 | |||
626 | void fftwnd_aux(fftwnd_plan p, int cur_dim, |
||
627 | fftw_complex *in, int istride, |
||
628 | fftw_complex *out, int ostride, |
||
629 | fftw_complex *work) |
||
630 | { |
||
631 | int n_after = p->n_after[cur_dim], n = p->n[cur_dim]; |
||
632 | |||
633 | if (cur_dim == p->rank - 2) { |
||
634 | /* just do the last dimension directly: */ |
||
635 | if (p->is_in_place) |
||
636 | fftw(p->plans[p->rank - 1], n, |
||
637 | in, istride, n_after * istride, |
||
638 | work, 1, 0); |
||
639 | else |
||
640 | fftw(p->plans[p->rank - 1], n, |
||
641 | in, istride, n_after * istride, |
||
642 | out, ostride, n_after * ostride); |
||
643 | } else { /* we have at least two dimensions to go */ |
||
644 | int i; |
||
645 | |||
646 | /* |
||
647 | * process the subsequent dimensions recursively, in hyperslabs, |
||
648 | * to get maximum locality: |
||
649 | */ |
||
650 | for (i = 0; i < n; ++i) |
||
651 | fftwnd_aux(p, cur_dim + 1, |
||
652 | in + i * n_after * istride, istride, |
||
653 | out + i * n_after * ostride, ostride, work); |
||
654 | } |
||
655 | |||
656 | /* do the current dimension (in-place): */ |
||
657 | if (p->nbuffers == 0) { |
||
658 | fftw(p->plans[cur_dim], n_after, |
||
659 | out, n_after * ostride, ostride, |
||
660 | work, 1, 0); |
||
661 | } else /* using contiguous copy buffers: */ |
||
662 | fftw_buffered(p->plans[cur_dim], n_after, |
||
663 | out, n_after * ostride, ostride, |
||
664 | work, p->nbuffers, work + n); |
||
665 | } |
||
666 | |||
667 | /* |
||
668 | * alternate version of fftwnd_aux -- this version pushes the howmany |
||
669 | * loop down to the leaves of the computation, for greater locality in |
||
670 | * cases where dist < stride |
||
671 | */ |
||
672 | void fftwnd_aux_howmany(fftwnd_plan p, int cur_dim, |
||
673 | int howmany, |
||
674 | fftw_complex *in, int istride, int idist, |
||
675 | fftw_complex *out, int ostride, int odist, |
||
676 | fftw_complex *work) |
||
677 | { |
||
678 | int n_after = p->n_after[cur_dim], n = p->n[cur_dim]; |
||
679 | int k; |
||
680 | |||
681 | if (cur_dim == p->rank - 2) { |
||
682 | /* just do the last dimension directly: */ |
||
683 | if (p->is_in_place) |
||
684 | for (k = 0; k < n; ++k) |
||
685 | fftw(p->plans[p->rank - 1], howmany, |
||
686 | in + k * n_after * istride, istride, idist, |
||
687 | work, 1, 0); |
||
688 | else |
||
689 | for (k = 0; k < n; ++k) |
||
690 | fftw(p->plans[p->rank - 1], howmany, |
||
691 | in + k * n_after * istride, istride, idist, |
||
692 | out + k * n_after * ostride, ostride, odist); |
||
693 | } else { /* we have at least two dimensions to go */ |
||
694 | int i; |
||
695 | |||
696 | /* |
||
697 | * process the subsequent dimensions recursively, in |
||
698 | * hyperslabs, to get maximum locality: |
||
699 | */ |
||
700 | for (i = 0; i < n; ++i) |
||
701 | fftwnd_aux_howmany(p, cur_dim + 1, howmany, |
||
702 | in + i * n_after * istride, istride, idist, |
||
703 | out + i * n_after * ostride, ostride, odist, |
||
704 | work); |
||
705 | } |
||
706 | |||
707 | /* do the current dimension (in-place): */ |
||
708 | if (p->nbuffers == 0) |
||
709 | for (k = 0; k < n_after; ++k) |
||
710 | fftw(p->plans[cur_dim], howmany, |
||
711 | out + k * ostride, n_after * ostride, odist, |
||
712 | work, 1, 0); |
||
713 | else /* using contiguous copy buffers: */ |
||
714 | for (k = 0; k < n_after; ++k) |
||
715 | fftw_buffered(p->plans[cur_dim], howmany, |
||
716 | out + k * ostride, n_after * ostride, odist, |
||
717 | work, p->nbuffers, work + n); |
||
718 | } |
||
719 | |||
720 | void fftwnd(fftwnd_plan p, int howmany, |
||
721 | fftw_complex *in, int istride, int idist, |
||
722 | fftw_complex *out, int ostride, int odist) |
||
723 | { |
||
724 | fftw_complex *work; |
||
725 | |||
726 | #ifdef FFTW_DEBUG |
||
727 | if (p->rank > 0 && (p->plans[0]->flags & FFTW_THREADSAFE) |
||
728 | && p->nwork && p->work) |
||
729 | fftw_die("bug with FFTW_THREADSAFE flag"); |
||
730 | #endif |
||
731 | |||
732 | if (p->nwork && !p->work) |
||
733 | work = (fftw_complex *) fftw_malloc(p->nwork * sizeof(fftw_complex)); |
||
734 | else |
||
735 | work = p->work; |
||
736 | |||
737 | switch (p->rank) { |
||
738 | case 0: |
||
739 | break; |
||
740 | case 1: |
||
741 | if (p->is_in_place) /* fft is in-place */ |
||
742 | fftw(p->plans[0], howmany, in, istride, idist, |
||
743 | work, 1, 0); |
||
744 | else |
||
745 | fftw(p->plans[0], howmany, in, istride, idist, |
||
746 | out, ostride, odist); |
||
747 | break; |
||
748 | default: /* rank >= 2 */ |
||
749 | { |
||
750 | if (p->is_in_place) { |
||
751 | out = in; |
||
752 | ostride = istride; |
||
753 | odist = idist; |
||
754 | } |
||
755 | if (howmany > 1 && odist < ostride) |
||
756 | fftwnd_aux_howmany(p, 0, howmany, |
||
757 | in, istride, idist, |
||
758 | out, ostride, odist, |
||
759 | work); |
||
760 | else { |
||
761 | int i; |
||
762 | |||
763 | for (i = 0; i < howmany; ++i) |
||
764 | fftwnd_aux(p, 0, |
||
765 | in + i * idist, istride, |
||
766 | out + i * odist, ostride, |
||
767 | work); |
||
768 | } |
||
769 | } |
||
770 | } |
||
771 | |||
772 | if (p->nwork && !p->work) |
||
773 | fftw_free(work); |
||
774 | |||
775 | } |
||
776 | |||
777 | void fftwnd_one(fftwnd_plan p, fftw_complex *in, fftw_complex *out) |
||
778 | { |
||
779 | fftwnd(p, 1, in, 1, 1, out, 1, 1); |
||
780 | } |