NFFT 3.5.3alpha
nfft.c
1/*
2 * Copyright (c) 2002, 2017 Jens Keiner, Stefan Kunis, Daniel Potts
3 *
4 * This program is free software; you can redistribute it and/or modify it under
5 * the terms of the GNU General Public License as published by the Free Software
6 * Foundation; either version 2 of the License, or (at your option) any later
7 * version.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12 * details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 51
16 * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 */
18
19/* Nonequispaced FFT */
20
21/* Authors: D. Potts, S. Kunis 2002-2009, Jens Keiner 2009, Toni Volkmer 2012 */
22
23/* configure header */
24#include "config.h"
25
26/* complex datatype (maybe) */
27#ifdef HAVE_COMPLEX_H
28#include<complex.h>
29#endif
30
31/* NFFT headers */
32#include "nfft3.h"
33#include "infft.h"
34
35#ifdef _OPENMP
36#include <omp.h>
37#endif
38
39#ifdef OMP_ASSERT
40#include <assert.h>
41#endif
42
43#undef X
44#define X(name) NFFT(name)
45
47static inline INT intprod(const INT *vec, const INT a, const INT d)
48{
49 INT t, p;
50
51 p = 1;
52 for (t = 0; t < d; t++)
53 p *= vec[t] - a;
54
55 return p;
56}
57
58/* handy shortcuts */
59#define BASE(x) CEXP(x)
60
75static inline void sort0(const INT d, const INT *n, const INT m,
76 const INT local_x_num, const R *local_x, INT *ar_x)
77{
78 INT u_j[d], i, j, help, rhigh;
79 INT *ar_x_temp;
80 INT nprod;
81
82 for (i = 0; i < local_x_num; i++)
83 {
84 ar_x[2 * i] = 0;
85 ar_x[2 *i + 1] = i;
86 for (j = 0; j < d; j++)
87 {
88 help = (INT) LRINT(FLOOR((R)(n[j]) * local_x[d * i + j] - (R)(m)));
89 u_j[j] = (help % n[j] + n[j]) % n[j];
90
91 ar_x[2 * i] += u_j[j];
92 if (j + 1 < d)
93 ar_x[2 * i] *= n[j + 1];
94 }
95 }
96
97 for (j = 0, nprod = 1; j < d; j++)
98 nprod *= n[j];
99
100 rhigh = (INT) LRINT(CEIL(LOG2((R)nprod))) - 1;
101
102 ar_x_temp = (INT*) Y(malloc)(2 * (size_t)(local_x_num) * sizeof(INT));
103 Y(sort_node_indices_radix_lsdf)(local_x_num, ar_x, ar_x_temp, rhigh);
104#ifdef OMP_ASSERT
105 for (i = 1; i < local_x_num; i++)
106 assert(ar_x[2 * (i - 1)] <= ar_x[2 * i]);
107#endif
108 Y(free)(ar_x_temp);
109}
110
119static inline void sort(const X(plan) *ths)
120{
121 if (ths->flags & NFFT_SORT_NODES)
122 sort0(ths->d, ths->n, ths->m, ths->M_total, ths->x, ths->index_x);
123}
124
145void X(trafo_direct)(const X(plan) *ths)
146{
147 C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
148
149 memset(f, 0, (size_t)(ths->M_total) * sizeof(C));
150
151 if (ths->d == 1)
152 {
153 /* specialize for univariate case, rationale: faster */
154 INT j;
155#ifdef _OPENMP
156 #pragma omp parallel for default(shared) private(j)
157#endif
158 for (j = 0; j < ths->M_total; j++)
159 {
160 INT k_L;
161 for (k_L = 0; k_L < ths->N_total; k_L++)
162 {
163 R omega = K2PI * ((R)(k_L - ths->N_total/2)) * ths->x[j];
164 f[j] += f_hat[k_L] * BASE(-II * omega);
165 }
166 }
167 }
168 else
169 {
170 /* multivariate case */
171 INT j;
172#ifdef _OPENMP
173 #pragma omp parallel for default(shared) private(j)
174#endif
175 for (j = 0; j < ths->M_total; j++)
176 {
177 R x[ths->d], omega, Omega[ths->d + 1];
178 INT t, t2, k_L, k[ths->d];
179 Omega[0] = K(0.0);
180 for (t = 0; t < ths->d; t++)
181 {
182 k[t] = -ths->N[t]/2;
183 x[t] = K2PI * ths->x[j * ths->d + t];
184 Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
185 }
186 omega = Omega[ths->d];
187
188 for (k_L = 0; k_L < ths->N_total; k_L++)
189 {
190 f[j] += f_hat[k_L] * BASE(-II * omega);
191 {
192 for (t = ths->d - 1; (t >= 1) && (k[t] == ths->N[t]/2 - 1); t--)
193 k[t]-= ths->N[t]-1;
194
195 k[t]++;
196
197 for (t2 = t; t2 < ths->d; t2++)
198 Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
199
200 omega = Omega[ths->d];
201 }
202 }
203 }
204 }
205}
206
207void X(adjoint_direct)(const X(plan) *ths)
208{
209 C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
210
211 memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(C));
212
213 if (ths->d == 1)
214 {
215 /* specialize for univariate case, rationale: faster */
216#ifdef _OPENMP
217 INT k_L;
218 #pragma omp parallel for default(shared) private(k_L)
219 for (k_L = 0; k_L < ths->N_total; k_L++)
220 {
221 INT j;
222 for (j = 0; j < ths->M_total; j++)
223 {
224 R omega = K2PI * ((R)(k_L - (ths->N_total/2))) * ths->x[j];
225 f_hat[k_L] += f[j] * BASE(II * omega);
226 }
227 }
228#else
229 INT j;
230 for (j = 0; j < ths->M_total; j++)
231 {
232 INT k_L;
233 for (k_L = 0; k_L < ths->N_total; k_L++)
234 {
235 R omega = K2PI * ((R)(k_L - ths->N_total / 2)) * ths->x[j];
236 f_hat[k_L] += f[j] * BASE(II * omega);
237 }
238 }
239#endif
240 }
241 else
242 {
243 /* multivariate case */
244 INT j, k_L;
245#ifdef _OPENMP
246 #pragma omp parallel for default(shared) private(j, k_L)
247 for (k_L = 0; k_L < ths->N_total; k_L++)
248 {
249 INT k[ths->d], k_temp, t;
250
251 k_temp = k_L;
252
253 for (t = ths->d - 1; t >= 0; t--)
254 {
255 k[t] = k_temp % ths->N[t] - ths->N[t]/2;
256 k_temp /= ths->N[t];
257 }
258
259 for (j = 0; j < ths->M_total; j++)
260 {
261 R omega = K(0.0);
262 for (t = 0; t < ths->d; t++)
263 omega += k[t] * K2PI * ths->x[j * ths->d + t];
264 f_hat[k_L] += f[j] * BASE(II * omega);
265 }
266 }
267#else
268 for (j = 0; j < ths->M_total; j++)
269 {
270 R x[ths->d], omega, Omega[ths->d+1];
271 INT t, t2, k[ths->d];
272 Omega[0] = K(0.0);
273 for (t = 0; t < ths->d; t++)
274 {
275 k[t] = -ths->N[t]/2;
276 x[t] = K2PI * ths->x[j * ths->d + t];
277 Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
278 }
279 omega = Omega[ths->d];
280 for (k_L = 0; k_L < ths->N_total; k_L++)
281 {
282 f_hat[k_L] += f[j] * BASE(II * omega);
283
284 for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--)
285 k[t]-= ths->N[t]-1;
286
287 k[t]++;
288
289 for (t2 = t; t2 < ths->d; t2++)
290 Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
291
292 omega = Omega[ths->d];
293 }
294 }
295#endif
296 }
297}
298
324static inline void uo(const X(plan) *ths, const INT j, INT *up, INT *op,
325 const INT act_dim)
326{
327 const R xj = ths->x[j * ths->d + act_dim];
328 INT c = LRINT(FLOOR(xj * (R)(ths->n[act_dim])));
329
330 (*up) = c - (ths->m);
331 (*op) = c + 1 + (ths->m);
332}
333
334static inline void uo2(INT *u, INT *o, const R x, const INT n, const INT m)
335{
336 INT c = LRINT(FLOOR(x * (R)(n)));
337
338 *u = (c - m + n) % n;
339 *o = (c + 1 + m + n) % n;
340}
341
342#define MACRO_D_compute_A \
343{ \
344 g_hat[k_plain[ths->d]] = f_hat[ks_plain[ths->d]] * c_phi_inv_k[ths->d]; \
345}
346
347#define MACRO_D_compute_T \
348{ \
349 f_hat[ks_plain[ths->d]] = g_hat[k_plain[ths->d]] * c_phi_inv_k[ths->d]; \
350}
351
352#define MACRO_D_init_result_A memset(g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
353
354#define MACRO_D_init_result_T memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(C));
355
356#define MACRO_with_PRE_PHI_HUT * ths->c_phi_inv[t2][ks[t2]];
357
358#define MACRO_without_PRE_PHI_HUT / (PHI_HUT(ths->n[t2],ks[t2]-(ths->N[t2]/2),t2));
359
360#define MACRO_init_k_ks \
361{ \
362 for (t = ths->d-1; 0 <= t; t--) \
363 { \
364 kp[t] = k[t] = 0; \
365 ks[t] = ths->N[t]/2; \
366 } \
367 t++; \
368}
369
370#define MACRO_update_c_phi_inv_k(which_one) \
371{ \
372 for (t2 = t; t2 < ths->d; t2++) \
373 { \
374 c_phi_inv_k[t2+1] = c_phi_inv_k[t2] MACRO_ ##which_one; \
375 ks_plain[t2+1] = ks_plain[t2]*ths->N[t2] + ks[t2]; \
376 k_plain[t2+1] = k_plain[t2]*ths->n[t2] + k[t2]; \
377 } \
378}
379
380#define MACRO_count_k_ks \
381{ \
382 for (t = ths->d-1; (t > 0) && (kp[t] == ths->N[t]-1); t--) \
383 { \
384 kp[t] = k[t] = 0; \
385 ks[t]= ths->N[t]/2; \
386 } \
387\
388 kp[t]++; k[t]++; ks[t]++; \
389 if(kp[t] == ths->N[t]/2) \
390 { \
391 k[t] = ths->n[t] - ths->N[t]/2; \
392 ks[t] = 0; \
393 } \
394} \
395
396/* sub routines for the fast transforms matrix vector multiplication with D, D^T */
397#define MACRO_D(which_one) \
398static inline void D_serial_ ## which_one (X(plan) *ths) \
399{ \
400 C *f_hat, *g_hat; /* local copy */ \
401 R c_phi_inv_k[ths->d+1]; /* postfix product of PHI_HUT */ \
402 INT t, t2; /* index dimensions */ \
403 INT k_L; /* plain index */ \
404 INT kp[ths->d]; /* multi index (simple) */ \
405 INT k[ths->d]; /* multi index in g_hat */ \
406 INT ks[ths->d]; /* multi index in f_hat, c_phi_inv*/ \
407 INT k_plain[ths->d+1]; /* postfix plain index */ \
408 INT ks_plain[ths->d+1]; /* postfix plain index */ \
409 \
410 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat; \
411 MACRO_D_init_result_ ## which_one; \
412\
413 c_phi_inv_k[0] = K(1.0); \
414 k_plain[0] = 0; \
415 ks_plain[0] = 0; \
416\
417 MACRO_init_k_ks; \
418\
419 if (ths->flags & PRE_PHI_HUT) \
420 { \
421 for (k_L = 0; k_L < ths->N_total; k_L++) \
422 { \
423 MACRO_update_c_phi_inv_k(with_PRE_PHI_HUT); \
424 MACRO_D_compute_ ## which_one; \
425 MACRO_count_k_ks; \
426 } \
427 } \
428 else \
429 { \
430 for (k_L = 0; k_L < ths->N_total; k_L++) \
431 { \
432 MACRO_update_c_phi_inv_k(without_PRE_PHI_HUT); \
433 MACRO_D_compute_ ## which_one; \
434 MACRO_count_k_ks; \
435 } \
436 } \
437}
438
439#ifdef _OPENMP
440static inline void D_openmp_A(X(plan) *ths)
441{
442 C *f_hat, *g_hat;
443 INT k_L;
445 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
446 memset(g_hat, 0, ths->n_total * sizeof(C));
447
448 if (ths->flags & PRE_PHI_HUT)
449 {
450 #pragma omp parallel for default(shared) private(k_L)
451 for (k_L = 0; k_L < ths->N_total; k_L++)
452 {
453 INT kp[ths->d]; //0..N-1
454 INT k[ths->d];
455 INT ks[ths->d];
456 R c_phi_inv_k_val = K(1.0);
457 INT k_plain_val = 0;
458 INT ks_plain_val = 0;
459 INT t;
460 INT k_temp = k_L;
461
462 for (t = ths->d-1; t >= 0; t--)
463 {
464 kp[t] = k_temp % ths->N[t];
465 if (kp[t] >= ths->N[t]/2)
466 k[t] = ths->n[t] - ths->N[t] + kp[t];
467 else
468 k[t] = kp[t];
469 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
470 k_temp /= ths->N[t];
471 }
472
473 for (t = 0; t < ths->d; t++)
474 {
475 c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
476 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
477 k_plain_val = k_plain_val*ths->n[t] + k[t];
478 }
479
480 g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
481 } /* for(k_L) */
482 } /* if(PRE_PHI_HUT) */
483 else
484 {
485 #pragma omp parallel for default(shared) private(k_L)
486 for (k_L = 0; k_L < ths->N_total; k_L++)
487 {
488 INT kp[ths->d]; //0..N-1
489 INT k[ths->d];
490 INT ks[ths->d];
491 R c_phi_inv_k_val = K(1.0);
492 INT k_plain_val = 0;
493 INT ks_plain_val = 0;
494 INT t;
495 INT k_temp = k_L;
496
497 for (t = ths->d-1; t >= 0; t--)
498 {
499 kp[t] = k_temp % ths->N[t];
500 if (kp[t] >= ths->N[t]/2)
501 k[t] = ths->n[t] - ths->N[t] + kp[t];
502 else
503 k[t] = kp[t];
504 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
505 k_temp /= ths->N[t];
506 }
507
508 for (t = 0; t < ths->d; t++)
509 {
510 c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
511 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
512 k_plain_val = k_plain_val*ths->n[t] + k[t];
513 }
514
515 g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
516 } /* for(k_L) */
517 } /* else(PRE_PHI_HUT) */
518}
519#endif
520
521#ifndef _OPENMP
522MACRO_D(A)
523#endif
524
525static inline void D_A(X(plan) *ths)
526{
527#ifdef _OPENMP
528 D_openmp_A(ths);
529#else
530 D_serial_A(ths);
531#endif
532}
533
534#ifdef _OPENMP
535static void D_openmp_T(X(plan) *ths)
536{
537 C *f_hat, *g_hat;
538 INT k_L;
540 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
541 memset(f_hat, 0, ths->N_total * sizeof(C));
542
543 if (ths->flags & PRE_PHI_HUT)
544 {
545 #pragma omp parallel for default(shared) private(k_L)
546 for (k_L = 0; k_L < ths->N_total; k_L++)
547 {
548 INT kp[ths->d]; //0..N-1
549 INT k[ths->d];
550 INT ks[ths->d];
551 R c_phi_inv_k_val = K(1.0);
552 INT k_plain_val = 0;
553 INT ks_plain_val = 0;
554 INT t;
555 INT k_temp = k_L;
556
557 for (t = ths->d - 1; t >= 0; t--)
558 {
559 kp[t] = k_temp % ths->N[t];
560 if (kp[t] >= ths->N[t]/2)
561 k[t] = ths->n[t] - ths->N[t] + kp[t];
562 else
563 k[t] = kp[t];
564 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
565 k_temp /= ths->N[t];
566 }
567
568 for (t = 0; t < ths->d; t++)
569 {
570 c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
571 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
572 k_plain_val = k_plain_val*ths->n[t] + k[t];
573 }
574
575 f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
576 } /* for(k_L) */
577 } /* if(PRE_PHI_HUT) */
578 else
579 {
580 #pragma omp parallel for default(shared) private(k_L)
581 for (k_L = 0; k_L < ths->N_total; k_L++)
582 {
583 INT kp[ths->d]; //0..N-1
584 INT k[ths->d];
585 INT ks[ths->d];
586 R c_phi_inv_k_val = K(1.0);
587 INT k_plain_val = 0;
588 INT ks_plain_val = 0;
589 INT t;
590 INT k_temp = k_L;
591
592 for (t = ths->d-1; t >= 0; t--)
593 {
594 kp[t] = k_temp % ths->N[t];
595 if (kp[t] >= ths->N[t]/2)
596 k[t] = ths->n[t] - ths->N[t] + kp[t];
597 else
598 k[t] = kp[t];
599 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
600 k_temp /= ths->N[t];
601 }
602
603 for (t = 0; t < ths->d; t++)
604 {
605 c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
606 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
607 k_plain_val = k_plain_val*ths->n[t] + k[t];
608 }
609
610 f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
611 } /* for(k_L) */
612 } /* else(PRE_PHI_HUT) */
613}
614#endif
615
616#ifndef _OPENMP
617MACRO_D(T)
618#endif
619
620static void D_T(X(plan) *ths)
621{
622#ifdef _OPENMP
623 D_openmp_T(ths);
624#else
625 D_serial_T(ths);
626#endif
627}
628
629/* sub routines for the fast transforms matrix vector multiplication with B, B^T */
630#define MACRO_B_init_result_A memset(ths->f, 0, (size_t)(ths->M_total) * sizeof(C));
631#define MACRO_B_init_result_T memset(ths->g, 0, (size_t)(ths->n_total) * sizeof(C));
632
633#define MACRO_B_PRE_FULL_PSI_compute_A \
634{ \
635 (*fj) += ths->psi[ix] * g[ths->psi_index_g[ix]]; \
636}
637
638#define MACRO_B_PRE_FULL_PSI_compute_T \
639{ \
640 g[ths->psi_index_g[ix]] += ths->psi[ix] * (*fj); \
641}
642
643#define MACRO_B_compute_A \
644{ \
645 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]]; \
646}
647
648#define MACRO_B_compute_T \
649{ \
650 ths->g[ll_plain[ths->d]] += phi_prod[ths->d] * ths->f[j]; \
651}
652
653#define MACRO_with_FG_PSI fg_psi[t2][lj[t2]]
654
655#define MACRO_with_PRE_PSI ths->psi[(j*ths->d+t2) * (2*ths->m+2)+lj[t2]]
656
657#define MACRO_without_PRE_PSI_improved psij_const[t2 * (2*ths->m+2) + lj[t2]]
658
659#define MACRO_without_PRE_PSI PHI(ths->n[t2], ths->x[j*ths->d+t2] \
660 - ((R) (lj[t2]+u[t2]))/((R)ths->n[t2]), t2)
661
662#define MACRO_init_uo_l_lj_t \
663INT l_all[ths->d*(2*ths->m+2)]; \
664{ \
665 for (t = ths->d-1; t >= 0; t--) \
666 { \
667 uo(ths,j,&u[t],&o[t],t); \
668 INT lj_t; \
669 for (lj_t = 0; lj_t < 2*ths->m+2; lj_t++) \
670 l_all[t*(2*ths->m+2) + lj_t] = (u[t] + lj_t + ths->n[t]) % ths->n[t]; \
671 lj[t] = 0; \
672 } \
673 t++; \
674}
675
676#define MACRO_update_phi_prod_ll_plain(which_one) { \
677 for (t2 = t; t2 < ths->d; t2++) \
678 { \
679 phi_prod[t2+1] = phi_prod[t2] * MACRO_ ## which_one; \
680 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
681 } \
682}
683
684#define MACRO_count_uo_l_lj_t \
685{ \
686 for (t = ths->d-1; (t > 0) && (lj[t] == o[t]-u[t]); t--) \
687 { \
688 lj[t] = 0; \
689 } \
690 \
691 lj[t]++; \
692}
693
694#define MACRO_COMPUTE_with_PRE_PSI MACRO_with_PRE_PSI
695#define MACRO_COMPUTE_with_PRE_FG_PSI MACRO_with_FG_PSI
696#define MACRO_COMPUTE_with_FG_PSI MACRO_with_FG_PSI
697#define MACRO_COMPUTE_with_PRE_LIN_PSI MACRO_with_FG_PSI
698#define MACRO_COMPUTE_without_PRE_PSI MACRO_without_PRE_PSI_improved
699#define MACRO_COMPUTE_without_PRE_PSI_improved MACRO_without_PRE_PSI_improved
700
701#define MACRO_B_COMPUTE_ONE_NODE(whichone_AT,whichone_FLAGS) \
702 if (ths->d == 4) \
703 { \
704 INT l0, l1, l2, l3; \
705 for (l0 = 0; l0 < 2*ths->m+2; l0++) \
706 { \
707 lj[0] = l0; \
708 t2 = 0; \
709 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \
710 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
711 for (l1 = 0; l1 < 2*ths->m+2; l1++) \
712 { \
713 lj[1] = l1; \
714 t2 = 1; \
715 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \
716 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
717 for (l2 = 0; l2 < 2*ths->m+2; l2++) \
718 { \
719 lj[2] = l2; \
720 t2 = 2; \
721 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \
722 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
723 for (l3 = 0; l3 < 2*ths->m+2; l3++) \
724 { \
725 lj[3] = l3; \
726 t2 = 3; \
727 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \
728 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
729 \
730 MACRO_B_compute_ ## whichone_AT; \
731 } \
732 } \
733 } \
734 } \
735 } /* if(d==4) */ \
736 else if (ths->d == 5) \
737 { \
738 INT l0, l1, l2, l3, l4; \
739 for (l0 = 0; l0 < 2*ths->m+2; l0++) \
740 { \
741 lj[0] = l0; \
742 t2 = 0; \
743 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \
744 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
745 for (l1 = 0; l1 < 2*ths->m+2; l1++) \
746 { \
747 lj[1] = l1; \
748 t2 = 1; \
749 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \
750 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
751 for (l2 = 0; l2 < 2*ths->m+2; l2++) \
752 { \
753 lj[2] = l2; \
754 t2 = 2; \
755 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \
756 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
757 for (l3 = 0; l3 < 2*ths->m+2; l3++) \
758 { \
759 lj[3] = l3; \
760 t2 = 3; \
761 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \
762 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
763 for (l4 = 0; l4 < 2*ths->m+2; l4++) \
764 { \
765 lj[4] = l4; \
766 t2 = 4; \
767 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone_FLAGS; \
768 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
769 \
770 MACRO_B_compute_ ## whichone_AT; \
771 } \
772 } \
773 } \
774 } \
775 } \
776 } /* if(d==5) */ \
777 else { \
778 for (l_L = 0; l_L < lprod; l_L++) \
779 { \
780 MACRO_update_phi_prod_ll_plain(whichone_FLAGS); \
781 \
782 MACRO_B_compute_ ## whichone_AT; \
783 \
784 MACRO_count_uo_l_lj_t; \
785 } /* for(l_L) */ \
786 }
787
788#define MACRO_B(which_one) \
789static inline void B_serial_ ## which_one (X(plan) *ths) \
790{ \
791 INT lprod; /* 'regular bandwidth' of matrix B */ \
792 INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ \
793 INT t, t2; /* index dimensions */ \
794 INT k; /* index nodes */ \
795 INT l_L, ix; /* index one row of B */ \
796 INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ \
797 INT ll_plain[ths->d+1]; /* postfix plain index in g */ \
798 R phi_prod[ths->d+1]; /* postfix product of PHI */ \
799 R y[ths->d]; \
800 R fg_psi[ths->d][2*ths->m+2]; \
801 R fg_exp_l[ths->d][2*ths->m+2]; \
802 INT l_fg,lj_fg; \
803 R tmpEXP1, tmpEXP2, tmpEXP2sq, tmp1, tmp2, tmp3; \
804 R ip_w; \
805 INT ip_u; \
806 INT ip_s = ths->K/(ths->m+2); \
807 \
808 MACRO_B_init_result_ ## which_one; \
809 \
810 if (ths->flags & PRE_FULL_PSI) \
811 { \
812 INT j; \
813 C *f, *g; /* local copy */ \
814 C *fj; /* local copy */ \
815 f = (C*)ths->f; g = (C*)ths->g; \
816 \
817 for (ix = 0, j = 0, fj = f; j < ths->M_total; j++, fj++) \
818 { \
819 for (l_L = 0; l_L < ths->psi_index_f[j]; l_L++, ix++) \
820 { \
821 MACRO_B_PRE_FULL_PSI_compute_ ## which_one; \
822 } \
823 } \
824 return; \
825 } \
826\
827 phi_prod[0] = K(1.0); \
828 ll_plain[0] = 0; \
829\
830 for (t = 0, lprod = 1; t < ths->d; t++) \
831 lprod *= (2 * ths->m + 2); \
832\
833 if (ths->flags & PRE_PSI) \
834 { \
835 sort(ths); \
836 \
837 for (k = 0; k < ths->M_total; k++) \
838 { \
839 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; \
840 \
841 MACRO_init_uo_l_lj_t; \
842 \
843 MACRO_B_COMPUTE_ONE_NODE(which_one,with_PRE_PSI); \
844 } /* for(j) */ \
845 return; \
846 } /* if(PRE_PSI) */ \
847 \
848 if (ths->flags & PRE_FG_PSI) \
849 { \
850 sort(ths); \
851 \
852 for(t2 = 0; t2 < ths->d; t2++) \
853 { \
854 tmpEXP2 = EXP(K(-1.0) / ths->b[t2]); \
855 tmpEXP2sq = tmpEXP2*tmpEXP2; \
856 tmp2 = K(1.0); \
857 tmp3 = K(1.0); \
858 fg_exp_l[t2][0] = K(1.0); \
859 for (lj_fg = 1; lj_fg <= (2 * ths->m + 2); lj_fg++) \
860 { \
861 tmp3 = tmp2*tmpEXP2; \
862 tmp2 *= tmpEXP2sq; \
863 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1] * tmp3; \
864 } \
865 } \
866 for (k = 0; k < ths->M_total; k++) \
867 { \
868 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; \
869 \
870 MACRO_init_uo_l_lj_t; \
871 \
872 for (t2 = 0; t2 < ths->d; t2++) \
873 { \
874 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \
875 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \
876 tmp1 = K(1.0); \
877 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
878 { \
879 tmp1 *= tmpEXP1; \
880 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
881 } \
882 } \
883 \
884 MACRO_B_COMPUTE_ONE_NODE(which_one,with_FG_PSI); \
885 } /* for(j) */ \
886 return; \
887 } /* if(PRE_FG_PSI) */ \
888 \
889 if (ths->flags & FG_PSI) \
890 { \
891 sort(ths); \
892 \
893 for (t2 = 0; t2 < ths->d; t2++) \
894 { \
895 tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \
896 tmpEXP2sq = tmpEXP2*tmpEXP2; \
897 tmp2 = K(1.0); \
898 tmp3 = K(1.0); \
899 fg_exp_l[t2][0] = K(1.0); \
900 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \
901 { \
902 tmp3 = tmp2*tmpEXP2; \
903 tmp2 *= tmpEXP2sq; \
904 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \
905 } \
906 } \
907 for (k = 0; k < ths->M_total; k++) \
908 { \
909 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; \
910 \
911 MACRO_init_uo_l_lj_t; \
912 \
913 for (t2 = 0; t2 < ths->d; t2++) \
914 { \
915 fg_psi[t2][0] = (PHI(ths->n[t2], (ths->x[j*ths->d+t2] - ((R)u[t2])/((R)(ths->n[t2]))), t2));\
916 \
917 tmpEXP1 = EXP(K(2.0) * ((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \
918 /ths->b[t2]); \
919 tmp1 = K(1.0); \
920 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
921 { \
922 tmp1 *= tmpEXP1; \
923 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
924 } \
925 } \
926 \
927 MACRO_B_COMPUTE_ONE_NODE(which_one,with_FG_PSI); \
928 } /* for(j) */ \
929 return; \
930 } /* if(FG_PSI) */ \
931 \
932 if (ths->flags & PRE_LIN_PSI) \
933 { \
934 sort(ths); \
935 \
936 for (k = 0; k<ths->M_total; k++) \
937 { \
938 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; \
939 \
940 MACRO_init_uo_l_lj_t; \
941 \
942 for (t2 = 0; t2 < ths->d; t2++) \
943 { \
944 y[t2] = (((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \
945 * ((R)(ths->K))) / (R)(ths->m + 2); \
946 ip_u = LRINT(FLOOR(y[t2])); \
947 ip_w = y[t2]-ip_u; \
948 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \
949 { \
950 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \
951 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \
952 * (ip_w); \
953 } \
954 } \
955 \
956 MACRO_B_COMPUTE_ONE_NODE(which_one,with_FG_PSI); \
957 } /* for(j) */ \
958 return; \
959 } /* if(PRE_LIN_PSI) */ \
960 \
961 sort(ths); \
962 \
963 /* no precomputed psi at all */ \
964 for (k = 0; k < ths->M_total; k++) \
965 { \
966 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; \
967 \
968 R psij_const[ths->d * (2*ths->m+2)]; \
969 \
970 MACRO_init_uo_l_lj_t; \
971 \
972 for (t2 = 0; t2 < ths->d; t2++) \
973 { \
974 INT lj_t; \
975 for (lj_t = 0; lj_t < 2*ths->m+2; lj_t++) \
976 psij_const[t2 * (2*ths->m+2) + lj_t] = PHI(ths->n[t2], ths->x[j*ths->d+t2] \
977 - ((R) (lj_t+u[t2]))/((R)ths->n[t2]), t2); \
978 } \
979 \
980 MACRO_B_COMPUTE_ONE_NODE(which_one,without_PRE_PSI_improved); \
981 } /* for(j) */ \
982} /* nfft_B */ \
983
984#ifndef _OPENMP
985MACRO_B(A)
986#endif
987
988#ifdef _OPENMP
989#define MACRO_B_openmp_A_COMPUTE_BEFORE_LOOP_with_PRE_PSI
990#define MACRO_B_openmp_A_COMPUTE_UPDATE_with_PRE_PSI \
991 MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
992
993#define MACRO_B_openmp_A_COMPUTE_INIT_FG_PSI \
994 for (t2 = 0; t2 < ths->d; t2++) \
995 { \
996 INT lj_fg; \
997 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \
998 R tmpEXP2sq = tmpEXP2*tmpEXP2; \
999 R tmp2 = K(1.0); \
1000 R tmp3 = K(1.0); \
1001 fg_exp_l[t2][0] = K(1.0); \
1002 for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \
1003 { \
1004 tmp3 = tmp2*tmpEXP2; \
1005 tmp2 *= tmpEXP2sq; \
1006 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \
1007 } \
1008 }
1009#define MACRO_B_openmp_A_COMPUTE_BEFORE_LOOP_with_PRE_FG_PSI \
1010 for (t2 = 0; t2 < ths->d; t2++) \
1011 { \
1012 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \
1013 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \
1014 tmp1 = K(1.0); \
1015 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
1016 { \
1017 tmp1 *= tmpEXP1; \
1018 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
1019 } \
1020 }
1021#define MACRO_B_openmp_A_COMPUTE_UPDATE_with_PRE_FG_PSI \
1022 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1023
1024#define MACRO_B_openmp_A_COMPUTE_BEFORE_LOOP_with_FG_PSI \
1025 for (t2 = 0; t2 < ths->d; t2++) \
1026 { \
1027 fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/((R)ths->n[t2])),t2)); \
1028 \
1029 tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2]) \
1030 /ths->b[t2]); \
1031 tmp1 = K(1.0); \
1032 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
1033 { \
1034 tmp1 *= tmpEXP1; \
1035 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
1036 } \
1037 }
1038#define MACRO_B_openmp_A_COMPUTE_UPDATE_with_FG_PSI \
1039 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1040
1041#define MACRO_B_openmp_A_COMPUTE_BEFORE_LOOP_with_PRE_LIN_PSI \
1042 for (t2 = 0; t2 < ths->d; t2++) \
1043 { \
1044 y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2]) \
1045 * ((R)ths->K))/(ths->m+2); \
1046 ip_u = LRINT(FLOOR(y[t2])); \
1047 ip_w = y[t2]-ip_u; \
1048 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \
1049 { \
1050 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \
1051 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \
1052 * (ip_w); \
1053 } \
1054 }
1055#define MACRO_B_openmp_A_COMPUTE_UPDATE_with_PRE_LIN_PSI \
1056 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1057
1058#define MACRO_B_openmp_A_COMPUTE_BEFORE_LOOP_without_PRE_PSI \
1059 for (t2 = 0; t2 < ths->d; t2++) \
1060 { \
1061 INT lj_t; \
1062 for (lj_t = 0; lj_t < 2*ths->m+2; lj_t++) \
1063 psij_const[t2 * (2*ths->m+2) + lj_t] = PHI(ths->n[t2], ths->x[j*ths->d+t2] \
1064 - ((R) (lj_t+u[t2]))/((R)ths->n[t2]), t2); \
1065 }
1066#define MACRO_B_openmp_A_COMPUTE_UPDATE_without_PRE_PSI \
1067 MACRO_update_phi_prod_ll_plain(without_PRE_PSI_improved);
1068
1069#define MACRO_B_openmp_A_COMPUTE(whichone) \
1070{ \
1071 INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ \
1072 INT l_L; /* index one row of B */ \
1073 INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ \
1074 INT ll_plain[ths->d+1]; /* postfix plain index in g */ \
1075 R phi_prod[ths->d+1]; /* postfix product of PHI */ \
1076 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k; \
1077 \
1078 phi_prod[0] = K(1.0); \
1079 ll_plain[0] = 0; \
1080 \
1081 MACRO_init_uo_l_lj_t; \
1082 \
1083 MACRO_B_openmp_A_COMPUTE_BEFORE_LOOP_ ##whichone \
1084 \
1085 if (ths->d == 4) \
1086 { \
1087 INT l0, l1, l2, l3; \
1088 for (l0 = 0; l0 < 2*ths->m+2; l0++) \
1089 { \
1090 lj[0] = l0; \
1091 t2 = 0; \
1092 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1093 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1094 for (l1 = 0; l1 < 2*ths->m+2; l1++) \
1095 { \
1096 lj[1] = l1; \
1097 t2 = 1; \
1098 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1099 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1100 for (l2 = 0; l2 < 2*ths->m+2; l2++) \
1101 { \
1102 lj[2] = l2; \
1103 t2 = 2; \
1104 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1105 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1106 for (l3 = 0; l3 < 2*ths->m+2; l3++) \
1107 { \
1108 lj[3] = l3; \
1109 t2 = 3; \
1110 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1111 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1112 \
1113 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]]; \
1114 } \
1115 } \
1116 } \
1117 } \
1118 } /* if(d==4) */ \
1119 else if (ths->d == 5) \
1120 { \
1121 INT l0, l1, l2, l3, l4; \
1122 for (l0 = 0; l0 < 2*ths->m+2; l0++) \
1123 { \
1124 lj[0] = l0; \
1125 t2 = 0; \
1126 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1127 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1128 for (l1 = 0; l1 < 2*ths->m+2; l1++) \
1129 { \
1130 lj[1] = l1; \
1131 t2 = 1; \
1132 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1133 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1134 for (l2 = 0; l2 < 2*ths->m+2; l2++) \
1135 { \
1136 lj[2] = l2; \
1137 t2 = 2; \
1138 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1139 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1140 for (l3 = 0; l3 < 2*ths->m+2; l3++) \
1141 { \
1142 lj[3] = l3; \
1143 t2 = 3; \
1144 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1145 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1146 for (l4 = 0; l4 < 2*ths->m+2; l4++) \
1147 { \
1148 lj[4] = l4; \
1149 t2 = 4; \
1150 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1151 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1152 \
1153 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]]; \
1154 } \
1155 } \
1156 } \
1157 } \
1158 } \
1159 } /* if(d==5) */ \
1160 else { \
1161 for (l_L = 0; l_L < lprod; l_L++) \
1162 { \
1163 MACRO_B_openmp_A_COMPUTE_UPDATE_ ##whichone \
1164 \
1165 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]]; \
1166 \
1167 MACRO_count_uo_l_lj_t; \
1168 } /* for(l_L) */ \
1169 } \
1170}
1171
1172static inline void B_openmp_A (X(plan) *ths)
1173{
1174 INT lprod; /* 'regular bandwidth' of matrix B */
1175 INT k;
1176
1177 memset(ths->f, 0, ths->M_total * sizeof(C));
1178
1179 for (k = 0, lprod = 1; k < ths->d; k++)
1180 lprod *= (2*ths->m+2);
1181
1182 if (ths->flags & PRE_FULL_PSI)
1183 {
1184 #pragma omp parallel for default(shared) private(k)
1185 for (k = 0; k < ths->M_total; k++)
1186 {
1187 INT l;
1188 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1189 ths->f[j] = K(0.0);
1190 for (l = 0; l < lprod; l++)
1191 ths->f[j] += ths->psi[j*lprod+l] * ths->g[ths->psi_index_g[j*lprod+l]];
1192 }
1193 return;
1194 }
1195
1196 if (ths->flags & PRE_PSI)
1197 {
1198 #pragma omp parallel for default(shared) private(k)
1199 for (k = 0; k < ths->M_total; k++)
1200 {
1201 INT t, t2; /* index dimensions */
1202 MACRO_B_openmp_A_COMPUTE(with_PRE_PSI);
1203 } /* for(j) */
1204 return;
1205 } /* if(PRE_PSI) */
1206
1207 if (ths->flags & PRE_FG_PSI)
1208 {
1209 INT t, t2; /* index dimensions */
1210 R fg_exp_l[ths->d][2*ths->m+2];
1211
1212 MACRO_B_openmp_A_COMPUTE_INIT_FG_PSI
1213
1214 #pragma omp parallel for default(shared) private(k,t,t2)
1215 for (k = 0; k < ths->M_total; k++)
1216 {
1217 R fg_psi[ths->d][2*ths->m+2];
1218 R tmpEXP1, tmp1;
1219 INT l_fg,lj_fg;
1220
1221 MACRO_B_openmp_A_COMPUTE(with_PRE_FG_PSI);
1222 } /* for(j) */
1223 return;
1224 } /* if(PRE_FG_PSI) */
1225
1226 if (ths->flags & FG_PSI)
1227 {
1228 INT t, t2; /* index dimensions */
1229 R fg_exp_l[ths->d][2*ths->m+2];
1230
1231 sort(ths);
1232
1233 MACRO_B_openmp_A_COMPUTE_INIT_FG_PSI
1234
1235 #pragma omp parallel for default(shared) private(k,t,t2)
1236 for (k = 0; k < ths->M_total; k++)
1237 {
1238 R fg_psi[ths->d][2*ths->m+2];
1239 R tmpEXP1, tmp1;
1240 INT l_fg,lj_fg;
1241
1242 MACRO_B_openmp_A_COMPUTE(with_FG_PSI);
1243 } /* for(j) */
1244 return;
1245 } /* if(FG_PSI) */
1246
1247 if (ths->flags & PRE_LIN_PSI)
1248 {
1249 sort(ths);
1250
1251 #pragma omp parallel for default(shared) private(k)
1252 for (k = 0; k<ths->M_total; k++)
1253 {
1254 INT t, t2; /* index dimensions */
1255 R y[ths->d];
1256 R fg_psi[ths->d][2*ths->m+2];
1257 INT l_fg,lj_fg;
1258 R ip_w;
1259 INT ip_u;
1260 INT ip_s = ths->K/(ths->m+2);
1261
1262 MACRO_B_openmp_A_COMPUTE(with_PRE_LIN_PSI);
1263 } /* for(j) */
1264 return;
1265 } /* if(PRE_LIN_PSI) */
1266
1267 /* no precomputed psi at all */
1268 sort(ths);
1269
1270 #pragma omp parallel for default(shared) private(k)
1271 for (k = 0; k < ths->M_total; k++)
1272 {
1273 INT t, t2; /* index dimensions */
1274 R psij_const[ths->d * (2*ths->m+2)];
1275
1276 MACRO_B_openmp_A_COMPUTE(without_PRE_PSI);
1277 } /* for(j) */
1278}
1279#endif
1280
1281static void B_A(X(plan) *ths)
1282{
1283#ifdef _OPENMP
1284 B_openmp_A(ths);
1285#else
1286 B_serial_A(ths);
1287#endif
1288}
1289
1290#ifdef _OPENMP
1306static inline INT index_x_binary_search(const INT *ar_x, const INT len, const INT key)
1307{
1308 INT left = 0, right = len - 1;
1309
1310 if (len == 1)
1311 return 0;
1312
1313 while (left < right - 1)
1314 {
1315 INT i = (left + right) / 2;
1316 if (ar_x[2*i] >= key)
1317 right = i;
1318 else if (ar_x[2*i] < key)
1319 left = i;
1320 }
1321
1322 if (ar_x[2*left] < key && left != len-1)
1323 return left+1;
1324
1325 return left;
1326}
1327#endif
1328
1329#ifdef _OPENMP
1345static void nfft_adjoint_B_omp_blockwise_init(INT *my_u0, INT *my_o0,
1346 INT *min_u_a, INT *max_u_a, INT *min_u_b, INT *max_u_b, const INT d,
1347 const INT *n, const INT m)
1348{
1349 const INT n0 = n[0];
1350 INT k;
1351 INT nthreads = omp_get_num_threads();
1352 INT nthreads_used = MIN(nthreads, n0);
1353 INT size_per_thread = n0 / nthreads_used;
1354 INT size_left = n0 - size_per_thread * nthreads_used;
1355 INT size_g[nthreads_used];
1356 INT offset_g[nthreads_used];
1357 INT my_id = omp_get_thread_num();
1358 INT n_prod_rest = 1;
1359
1360 for (k = 1; k < d; k++)
1361 n_prod_rest *= n[k];
1362
1363 *min_u_a = -1;
1364 *max_u_a = -1;
1365 *min_u_b = -1;
1366 *max_u_b = -1;
1367 *my_u0 = -1;
1368 *my_o0 = -1;
1369
1370 if (my_id < nthreads_used)
1371 {
1372 const INT m22 = 2 * m + 2;
1373
1374 offset_g[0] = 0;
1375 for (k = 0; k < nthreads_used; k++)
1376 {
1377 if (k > 0)
1378 offset_g[k] = offset_g[k-1] + size_g[k-1];
1379 size_g[k] = size_per_thread;
1380 if (size_left > 0)
1381 {
1382 size_g[k]++;
1383 size_left--;
1384 }
1385 }
1386
1387 *my_u0 = offset_g[my_id];
1388 *my_o0 = offset_g[my_id] + size_g[my_id] - 1;
1389
1390 if (nthreads_used > 1)
1391 {
1392 *max_u_a = n_prod_rest*(offset_g[my_id] + size_g[my_id]) - 1;
1393 *min_u_a = n_prod_rest*(offset_g[my_id] - m22 + 1);
1394 }
1395 else
1396 {
1397 *min_u_a = 0;
1398 *max_u_a = n_prod_rest * n0 - 1;
1399 }
1400
1401 if (*min_u_a < 0)
1402 {
1403 *min_u_b = n_prod_rest * (offset_g[my_id] - m22 + 1 + n0);
1404 *max_u_b = n_prod_rest * n0 - 1;
1405 *min_u_a = 0;
1406 }
1407
1408 if (*min_u_b != -1 && *min_u_b <= *max_u_a)
1409 {
1410 *max_u_a = *max_u_b;
1411 *min_u_b = -1;
1412 *max_u_b = -1;
1413 }
1414#ifdef OMP_ASSERT
1415 assert(*min_u_a <= *max_u_a);
1416 assert(*min_u_b <= *max_u_b);
1417 assert(*min_u_b == -1 || *max_u_a < *min_u_b);
1418#endif
1419 }
1420}
1421#endif
1422
1431static void nfft_adjoint_B_compute_full_psi(C *g, const INT *psi_index_g,
1432 const R *psi, const C *f, const INT M, const INT d, const INT *n,
1433 const INT m, const unsigned flags, const INT *index_x)
1434{
1435 INT k;
1436 INT lprod;
1437#ifdef _OPENMP
1438 INT lprod_m1;
1439#endif
1440#ifndef _OPENMP
1441 UNUSED(n);
1442#endif
1443 {
1444 INT t;
1445 for(t = 0, lprod = 1; t < d; t++)
1446 lprod *= 2 * m + 2;
1447 }
1448#ifdef _OPENMP
1449 lprod_m1 = lprod / (2 * m + 2);
1450#endif
1451
1452#ifdef _OPENMP
1453 if (flags & NFFT_OMP_BLOCKWISE_ADJOINT)
1454 {
1455 #pragma omp parallel private(k)
1456 {
1457 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b;
1458 const INT *ar_x = index_x;
1459 INT n_prod_rest = 1;
1460
1461 for (k = 1; k < d; k++)
1462 n_prod_rest *= n[k];
1463
1464 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, &min_u_b, &max_u_b, d, n, m);
1465
1466 if (min_u_a != -1)
1467 {
1468 k = index_x_binary_search(ar_x, M, min_u_a);
1469#ifdef OMP_ASSERT
1470 assert(ar_x[2*k] >= min_u_a || k == M-1);
1471 if (k > 0)
1472 assert(ar_x[2*k-2] < min_u_a);
1473#endif
1474 while (k < M)
1475 {
1476 INT l0, lrest;
1477 INT u_prod = ar_x[2*k];
1478 INT j = ar_x[2*k+1];
1479
1480 if (u_prod < min_u_a || u_prod > max_u_a)
1481 break;
1482
1483 for (l0 = 0; l0 < 2 * m + 2; l0++)
1484 {
1485 const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1486
1487 if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1488 continue;
1489
1490 for (lrest = 0; lrest < lprod_m1; lrest++)
1491 {
1492 const INT l = l0 * lprod_m1 + lrest;
1493 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1494 }
1495 }
1496
1497 k++;
1498 }
1499 }
1500
1501 if (min_u_b != -1)
1502 {
1503 k = index_x_binary_search(ar_x, M, min_u_b);
1504#ifdef OMP_ASSERT
1505 assert(ar_x[2*k] >= min_u_b || k == M-1);
1506 if (k > 0)
1507 assert(ar_x[2*k-2] < min_u_b);
1508#endif
1509 while (k < M)
1510 {
1511 INT l0, lrest;
1512 INT u_prod = ar_x[2*k];
1513 INT j = ar_x[2*k+1];
1514
1515 if (u_prod < min_u_b || u_prod > max_u_b)
1516 break;
1517
1518 for (l0 = 0; l0 < 2 * m + 2; l0++)
1519 {
1520 const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1521
1522 if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1523 continue;
1524 for (lrest = 0; lrest < lprod_m1; lrest++)
1525 {
1526 const INT l = l0 * lprod_m1 + lrest;
1527 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1528 }
1529 }
1530
1531 k++;
1532 }
1533 }
1534 } /* omp parallel */
1535 return;
1536 } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */
1537#endif
1538
1539#ifdef _OPENMP
1540 #pragma omp parallel for default(shared) private(k)
1541#endif
1542 for (k = 0; k < M; k++)
1543 {
1544 INT l;
1545 INT j = (flags & NFFT_SORT_NODES) ? index_x[2*k+1] : k;
1546
1547 for (l = 0; l < lprod; l++)
1548 {
1549#ifdef _OPENMP
1550 C val = psi[j * lprod + l] * f[j];
1551 C *gref = g + psi_index_g[j * lprod + l];
1552 R *gref_real = (R*) gref;
1553
1554 #pragma omp atomic
1555 gref_real[0] += CREAL(val);
1556
1557 #pragma omp atomic
1558 gref_real[1] += CIMAG(val);
1559#else
1560 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1561#endif
1562 }
1563 }
1564}
1565
1566#ifndef _OPENMP
1567MACRO_B(T)
1568#endif
1569
1570
1571#ifdef _OPENMP
1572
1573#ifdef OMP_ASSERT
1574#define MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_A \
1575{ \
1576 assert(ar_x[2*k] >= min_u_a || k == M-1); \
1577 if (k > 0) \
1578 assert(ar_x[2*k-2] < min_u_a); \
1579}
1580#else
1581#define MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_A
1582#endif
1583
1584#ifdef OMP_ASSERT
1585#define MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_B \
1586{ \
1587 assert(ar_x[2*k] >= min_u_b || k == M-1); \
1588 if (k > 0) \
1589 assert(ar_x[2*k-2] < min_u_b); \
1590}
1591#else
1592#define MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_B
1593#endif
1594
1595#define MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_with_PRE_PSI
1596#define MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_with_PRE_PSI \
1597 MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
1598
1599#define MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_with_PRE_FG_PSI \
1600 R fg_psi[ths->d][2*ths->m+2]; \
1601 R tmpEXP1, tmp1; \
1602 INT l_fg,lj_fg; \
1603 for (t2 = 0; t2 < ths->d; t2++) \
1604 { \
1605 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \
1606 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \
1607 tmp1 = K(1.0); \
1608 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
1609 { \
1610 tmp1 *= tmpEXP1; \
1611 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
1612 } \
1613 }
1614#define MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_with_PRE_FG_PSI \
1615 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1616
1617#define MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_with_FG_PSI \
1618 R fg_psi[ths->d][2*ths->m+2]; \
1619 R tmpEXP1, tmp1; \
1620 INT l_fg,lj_fg; \
1621 for (t2 = 0; t2 < ths->d; t2++) \
1622 { \
1623 fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/((R)ths->n[t2])),t2)); \
1624 \
1625 tmpEXP1 = EXP(K(2.0)*((R)ths->n[t2]*ths->x[j*ths->d+t2] - (R)u[t2]) \
1626 /ths->b[t2]); \
1627 tmp1 = K(1.0); \
1628 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
1629 { \
1630 tmp1 *= tmpEXP1; \
1631 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
1632 } \
1633 }
1634#define MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_with_FG_PSI \
1635 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1636
1637#define MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_with_PRE_LIN_PSI \
1638 R y[ths->d]; \
1639 R fg_psi[ths->d][2*ths->m+2]; \
1640 INT l_fg,lj_fg; \
1641 R ip_w; \
1642 INT ip_u; \
1643 INT ip_s = ths->K/(ths->m+2); \
1644 for (t2 = 0; t2 < ths->d; t2++) \
1645 { \
1646 y[t2] = ((((R)ths->n[t2])*ths->x[j*ths->d+t2]-(R)u[t2]) \
1647 * ((R)ths->K))/((R)ths->m+2); \
1648 ip_u = LRINT(FLOOR(y[t2])); \
1649 ip_w = y[t2]-ip_u; \
1650 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \
1651 { \
1652 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \
1653 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \
1654 * (ip_w); \
1655 } \
1656 }
1657#define MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_with_PRE_LIN_PSI \
1658 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1659
1660#define MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_without_PRE_PSI \
1661 R psij_const[ths->d * (2*ths->m+2)]; \
1662 for (t2 = 0; t2 < ths->d; t2++) \
1663 { \
1664 INT lj_t; \
1665 for (lj_t = 0; lj_t < 2*ths->m+2; lj_t++) \
1666 psij_const[t2 * (2*ths->m+2) + lj_t] = PHI(ths->n[t2], ths->x[j*ths->d+t2] \
1667 - ((R) (lj_t+u[t2]))/((R)ths->n[t2]), t2); \
1668 }
1669#define MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_without_PRE_PSI \
1670 MACRO_update_phi_prod_ll_plain(without_PRE_PSI_improved);
1671
1672#define MACRO_adjoint_nd_B_OMP_BLOCKWISE_COMPUTE(whichone) \
1673{ \
1674 INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ \
1675 INT t, t2; /* index dimensions */ \
1676 INT l_L; /* index one row of B */ \
1677 INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ \
1678 INT ll_plain[ths->d+1]; /* postfix plain index in g */ \
1679 R phi_prod[ths->d+1]; /* postfix product of PHI */ \
1680 \
1681 phi_prod[0] = K(1.0); \
1682 ll_plain[0] = 0; \
1683 \
1684 MACRO_init_uo_l_lj_t; \
1685 \
1686 MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_ ##whichone \
1687 \
1688 if (ths->d == 4) \
1689 { \
1690 INT l0, l1, l2, l3; \
1691 for (l0 = 0; l0 < 2*ths->m+2; l0++) \
1692 { \
1693 lj[0] = l0; \
1694 t2 = 0; \
1695 if (l_all[lj[0]] < my_u0 || l_all[lj[0]] > my_o0) \
1696 continue; \
1697 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1698 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1699 for (l1 = 0; l1 < 2*ths->m+2; l1++) \
1700 { \
1701 lj[1] = l1; \
1702 t2 = 1; \
1703 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1704 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1705 for (l2 = 0; l2 < 2*ths->m+2; l2++) \
1706 { \
1707 lj[2] = l2; \
1708 t2 = 2; \
1709 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1710 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1711 for (l3 = 0; l3 < 2*ths->m+2; l3++) \
1712 { \
1713 lj[3] = l3; \
1714 t2 = 3; \
1715 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1716 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1717 \
1718 ths->g[ll_plain[ths->d]] += phi_prod[ths->d] * ths->f[j]; \
1719 } \
1720 } \
1721 } \
1722 } \
1723 } /* if(d==4) */ \
1724 else if (ths->d == 5) \
1725 { \
1726 INT l0, l1, l2, l3, l4; \
1727 for (l0 = 0; l0 < 2*ths->m+2; l0++) \
1728 { \
1729 lj[0] = l0; \
1730 t2 = 0; \
1731 if (l_all[lj[0]] < my_u0 || l_all[lj[0]] > my_o0) \
1732 continue; \
1733 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1734 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1735 for (l1 = 0; l1 < 2*ths->m+2; l1++) \
1736 { \
1737 lj[1] = l1; \
1738 t2 = 1; \
1739 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1740 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1741 for (l2 = 0; l2 < 2*ths->m+2; l2++) \
1742 { \
1743 lj[2] = l2; \
1744 t2 = 2; \
1745 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1746 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1747 for (l3 = 0; l3 < 2*ths->m+2; l3++) \
1748 { \
1749 lj[3] = l3; \
1750 t2 = 3; \
1751 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1752 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1753 for (l4 = 0; l4 < 2*ths->m+2; l4++) \
1754 { \
1755 lj[4] = l4; \
1756 t2 = 4; \
1757 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1758 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1759 \
1760 ths->g[ll_plain[ths->d]] += phi_prod[ths->d] * ths->f[j]; \
1761 } \
1762 } \
1763 } \
1764 } \
1765 } \
1766 } /* if(d==5) */ \
1767 else { \
1768 l_L = 0; \
1769 while (l_L < lprod) \
1770 { \
1771 if (t == 0 && (l_all[lj[0]] < my_u0 || l_all[lj[0]] > my_o0)) \
1772 { \
1773 lj[0]++; \
1774 l_L += lprodrest; \
1775 continue; \
1776 } \
1777 MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_ ##whichone \
1778 ths->g[ll_plain[ths->d]] += phi_prod[ths->d] * ths->f[j]; \
1779 MACRO_count_uo_l_lj_t; \
1780 l_L++; \
1781 } /* for(l_L) */ \
1782 } \
1783}
1784
1785#define MACRO_adjoint_nd_B_OMP_BLOCKWISE(whichone) \
1786{ \
1787 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
1788 { \
1789 INT lprodrest = 1; \
1790 for (k = 1; k < ths->d; k++) \
1791 lprodrest *= (2*ths->m+2); \
1792 _Pragma("omp parallel private(k)") \
1793 { \
1794 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
1795 INT *ar_x = ths->index_x; \
1796 \
1797 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
1798 &min_u_b, &max_u_b, ths->d, ths->n, ths->m); \
1799 \
1800 if (min_u_a != -1) \
1801 { \
1802 k = index_x_binary_search(ar_x, ths->M_total, min_u_a); \
1803 \
1804 MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_A \
1805 \
1806 while (k < ths->M_total) \
1807 { \
1808 INT u_prod = ar_x[2*k]; \
1809 INT j = ar_x[2*k+1]; \
1810 \
1811 if (u_prod < min_u_a || u_prod > max_u_a) \
1812 break; \
1813 \
1814 MACRO_adjoint_nd_B_OMP_BLOCKWISE_COMPUTE(whichone) \
1815 \
1816 k++; \
1817 } \
1818 } \
1819 \
1820 if (min_u_b != -1) \
1821 { \
1822 INT k = index_x_binary_search(ar_x, ths->M_total, min_u_b); \
1823 \
1824 MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_B \
1825 \
1826 while (k < ths->M_total) \
1827 { \
1828 INT u_prod = ar_x[2*k]; \
1829 INT j = ar_x[2*k+1]; \
1830 \
1831 if (u_prod < min_u_b || u_prod > max_u_b) \
1832 break; \
1833 \
1834 MACRO_adjoint_nd_B_OMP_BLOCKWISE_COMPUTE(whichone) \
1835 \
1836 k++; \
1837 } \
1838 } \
1839 } /* omp parallel */ \
1840 return; \
1841 } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
1842}
1843
1844#define MACRO_adjoint_nd_B_OMP_COMPUTE(whichone) \
1845{ \
1846 INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ \
1847 INT l_L; /* index one row of B */ \
1848 INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ \
1849 INT ll_plain[ths->d+1]; /* postfix plain index in g */ \
1850 R phi_prod[ths->d+1]; /* postfix product of PHI */ \
1851 \
1852 phi_prod[0] = K(1.0); \
1853 ll_plain[0] = 0; \
1854 \
1855 MACRO_init_uo_l_lj_t; \
1856 \
1857 MACRO_adjoint_nd_B_OMP_COMPUTE_BEFORE_LOOP_ ## whichone \
1858 \
1859 if (ths->d == 4) \
1860 { \
1861 INT l0, l1, l2, l3; \
1862 for (l0 = 0; l0 < 2*ths->m+2; l0++) \
1863 { \
1864 lj[0] = l0; \
1865 t2 = 0; \
1866 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1867 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1868 for (l1 = 0; l1 < 2*ths->m+2; l1++) \
1869 { \
1870 lj[1] = l1; \
1871 t2 = 1; \
1872 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1873 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1874 for (l2 = 0; l2 < 2*ths->m+2; l2++) \
1875 { \
1876 lj[2] = l2; \
1877 t2 = 2; \
1878 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1879 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1880 for (l3 = 0; l3 < 2*ths->m+2; l3++) \
1881 { \
1882 lj[3] = l3; \
1883 t2 = 3; \
1884 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1885 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1886 \
1887 C *lhs = ths->g + ll_plain[ths->d]; \
1888 R *lhs_real = (R*)lhs; \
1889 C val = phi_prod[ths->d] * ths->f[j]; \
1890 \
1891 _Pragma("omp atomic") \
1892 lhs_real[0] += CREAL(val); \
1893 \
1894 _Pragma("omp atomic") \
1895 lhs_real[1] += CIMAG(val); \
1896 } \
1897 } \
1898 } \
1899 } \
1900 } /* if(d==4) */ \
1901 else if (ths->d == 5) \
1902 { \
1903 INT l0, l1, l2, l3, l4; \
1904 for (l0 = 0; l0 < 2*ths->m+2; l0++) \
1905 { \
1906 lj[0] = l0; \
1907 t2 = 0; \
1908 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1909 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1910 for (l1 = 0; l1 < 2*ths->m+2; l1++) \
1911 { \
1912 lj[1] = l1; \
1913 t2 = 1; \
1914 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1915 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1916 for (l2 = 0; l2 < 2*ths->m+2; l2++) \
1917 { \
1918 lj[2] = l2; \
1919 t2 = 2; \
1920 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1921 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1922 for (l3 = 0; l3 < 2*ths->m+2; l3++) \
1923 { \
1924 lj[3] = l3; \
1925 t2 = 3; \
1926 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1927 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1928 for (l4 = 0; l4 < 2*ths->m+2; l4++) \
1929 { \
1930 lj[4] = l4; \
1931 t2 = 4; \
1932 phi_prod[t2+1] = phi_prod[t2] * MACRO_COMPUTE_ ## whichone; \
1933 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + l_all[t2*(2*ths->m+2) + lj[t2]]; \
1934 \
1935 C *lhs = ths->g + ll_plain[ths->d]; \
1936 R *lhs_real = (R*)lhs; \
1937 C val = phi_prod[ths->d] * ths->f[j]; \
1938 \
1939 _Pragma("omp atomic") \
1940 lhs_real[0] += CREAL(val); \
1941 \
1942 _Pragma("omp atomic") \
1943 lhs_real[1] += CIMAG(val); \
1944 } \
1945 } \
1946 } \
1947 } \
1948 } \
1949 } /* if(d==5) */ \
1950 else { \
1951 for (l_L = 0; l_L < lprod; l_L++) \
1952 { \
1953 C *lhs; \
1954 R *lhs_real; \
1955 C val; \
1956 \
1957 MACRO_adjoint_nd_B_OMP_COMPUTE_UPDATE_ ## whichone \
1958 \
1959 lhs = ths->g + ll_plain[ths->d]; \
1960 lhs_real = (R*)lhs; \
1961 val = phi_prod[ths->d] * ths->f[j]; \
1962 \
1963 _Pragma("omp atomic") \
1964 lhs_real[0] += CREAL(val); \
1965 \
1966 _Pragma("omp atomic") \
1967 lhs_real[1] += CIMAG(val); \
1968 \
1969 MACRO_count_uo_l_lj_t; \
1970 } /* for(l_L) */ \
1971 } \
1972}
1973
1974static inline void B_openmp_T(X(plan) *ths)
1975{
1976 INT lprod; /* 'regular bandwidth' of matrix B */
1977 INT k;
1978
1979 memset(ths->g, 0, (size_t)(ths->n_total) * sizeof(C));
1980
1981 for (k = 0, lprod = 1; k < ths->d; k++)
1982 lprod *= (2*ths->m+2);
1983
1984 if (ths->flags & PRE_FULL_PSI)
1985 {
1986 nfft_adjoint_B_compute_full_psi(ths->g, ths->psi_index_g, ths->psi, ths->f,
1987 ths->M_total, ths->d, ths->n, ths->m, ths->flags, ths->index_x);
1988 return;
1989 }
1990
1991 if (ths->flags & PRE_PSI)
1992 {
1993 MACRO_adjoint_nd_B_OMP_BLOCKWISE(with_PRE_PSI);
1994
1995 #pragma omp parallel for default(shared) private(k)
1996 for (k = 0; k < ths->M_total; k++)
1997 {
1998 INT t, t2; /* index dimensions */ \
1999 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2000 MACRO_adjoint_nd_B_OMP_COMPUTE(with_PRE_PSI);
2001 } /* for(j) */
2002 return;
2003 } /* if(PRE_PSI) */
2004
2005 if (ths->flags & PRE_FG_PSI)
2006 {
2007 INT t, t2; /* index dimensions */
2008 R fg_exp_l[ths->d][2*ths->m+2];
2009 for(t2 = 0; t2 < ths->d; t2++)
2010 {
2011 INT lj_fg;
2012 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
2013 R tmpEXP2sq = tmpEXP2*tmpEXP2;
2014 R tmp2 = K(1.0);
2015 R tmp3 = K(1.0);
2016 fg_exp_l[t2][0] = K(1.0);
2017 for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
2018 {
2019 tmp3 = tmp2*tmpEXP2;
2020 tmp2 *= tmpEXP2sq;
2021 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
2022 }
2023 }
2024
2025 MACRO_adjoint_nd_B_OMP_BLOCKWISE(with_PRE_FG_PSI);
2026
2027 #pragma omp parallel for default(shared) private(k,t,t2)
2028 for (k = 0; k < ths->M_total; k++)
2029 {
2030 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2031 MACRO_adjoint_nd_B_OMP_COMPUTE(with_PRE_FG_PSI);
2032 } /* for(j) */
2033 return;
2034 } /* if(PRE_FG_PSI) */
2035
2036 if (ths->flags & FG_PSI)
2037 {
2038 INT t, t2; /* index dimensions */
2039 R fg_exp_l[ths->d][2*ths->m+2];
2040
2041 sort(ths);
2042
2043 for (t2 = 0; t2 < ths->d; t2++)
2044 {
2045 INT lj_fg;
2046 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
2047 R tmpEXP2sq = tmpEXP2*tmpEXP2;
2048 R tmp2 = K(1.0);
2049 R tmp3 = K(1.0);
2050 fg_exp_l[t2][0] = K(1.0);
2051 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
2052 {
2053 tmp3 = tmp2*tmpEXP2;
2054 tmp2 *= tmpEXP2sq;
2055 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
2056 }
2057 }
2058
2059 MACRO_adjoint_nd_B_OMP_BLOCKWISE(with_FG_PSI);
2060
2061 #pragma omp parallel for default(shared) private(k,t,t2)
2062 for (k = 0; k < ths->M_total; k++)
2063 {
2064 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2065 MACRO_adjoint_nd_B_OMP_COMPUTE(with_FG_PSI);
2066 } /* for(j) */
2067 return;
2068 } /* if(FG_PSI) */
2069
2070 if (ths->flags & PRE_LIN_PSI)
2071 {
2072 sort(ths);
2073
2074 MACRO_adjoint_nd_B_OMP_BLOCKWISE(with_PRE_LIN_PSI);
2075
2076 #pragma omp parallel for default(shared) private(k)
2077 for (k = 0; k<ths->M_total; k++)
2078 {
2079 INT t, t2; /* index dimensions */
2080 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2081 MACRO_adjoint_nd_B_OMP_COMPUTE(with_PRE_LIN_PSI);
2082 } /* for(j) */
2083 return;
2084 } /* if(PRE_LIN_PSI) */
2085
2086 /* no precomputed psi at all */
2087 sort(ths);
2088
2089 MACRO_adjoint_nd_B_OMP_BLOCKWISE(without_PRE_PSI);
2090
2091 #pragma omp parallel for default(shared) private(k)
2092 for (k = 0; k < ths->M_total; k++)
2093 {
2094 INT t, t2; /* index dimensions */
2095 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2096 MACRO_adjoint_nd_B_OMP_COMPUTE(without_PRE_PSI);
2097 } /* for(j) */
2098}
2099#endif
2100
2101static void B_T(X(plan) *ths)
2102{
2103#ifdef _OPENMP
2104 B_openmp_T(ths);
2105#else
2106 B_serial_T(ths);
2107#endif
2108}
2109
2110/* ## specialized version for d=1 ########################################### */
2111
2112static void nfft_1d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b)
2113{
2114 const INT tmp2 = 2*m+2;
2115 INT l;
2116 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
2117
2118 fg_exp_b0 = EXP(K(-1.0)/b);
2119 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
2120 fg_exp_b1 = fg_exp_b2 =fg_exp_l[0] = K(1.0);
2121
2122 for (l = 1; l < tmp2; l++)
2123 {
2124 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
2125 fg_exp_b1 *= fg_exp_b0_sq;
2126 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
2127 }
2128}
2129
2130
2131static void nfft_trafo_1d_compute(C *fj, const C *g,const R *psij_const,
2132 const R *xj, const INT n, const INT m)
2133{
2134 INT u, o, l;
2135 const C *gj;
2136 const R *psij;
2137 psij = psij_const;
2138
2139 uo2(&u, &o, *xj, n, m);
2140
2141 if (u < o)
2142 {
2143 for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l <= 2*m+1; l++)
2144 (*fj) += (*psij++) * (*gj++);
2145 }
2146 else
2147 {
2148 for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l < 2*m+1 - o; l++)
2149 (*fj) += (*psij++) * (*gj++);
2150 for (l = 0, gj = g; l <= o; l++)
2151 (*fj) += (*psij++) * (*gj++);
2152 }
2153}
2154
2155#ifndef _OPENMP
2156static void nfft_adjoint_1d_compute_serial(const C *fj, C *g,
2157 const R *psij_const, const R *xj, const INT n, const INT m)
2158{
2159 INT u,o,l;
2160 C *gj;
2161 const R *psij;
2162 psij = psij_const;
2163
2164 uo2(&u,&o,*xj, n, m);
2165
2166 if (u < o)
2167 {
2168 for (l = 0, gj = g+u; l <= 2*m+1; l++)
2169 (*gj++) += (*psij++) * (*fj);
2170 }
2171 else
2172 {
2173 for (l = 0, gj = g+u; l < 2*m+1-o; l++)
2174 (*gj++) += (*psij++) * (*fj);
2175 for (l = 0, gj = g; l <= o; l++)
2176 (*gj++) += (*psij++) * (*fj);
2177 }
2178}
2179#endif
2180
2181#ifdef _OPENMP
2182/* adjoint NFFT one-dimensional case with OpenMP atomic operations */
2183static void nfft_adjoint_1d_compute_omp_atomic(const C f, C *g,
2184 const R *psij_const, const R *xj, const INT n, const INT m)
2185{
2186 INT u,o,l;
2187 C *gj;
2188 INT index_temp[2*m+2];
2189
2190 uo2(&u,&o,*xj, n, m);
2191
2192 for (l=0; l<=2*m+1; l++)
2193 index_temp[l] = (l+u)%n;
2194
2195 for (l = 0, gj = g+u; l <= 2*m+1; l++)
2196 {
2197 INT i = index_temp[l];
2198 C *lhs = g+i;
2199 R *lhs_real = (R*)lhs;
2200 C val = psij_const[l] * f;
2201 #pragma omp atomic
2202 lhs_real[0] += CREAL(val);
2203
2204 #pragma omp atomic
2205 lhs_real[1] += CIMAG(val);
2206 }
2207}
2208#endif
2209
2210#ifdef _OPENMP
2226static void nfft_adjoint_1d_compute_omp_blockwise(const C f, C *g,
2227 const R *psij_const, const R *xj, const INT n, const INT m,
2228 const INT my_u0, const INT my_o0)
2229{
2230 INT ar_u,ar_o,l;
2231
2232 uo2(&ar_u,&ar_o,*xj, n, m);
2233
2234 if (ar_u < ar_o)
2235 {
2236 INT u = MAX(my_u0,ar_u);
2237 INT o = MIN(my_o0,ar_o);
2238 INT offset_psij = u-ar_u;
2239#ifdef OMP_ASSERT
2240 assert(offset_psij >= 0);
2241 assert(o-u <= 2*m+1);
2242 assert(offset_psij+o-u <= 2*m+1);
2243#endif
2244
2245 for (l = 0; l <= o-u; l++)
2246 g[u+l] += psij_const[offset_psij+l] * f;
2247 }
2248 else
2249 {
2250 INT u = MAX(my_u0,ar_u);
2251 INT o = my_o0;
2252 INT offset_psij = u-ar_u;
2253#ifdef OMP_ASSERT
2254 assert(offset_psij >= 0);
2255 assert(o-u <= 2*m+1);
2256 assert(offset_psij+o-u <= 2*m+1);
2257#endif
2258
2259 for (l = 0; l <= o-u; l++)
2260 g[u+l] += psij_const[offset_psij+l] * f;
2261
2262 u = my_u0;
2263 o = MIN(my_o0,ar_o);
2264 offset_psij += my_u0-ar_u+n;
2265
2266#ifdef OMP_ASSERT
2267 if (u <= o)
2268 {
2269 assert(o-u <= 2*m+1);
2270 if (offset_psij+o-u > 2*m+1)
2271 {
2272 fprintf(stderr, "ERR: %d %d %d %d %d %d %d\n", ar_u, ar_o, my_u0, my_o0, u, o, offset_psij);
2273 }
2274 assert(offset_psij+o-u <= 2*m+1);
2275 }
2276#endif
2277 for (l = 0; l <= o-u; l++)
2278 g[u+l] += psij_const[offset_psij+l] * f;
2279 }
2280}
2281#endif
2282
2283static void nfft_trafo_1d_B(X(plan) *ths)
2284{
2285 const INT n = ths->n[0], M = ths->M_total, m = ths->m, m2p2 = 2*m+2;
2286 const C *g = (C*)ths->g;
2287
2288 if (ths->flags & PRE_FULL_PSI)
2289 {
2290 INT k;
2291#ifdef _OPENMP
2292 #pragma omp parallel for default(shared) private(k)
2293#endif
2294 for (k = 0; k < M; k++)
2295 {
2296 INT l;
2297 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2298 ths->f[j] = K(0.0);
2299 for (l = 0; l < m2p2; l++)
2300 ths->f[j] += ths->psi[j*m2p2+l] * g[ths->psi_index_g[j*m2p2+l]];
2301 }
2302 return;
2303 } /* if(PRE_FULL_PSI) */
2304
2305 if (ths->flags & PRE_PSI)
2306 {
2307 INT k;
2308#ifdef _OPENMP
2309 #pragma omp parallel for default(shared) private(k)
2310#endif
2311 for (k = 0; k < M; k++)
2312 {
2313 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2314 nfft_trafo_1d_compute(&ths->f[j], g, ths->psi + j * (2 * m + 2),
2315 &ths->x[j], n, m);
2316 }
2317 return;
2318 } /* if(PRE_PSI) */
2319
2320 if (ths->flags & PRE_FG_PSI)
2321 {
2322 INT k;
2323 R fg_exp_l[m2p2];
2324
2325 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2326
2327#ifdef _OPENMP
2328 #pragma omp parallel for default(shared) private(k)
2329#endif
2330 for (k = 0; k < M; k++)
2331 {
2332 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2333 const R fg_psij0 = ths->psi[2 * j], fg_psij1 = ths->psi[2 * j + 1];
2334 R fg_psij2 = K(1.0);
2335 R psij_const[m2p2];
2336 INT l;
2337
2338 psij_const[0] = fg_psij0;
2339
2340 for (l = 1; l < m2p2; l++)
2341 {
2342 fg_psij2 *= fg_psij1;
2343 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2344 }
2345
2346 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2347 }
2348
2349 return;
2350 } /* if(PRE_FG_PSI) */
2351
2352 if (ths->flags & FG_PSI)
2353 {
2354 INT k;
2355 R fg_exp_l[m2p2];
2356
2357 sort(ths);
2358
2359 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2360
2361#ifdef _OPENMP
2362 #pragma omp parallel for default(shared) private(k)
2363#endif
2364 for (k = 0; k < M; k++)
2365 {
2366 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2367 INT u, o, l;
2368 R fg_psij0, fg_psij1, fg_psij2;
2369 R psij_const[m2p2];
2370
2371 uo(ths, (INT)j, &u, &o, (INT)0);
2372 fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)(u))/(R)(n), 0));
2373 fg_psij1 = EXP(K(2.0) * ((R)(n) * ths->x[j] - (R)(u)) / ths->b[0]);
2374 fg_psij2 = K(1.0);
2375
2376 psij_const[0] = fg_psij0;
2377
2378 for (l = 1; l < m2p2; l++)
2379 {
2380 fg_psij2 *= fg_psij1;
2381 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2382 }
2383
2384 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2385 }
2386 return;
2387 } /* if(FG_PSI) */
2388
2389 if (ths->flags & PRE_LIN_PSI)
2390 {
2391 const INT K = ths->K, ip_s = K / (m + 2);
2392 INT k;
2393
2394 sort(ths);
2395
2396#ifdef _OPENMP
2397 #pragma omp parallel for default(shared) private(k)
2398#endif
2399 for (k = 0; k < M; k++)
2400 {
2401 INT u, o, l;
2402 R ip_y, ip_w;
2403 INT ip_u;
2404 R psij_const[m2p2];
2405 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2406
2407 uo(ths, (INT)j, &u, &o, (INT)0);
2408
2409 ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
2410 ip_u = (INT)(LRINT(FLOOR(ip_y)));
2411 ip_w = ip_y - (R)(ip_u);
2412
2413 for (l = 0; l < m2p2; l++)
2414 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2415 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2416
2417 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2418 }
2419 return;
2420 } /* if(PRE_LIN_PSI) */
2421 else
2422 {
2423 /* no precomputed psi at all */
2424 INT k;
2425
2426 sort(ths);
2427
2428#ifdef _OPENMP
2429 #pragma omp parallel for default(shared) private(k)
2430#endif
2431 for (k = 0; k < M; k++)
2432 {
2433 R psij_const[m2p2];
2434 INT u, o, l;
2435 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2436
2437 uo(ths, (INT)j, &u, &o, (INT)0);
2438
2439 for (l = 0; l < m2p2; l++)
2440 psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n), 0));
2441
2442 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2443 }
2444 }
2445}
2446
2447
2448#define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
2449{ \
2450 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, \
2451 ths->psi + j * (2 * m + 2), ths->x + j, n, m, my_u0, my_o0); \
2452}
2453
2454#define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
2455{ \
2456 R psij_const[2 * m + 2]; \
2457 INT l; \
2458 R fg_psij0 = ths->psi[2 * j]; \
2459 R fg_psij1 = ths->psi[2 * j + 1]; \
2460 R fg_psij2 = K(1.0); \
2461 \
2462 psij_const[0] = fg_psij0; \
2463 for (l = 1; l <= 2 * m + 1; l++) \
2464 { \
2465 fg_psij2 *= fg_psij1; \
2466 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2467 } \
2468 \
2469 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2470 ths->x + j, n, m, my_u0, my_o0); \
2471}
2472
2473#define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
2474{ \
2475 R psij_const[2 * m + 2]; \
2476 R fg_psij0, fg_psij1, fg_psij2; \
2477 INT u, o, l; \
2478 \
2479 uo(ths, j, &u, &o, (INT)0); \
2480 fg_psij0 = (PHI(ths->n[0],ths->x[j]-((R)u)/((R)n),0)); \
2481 fg_psij1 = EXP(K(2.0) * (((R)n) * (ths->x[j]) - (R)u) / ths->b[0]); \
2482 fg_psij2 = K(1.0); \
2483 psij_const[0] = fg_psij0; \
2484 for (l = 1; l <= 2 * m + 1; l++) \
2485 { \
2486 fg_psij2 *= fg_psij1; \
2487 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2488 } \
2489 \
2490 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2491 ths->x + j, n, m, my_u0, my_o0); \
2492}
2493
2494#define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
2495{ \
2496 R psij_const[2 * m + 2]; \
2497 INT ip_u; \
2498 R ip_y, ip_w; \
2499 INT u, o, l; \
2500 \
2501 uo(ths, j, &u, &o, (INT)0); \
2502 \
2503 ip_y = FABS(((R)n) * ths->x[j] - (R)u) * ((R)ip_s); \
2504 ip_u = LRINT(FLOOR(ip_y)); \
2505 ip_w = ip_y - ip_u; \
2506 for (l = 0; l < 2 * m + 2; l++) \
2507 psij_const[l] \
2508 = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w) \
2509 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w); \
2510 \
2511 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2512 ths->x + j, n, m, my_u0, my_o0); \
2513}
2514
2515#define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
2516{ \
2517 R psij_const[2 * m + 2]; \
2518 INT u, o, l; \
2519 \
2520 uo(ths, j, &u, &o, (INT)0); \
2521 \
2522 for (l = 0; l <= 2 * m + 1; l++) \
2523 psij_const[l] = (PHI(ths->n[0],ths->x[j]-((R)((u+l)))/((R)n),0)); \
2524 \
2525 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2526 ths->x + j, n, m, my_u0, my_o0); \
2527}
2528
2529#define MACRO_adjoint_1d_B_OMP_BLOCKWISE(whichone) \
2530{ \
2531 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
2532 { \
2533 _Pragma("omp parallel private(k)") \
2534 { \
2535 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
2536 INT *ar_x = ths->index_x; \
2537 \
2538 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
2539 &min_u_b, &max_u_b, 1, &n, m); \
2540 \
2541 if (min_u_a != -1) \
2542 { \
2543 k = index_x_binary_search(ar_x, M, min_u_a); \
2544 \
2545 MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_A \
2546 \
2547 while (k < M) \
2548 { \
2549 INT u_prod = ar_x[2*k]; \
2550 INT j = ar_x[2*k+1]; \
2551 \
2552 if (u_prod < min_u_a || u_prod > max_u_a) \
2553 break; \
2554 \
2555 MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2556 \
2557 k++; \
2558 } \
2559 } \
2560 \
2561 if (min_u_b != -1) \
2562 { \
2563 k = index_x_binary_search(ar_x, M, min_u_b); \
2564 \
2565 MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_B \
2566 \
2567 while (k < M) \
2568 { \
2569 INT u_prod = ar_x[2*k]; \
2570 INT j = ar_x[2*k+1]; \
2571 \
2572 if (u_prod < min_u_b || u_prod > max_u_b) \
2573 break; \
2574 \
2575 MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2576 \
2577 k++; \
2578 } \
2579 } \
2580 } /* omp parallel */ \
2581 return; \
2582 } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
2583}
2584
2585static void nfft_adjoint_1d_B(X(plan) *ths)
2586{
2587 const INT n = ths->n[0], M = ths->M_total, m = ths->m;
2588 INT k;
2589 C *g = (C*)ths->g;
2590
2591 memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
2592
2593 if (ths->flags & PRE_FULL_PSI)
2594 {
2595 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
2596 (INT)1, ths->n, m, ths->flags, ths->index_x);
2597 return;
2598 } /* if(PRE_FULL_PSI) */
2599
2600 if (ths->flags & PRE_PSI)
2601 {
2602#ifdef _OPENMP
2603 MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_PSI)
2604#endif
2605
2606#ifdef _OPENMP
2607 #pragma omp parallel for default(shared) private(k)
2608#endif
2609 for (k = 0; k < M; k++)
2610 {
2611 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2612#ifdef _OPENMP
2613 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2614#else
2615 nfft_adjoint_1d_compute_serial(ths->f + j, g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2616#endif
2617 }
2618
2619 return;
2620 } /* if(PRE_PSI) */
2621
2622 if (ths->flags & PRE_FG_PSI)
2623 {
2624 R fg_exp_l[2 * m + 2];
2625
2626 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2627
2628#ifdef _OPENMP
2629 MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_FG_PSI)
2630#endif
2631
2632
2633#ifdef _OPENMP
2634 #pragma omp parallel for default(shared) private(k)
2635#endif
2636 for (k = 0; k < M; k++)
2637 {
2638 R psij_const[2 * m + 2];
2639 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2640 INT l;
2641 R fg_psij0 = ths->psi[2 * j];
2642 R fg_psij1 = ths->psi[2 * j + 1];
2643 R fg_psij2 = K(1.0);
2644
2645 psij_const[0] = fg_psij0;
2646 for (l = 1; l <= 2 * m + 1; l++)
2647 {
2648 fg_psij2 *= fg_psij1;
2649 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2650 }
2651
2652#ifdef _OPENMP
2653 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2654#else
2655 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2656#endif
2657 }
2658
2659 return;
2660 } /* if(PRE_FG_PSI) */
2661
2662 if (ths->flags & FG_PSI)
2663 {
2664 R fg_exp_l[2 * m + 2];
2665
2666 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2667
2668 sort(ths);
2669
2670#ifdef _OPENMP
2671 MACRO_adjoint_1d_B_OMP_BLOCKWISE(FG_PSI)
2672#endif
2673
2674#ifdef _OPENMP
2675 #pragma omp parallel for default(shared) private(k)
2676#endif
2677 for (k = 0; k < M; k++)
2678 {
2679 INT u,o,l;
2680 R psij_const[2 * m + 2];
2681 R fg_psij0, fg_psij1, fg_psij2;
2682 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2683
2684 uo(ths, j, &u, &o, (INT)0);
2685 fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)u) / (R)(n),0));
2686 fg_psij1 = EXP(K(2.0) * ((R)(n) * (ths->x[j]) - (R)(u)) / ths->b[0]);
2687 fg_psij2 = K(1.0);
2688 psij_const[0] = fg_psij0;
2689 for (l = 1; l <= 2 * m + 1; l++)
2690 {
2691 fg_psij2 *= fg_psij1;
2692 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2693 }
2694
2695#ifdef _OPENMP
2696 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2697#else
2698 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2699#endif
2700 }
2701
2702 return;
2703 } /* if(FG_PSI) */
2704
2705 if (ths->flags & PRE_LIN_PSI)
2706 {
2707 const INT K = ths->K;
2708 const INT ip_s = K / (m + 2);
2709
2710 sort(ths);
2711
2712#ifdef _OPENMP
2713 MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
2714#endif
2715
2716#ifdef _OPENMP
2717 #pragma omp parallel for default(shared) private(k)
2718#endif
2719 for (k = 0; k < M; k++)
2720 {
2721 INT u,o,l;
2722 INT ip_u;
2723 R ip_y, ip_w;
2724 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2725 R psij_const[2 * m + 2];
2726
2727 uo(ths, j, &u, &o, (INT)0);
2728
2729 ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
2730 ip_u = (INT)(LRINT(FLOOR(ip_y)));
2731 ip_w = ip_y - (R)(ip_u);
2732 for (l = 0; l < 2 * m + 2; l++)
2733 psij_const[l]
2734 = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2735 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2736
2737#ifdef _OPENMP
2738 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2739#else
2740 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2741#endif
2742 }
2743 return;
2744 } /* if(PRE_LIN_PSI) */
2745
2746 /* no precomputed psi at all */
2747 sort(ths);
2748
2749#ifdef _OPENMP
2750 MACRO_adjoint_1d_B_OMP_BLOCKWISE(NO_PSI)
2751#endif
2752
2753#ifdef _OPENMP
2754 #pragma omp parallel for default(shared) private(k)
2755#endif
2756 for (k = 0; k < M; k++)
2757 {
2758 INT u,o,l;
2759 R psij_const[2 * m + 2];
2760 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2761
2762 uo(ths, j, &u, &o, (INT)0);
2763
2764 for (l = 0; l <= 2 * m + 1; l++)
2765 psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n),0));
2766
2767#ifdef _OPENMP
2768 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2769#else
2770 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2771#endif
2772 }
2773}
2774
2775void X(trafo_1d)(X(plan) *ths)
2776{
2777 if((ths->N[0] <= ths->m) || (ths->n[0] <= 2*ths->m+2))
2778 {
2779 X(trafo_direct)(ths);
2780 return;
2781 }
2782
2783 const INT N = ths->N[0], N2 = N/2, n = ths->n[0];
2784 C *f_hat1 = (C*)ths->f_hat, *f_hat2 = (C*)&ths->f_hat[N2];
2785
2786 ths->g_hat = ths->g1;
2787 ths->g = ths->g2;
2788
2789 {
2790 C *g_hat1 = (C*)&ths->g_hat[n-N/2], *g_hat2 = (C*)ths->g_hat;
2791 R *c_phi_inv1, *c_phi_inv2;
2792
2793 TIC(0)
2794#ifdef _OPENMP
2795 {
2796 INT k;
2797 #pragma omp parallel for default(shared) private(k)
2798 for (k = 0; k < ths->n_total; k++)
2799 ths->g_hat[k] = 0.0;
2800 }
2801#else
2802 memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
2803#endif
2804 if(ths->flags & PRE_PHI_HUT)
2805 {
2806 INT k;
2807 c_phi_inv1 = ths->c_phi_inv[0];
2808 c_phi_inv2 = &ths->c_phi_inv[0][N2];
2809
2810#ifdef _OPENMP
2811 #pragma omp parallel for default(shared) private(k)
2812#endif
2813 for (k = 0; k < N2; k++)
2814 {
2815 g_hat1[k] = f_hat1[k] * c_phi_inv1[k];
2816 g_hat2[k] = f_hat2[k] * c_phi_inv2[k];
2817 }
2818 }
2819 else
2820 {
2821 INT k;
2822#ifdef _OPENMP
2823 #pragma omp parallel for default(shared) private(k)
2824#endif
2825 for (k = 0; k < N2; k++)
2826 {
2827 g_hat1[k] = f_hat1[k] / (PHI_HUT(ths->n[0],k-N2,0));
2828 g_hat2[k] = f_hat2[k] / (PHI_HUT(ths->n[0],k,0));
2829 }
2830 }
2831 TOC(0)
2832
2833 TIC_FFTW(1)
2834 FFTW(execute)(ths->my_fftw_plan1);
2835 TOC_FFTW(1);
2836
2837 TIC(2);
2838 nfft_trafo_1d_B(ths);
2839 TOC(2);
2840 }
2841}
2842
2843void X(adjoint_1d)(X(plan) *ths)
2844{
2845 if((ths->N[0] <= ths->m) || (ths->n[0] <= 2*ths->m+2))
2846 {
2847 X(adjoint_direct)(ths);
2848 return;
2849 }
2850
2851 INT n,N;
2852 C *g_hat1,*g_hat2,*f_hat1,*f_hat2;
2853 R *c_phi_inv1, *c_phi_inv2;
2854
2855 N=ths->N[0];
2856 n=ths->n[0];
2857
2858 ths->g_hat=ths->g1;
2859 ths->g=ths->g2;
2860
2861 f_hat1=(C*)ths->f_hat;
2862 f_hat2=(C*)&ths->f_hat[N/2];
2863 g_hat1=(C*)&ths->g_hat[n-N/2];
2864 g_hat2=(C*)ths->g_hat;
2865
2866 TIC(2)
2867 nfft_adjoint_1d_B(ths);
2868 TOC(2)
2869
2870 TIC_FFTW(1)
2871 FFTW(execute)(ths->my_fftw_plan2);
2872 TOC_FFTW(1);
2873
2874 TIC(0)
2875 if(ths->flags & PRE_PHI_HUT)
2876 {
2877 INT k;
2878 c_phi_inv1=ths->c_phi_inv[0];
2879 c_phi_inv2=&ths->c_phi_inv[0][N/2];
2880
2881#ifdef _OPENMP
2882 #pragma omp parallel for default(shared) private(k)
2883#endif
2884 for (k = 0; k < N/2; k++)
2885 {
2886 f_hat1[k] = g_hat1[k] * c_phi_inv1[k];
2887 f_hat2[k] = g_hat2[k] * c_phi_inv2[k];
2888 }
2889 }
2890 else
2891 {
2892 INT k;
2893
2894#ifdef _OPENMP
2895 #pragma omp parallel for default(shared) private(k)
2896#endif
2897 for (k = 0; k < N/2; k++)
2898 {
2899 f_hat1[k] = g_hat1[k] / (PHI_HUT(ths->n[0],k-N/2,0));
2900 f_hat2[k] = g_hat2[k] / (PHI_HUT(ths->n[0],k,0));
2901 }
2902 }
2903 TOC(0)
2904}
2905
2906
2907/* ################################################ SPECIFIC VERSIONS FOR d=2 */
2908
2909static void nfft_2d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b)
2910{
2911 INT l;
2912 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
2913
2914 fg_exp_b0 = EXP(K(-1.0)/b);
2915 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
2916 fg_exp_b1 = K(1.0);
2917 fg_exp_b2 = K(1.0);
2918 fg_exp_l[0] = K(1.0);
2919 for(l=1; l <= 2*m+1; l++)
2920 {
2921 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
2922 fg_exp_b1 *= fg_exp_b0_sq;
2923 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
2924 }
2925}
2926
2927static void nfft_trafo_2d_compute(C *fj, const C *g, const R *psij_const0,
2928 const R *psij_const1, const R *xj0, const R *xj1, const INT n0,
2929 const INT n1, const INT m)
2930{
2931 INT u0,o0,l0,u1,o1,l1;
2932 const C *gj;
2933 const R *psij0,*psij1;
2934
2935 psij0=psij_const0;
2936 psij1=psij_const1;
2937
2938 uo2(&u0,&o0,*xj0, n0, m);
2939 uo2(&u1,&o1,*xj1, n1, m);
2940
2941 *fj=0;
2942
2943 if (u0 < o0)
2944 if(u1 < o1)
2945 for(l0=0; l0<=2*m+1; l0++,psij0++)
2946 {
2947 psij1=psij_const1;
2948 gj=g+(u0+l0)*n1+u1;
2949 for(l1=0; l1<=2*m+1; l1++)
2950 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2951 }
2952 else
2953 for(l0=0; l0<=2*m+1; l0++,psij0++)
2954 {
2955 psij1=psij_const1;
2956 gj=g+(u0+l0)*n1+u1;
2957 for(l1=0; l1<2*m+1-o1; l1++)
2958 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2959 gj=g+(u0+l0)*n1;
2960 for(l1=0; l1<=o1; l1++)
2961 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2962 }
2963 else
2964 if(u1<o1)
2965 {
2966 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2967 {
2968 psij1=psij_const1;
2969 gj=g+(u0+l0)*n1+u1;
2970 for(l1=0; l1<=2*m+1; l1++)
2971 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2972 }
2973 for(l0=0; l0<=o0; l0++,psij0++)
2974 {
2975 psij1=psij_const1;
2976 gj=g+l0*n1+u1;
2977 for(l1=0; l1<=2*m+1; l1++)
2978 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2979 }
2980 }
2981 else
2982 {
2983 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2984 {
2985 psij1=psij_const1;
2986 gj=g+(u0+l0)*n1+u1;
2987 for(l1=0; l1<2*m+1-o1; l1++)
2988 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2989 gj=g+(u0+l0)*n1;
2990 for(l1=0; l1<=o1; l1++)
2991 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2992 }
2993 for(l0=0; l0<=o0; l0++,psij0++)
2994 {
2995 psij1=psij_const1;
2996 gj=g+l0*n1+u1;
2997 for(l1=0; l1<2*m+1-o1; l1++)
2998 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2999 gj=g+l0*n1;
3000 for(l1=0; l1<=o1; l1++)
3001 (*fj) += (*psij0) * (*psij1++) * (*gj++);
3002 }
3003 }
3004}
3005
3006#ifdef _OPENMP
3007/* adjoint NFFT two-dimensional case with OpenMP atomic operations */
3008static void nfft_adjoint_2d_compute_omp_atomic(const C f, C *g,
3009 const R *psij_const0, const R *psij_const1, const R *xj0,
3010 const R *xj1, const INT n0, const INT n1, const INT m)
3011{
3012 INT u0,o0,l0,u1,o1,l1;
3013
3014 INT index_temp0[2*m+2];
3015 INT index_temp1[2*m+2];
3016
3017 uo2(&u0,&o0,*xj0, n0, m);
3018 uo2(&u1,&o1,*xj1, n1, m);
3019
3020 for (l0=0; l0<=2*m+1; l0++)
3021 index_temp0[l0] = (u0+l0)%n0;
3022
3023 for (l1=0; l1<=2*m+1; l1++)
3024 index_temp1[l1] = (u1+l1)%n1;
3025
3026 for(l0=0; l0<=2*m+1; l0++)
3027 {
3028 for(l1=0; l1<=2*m+1; l1++)
3029 {
3030 INT i = index_temp0[l0] * n1 + index_temp1[l1];
3031 C *lhs = g+i;
3032 R *lhs_real = (R*)lhs;
3033 C val = psij_const0[l0] * psij_const1[l1] * f;
3034
3035 #pragma omp atomic
3036 lhs_real[0] += CREAL(val);
3037
3038 #pragma omp atomic
3039 lhs_real[1] += CIMAG(val);
3040 }
3041 }
3042}
3043#endif
3044
3045#ifdef _OPENMP
3064static void nfft_adjoint_2d_compute_omp_blockwise(const C f, C *g,
3065 const R *psij_const0, const R *psij_const1, const R *xj0,
3066 const R *xj1, const INT n0, const INT n1, const INT m,
3067 const INT my_u0, const INT my_o0)
3068{
3069 INT ar_u0,ar_o0,l0,u1,o1,l1;
3070 INT index_temp1[2*m+2];
3071
3072 uo2(&ar_u0,&ar_o0,*xj0, n0, m);
3073 uo2(&u1,&o1,*xj1, n1, m);
3074
3075 for (l1 = 0; l1 <= 2*m+1; l1++)
3076 index_temp1[l1] = (u1+l1)%n1;
3077
3078 if(ar_u0 < ar_o0)
3079 {
3080 INT u0 = MAX(my_u0,ar_u0);
3081 INT o0 = MIN(my_o0,ar_o0);
3082 INT offset_psij = u0-ar_u0;
3083#ifdef OMP_ASSERT
3084 assert(offset_psij >= 0);
3085 assert(o0-u0 <= 2*m+1);
3086 assert(offset_psij+o0-u0 <= 2*m+1);
3087#endif
3088
3089 for (l0 = 0; l0 <= o0-u0; l0++)
3090 {
3091 INT i0 = (u0+l0) * n1;
3092 const C val0 = psij_const0[offset_psij+l0];
3093
3094 for(l1=0; l1<=2*m+1; l1++)
3095 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
3096 }
3097 }
3098 else
3099 {
3100 INT u0 = MAX(my_u0,ar_u0);
3101 INT o0 = my_o0;
3102 INT offset_psij = u0-ar_u0;
3103#ifdef OMP_ASSERT
3104 assert(offset_psij >= 0);
3105 assert(o0-u0 <= 2*m+1);
3106 assert(offset_psij+o0-u0 <= 2*m+1);
3107#endif
3108
3109 for (l0 = 0; l0 <= o0-u0; l0++)
3110 {
3111 INT i0 = (u0+l0) * n1;
3112 const C val0 = psij_const0[offset_psij+l0];
3113
3114 for(l1=0; l1<=2*m+1; l1++)
3115 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
3116 }
3117
3118 u0 = my_u0;
3119 o0 = MIN(my_o0,ar_o0);
3120 offset_psij += my_u0-ar_u0+n0;
3121
3122#ifdef OMP_ASSERT
3123 if (u0<=o0)
3124 {
3125 assert(o0-u0 <= 2*m+1);
3126 assert(offset_psij+o0-u0 <= 2*m+1);
3127 }
3128#endif
3129
3130 for (l0 = 0; l0 <= o0-u0; l0++)
3131 {
3132 INT i0 = (u0+l0) * n1;
3133 const C val0 = psij_const0[offset_psij+l0];
3134
3135 for(l1=0; l1<=2*m+1; l1++)
3136 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
3137 }
3138 }
3139}
3140#endif
3141
3142#ifndef _OPENMP
3143static void nfft_adjoint_2d_compute_serial(const C *fj, C *g,
3144 const R *psij_const0, const R *psij_const1, const R *xj0,
3145 const R *xj1, const INT n0, const INT n1, const INT m)
3146{
3147 INT u0,o0,l0,u1,o1,l1;
3148 C *gj;
3149 const R *psij0,*psij1;
3150
3151 psij0=psij_const0;
3152 psij1=psij_const1;
3153
3154 uo2(&u0,&o0,*xj0, n0, m);
3155 uo2(&u1,&o1,*xj1, n1, m);
3156
3157 if(u0<o0)
3158 if(u1<o1)
3159 for(l0=0; l0<=2*m+1; l0++,psij0++)
3160 {
3161 psij1=psij_const1;
3162 gj=g+(u0+l0)*n1+u1;
3163 for(l1=0; l1<=2*m+1; l1++)
3164 (*gj++) += (*psij0) * (*psij1++) * (*fj);
3165 }
3166 else
3167 for(l0=0; l0<=2*m+1; l0++,psij0++)
3168 {
3169 psij1=psij_const1;
3170 gj=g+(u0+l0)*n1+u1;
3171 for(l1=0; l1<2*m+1-o1; l1++)
3172 (*gj++) += (*psij0) * (*psij1++) * (*fj);
3173 gj=g+(u0+l0)*n1;
3174 for(l1=0; l1<=o1; l1++)
3175 (*gj++) += (*psij0) * (*psij1++) * (*fj);
3176 }
3177 else
3178 if(u1<o1)
3179 {
3180 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
3181 {
3182 psij1=psij_const1;
3183 gj=g+(u0+l0)*n1+u1;
3184 for(l1=0; l1<=2*m+1; l1++)
3185 (*gj++) += (*psij0) * (*psij1++) * (*fj);
3186 }
3187 for(l0=0; l0<=o0; l0++,psij0++)
3188 {
3189 psij1=psij_const1;
3190 gj=g+l0*n1+u1;
3191 for(l1=0; l1<=2*m+1; l1++)
3192 (*gj++) += (*psij0) * (*psij1++) * (*fj);
3193 }
3194 }
3195 else
3196 {
3197 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
3198 {
3199 psij1=psij_const1;
3200 gj=g+(u0+l0)*n1+u1;
3201 for(l1=0; l1<2*m+1-o1; l1++)
3202 (*gj++) += (*psij0) * (*psij1++) * (*fj);
3203 gj=g+(u0+l0)*n1;
3204 for(l1=0; l1<=o1; l1++)
3205 (*gj++) += (*psij0) * (*psij1++) * (*fj);
3206 }
3207 for(l0=0; l0<=o0; l0++,psij0++)
3208 {
3209 psij1=psij_const1;
3210 gj=g+l0*n1+u1;
3211 for(l1=0; l1<2*m+1-o1; l1++)
3212 (*gj++) += (*psij0) * (*psij1++) * (*fj);
3213 gj=g+l0*n1;
3214 for(l1=0; l1<=o1; l1++)
3215 (*gj++) += (*psij0) * (*psij1++) * (*fj);
3216 }
3217 }
3218}
3219#endif
3220
3221static void nfft_trafo_2d_B(X(plan) *ths)
3222{
3223 const C *g = (C*)ths->g;
3224 const INT n0 = ths->n[0];
3225 const INT n1 = ths->n[1];
3226 const INT M = ths->M_total;
3227 const INT m = ths->m;
3228
3229 INT k;
3230
3231 if(ths->flags & PRE_FULL_PSI)
3232 {
3233 const INT lprod = (2*m+2) * (2*m+2);
3234#ifdef _OPENMP
3235 #pragma omp parallel for default(shared) private(k)
3236#endif
3237 for (k = 0; k < M; k++)
3238 {
3239 INT l;
3240 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3241 ths->f[j] = K(0.0);
3242 for (l = 0; l < lprod; l++)
3243 ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
3244 }
3245 return;
3246 } /* if(PRE_FULL_PSI) */
3247
3248 if(ths->flags & PRE_PSI)
3249 {
3250#ifdef _OPENMP
3251 #pragma omp parallel for default(shared) private(k)
3252#endif
3253 for (k = 0; k < M; k++)
3254 {
3255 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3256 nfft_trafo_2d_compute(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3257 }
3258
3259 return;
3260 } /* if(PRE_PSI) */
3261
3262 if(ths->flags & PRE_FG_PSI)
3263 {
3264 R fg_exp_l[2*(2*m+2)];
3265
3266 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3267 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3268
3269#ifdef _OPENMP
3270 #pragma omp parallel for default(shared) private(k)
3271#endif
3272 for (k = 0; k < M; k++)
3273 {
3274 R psij_const[2*(2*m+2)];
3275 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3276 INT l;
3277 R fg_psij0 = ths->psi[2*j*2];
3278 R fg_psij1 = ths->psi[2*j*2+1];
3279 R fg_psij2 = K(1.0);
3280
3281 psij_const[0] = fg_psij0;
3282 for (l = 1; l <= 2*m+1; l++)
3283 {
3284 fg_psij2 *= fg_psij1;
3285 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3286 }
3287
3288 fg_psij0 = ths->psi[2*(j*2+1)];
3289 fg_psij1 = ths->psi[2*(j*2+1)+1];
3290 fg_psij2 = K(1.0);
3291 psij_const[2*m+2] = fg_psij0;
3292 for (l = 1; l <= 2*m+1; l++)
3293 {
3294 fg_psij2 *= fg_psij1;
3295 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3296 }
3297
3298 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3299 }
3300
3301 return;
3302 } /* if(PRE_FG_PSI) */
3303
3304 if(ths->flags & FG_PSI)
3305 {
3306 R fg_exp_l[2*(2*m+2)];
3307
3308 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3309 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3310
3311 sort(ths);
3312
3313#ifdef _OPENMP
3314 #pragma omp parallel for default(shared) private(k)
3315#endif
3316 for (k = 0; k < M; k++)
3317 {
3318 INT u, o, l;
3319 R fg_psij0, fg_psij1, fg_psij2;
3320 R psij_const[2*(2*m+2)];
3321 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3322
3323 uo(ths, j, &u, &o, (INT)0);
3324 fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u) / (R)(n0),0));
3325 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
3326 fg_psij2 = K(1.0);
3327 psij_const[0] = fg_psij0;
3328 for (l = 1; l <= 2*m+1; l++)
3329 {
3330 fg_psij2 *= fg_psij1;
3331 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3332 }
3333
3334 uo(ths,j,&u,&o, (INT)1);
3335 fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
3336 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
3337 fg_psij2 = K(1.0);
3338 psij_const[2*m+2] = fg_psij0;
3339 for(l=1; l<=2*m+1; l++)
3340 {
3341 fg_psij2 *= fg_psij1;
3342 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3343 }
3344
3345 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3346 }
3347
3348 return;
3349 } /* if(FG_PSI) */
3350
3351 if(ths->flags & PRE_LIN_PSI)
3352 {
3353 const INT K = ths->K, ip_s = K / (m + 2);
3354
3355 sort(ths);
3356
3357#ifdef _OPENMP
3358 #pragma omp parallel for default(shared) private(k)
3359#endif
3360 for (k = 0; k < M; k++)
3361 {
3362 INT u, o, l;
3363 R ip_y, ip_w;
3364 INT ip_u;
3365 R psij_const[2*(2*m+2)];
3366 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3367
3368 uo(ths,j,&u,&o,(INT)0);
3369 ip_y = FABS((R)(n0) * ths->x[2*j] - (R)(u)) * ((R)ip_s);
3370 ip_u = (INT)LRINT(FLOOR(ip_y));
3371 ip_w = ip_y - (R)(ip_u);
3372 for (l = 0; l < 2*m+2; l++)
3373 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3374
3375 uo(ths,j,&u,&o,(INT)1);
3376 ip_y = FABS((R)(n1) * ths->x[2*j+1] - (R)(u)) * ((R)ip_s);
3377 ip_u = (INT)(LRINT(FLOOR(ip_y)));
3378 ip_w = ip_y - (R)(ip_u);
3379 for (l = 0; l < 2*m+2; l++)
3380 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3381
3382 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3383 }
3384 return;
3385 } /* if(PRE_LIN_PSI) */
3386
3387 /* no precomputed psi at all */
3388
3389 sort(ths);
3390
3391#ifdef _OPENMP
3392 #pragma omp parallel for default(shared) private(k)
3393#endif
3394 for (k = 0; k < M; k++)
3395 {
3396 R psij_const[2*(2*m+2)];
3397 INT u, o, l;
3398 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3399
3400 uo(ths,j,&u,&o,(INT)0);
3401 for (l = 0; l <= 2*m+1; l++)
3402 psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
3403
3404 uo(ths,j,&u,&o,(INT)1);
3405 for (l = 0; l <= 2*m+1; l++)
3406 psij_const[2*m+2+l] = (PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l)))/(R)(n1),1));
3407
3408 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3409 }
3410}
3411
3412#define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
3413 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3414 ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), \
3415 ths->x+2*j, ths->x+2*j+1, n0, n1, m, my_u0, my_o0);
3416
3417#define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
3418{ \
3419 R psij_const[2*(2*m+2)]; \
3420 INT l; \
3421 R fg_psij0 = ths->psi[2*j*2]; \
3422 R fg_psij1 = ths->psi[2*j*2+1]; \
3423 R fg_psij2 = K(1.0); \
3424 \
3425 psij_const[0] = fg_psij0; \
3426 for(l=1; l<=2*m+1; l++) \
3427 { \
3428 fg_psij2 *= fg_psij1; \
3429 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3430 } \
3431 \
3432 fg_psij0 = ths->psi[2*(j*2+1)]; \
3433 fg_psij1 = ths->psi[2*(j*2+1)+1]; \
3434 fg_psij2 = K(1.0); \
3435 psij_const[2*m+2] = fg_psij0; \
3436 for(l=1; l<=2*m+1; l++) \
3437 { \
3438 fg_psij2 *= fg_psij1; \
3439 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3440 } \
3441 \
3442 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3443 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3444 n0, n1, m, my_u0, my_o0); \
3445}
3446
3447#define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
3448{ \
3449 R psij_const[2*(2*m+2)]; \
3450 R fg_psij0, fg_psij1, fg_psij2; \
3451 INT u, o, l; \
3452 \
3453 uo(ths,j,&u,&o,(INT)0); \
3454 fg_psij0 = (PHI(ths->n[0],ths->x[2*j]-((R)u)/((R)n0),0)); \
3455 fg_psij1 = EXP(K(2.0)*(((R)n0)*(ths->x[2*j]) - (R)u)/ths->b[0]); \
3456 fg_psij2 = K(1.0); \
3457 psij_const[0] = fg_psij0; \
3458 for(l=1; l<=2*m+1; l++) \
3459 { \
3460 fg_psij2 *= fg_psij1; \
3461 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3462 } \
3463 \
3464 uo(ths,j,&u,&o,(INT)1); \
3465 fg_psij0 = (PHI(ths->n[1],ths->x[2*j+1]-((R)u)/((R)n1),1)); \
3466 fg_psij1 = EXP(K(2.0)*(((R)n1)*(ths->x[2*j+1]) - (R)u)/ths->b[1]); \
3467 fg_psij2 = K(1.0); \
3468 psij_const[2*m+2] = fg_psij0; \
3469 for(l=1; l<=2*m+1; l++) \
3470 { \
3471 fg_psij2 *= fg_psij1; \
3472 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3473 } \
3474 \
3475 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3476 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3477 n0, n1, m, my_u0, my_o0); \
3478}
3479
3480#define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
3481{ \
3482 R psij_const[2*(2*m+2)]; \
3483 INT u, o, l; \
3484 INT ip_u; \
3485 R ip_y, ip_w; \
3486 \
3487 uo(ths,j,&u,&o,(INT)0); \
3488 ip_y = FABS(((R)n0)*(ths->x[2*j]) - (R)u)*((R)ip_s); \
3489 ip_u = LRINT(FLOOR(ip_y)); \
3490 ip_w = ip_y-ip_u; \
3491 for(l=0; l < 2*m+2; l++) \
3492 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3493 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
3494 \
3495 uo(ths,j,&u,&o,(INT)1); \
3496 ip_y = FABS(((R)n1)*(ths->x[2*j+1]) - (R)u)*((R)ip_s); \
3497 ip_u = LRINT(FLOOR(ip_y)); \
3498 ip_w = ip_y-ip_u; \
3499 for(l=0; l < 2*m+2; l++) \
3500 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3501 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
3502 \
3503 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3504 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3505 n0, n1, m, my_u0, my_o0); \
3506}
3507
3508#define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
3509{ \
3510 R psij_const[2*(2*m+2)]; \
3511 INT u, o, l; \
3512 \
3513 uo(ths,j,&u,&o,(INT)0); \
3514 for(l=0;l<=2*m+1;l++) \
3515 psij_const[l]=(PHI(ths->n[0],ths->x[2*j]-((R)((u+l)))/((R)n0),0)); \
3516 \
3517 uo(ths,j,&u,&o,(INT)1); \
3518 for(l=0;l<=2*m+1;l++) \
3519 psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[2*j+1]-((R)((u+l)))/((R)n1),1)); \
3520 \
3521 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3522 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3523 n0, n1, m, my_u0, my_o0); \
3524}
3525
3526#define MACRO_adjoint_2d_B_OMP_BLOCKWISE(whichone) \
3527{ \
3528 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
3529 { \
3530 _Pragma("omp parallel private(k)") \
3531 { \
3532 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
3533 INT *ar_x = ths->index_x; \
3534 \
3535 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
3536 &min_u_b, &max_u_b, 2, ths->n, m); \
3537 \
3538 if (min_u_a != -1) \
3539 { \
3540 k = index_x_binary_search(ar_x, M, min_u_a); \
3541 \
3542 MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_A \
3543 \
3544 while (k < M) \
3545 { \
3546 INT u_prod = ar_x[2*k]; \
3547 INT j = ar_x[2*k+1]; \
3548 \
3549 if (u_prod < min_u_a || u_prod > max_u_a) \
3550 break; \
3551 \
3552 MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3553 \
3554 k++; \
3555 } \
3556 } \
3557 \
3558 if (min_u_b != -1) \
3559 { \
3560 INT k = index_x_binary_search(ar_x, M, min_u_b); \
3561 \
3562 MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_B \
3563 \
3564 while (k < M) \
3565 { \
3566 INT u_prod = ar_x[2*k]; \
3567 INT j = ar_x[2*k+1]; \
3568 \
3569 if (u_prod < min_u_b || u_prod > max_u_b) \
3570 break; \
3571 \
3572 MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3573 \
3574 k++; \
3575 } \
3576 } \
3577 } /* omp parallel */ \
3578 return; \
3579 } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
3580}
3581
3582
3583static void nfft_adjoint_2d_B(X(plan) *ths)
3584{
3585 const INT n0 = ths->n[0];
3586 const INT n1 = ths->n[1];
3587 const INT M = ths->M_total;
3588 const INT m = ths->m;
3589 C* g = (C*) ths->g;
3590 INT k;
3591
3592 memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
3593
3594 if(ths->flags & PRE_FULL_PSI)
3595 {
3596 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
3597 (INT)2, ths->n, m, ths->flags, ths->index_x);
3598 return;
3599 } /* if(PRE_FULL_PSI) */
3600
3601 if(ths->flags & PRE_PSI)
3602 {
3603#ifdef _OPENMP
3604 MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_PSI)
3605#endif
3606
3607#ifdef _OPENMP
3608 #pragma omp parallel for default(shared) private(k)
3609#endif
3610 for (k = 0; k < M; k++)
3611 {
3612 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3613#ifdef _OPENMP
3614 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3615#else
3616 nfft_adjoint_2d_compute_serial(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3617#endif
3618 }
3619 return;
3620 } /* if(PRE_PSI) */
3621
3622 if(ths->flags & PRE_FG_PSI)
3623 {
3624 R fg_exp_l[2*(2*m+2)];
3625
3626 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3627 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3628
3629#ifdef _OPENMP
3630 MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_FG_PSI)
3631#endif
3632
3633
3634#ifdef _OPENMP
3635 #pragma omp parallel for default(shared) private(k)
3636#endif
3637 for (k = 0; k < M; k++)
3638 {
3639 R psij_const[2*(2*m+2)];
3640 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3641 INT l;
3642 R fg_psij0 = ths->psi[2*j*2];
3643 R fg_psij1 = ths->psi[2*j*2+1];
3644 R fg_psij2 = K(1.0);
3645
3646 psij_const[0] = fg_psij0;
3647 for(l=1; l<=2*m+1; l++)
3648 {
3649 fg_psij2 *= fg_psij1;
3650 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3651 }
3652
3653 fg_psij0 = ths->psi[2*(j*2+1)];
3654 fg_psij1 = ths->psi[2*(j*2+1)+1];
3655 fg_psij2 = K(1.0);
3656 psij_const[2*m+2] = fg_psij0;
3657 for(l=1; l<=2*m+1; l++)
3658 {
3659 fg_psij2 *= fg_psij1;
3660 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3661 }
3662
3663#ifdef _OPENMP
3664 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3665#else
3666 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3667#endif
3668 }
3669
3670 return;
3671 } /* if(PRE_FG_PSI) */
3672
3673 if(ths->flags & FG_PSI)
3674 {
3675 R fg_exp_l[2*(2*m+2)];
3676
3677 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3678 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3679
3680 sort(ths);
3681
3682#ifdef _OPENMP
3683 MACRO_adjoint_2d_B_OMP_BLOCKWISE(FG_PSI)
3684#endif
3685
3686#ifdef _OPENMP
3687 #pragma omp parallel for default(shared) private(k)
3688#endif
3689 for (k = 0; k < M; k++)
3690 {
3691 INT u, o, l;
3692 R fg_psij0, fg_psij1, fg_psij2;
3693 R psij_const[2*(2*m+2)];
3694 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3695
3696 uo(ths,j,&u,&o,(INT)0);
3697 fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u)/(R)(n0),0));
3698 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
3699 fg_psij2 = K(1.0);
3700 psij_const[0] = fg_psij0;
3701 for(l=1; l<=2*m+1; l++)
3702 {
3703 fg_psij2 *= fg_psij1;
3704 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3705 }
3706
3707 uo(ths,j,&u,&o,(INT)1);
3708 fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
3709 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
3710 fg_psij2 = K(1.0);
3711 psij_const[2*m+2] = fg_psij0;
3712 for(l=1; l<=2*m+1; l++)
3713 {
3714 fg_psij2 *= fg_psij1;
3715 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3716 }
3717
3718#ifdef _OPENMP
3719 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3720#else
3721 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3722#endif
3723 }
3724
3725 return;
3726 } /* if(FG_PSI) */
3727
3728 if(ths->flags & PRE_LIN_PSI)
3729 {
3730 const INT K = ths->K;
3731 const INT ip_s = K / (m + 2);
3732
3733 sort(ths);
3734
3735#ifdef _OPENMP
3736 MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
3737#endif
3738
3739#ifdef _OPENMP
3740 #pragma omp parallel for default(shared) private(k)
3741#endif
3742 for (k = 0; k < M; k++)
3743 {
3744 INT u,o,l;
3745 INT ip_u;
3746 R ip_y, ip_w;
3747 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3748 R psij_const[2*(2*m+2)];
3749
3750 uo(ths,j,&u,&o,(INT)0);
3751 ip_y = FABS((R)(n0) * (ths->x[2*j]) - (R)(u)) * ((R)ip_s);
3752 ip_u = (INT)(LRINT(FLOOR(ip_y)));
3753 ip_w = ip_y - (R)(ip_u);
3754 for(l=0; l < 2*m+2; l++)
3755 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3756 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3757
3758 uo(ths,j,&u,&o,(INT)1);
3759 ip_y = FABS((R)(n1) * (ths->x[2*j+1]) - (R)(u)) * ((R)ip_s);
3760 ip_u = (INT)(LRINT(FLOOR(ip_y)));
3761 ip_w = ip_y - (R)(ip_u);
3762 for(l=0; l < 2*m+2; l++)
3763 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3764 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3765
3766#ifdef _OPENMP
3767 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3768#else
3769 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3770#endif
3771 }
3772 return;
3773 } /* if(PRE_LIN_PSI) */
3774
3775 /* no precomputed psi at all */
3776 sort(ths);
3777
3778#ifdef _OPENMP
3779 MACRO_adjoint_2d_B_OMP_BLOCKWISE(NO_PSI)
3780#endif
3781
3782#ifdef _OPENMP
3783 #pragma omp parallel for default(shared) private(k)
3784#endif
3785 for (k = 0; k < M; k++)
3786 {
3787 INT u,o,l;
3788 R psij_const[2*(2*m+2)];
3789 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3790
3791 uo(ths,j,&u,&o,(INT)0);
3792 for(l=0;l<=2*m+1;l++)
3793 psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
3794
3795 uo(ths,j,&u,&o,(INT)1);
3796 for(l=0;l<=2*m+1;l++)
3797 psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l))) / (R)(n1),1));
3798
3799#ifdef _OPENMP
3800 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3801#else
3802 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3803#endif
3804 }
3805}
3806
3807
3808void X(trafo_2d)(X(plan) *ths)
3809{
3810 if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2))
3811 {
3812 X(trafo_direct)(ths);
3813 return;
3814 }
3815
3816 INT k0,k1,n0,n1,N0,N1;
3817 C *g_hat,*f_hat;
3818 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3819 R ck01, ck02, ck11, ck12;
3820 C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3821
3822 ths->g_hat=ths->g1;
3823 ths->g=ths->g2;
3824
3825 N0=ths->N[0];
3826 N1=ths->N[1];
3827 n0=ths->n[0];
3828 n1=ths->n[1];
3829
3830 f_hat=(C*)ths->f_hat;
3831 g_hat=(C*)ths->g_hat;
3832
3833 TIC(0)
3834#ifdef _OPENMP
3835 #pragma omp parallel for default(shared) private(k0)
3836 for (k0 = 0; k0 < ths->n_total; k0++)
3837 ths->g_hat[k0] = 0.0;
3838#else
3839 memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
3840#endif
3841 if(ths->flags & PRE_PHI_HUT)
3842 {
3843 c_phi_inv01=ths->c_phi_inv[0];
3844 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3845
3846#ifdef _OPENMP
3847 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3848#endif
3849 for(k0=0;k0<N0/2;k0++)
3850 {
3851 ck01=c_phi_inv01[k0];
3852 ck02=c_phi_inv02[k0];
3853
3854 c_phi_inv11=ths->c_phi_inv[1];
3855 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3856
3857 g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3858 f_hat11=f_hat + k0*N1;
3859 g_hat21=g_hat + k0*n1+n1-(N1/2);
3860 f_hat21=f_hat + ((N0/2)+k0)*N1;
3861 g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3862 f_hat12=f_hat + k0*N1+(N1/2);
3863 g_hat22=g_hat + k0*n1;
3864 f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3865
3866 for(k1=0;k1<N1/2;k1++)
3867 {
3868 ck11=c_phi_inv11[k1];
3869 ck12=c_phi_inv12[k1];
3870
3871 g_hat11[k1] = f_hat11[k1] * ck01 * ck11;
3872 g_hat21[k1] = f_hat21[k1] * ck02 * ck11;
3873 g_hat12[k1] = f_hat12[k1] * ck01 * ck12;
3874 g_hat22[k1] = f_hat22[k1] * ck02 * ck12;
3875 }
3876 }
3877 }
3878 else
3879#ifdef _OPENMP
3880 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3881#endif
3882 for(k0=0;k0<N0/2;k0++)
3883 {
3884 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
3885 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
3886 for(k1=0;k1<N1/2;k1++)
3887 {
3888 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
3889 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
3890 g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] = f_hat[k0*N1+k1] * ck01 * ck11;
3891 g_hat[k0*n1+n1-N1/2+k1] = f_hat[(N0/2+k0)*N1+k1] * ck02 * ck11;
3892 g_hat[(n0-N0/2+k0)*n1+k1] = f_hat[k0*N1+N1/2+k1] * ck01 * ck12;
3893 g_hat[k0*n1+k1] = f_hat[(N0/2+k0)*N1+N1/2+k1] * ck02 * ck12;
3894 }
3895 }
3896
3897 TOC(0)
3898
3899 TIC_FFTW(1)
3900 FFTW(execute)(ths->my_fftw_plan1);
3901 TOC_FFTW(1);
3902
3903 TIC(2);
3904 nfft_trafo_2d_B(ths);
3905 TOC(2);
3906}
3907
3908void X(adjoint_2d)(X(plan) *ths)
3909{
3910 if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2))
3911 {
3912 X(adjoint_direct)(ths);
3913 return;
3914 }
3915
3916 INT k0,k1,n0,n1,N0,N1;
3917 C *g_hat,*f_hat;
3918 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3919 R ck01, ck02, ck11, ck12;
3920 C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3921
3922 ths->g_hat=ths->g1;
3923 ths->g=ths->g2;
3924
3925 N0=ths->N[0];
3926 N1=ths->N[1];
3927 n0=ths->n[0];
3928 n1=ths->n[1];
3929
3930 f_hat=(C*)ths->f_hat;
3931 g_hat=(C*)ths->g_hat;
3932
3933 TIC(2);
3934 nfft_adjoint_2d_B(ths);
3935 TOC(2);
3936
3937 TIC_FFTW(1)
3938 FFTW(execute)(ths->my_fftw_plan2);
3939 TOC_FFTW(1);
3940
3941 TIC(0)
3942 if(ths->flags & PRE_PHI_HUT)
3943 {
3944 c_phi_inv01=ths->c_phi_inv[0];
3945 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3946
3947#ifdef _OPENMP
3948 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3949#endif
3950 for(k0=0;k0<N0/2;k0++)
3951 {
3952 ck01=c_phi_inv01[k0];
3953 ck02=c_phi_inv02[k0];
3954
3955 c_phi_inv11=ths->c_phi_inv[1];
3956 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3957
3958 g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3959 f_hat11=f_hat + k0*N1;
3960 g_hat21=g_hat + k0*n1+n1-(N1/2);
3961 f_hat21=f_hat + ((N0/2)+k0)*N1;
3962 g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3963 f_hat12=f_hat + k0*N1+(N1/2);
3964 g_hat22=g_hat + k0*n1;
3965 f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3966
3967 for(k1=0;k1<N1/2;k1++)
3968 {
3969 ck11=c_phi_inv11[k1];
3970 ck12=c_phi_inv12[k1];
3971
3972 f_hat11[k1] = g_hat11[k1] * ck01 * ck11;
3973 f_hat21[k1] = g_hat21[k1] * ck02 * ck11;
3974 f_hat12[k1] = g_hat12[k1] * ck01 * ck12;
3975 f_hat22[k1] = g_hat22[k1] * ck02 * ck12;
3976 }
3977 }
3978 }
3979 else
3980#ifdef _OPENMP
3981 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3982#endif
3983 for(k0=0;k0<N0/2;k0++)
3984 {
3985 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
3986 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
3987 for(k1=0;k1<N1/2;k1++)
3988 {
3989 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
3990 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
3991 f_hat[k0*N1+k1] = g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] * ck01 * ck11;
3992 f_hat[(N0/2+k0)*N1+k1] = g_hat[k0*n1+n1-N1/2+k1] * ck02 * ck11;
3993 f_hat[k0*N1+N1/2+k1] = g_hat[(n0-N0/2+k0)*n1+k1] * ck01 * ck12;
3994 f_hat[(N0/2+k0)*N1+N1/2+k1] = g_hat[k0*n1+k1] * ck02 * ck12;
3995 }
3996 }
3997 TOC(0)
3998}
3999
4000/* ################################################ SPECIFIC VERSIONS FOR d=3 */
4001
4002static void nfft_3d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b)
4003{
4004 INT l;
4005 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
4006
4007 fg_exp_b0 = EXP(-K(1.0) / b);
4008 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
4009 fg_exp_b1 = K(1.0);
4010 fg_exp_b2 = K(1.0);
4011 fg_exp_l[0] = K(1.0);
4012 for(l=1; l <= 2*m+1; l++)
4013 {
4014 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
4015 fg_exp_b1 *= fg_exp_b0_sq;
4016 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
4017 }
4018}
4019
4020static void nfft_trafo_3d_compute(C *fj, const C *g, const R *psij_const0,
4021 const R *psij_const1, const R *psij_const2, const R *xj0, const R *xj1,
4022 const R *xj2, const INT n0, const INT n1, const INT n2, const INT m)
4023{
4024 INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
4025 const C *gj;
4026 const R *psij0, *psij1, *psij2;
4027
4028 psij0 = psij_const0;
4029 psij1 = psij_const1;
4030 psij2 = psij_const2;
4031
4032 uo2(&u0, &o0, *xj0, n0, m);
4033 uo2(&u1, &o1, *xj1, n1, m);
4034 uo2(&u2, &o2, *xj2, n2, m);
4035
4036 *fj = 0;
4037
4038 if (u0 < o0)
4039 if (u1 < o1)
4040 if (u2 < o2)
4041 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4042 {
4043 psij1 = psij_const1;
4044 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4045 {
4046 psij2 = psij_const2;
4047 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4048 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4049 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4050 }
4051 }
4052 else
4053 /* asserts (u2>o2)*/
4054 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4055 {
4056 psij1 = psij_const1;
4057 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4058 {
4059 psij2 = psij_const2;
4060 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4061 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4062 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4063 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4064 for (l2 = 0; l2 <= o2; l2++)
4065 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4066 }
4067 }
4068 else /* asserts (u1>o1)*/
4069 if (u2 < o2)
4070 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4071 {
4072 psij1 = psij_const1;
4073 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4074 {
4075 psij2 = psij_const2;
4076 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4077 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4078 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4079 }
4080 for (l1 = 0; l1 <= o1; l1++, psij1++)
4081 {
4082 psij2 = psij_const2;
4083 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4084 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4085 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4086 }
4087 }
4088 else/* asserts (u2>o2) */
4089 {
4090 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4091 {
4092 psij1 = psij_const1;
4093 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4094 {
4095 psij2 = psij_const2;
4096 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4097 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4098 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4099 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4100 for (l2 = 0; l2 <= o2; l2++)
4101 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4102 }
4103 for (l1 = 0; l1 <= o1; l1++, psij1++)
4104 {
4105 psij2 = psij_const2;
4106 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4107 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4108 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4109 gj = g + ((u0 + l0) * n1 + l1) * n2;
4110 for (l2 = 0; l2 <= o2; l2++)
4111 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4112 }
4113 }
4114 }
4115 else /* asserts (u0>o0) */
4116 if (u1 < o1)
4117 if (u2 < o2)
4118 {
4119 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4120 {
4121 psij1 = psij_const1;
4122 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4123 {
4124 psij2 = psij_const2;
4125 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4126 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4127 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4128 }
4129 }
4130
4131 for (l0 = 0; l0 <= o0; l0++, psij0++)
4132 {
4133 psij1 = psij_const1;
4134 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4135 {
4136 psij2 = psij_const2;
4137 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4138 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4139 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4140 }
4141 }
4142 } else/* asserts (u2>o2) */
4143 {
4144 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4145 {
4146 psij1 = psij_const1;
4147 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4148 {
4149 psij2 = psij_const2;
4150 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4151 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4152 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4153 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4154 for (l2 = 0; l2 <= o2; l2++)
4155 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4156 }
4157 }
4158
4159 for (l0 = 0; l0 <= o0; l0++, psij0++)
4160 {
4161 psij1 = psij_const1;
4162 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4163 {
4164 psij2 = psij_const2;
4165 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4166 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4167 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4168 gj = g + (l0 * n1 + (u1 + l1)) * n2;
4169 for (l2 = 0; l2 <= o2; l2++)
4170 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4171 }
4172 }
4173 }
4174 else /* asserts (u1>o1) */
4175 if (u2 < o2)
4176 {
4177 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4178 {
4179 psij1 = psij_const1;
4180 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4181 {
4182 psij2 = psij_const2;
4183 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4184 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4185 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4186 }
4187 for (l1 = 0; l1 <= o1; l1++, psij1++)
4188 {
4189 psij2 = psij_const2;
4190 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4191 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4192 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4193 }
4194 }
4195 for (l0 = 0; l0 <= o0; l0++, psij0++)
4196 {
4197 psij1 = psij_const1;
4198 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4199 {
4200 psij2 = psij_const2;
4201 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4202 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4203 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4204 }
4205 for (l1 = 0; l1 <= o1; l1++, psij1++)
4206 {
4207 psij2 = psij_const2;
4208 gj = g + (l0 * n1 + l1) * n2 + u2;
4209 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4210 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4211 }
4212 }
4213 } else/* asserts (u2>o2) */
4214 {
4215 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4216 {
4217 psij1 = psij_const1;
4218 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4219 {
4220 psij2 = psij_const2;
4221 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4222 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4223 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4224 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4225 for (l2 = 0; l2 <= o2; l2++)
4226 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4227 }
4228 for (l1 = 0; l1 <= o1; l1++, psij1++)
4229 {
4230 psij2 = psij_const2;
4231 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4232 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4233 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4234 gj = g + ((u0 + l0) * n1 + l1) * n2;
4235 for (l2 = 0; l2 <= o2; l2++)
4236 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4237 }
4238 }
4239
4240 for (l0 = 0; l0 <= o0; l0++, psij0++)
4241 {
4242 psij1 = psij_const1;
4243 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4244 {
4245 psij2 = psij_const2;
4246 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4247 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4248 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4249 gj = g + (l0 * n1 + (u1 + l1)) * n2;
4250 for (l2 = 0; l2 <= o2; l2++)
4251 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4252 }
4253 for (l1 = 0; l1 <= o1; l1++, psij1++)
4254 {
4255 psij2 = psij_const2;
4256 gj = g + (l0 * n1 + l1) * n2 + u2;
4257 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4258 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4259 gj = g + (l0 * n1 + l1) * n2;
4260 for (l2 = 0; l2 <= o2; l2++)
4261 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
4262 }
4263 }
4264 }
4265}
4266
4267#ifdef _OPENMP
4289static void nfft_adjoint_3d_compute_omp_blockwise(const C f, C *g,
4290 const R *psij_const0, const R *psij_const1, const R *psij_const2,
4291 const R *xj0, const R *xj1, const R *xj2,
4292 const INT n0, const INT n1, const INT n2, const INT m,
4293 const INT my_u0, const INT my_o0)
4294{
4295 INT ar_u0,ar_o0,l0,u1,o1,l1,u2,o2,l2;
4296
4297 INT index_temp1[2*m+2];
4298 INT index_temp2[2*m+2];
4299
4300 uo2(&ar_u0,&ar_o0,*xj0, n0, m);
4301 uo2(&u1,&o1,*xj1, n1, m);
4302 uo2(&u2,&o2,*xj2, n2, m);
4303
4304 for (l1=0; l1<=2*m+1; l1++)
4305 index_temp1[l1] = (u1+l1)%n1;
4306
4307 for (l2=0; l2<=2*m+1; l2++)
4308 index_temp2[l2] = (u2+l2)%n2;
4309
4310 if(ar_u0<ar_o0)
4311 {
4312 INT u0 = MAX(my_u0,ar_u0);
4313 INT o0 = MIN(my_o0,ar_o0);
4314 INT offset_psij = u0-ar_u0;
4315#ifdef OMP_ASSERT
4316 assert(offset_psij >= 0);
4317 assert(o0-u0 <= 2*m+1);
4318 assert(offset_psij+o0-u0 <= 2*m+1);
4319#endif
4320
4321 for (l0 = 0; l0 <= o0-u0; l0++)
4322 {
4323 const INT i0 = (u0+l0) * n1;
4324 const C val0 = psij_const0[offset_psij+l0];
4325
4326 for(l1=0; l1<=2*m+1; l1++)
4327 {
4328 const INT i1 = (i0 + index_temp1[l1]) * n2;
4329 const C val1 = psij_const1[l1];
4330
4331 for(l2=0; l2<=2*m+1; l2++)
4332 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4333 }
4334 }
4335 }
4336 else
4337 {
4338 INT u0 = MAX(my_u0,ar_u0);
4339 INT o0 = my_o0;
4340 INT offset_psij = u0-ar_u0;
4341#ifdef OMP_ASSERT
4342 assert(offset_psij >= 0);
4343 assert(o0-u0 <= 2*m+1);
4344 assert(offset_psij+o0-u0 <= 2*m+1);
4345#endif
4346
4347 for (l0 = 0; l0 <= o0-u0; l0++)
4348 {
4349 INT i0 = (u0+l0) * n1;
4350 const C val0 = psij_const0[offset_psij+l0];
4351
4352 for(l1=0; l1<=2*m+1; l1++)
4353 {
4354 const INT i1 = (i0 + index_temp1[l1]) * n2;
4355 const C val1 = psij_const1[l1];
4356
4357 for(l2=0; l2<=2*m+1; l2++)
4358 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4359 }
4360 }
4361
4362 u0 = my_u0;
4363 o0 = MIN(my_o0,ar_o0);
4364 offset_psij += my_u0-ar_u0+n0;
4365
4366#ifdef OMP_ASSERT
4367 if (u0<=o0)
4368 {
4369 assert(o0-u0 <= 2*m+1);
4370 assert(offset_psij+o0-u0 <= 2*m+1);
4371 }
4372#endif
4373 for (l0 = 0; l0 <= o0-u0; l0++)
4374 {
4375 INT i0 = (u0+l0) * n1;
4376 const C val0 = psij_const0[offset_psij+l0];
4377
4378 for(l1=0; l1<=2*m+1; l1++)
4379 {
4380 const INT i1 = (i0 + index_temp1[l1]) * n2;
4381 const C val1 = psij_const1[l1];
4382
4383 for(l2=0; l2<=2*m+1; l2++)
4384 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4385 }
4386 }
4387 }
4388}
4389#endif
4390
4391#ifdef _OPENMP
4392/* adjoint NFFT three-dimensional case with OpenMP atomic operations */
4393static void nfft_adjoint_3d_compute_omp_atomic(const C f, C *g,
4394 const R *psij_const0, const R *psij_const1, const R *psij_const2,
4395 const R *xj0, const R *xj1, const R *xj2,
4396 const INT n0, const INT n1, const INT n2, const INT m)
4397{
4398 INT u0,o0,l0,u1,o1,l1,u2,o2,l2;
4399
4400 INT index_temp0[2*m+2];
4401 INT index_temp1[2*m+2];
4402 INT index_temp2[2*m+2];
4403
4404 uo2(&u0,&o0,*xj0, n0, m);
4405 uo2(&u1,&o1,*xj1, n1, m);
4406 uo2(&u2,&o2,*xj2, n2, m);
4407
4408 for (l0=0; l0<=2*m+1; l0++)
4409 index_temp0[l0] = (u0+l0)%n0;
4410
4411 for (l1=0; l1<=2*m+1; l1++)
4412 index_temp1[l1] = (u1+l1)%n1;
4413
4414 for (l2=0; l2<=2*m+1; l2++)
4415 index_temp2[l2] = (u2+l2)%n2;
4416
4417 for(l0=0; l0<=2*m+1; l0++)
4418 {
4419 for(l1=0; l1<=2*m+1; l1++)
4420 {
4421 for(l2=0; l2<=2*m+1; l2++)
4422 {
4423 INT i = (index_temp0[l0] * n1 + index_temp1[l1]) * n2 + index_temp2[l2];
4424 C *lhs = g+i;
4425 R *lhs_real = (R*)lhs;
4426 C val = psij_const0[l0] * psij_const1[l1] * psij_const2[l2] * f;
4427
4428#pragma omp atomic
4429 lhs_real[0] += CREAL(val);
4430
4431#pragma omp atomic
4432 lhs_real[1] += CIMAG(val);
4433 }
4434 }
4435 }
4436}
4437#endif
4438
4439#ifndef _OPENMP
4440static void nfft_adjoint_3d_compute_serial(const C *fj, C *g,
4441 const R *psij_const0, const R *psij_const1, const R *psij_const2, const R *xj0,
4442 const R *xj1, const R *xj2, const INT n0, const INT n1, const INT n2,
4443 const INT m)
4444{
4445 INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
4446 C *gj;
4447 const R *psij0, *psij1, *psij2;
4448
4449 psij0 = psij_const0;
4450 psij1 = psij_const1;
4451 psij2 = psij_const2;
4452
4453 uo2(&u0, &o0, *xj0, n0, m);
4454 uo2(&u1, &o1, *xj1, n1, m);
4455 uo2(&u2, &o2, *xj2, n2, m);
4456
4457 if (u0 < o0)
4458 if (u1 < o1)
4459 if (u2 < o2)
4460 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4461 {
4462 psij1 = psij_const1;
4463 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4464 {
4465 psij2 = psij_const2;
4466 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4467 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4468 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4469 }
4470 }
4471 else
4472 /* asserts (u2>o2)*/
4473 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4474 {
4475 psij1 = psij_const1;
4476 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4477 {
4478 psij2 = psij_const2;
4479 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4480 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4481 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4482 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4483 for (l2 = 0; l2 <= o2; l2++)
4484 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4485 }
4486 }
4487 else /* asserts (u1>o1)*/
4488 if (u2 < o2)
4489 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4490 {
4491 psij1 = psij_const1;
4492 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4493 {
4494 psij2 = psij_const2;
4495 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4496 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4497 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4498 }
4499 for (l1 = 0; l1 <= o1; l1++, psij1++)
4500 {
4501 psij2 = psij_const2;
4502 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4503 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4504 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4505 }
4506 }
4507 else/* asserts (u2>o2) */
4508 {
4509 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4510 {
4511 psij1 = psij_const1;
4512 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4513 {
4514 psij2 = psij_const2;
4515 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4516 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4517 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4518 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4519 for (l2 = 0; l2 <= o2; l2++)
4520 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4521 }
4522 for (l1 = 0; l1 <= o1; l1++, psij1++)
4523 {
4524 psij2 = psij_const2;
4525 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4526 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4527 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4528 gj = g + ((u0 + l0) * n1 + l1) * n2;
4529 for (l2 = 0; l2 <= o2; l2++)
4530 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4531 }
4532 }
4533 }
4534 else /* asserts (u0>o0) */
4535 if (u1 < o1)
4536 if (u2 < o2)
4537 {
4538 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4539 {
4540 psij1 = psij_const1;
4541 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4542 {
4543 psij2 = psij_const2;
4544 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4545 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4546 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4547 }
4548 }
4549
4550 for (l0 = 0; l0 <= o0; l0++, psij0++)
4551 {
4552 psij1 = psij_const1;
4553 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4554 {
4555 psij2 = psij_const2;
4556 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4557 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4558 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4559 }
4560 }
4561 } else/* asserts (u2>o2) */
4562 {
4563 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4564 {
4565 psij1 = psij_const1;
4566 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4567 {
4568 psij2 = psij_const2;
4569 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4570 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4571 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4572 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4573 for (l2 = 0; l2 <= o2; l2++)
4574 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4575 }
4576 }
4577
4578 for (l0 = 0; l0 <= o0; l0++, psij0++)
4579 {
4580 psij1 = psij_const1;
4581 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4582 {
4583 psij2 = psij_const2;
4584 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4585 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4586 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4587 gj = g + (l0 * n1 + (u1 + l1)) * n2;
4588 for (l2 = 0; l2 <= o2; l2++)
4589 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4590 }
4591 }
4592 }
4593 else /* asserts (u1>o1) */
4594 if (u2 < o2)
4595 {
4596 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4597 {
4598 psij1 = psij_const1;
4599 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4600 {
4601 psij2 = psij_const2;
4602 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4603 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4604 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4605 }
4606 for (l1 = 0; l1 <= o1; l1++, psij1++)
4607 {
4608 psij2 = psij_const2;
4609 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4610 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4611 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4612 }
4613 }
4614 for (l0 = 0; l0 <= o0; l0++, psij0++)
4615 {
4616 psij1 = psij_const1;
4617 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4618 {
4619 psij2 = psij_const2;
4620 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4621 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4622 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4623 }
4624 for (l1 = 0; l1 <= o1; l1++, psij1++)
4625 {
4626 psij2 = psij_const2;
4627 gj = g + (l0 * n1 + l1) * n2 + u2;
4628 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4629 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4630 }
4631 }
4632 } else/* asserts (u2>o2) */
4633 {
4634 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4635 {
4636 psij1 = psij_const1;
4637 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4638 {
4639 psij2 = psij_const2;
4640 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4641 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4642 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4643 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4644 for (l2 = 0; l2 <= o2; l2++)
4645 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4646 }
4647 for (l1 = 0; l1 <= o1; l1++, psij1++)
4648 {
4649 psij2 = psij_const2;
4650 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4651 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4652 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4653 gj = g + ((u0 + l0) * n1 + l1) * n2;
4654 for (l2 = 0; l2 <= o2; l2++)
4655 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4656 }
4657 }
4658
4659 for (l0 = 0; l0 <= o0; l0++, psij0++)
4660 {
4661 psij1 = psij_const1;
4662 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4663 {
4664 psij2 = psij_const2;
4665 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4666 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4667 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4668 gj = g + (l0 * n1 + (u1 + l1)) * n2;
4669 for (l2 = 0; l2 <= o2; l2++)
4670 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4671 }
4672 for (l1 = 0; l1 <= o1; l1++, psij1++)
4673 {
4674 psij2 = psij_const2;
4675 gj = g + (l0 * n1 + l1) * n2 + u2;
4676 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4677 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4678 gj = g + (l0 * n1 + l1) * n2;
4679 for (l2 = 0; l2 <= o2; l2++)
4680 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4681 }
4682 }
4683 }
4684}
4685#endif
4686
4687static void nfft_trafo_3d_B(X(plan) *ths)
4688{
4689 const INT n0 = ths->n[0];
4690 const INT n1 = ths->n[1];
4691 const INT n2 = ths->n[2];
4692 const INT M = ths->M_total;
4693 const INT m = ths->m;
4694
4695 const C* g = (C*) ths->g;
4696
4697 INT k;
4698
4699 if(ths->flags & PRE_FULL_PSI)
4700 {
4701 const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4702#ifdef _OPENMP
4703 #pragma omp parallel for default(shared) private(k)
4704#endif
4705 for (k = 0; k < M; k++)
4706 {
4707 INT l;
4708 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4709 ths->f[j] = K(0.0);
4710 for (l = 0; l < lprod; l++)
4711 ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
4712 }
4713 return;
4714 } /* if(PRE_FULL_PSI) */
4715
4716 if(ths->flags & PRE_PSI)
4717 {
4718#ifdef _OPENMP
4719 #pragma omp parallel for default(shared) private(k)
4720#endif
4721 for (k = 0; k < M; k++)
4722 {
4723 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4724 nfft_trafo_3d_compute(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4725 }
4726 return;
4727 } /* if(PRE_PSI) */
4728
4729 if(ths->flags & PRE_FG_PSI)
4730 {
4731 R fg_exp_l[3*(2*m+2)];
4732
4733 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4734 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4735 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4736
4737#ifdef _OPENMP
4738 #pragma omp parallel for default(shared) private(k)
4739#endif
4740 for (k = 0; k < M; k++)
4741 {
4742 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4743 INT l;
4744 R psij_const[3*(2*m+2)];
4745 R fg_psij0 = ths->psi[2*j*3];
4746 R fg_psij1 = ths->psi[2*j*3+1];
4747 R fg_psij2 = K(1.0);
4748
4749 psij_const[0] = fg_psij0;
4750 for(l=1; l<=2*m+1; l++)
4751 {
4752 fg_psij2 *= fg_psij1;
4753 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4754 }
4755
4756 fg_psij0 = ths->psi[2*(j*3+1)];
4757 fg_psij1 = ths->psi[2*(j*3+1)+1];
4758 fg_psij2 = K(1.0);
4759 psij_const[2*m+2] = fg_psij0;
4760 for(l=1; l<=2*m+1; l++)
4761 {
4762 fg_psij2 *= fg_psij1;
4763 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4764 }
4765
4766 fg_psij0 = ths->psi[2*(j*3+2)];
4767 fg_psij1 = ths->psi[2*(j*3+2)+1];
4768 fg_psij2 = K(1.0);
4769 psij_const[2*(2*m+2)] = fg_psij0;
4770 for(l=1; l<=2*m+1; l++)
4771 {
4772 fg_psij2 *= fg_psij1;
4773 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4774 }
4775
4776 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4777 }
4778
4779 return;
4780 } /* if(PRE_FG_PSI) */
4781
4782 if(ths->flags & FG_PSI)
4783 {
4784 R fg_exp_l[3*(2*m+2)];
4785
4786 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4787 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4788 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4789
4790 sort(ths);
4791
4792#ifdef _OPENMP
4793 #pragma omp parallel for default(shared) private(k)
4794#endif
4795 for (k = 0; k < M; k++)
4796 {
4797 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4798 INT u, o, l;
4799 R psij_const[3*(2*m+2)];
4800 R fg_psij0, fg_psij1, fg_psij2;
4801
4802 uo(ths,j,&u,&o,(INT)0);
4803 fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
4804 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u)) / ths->b[0]);
4805 fg_psij2 = K(1.0);
4806 psij_const[0] = fg_psij0;
4807 for(l=1; l<=2*m+1; l++)
4808 {
4809 fg_psij2 *= fg_psij1;
4810 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4811 }
4812
4813 uo(ths,j,&u,&o,(INT)1);
4814 fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
4815 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u)) / ths->b[1]);
4816 fg_psij2 = K(1.0);
4817 psij_const[2*m+2] = fg_psij0;
4818 for(l=1; l<=2*m+1; l++)
4819 {
4820 fg_psij2 *= fg_psij1;
4821 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4822 }
4823
4824 uo(ths,j,&u,&o,(INT)2);
4825 fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
4826 fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u)) / ths->b[2]);
4827 fg_psij2 = K(1.0);
4828 psij_const[2*(2*m+2)] = fg_psij0;
4829 for(l=1; l<=2*m+1; l++)
4830 {
4831 fg_psij2 *= fg_psij1;
4832 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4833 }
4834
4835 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4836 }
4837
4838 return;
4839 } /* if(FG_PSI) */
4840
4841 if(ths->flags & PRE_LIN_PSI)
4842 {
4843 const INT K = ths->K, ip_s = K / (m + 2);
4844
4845 sort(ths);
4846
4847#ifdef _OPENMP
4848 #pragma omp parallel for default(shared) private(k)
4849#endif
4850 for (k = 0; k < M; k++)
4851 {
4852 INT u, o, l;
4853 R ip_y, ip_w;
4854 INT ip_u;
4855 R psij_const[3*(2*m+2)];
4856 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4857
4858 uo(ths,j,&u,&o,(INT)0);
4859 ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
4860 ip_u = (INT)(LRINT(FLOOR(ip_y)));
4861 ip_w = ip_y - (R)(ip_u);
4862 for(l=0; l < 2*m+2; l++)
4863 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4864 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
4865
4866 uo(ths,j,&u,&o,(INT)1);
4867 ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
4868 ip_u = (INT)(LRINT(FLOOR(ip_y)));
4869 ip_w = ip_y - (R)(ip_u);
4870 for(l=0; l < 2*m+2; l++)
4871 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4872 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4873
4874 uo(ths,j,&u,&o,(INT)2);
4875 ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u)) * ((R)ip_s);
4876 ip_u = (INT)(LRINT(FLOOR(ip_y)));
4877 ip_w = ip_y - (R)(ip_u);
4878 for(l=0; l < 2*m+2; l++)
4879 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4880 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4881
4882 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4883 }
4884 return;
4885 } /* if(PRE_LIN_PSI) */
4886
4887 /* no precomputed psi at all */
4888
4889 sort(ths);
4890
4891#ifdef _OPENMP
4892 #pragma omp parallel for default(shared) private(k)
4893#endif
4894 for (k = 0; k < M; k++)
4895 {
4896 R psij_const[3*(2*m+2)];
4897 INT u, o, l;
4898 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4899
4900 uo(ths,j,&u,&o,(INT)0);
4901 for(l=0;l<=2*m+1;l++)
4902 psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
4903
4904 uo(ths,j,&u,&o,(INT)1);
4905 for(l=0;l<=2*m+1;l++)
4906 psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
4907
4908 uo(ths,j,&u,&o,(INT)2);
4909 for(l=0;l<=2*m+1;l++)
4910 psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
4911
4912 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4913 }
4914}
4915
4916#define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
4917 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4918 ths->psi+j*3*(2*m+2), \
4919 ths->psi+(j*3+1)*(2*m+2), \
4920 ths->psi+(j*3+2)*(2*m+2), \
4921 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4922 n0, n1, n2, m, my_u0, my_o0);
4923
4924#define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
4925{ \
4926 INT l; \
4927 R psij_const[3*(2*m+2)]; \
4928 R fg_psij0 = ths->psi[2*j*3]; \
4929 R fg_psij1 = ths->psi[2*j*3+1]; \
4930 R fg_psij2 = K(1.0); \
4931 \
4932 psij_const[0] = fg_psij0; \
4933 for(l=1; l<=2*m+1; l++) \
4934 { \
4935 fg_psij2 *= fg_psij1; \
4936 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4937 } \
4938 \
4939 fg_psij0 = ths->psi[2*(j*3+1)]; \
4940 fg_psij1 = ths->psi[2*(j*3+1)+1]; \
4941 fg_psij2 = K(1.0); \
4942 psij_const[2*m+2] = fg_psij0; \
4943 for(l=1; l<=2*m+1; l++) \
4944 { \
4945 fg_psij2 *= fg_psij1; \
4946 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4947 } \
4948 \
4949 fg_psij0 = ths->psi[2*(j*3+2)]; \
4950 fg_psij1 = ths->psi[2*(j*3+2)+1]; \
4951 fg_psij2 = K(1.0); \
4952 psij_const[2*(2*m+2)] = fg_psij0; \
4953 for(l=1; l<=2*m+1; l++) \
4954 { \
4955 fg_psij2 *= fg_psij1; \
4956 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
4957 } \
4958 \
4959 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4960 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4961 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4962 n0, n1, n2, m, my_u0, my_o0); \
4963}
4964
4965#define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
4966{ \
4967 INT u, o, l; \
4968 R psij_const[3*(2*m+2)]; \
4969 R fg_psij0, fg_psij1, fg_psij2; \
4970 \
4971 uo(ths,j,&u,&o,(INT)0); \
4972 fg_psij0 = (PHI(ths->n[0],ths->x[3*j]-((R)u)/((R)n0),0)); \
4973 fg_psij1 = EXP(K(2.0)*(((R)n0)*(ths->x[3*j]) - (R)u)/ths->b[0]); \
4974 fg_psij2 = K(1.0); \
4975 psij_const[0] = fg_psij0; \
4976 for(l=1; l<=2*m+1; l++) \
4977 { \
4978 fg_psij2 *= fg_psij1; \
4979 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4980 } \
4981 \
4982 uo(ths,j,&u,&o,(INT)1); \
4983 fg_psij0 = (PHI(ths->n[1],ths->x[3*j+1]-((R)u)/((R)n1),1)); \
4984 fg_psij1 = EXP(K(2.0)*(((R)n1)*(ths->x[3*j+1]) - (R)u)/ths->b[1]); \
4985 fg_psij2 = K(1.0); \
4986 psij_const[2*m+2] = fg_psij0; \
4987 for(l=1; l<=2*m+1; l++) \
4988 { \
4989 fg_psij2 *= fg_psij1; \
4990 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4991 } \
4992 \
4993 uo(ths,j,&u,&o,(INT)2); \
4994 fg_psij0 = (PHI(ths->n[2],ths->x[3*j+2]-((R)u)/((R)n2),2)); \
4995 fg_psij1 = EXP(K(2.0)*(((R)n2)*(ths->x[3*j+2]) - (R)u)/ths->b[2]); \
4996 fg_psij2 = K(1.0); \
4997 psij_const[2*(2*m+2)] = fg_psij0; \
4998 for(l=1; l<=2*m+1; l++) \
4999 { \
5000 fg_psij2 *= fg_psij1; \
5001 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
5002 } \
5003 \
5004 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
5005 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
5006 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
5007 n0, n1, n2, m, my_u0, my_o0); \
5008}
5009
5010#define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
5011{ \
5012 INT u, o, l; \
5013 R psij_const[3*(2*m+2)]; \
5014 INT ip_u; \
5015 R ip_y, ip_w; \
5016 \
5017 uo(ths,j,&u,&o,(INT)0); \
5018 ip_y = FABS(((R)n0)*ths->x[3*j+0] - (R)u)*((R)ip_s); \
5019 ip_u = LRINT(FLOOR(ip_y)); \
5020 ip_w = ip_y-ip_u; \
5021 for(l=0; l < 2*m+2; l++) \
5022 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
5023 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
5024 \
5025 uo(ths,j,&u,&o,(INT)1); \
5026 ip_y = FABS(((R)n1)*ths->x[3*j+1] - (R)u)*((R)ip_s); \
5027 ip_u = LRINT(FLOOR(ip_y)); \
5028 ip_w = ip_y-ip_u; \
5029 for(l=0; l < 2*m+2; l++) \
5030 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
5031 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
5032 \
5033 uo(ths,j,&u,&o,(INT)2); \
5034 ip_y = FABS(((R)n2)*ths->x[3*j+2] - (R)u)*((R)ip_s); \
5035 ip_u = LRINT(FLOOR(ip_y)); \
5036 ip_w = ip_y-ip_u; \
5037 for(l=0; l < 2*m+2; l++) \
5038 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
5039 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
5040 \
5041 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
5042 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
5043 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
5044 n0, n1, n2, m, my_u0, my_o0); \
5045}
5046
5047#define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
5048{ \
5049 INT u, o, l; \
5050 R psij_const[3*(2*m+2)]; \
5051 \
5052 uo(ths,j,&u,&o,(INT)0); \
5053 for(l=0;l<=2*m+1;l++) \
5054 psij_const[l]=(PHI(ths->n[0],ths->x[3*j]-((R)((u+l)))/((R) n0),0)); \
5055 \
5056 uo(ths,j,&u,&o,(INT)1); \
5057 for(l=0;l<=2*m+1;l++) \
5058 psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[3*j+1]-((R)((u+l)))/((R) n1),1)); \
5059 \
5060 uo(ths,j,&u,&o,(INT)2); \
5061 for(l=0;l<=2*m+1;l++) \
5062 psij_const[2*(2*m+2)+l]=(PHI(ths->n[2],ths->x[3*j+2]-((R)((u+l)))/((R) n2),2)); \
5063 \
5064 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
5065 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
5066 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
5067 n0, n1, n2, m, my_u0, my_o0); \
5068}
5069
5070#define MACRO_adjoint_3d_B_OMP_BLOCKWISE(whichone) \
5071{ \
5072 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
5073 { \
5074 _Pragma("omp parallel private(k)") \
5075 { \
5076 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
5077 INT *ar_x = ths->index_x; \
5078 \
5079 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
5080 &min_u_b, &max_u_b, 3, ths->n, m); \
5081 \
5082 if (min_u_a != -1) \
5083 { \
5084 k = index_x_binary_search(ar_x, M, min_u_a); \
5085 \
5086 MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_A \
5087 \
5088 while (k < M) \
5089 { \
5090 INT u_prod = ar_x[2*k]; \
5091 INT j = ar_x[2*k+1]; \
5092 \
5093 if (u_prod < min_u_a || u_prod > max_u_a) \
5094 break; \
5095 \
5096 MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
5097 \
5098 k++; \
5099 } \
5100 } \
5101 \
5102 if (min_u_b != -1) \
5103 { \
5104 INT k = index_x_binary_search(ar_x, M, min_u_b); \
5105 \
5106 MACRO_adjoint_nd_B_OMP_BLOCKWISE_ASSERT_B \
5107 \
5108 while (k < M) \
5109 { \
5110 INT u_prod = ar_x[2*k]; \
5111 INT j = ar_x[2*k+1]; \
5112 \
5113 if (u_prod < min_u_b || u_prod > max_u_b) \
5114 break; \
5115 \
5116 MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
5117 \
5118 k++; \
5119 } \
5120 } \
5121 } /* omp parallel */ \
5122 return; \
5123 } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
5124}
5125
5126static void nfft_adjoint_3d_B(X(plan) *ths)
5127{
5128 INT k;
5129 const INT n0 = ths->n[0];
5130 const INT n1 = ths->n[1];
5131 const INT n2 = ths->n[2];
5132 const INT M = ths->M_total;
5133 const INT m = ths->m;
5134
5135 C* g = (C*) ths->g;
5136
5137 memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
5138
5139 if(ths->flags & PRE_FULL_PSI)
5140 {
5141 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
5142 (INT)3, ths->n, m, ths->flags, ths->index_x);
5143 return;
5144 } /* if(PRE_FULL_PSI) */
5145
5146 if(ths->flags & PRE_PSI)
5147 {
5148#ifdef _OPENMP
5149 MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_PSI)
5150#endif
5151
5152#ifdef _OPENMP
5153 #pragma omp parallel for default(shared) private(k)
5154#endif
5155 for (k = 0; k < M; k++)
5156 {
5157 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5158#ifdef _OPENMP
5159 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5160#else
5161 nfft_adjoint_3d_compute_serial(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5162#endif
5163 }
5164 return;
5165 } /* if(PRE_PSI) */
5166
5167 if(ths->flags & PRE_FG_PSI)
5168 {
5169 R fg_exp_l[3*(2*m+2)];
5170
5171 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
5172 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
5173 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
5174
5175#ifdef _OPENMP
5176 MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_FG_PSI)
5177#endif
5178
5179#ifdef _OPENMP
5180 #pragma omp parallel for default(shared) private(k)
5181#endif
5182 for (k = 0; k < M; k++)
5183 {
5184 R psij_const[3*(2*m+2)];
5185 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5186 INT l;
5187 R fg_psij0 = ths->psi[2*j*3];
5188 R fg_psij1 = ths->psi[2*j*3+1];
5189 R fg_psij2 = K(1.0);
5190
5191 psij_const[0] = fg_psij0;
5192 for(l=1; l<=2*m+1; l++)
5193 {
5194 fg_psij2 *= fg_psij1;
5195 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
5196 }
5197
5198 fg_psij0 = ths->psi[2*(j*3+1)];
5199 fg_psij1 = ths->psi[2*(j*3+1)+1];
5200 fg_psij2 = K(1.0);
5201 psij_const[2*m+2] = fg_psij0;
5202 for(l=1; l<=2*m+1; l++)
5203 {
5204 fg_psij2 *= fg_psij1;
5205 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
5206 }
5207
5208 fg_psij0 = ths->psi[2*(j*3+2)];
5209 fg_psij1 = ths->psi[2*(j*3+2)+1];
5210 fg_psij2 = K(1.0);
5211 psij_const[2*(2*m+2)] = fg_psij0;
5212 for(l=1; l<=2*m+1; l++)
5213 {
5214 fg_psij2 *= fg_psij1;
5215 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
5216 }
5217
5218#ifdef _OPENMP
5219 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5220#else
5221 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5222#endif
5223 }
5224
5225 return;
5226 } /* if(PRE_FG_PSI) */
5227
5228 if(ths->flags & FG_PSI)
5229 {
5230 R fg_exp_l[3*(2*m+2)];
5231
5232 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
5233 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
5234 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
5235
5236 sort(ths);
5237
5238#ifdef _OPENMP
5239 MACRO_adjoint_3d_B_OMP_BLOCKWISE(FG_PSI)
5240#endif
5241
5242#ifdef _OPENMP
5243 #pragma omp parallel for default(shared) private(k)
5244#endif
5245 for (k = 0; k < M; k++)
5246 {
5247 INT u,o,l;
5248 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5249 R psij_const[3*(2*m+2)];
5250 R fg_psij0, fg_psij1, fg_psij2;
5251
5252 uo(ths,j,&u,&o,(INT)0);
5253 fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
5254 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u))/ths->b[0]);
5255 fg_psij2 = K(1.0);
5256 psij_const[0] = fg_psij0;
5257 for(l=1; l<=2*m+1; l++)
5258 {
5259 fg_psij2 *= fg_psij1;
5260 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
5261 }
5262
5263 uo(ths,j,&u,&o,(INT)1);
5264 fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
5265 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u))/ths->b[1]);
5266 fg_psij2 = K(1.0);
5267 psij_const[2*m+2] = fg_psij0;
5268 for(l=1; l<=2*m+1; l++)
5269 {
5270 fg_psij2 *= fg_psij1;
5271 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
5272 }
5273
5274 uo(ths,j,&u,&o,(INT)2);
5275 fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
5276 fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u))/ths->b[2]);
5277 fg_psij2 = K(1.0);
5278 psij_const[2*(2*m+2)] = fg_psij0;
5279 for(l=1; l<=2*m+1; l++)
5280 {
5281 fg_psij2 *= fg_psij1;
5282 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
5283 }
5284
5285#ifdef _OPENMP
5286 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5287#else
5288 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5289#endif
5290 }
5291
5292 return;
5293 } /* if(FG_PSI) */
5294
5295 if(ths->flags & PRE_LIN_PSI)
5296 {
5297 const INT K = ths->K;
5298 const INT ip_s = K / (m + 2);
5299
5300 sort(ths);
5301
5302#ifdef _OPENMP
5303 MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
5304#endif
5305
5306#ifdef _OPENMP
5307 #pragma omp parallel for default(shared) private(k)
5308#endif
5309 for (k = 0; k < M; k++)
5310 {
5311 INT u,o,l;
5312 INT ip_u;
5313 R ip_y, ip_w;
5314 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5315 R psij_const[3*(2*m+2)];
5316
5317 uo(ths,j,&u,&o,(INT)0);
5318 ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
5319 ip_u = (INT)(LRINT(FLOOR(ip_y)));
5320 ip_w = ip_y - (R)(ip_u);
5321 for(l=0; l < 2*m+2; l++)
5322 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5323 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
5324
5325 uo(ths,j,&u,&o,(INT)1);
5326 ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
5327 ip_u = (INT)(LRINT(FLOOR(ip_y)));
5328 ip_w = ip_y - (R)(ip_u);
5329 for(l=0; l < 2*m+2; l++)
5330 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5331 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5332
5333 uo(ths,j,&u,&o,(INT)2);
5334 ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u))*((R)ip_s);
5335 ip_u = (INT)(LRINT(FLOOR(ip_y)));
5336 ip_w = ip_y - (R)(ip_u);
5337 for(l=0; l < 2*m+2; l++)
5338 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5339 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5340
5341#ifdef _OPENMP
5342 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5343#else
5344 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5345#endif
5346 }
5347 return;
5348 } /* if(PRE_LIN_PSI) */
5349
5350 /* no precomputed psi at all */
5351 sort(ths);
5352
5353#ifdef _OPENMP
5354 MACRO_adjoint_3d_B_OMP_BLOCKWISE(NO_PSI)
5355#endif
5356
5357#ifdef _OPENMP
5358 #pragma omp parallel for default(shared) private(k)
5359#endif
5360 for (k = 0; k < M; k++)
5361 {
5362 INT u,o,l;
5363 R psij_const[3*(2*m+2)];
5364 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5365
5366 uo(ths,j,&u,&o,(INT)0);
5367 for(l=0;l<=2*m+1;l++)
5368 psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
5369
5370 uo(ths,j,&u,&o,(INT)1);
5371 for(l=0;l<=2*m+1;l++)
5372 psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
5373
5374 uo(ths,j,&u,&o,(INT)2);
5375 for(l=0;l<=2*m+1;l++)
5376 psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
5377
5378#ifdef _OPENMP
5379 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5380#else
5381 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5382#endif
5383 }
5384}
5385
5386
5387void X(trafo_3d)(X(plan) *ths)
5388{
5389 if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->N[2] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2) || (ths->n[2] <= 2*ths->m+2))
5390 {
5391 X(trafo_direct)(ths);
5392 return;
5393 }
5394
5395 INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
5396 C *g_hat,*f_hat;
5397 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5398 R ck01, ck02, ck11, ck12, ck21, ck22;
5399 C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5400 C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5401
5402 ths->g_hat=ths->g1;
5403 ths->g=ths->g2;
5404
5405 N0=ths->N[0];
5406 N1=ths->N[1];
5407 N2=ths->N[2];
5408 n0=ths->n[0];
5409 n1=ths->n[1];
5410 n2=ths->n[2];
5411
5412 f_hat=(C*)ths->f_hat;
5413 g_hat=(C*)ths->g_hat;
5414
5415 TIC(0)
5416#ifdef _OPENMP
5417 #pragma omp parallel for default(shared) private(k0)
5418 for (k0 = 0; k0 < ths->n_total; k0++)
5419 ths->g_hat[k0] = 0.0;
5420#else
5421 memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
5422#endif
5423
5424 if(ths->flags & PRE_PHI_HUT)
5425 {
5426 c_phi_inv01=ths->c_phi_inv[0];
5427 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5428
5429#ifdef _OPENMP
5430 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5431#endif
5432 for(k0=0;k0<N0/2;k0++)
5433 {
5434 ck01=c_phi_inv01[k0];
5435 ck02=c_phi_inv02[k0];
5436 c_phi_inv11=ths->c_phi_inv[1];
5437 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5438
5439 for(k1=0;k1<N1/2;k1++)
5440 {
5441 ck11=c_phi_inv11[k1];
5442 ck12=c_phi_inv12[k1];
5443 c_phi_inv21=ths->c_phi_inv[2];
5444 c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5445
5446 g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5447 f_hat111=f_hat + (k0*N1+k1)*N2;
5448 g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5449 f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5450 g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5451 f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5452 g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5453 f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5454
5455 g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5456 f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5457 g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5458 f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5459 g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5460 f_hat122=f_hat + (k0*N1+N1/2+k1)*N2+(N2/2);
5461 g_hat222=g_hat + (k0*n1+k1)*n2;
5462 f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5463
5464 for(k2=0;k2<N2/2;k2++)
5465 {
5466 ck21=c_phi_inv21[k2];
5467 ck22=c_phi_inv22[k2];
5468
5469 g_hat111[k2] = f_hat111[k2] * ck01 * ck11 * ck21;
5470 g_hat211[k2] = f_hat211[k2] * ck02 * ck11 * ck21;
5471 g_hat121[k2] = f_hat121[k2] * ck01 * ck12 * ck21;
5472 g_hat221[k2] = f_hat221[k2] * ck02 * ck12 * ck21;
5473
5474 g_hat112[k2] = f_hat112[k2] * ck01 * ck11 * ck22;
5475 g_hat212[k2] = f_hat212[k2] * ck02 * ck11 * ck22;
5476 g_hat122[k2] = f_hat122[k2] * ck01 * ck12 * ck22;
5477 g_hat222[k2] = f_hat222[k2] * ck02 * ck12 * ck22;
5478 }
5479 }
5480 }
5481 }
5482 else
5483#ifdef _OPENMP
5484 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5485#endif
5486 for(k0=0;k0<N0/2;k0++)
5487 {
5488 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
5489 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
5490 for(k1=0;k1<N1/2;k1++)
5491 {
5492 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
5493 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
5494
5495 for(k2=0;k2<N2/2;k2++)
5496 {
5497 ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
5498 ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
5499
5500 g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+k1)*N2+k2] * ck01 * ck11 * ck21;
5501 g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+k2] * ck02 * ck11 * ck21;
5502 g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+k2] * ck01 * ck12 * ck21;
5503 g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] * ck02 * ck12 * ck21;
5504
5505 g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] = f_hat[(k0*N1+k1)*N2+N2/2+k2] * ck01 * ck11 * ck22;
5506 g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] * ck02 * ck11 * ck22;
5507 g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] * ck01 * ck12 * ck22;
5508 g_hat[(k0*n1+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] * ck02 * ck12 * ck22;
5509 }
5510 }
5511 }
5512
5513 TOC(0)
5514
5515 TIC_FFTW(1)
5516 FFTW(execute)(ths->my_fftw_plan1);
5517 TOC_FFTW(1);
5518
5519 TIC(2);
5520 nfft_trafo_3d_B(ths);
5521 TOC(2);
5522}
5523
5524void X(adjoint_3d)(X(plan) *ths)
5525{
5526 if((ths->N[0] <= ths->m) || (ths->N[1] <= ths->m) || (ths->N[2] <= ths->m) || (ths->n[0] <= 2*ths->m+2) || (ths->n[1] <= 2*ths->m+2) || (ths->n[2] <= 2*ths->m+2))
5527 {
5528 X(adjoint_direct)(ths);
5529 return;
5530 }
5531
5532 INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
5533 C *g_hat,*f_hat;
5534 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5535 R ck01, ck02, ck11, ck12, ck21, ck22;
5536 C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5537 C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5538
5539 ths->g_hat=ths->g1;
5540 ths->g=ths->g2;
5541
5542 N0=ths->N[0];
5543 N1=ths->N[1];
5544 N2=ths->N[2];
5545 n0=ths->n[0];
5546 n1=ths->n[1];
5547 n2=ths->n[2];
5548
5549 f_hat=(C*)ths->f_hat;
5550 g_hat=(C*)ths->g_hat;
5551
5552 TIC(2);
5553 nfft_adjoint_3d_B(ths);
5554 TOC(2);
5555
5556 TIC_FFTW(1)
5557 FFTW(execute)(ths->my_fftw_plan2);
5558 TOC_FFTW(1);
5559
5560 TIC(0)
5561 if(ths->flags & PRE_PHI_HUT)
5562 {
5563 c_phi_inv01=ths->c_phi_inv[0];
5564 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5565
5566#ifdef _OPENMP
5567 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5568#endif
5569 for(k0=0;k0<N0/2;k0++)
5570 {
5571 ck01=c_phi_inv01[k0];
5572 ck02=c_phi_inv02[k0];
5573 c_phi_inv11=ths->c_phi_inv[1];
5574 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5575
5576 for(k1=0;k1<N1/2;k1++)
5577 {
5578 ck11=c_phi_inv11[k1];
5579 ck12=c_phi_inv12[k1];
5580 c_phi_inv21=ths->c_phi_inv[2];
5581 c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5582
5583 g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5584 f_hat111=f_hat + (k0*N1+k1)*N2;
5585 g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5586 f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5587 g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5588 f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5589 g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5590 f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5591
5592 g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5593 f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5594 g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5595 f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5596 g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5597 f_hat122=f_hat + (k0*N1+(N1/2)+k1)*N2+(N2/2);
5598 g_hat222=g_hat + (k0*n1+k1)*n2;
5599 f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5600
5601 for(k2=0;k2<N2/2;k2++)
5602 {
5603 ck21=c_phi_inv21[k2];
5604 ck22=c_phi_inv22[k2];
5605
5606 f_hat111[k2] = g_hat111[k2] * ck01 * ck11 * ck21;
5607 f_hat211[k2] = g_hat211[k2] * ck02 * ck11 * ck21;
5608 f_hat121[k2] = g_hat121[k2] * ck01 * ck12 * ck21;
5609 f_hat221[k2] = g_hat221[k2] * ck02 * ck12 * ck21;
5610
5611 f_hat112[k2] = g_hat112[k2] * ck01 * ck11 * ck22;
5612 f_hat212[k2] = g_hat212[k2] * ck02 * ck11 * ck22;
5613 f_hat122[k2] = g_hat122[k2] * ck01 * ck12 * ck22;
5614 f_hat222[k2] = g_hat222[k2] * ck02 * ck12 * ck22;
5615 }
5616 }
5617 }
5618 }
5619 else
5620#ifdef _OPENMP
5621 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5622#endif
5623 for(k0=0;k0<N0/2;k0++)
5624 {
5625 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
5626 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
5627 for(k1=0;k1<N1/2;k1++)
5628 {
5629 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
5630 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
5631
5632 for(k2=0;k2<N2/2;k2++)
5633 {
5634 ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
5635 ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
5636
5637 f_hat[(k0*N1+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck01 * ck11 * ck21;
5638 f_hat[((N0/2+k0)*N1+k1)*N2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck02 * ck11 * ck21;
5639 f_hat[(k0*N1+N1/2+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] * ck01 * ck12 * ck21;
5640 f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] = g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] * ck02 * ck12 * ck21;
5641
5642 f_hat[(k0*N1+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] * ck01 * ck11 * ck22;
5643 f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] * ck02 * ck11 * ck22;
5644 f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] * ck01 * ck12 * ck22;
5645 f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[(k0*n1+k1)*n2+k2] * ck02 * ck12 * ck22;
5646 }
5647 }
5648 }
5649
5650 TOC(0)
5651}
5652
5655void X(trafo)(X(plan) *ths)
5656{
5657 /* use direct transform if degree N is too low */
5658 for (int j = 0; j < ths->d; j++)
5659 {
5660 if((ths->N[j] <= ths->m) || (ths->n[j] <= 2*ths->m+2))
5661 {
5662 X(trafo_direct)(ths);
5663 return;
5664 }
5665 }
5666
5667 switch(ths->d)
5668 {
5669 case 1: X(trafo_1d)(ths); break;
5670 case 2: X(trafo_2d)(ths); break;
5671 case 3: X(trafo_3d)(ths); break;
5672 default:
5673 {
5674 /* use ths->my_fftw_plan1 */
5675 ths->g_hat = ths->g1;
5676 ths->g = ths->g2;
5677
5681 TIC(0)
5682 D_A(ths);
5683 TOC(0)
5684
5689 TIC_FFTW(1)
5690 FFTW(execute)(ths->my_fftw_plan1);
5691 TOC_FFTW(1)
5692
5696 TIC(2)
5697 B_A(ths);
5698 TOC(2)
5699 }
5700 }
5701} /* nfft_trafo */
5702
5703void X(adjoint)(X(plan) *ths)
5704{
5705 /* use direct transform if degree N is too low */
5706 for (int j = 0; j < ths->d; j++)
5707 {
5708 if((ths->N[j] <= ths->m) || (ths->n[j] <= 2*ths->m+2))
5709 {
5710 X(adjoint_direct)(ths);
5711 return;
5712 }
5713 }
5714
5715 switch(ths->d)
5716 {
5717 case 1: X(adjoint_1d)(ths); break;
5718 case 2: X(adjoint_2d)(ths); break;
5719 case 3: X(adjoint_3d)(ths); break;
5720 default:
5721 {
5722 /* use ths->my_fftw_plan2 */
5723 ths->g_hat=ths->g1;
5724 ths->g=ths->g2;
5725
5729 TIC(2)
5730 B_T(ths);
5731 TOC(2)
5732
5737 TIC_FFTW(1)
5738 FFTW(execute)(ths->my_fftw_plan2);
5739 TOC_FFTW(1)
5740
5744 TIC(0)
5745 D_T(ths);
5746 TOC(0)
5747 }
5748 }
5749} /* nfft_adjoint */
5750
5751
5754static void precompute_phi_hut(X(plan) *ths)
5755{
5756 INT ks[ths->d]; /* index over all frequencies */
5757 INT t; /* index over all dimensions */
5758
5759 ths->c_phi_inv = (R**) Y(malloc)((size_t)(ths->d) * sizeof(R*));
5760
5761 for (t = 0; t < ths->d; t++)
5762 {
5763 ths->c_phi_inv[t] = (R*)Y(malloc)((size_t)(ths->N[t]) * sizeof(R));
5764
5765 for (ks[t] = 0; ks[t] < ths->N[t]; ks[t]++)
5766 {
5767 ths->c_phi_inv[t][ks[t]]= K(1.0) / (PHI_HUT(ths->n[t], ks[t] - ths->N[t] / 2,t));
5768 }
5769 }
5770} /* nfft_phi_hut */
5771
5776void X(precompute_lin_psi)(X(plan) *ths)
5777{
5778 INT t;
5779 INT j;
5780 R step;
5782 for (t=0; t<ths->d; t++)
5783 {
5784 step = ((R)(ths->m+2)) / ((R)(ths->K * ths->n[t]));
5785 for(j = 0;j <= ths->K; j++)
5786 {
5787 ths->psi[(ths->K+1)*t + j] = PHI(ths->n[t], (R)(j) * step,t);
5788 } /* for(j) */
5789 } /* for(t) */
5790}
5791
5792void X(precompute_fg_psi)(X(plan) *ths)
5793{
5794 INT t;
5795 INT u, o;
5797 sort(ths);
5798
5799 for (t=0; t<ths->d; t++)
5800 {
5801 INT j;
5802#ifdef _OPENMP
5803 #pragma omp parallel for default(shared) private(j,u,o)
5804#endif
5805 for (j = 0; j < ths->M_total; j++)
5806 {
5807 uo(ths,j,&u,&o,t);
5808
5809 ths->psi[2*(j*ths->d+t)]=
5810 (PHI(ths->n[t] ,(ths->x[j*ths->d+t] - ((R)u) / (R)(ths->n[t])),t));
5811
5812 ths->psi[2*(j*ths->d+t)+1]=
5813 EXP(K(2.0) * ((R)(ths->n[t]) * ths->x[j*ths->d+t] - (R)(u)) / ths->b[t]);
5814 } /* for(j) */
5815 }
5816 /* for(t) */
5817} /* nfft_precompute_fg_psi */
5818
5819void X(precompute_psi)(X(plan) *ths)
5820{
5821 INT t; /* index over all dimensions */
5822 INT l; /* index u<=l<=o */
5823 INT lj; /* index 0<=lj<u+o+1 */
5824 INT u, o; /* depends on x_j */
5825
5826 sort(ths);
5827
5828 for (t=0; t<ths->d; t++)
5829 {
5830 INT j;
5831#ifdef _OPENMP
5832 #pragma omp parallel for default(shared) private(j,l,lj,u,o)
5833#endif
5834 for (j = 0; j < ths->M_total; j++)
5835 {
5836 uo(ths,j,&u,&o,t);
5837
5838 for(l = u, lj = 0; l <= o; l++, lj++)
5839 ths->psi[(j * ths->d + t) * (2 * ths->m + 2) + lj] =
5840 (PHI(ths->n[t], (ths->x[j*ths->d+t] - ((R)l) / (R)(ths->n[t])), t));
5841 } /* for(j) */
5842 }
5843 /* for(t) */
5844} /* nfft_precompute_psi */
5845
5846#ifdef _OPENMP
5847static void nfft_precompute_full_psi_omp(X(plan) *ths)
5848{
5849 INT j;
5850 INT lprod;
5852 {
5853 INT t;
5854 for(t=0,lprod = 1; t<ths->d; t++)
5855 lprod *= 2*ths->m+2;
5856 }
5857
5858 #pragma omp parallel for default(shared) private(j)
5859 for(j=0; j<ths->M_total; j++)
5860 {
5861 INT t,t2;
5862 INT l_L;
5863 INT lj[ths->d];
5864 INT ll_plain[ths->d+1];
5866 INT u[ths->d], o[ths->d];
5868 R phi_prod[ths->d+1];
5869 INT ix = j*lprod;
5870
5871 phi_prod[0]=1;
5872 ll_plain[0]=0;
5873
5874 MACRO_init_uo_l_lj_t;
5875
5876 for(l_L=0; l_L<lprod; l_L++, ix++)
5877 {
5878 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5879
5880 ths->psi_index_g[ix]=ll_plain[ths->d];
5881 ths->psi[ix]=phi_prod[ths->d];
5882
5883 MACRO_count_uo_l_lj_t;
5884 } /* for(l_L) */
5885
5886 ths->psi_index_f[j]=lprod;
5887 } /* for(j) */
5888}
5889#endif
5890
5891void X(precompute_full_psi)(X(plan) *ths)
5892{
5893#ifdef _OPENMP
5894 sort(ths);
5895
5896 nfft_precompute_full_psi_omp(ths);
5897#else
5898 INT t, t2; /* index over all dimensions */
5899 INT j; /* index over all nodes */
5900 INT l_L; /* plain index 0 <= l_L < lprod */
5901 INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
5902 INT ll_plain[ths->d+1]; /* postfix plain index */
5903 INT lprod; /* 'bandwidth' of matrix B */
5904 INT u[ths->d], o[ths->d]; /* depends on x_j */
5905
5906 R phi_prod[ths->d+1];
5907
5908 INT ix, ix_old;
5909
5910 sort(ths);
5911
5912 phi_prod[0] = K(1.0);
5913 ll_plain[0] = 0;
5914
5915 for (t = 0, lprod = 1; t < ths->d; t++)
5916 lprod *= 2 * ths->m + 2;
5917
5918 for (j = 0, ix = 0, ix_old = 0; j < ths->M_total; j++)
5919 {
5920 MACRO_init_uo_l_lj_t;
5921
5922 for (l_L = 0; l_L < lprod; l_L++, ix++)
5923 {
5924 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5925
5926 ths->psi_index_g[ix] = ll_plain[ths->d];
5927 ths->psi[ix] = phi_prod[ths->d];
5928
5929 MACRO_count_uo_l_lj_t;
5930 } /* for(l_L) */
5931
5932 ths->psi_index_f[j] = ix - ix_old;
5933 ix_old = ix;
5934 } /* for(j) */
5935#endif
5936}
5937
5938void X(precompute_one_psi)(X(plan) *ths)
5939{
5940 if(ths->flags & PRE_LIN_PSI)
5941 X(precompute_lin_psi)(ths);
5942 if(ths->flags & PRE_FG_PSI)
5943 X(precompute_fg_psi)(ths);
5944 if(ths->flags & PRE_PSI)
5945 X(precompute_psi)(ths);
5946 if(ths->flags & PRE_FULL_PSI)
5947 X(precompute_full_psi)(ths);
5948}
5949
5950static void init_help(X(plan) *ths)
5951{
5952 INT t; /* index over all dimensions */
5953 INT lprod; /* 'bandwidth' of matrix B */
5954
5955 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT)
5956 ths->flags |= NFFT_SORT_NODES;
5957
5958 ths->N_total = intprod(ths->N, 0, ths->d);
5959 ths->n_total = intprod(ths->n, 0, ths->d);
5960
5961 ths->sigma = (R*) Y(malloc)((size_t)(ths->d) * sizeof(R));
5962
5963 for(t = 0;t < ths->d; t++)
5964 ths->sigma[t] = ((R)ths->n[t]) / (R)(ths->N[t]);
5965
5966 WINDOW_HELP_INIT;
5967
5968 if(ths->flags & MALLOC_X)
5969 ths->x = (R*)Y(malloc)((size_t)(ths->d * ths->M_total) * sizeof(R));
5970
5971 if(ths->flags & MALLOC_F_HAT)
5972 ths->f_hat = (C*)Y(malloc)((size_t)(ths->N_total) * sizeof(C));
5973
5974 if(ths->flags & MALLOC_F)
5975 ths->f = (C*)Y(malloc)((size_t)(ths->M_total) * sizeof(C));
5976
5977 if(ths->flags & PRE_PHI_HUT)
5978 precompute_phi_hut(ths);
5979
5980 if (ths->flags & PRE_LIN_PSI)
5981 {
5982 if (ths->K == 0)
5983 {
5984 ths->K = Y(m2K)(ths->m);
5985 }
5986 ths->psi = (R*) Y(malloc)((size_t)((ths->K+1) * ths->d) * sizeof(R));
5987 }
5988
5989 if(ths->flags & PRE_FG_PSI)
5990 ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * 2) * sizeof(R));
5991
5992 if(ths->flags & PRE_PSI)
5993 ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * (2 * ths->m + 2)) * sizeof(R));
5994
5995 if(ths->flags & PRE_FULL_PSI)
5996 {
5997 for (t = 0, lprod = 1; t < ths->d; t++)
5998 lprod *= 2 * ths->m + 2;
5999
6000 ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * lprod) * sizeof(R));
6001
6002 ths->psi_index_f = (INT*) Y(malloc)((size_t)(ths->M_total) * sizeof(INT));
6003 ths->psi_index_g = (INT*) Y(malloc)((size_t)(ths->M_total * lprod) * sizeof(INT));
6004 }
6005
6006 if(ths->flags & FFTW_INIT)
6007 {
6008#ifdef _OPENMP
6009 INT nthreads = Y(get_num_threads)();
6010#endif
6011
6012 ths->g1 = (C*)Y(malloc)((size_t)(ths->n_total) * sizeof(C));
6013
6014 if(ths->flags & FFT_OUT_OF_PLACE)
6015 ths->g2 = (C*) Y(malloc)((size_t)(ths->n_total) * sizeof(C));
6016 else
6017 ths->g2 = ths->g1;
6018
6019#ifdef _OPENMP
6020#pragma omp critical (nfft_omp_critical_fftw_plan)
6021{
6022 FFTW(plan_with_nthreads)(nthreads);
6023#endif
6024 {
6025 int *_n = Y(malloc)((size_t)(ths->d) * sizeof(int));
6026
6027 for (t = 0; t < ths->d; t++)
6028 _n[t] = (int)(ths->n[t]);
6029
6030 ths->my_fftw_plan1 = FFTW(plan_dft)((int)ths->d, _n, ths->g1, ths->g2, FFTW_FORWARD, ths->fftw_flags);
6031 ths->my_fftw_plan2 = FFTW(plan_dft)((int)ths->d, _n, ths->g2, ths->g1, FFTW_BACKWARD, ths->fftw_flags);
6032 Y(free)(_n);
6033 }
6034#ifdef _OPENMP
6035}
6036#endif
6037 }
6038
6039 if(ths->flags & NFFT_SORT_NODES)
6040 ths->index_x = (INT*) Y(malloc)(sizeof(INT) * 2U * (size_t)(ths->M_total));
6041 else
6042 ths->index_x = NULL;
6043
6044 ths->mv_trafo = (void (*) (void* ))X(trafo);
6045 ths->mv_adjoint = (void (*) (void* ))X(adjoint);
6046}
6047
6048void X(init)(X(plan) *ths, int d, int *N, int M_total)
6049{
6050 INT t; /* index over all dimensions */
6051
6052 ths->d = (INT)d;
6053
6054 ths->N = (INT*) Y(malloc)((size_t)(d) * sizeof(INT));
6055
6056 for (t = 0; t < d; t++)
6057 ths->N[t] = (INT)N[t];
6058
6059 ths->M_total = (INT)M_total;
6060
6061 ths->n = (INT*) Y(malloc)((size_t)(d) * sizeof(INT));
6062
6063 for (t = 0; t < d; t++)
6064 ths->n[t] = 2 * (Y(next_power_of_2)(ths->N[t]));
6065
6066 ths->m = WINDOW_HELP_ESTIMATE_m;
6067
6068 if (d > 1)
6069 {
6070#ifdef _OPENMP
6071 ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
6072 FFTW_INIT | NFFT_SORT_NODES |
6073 NFFT_OMP_BLOCKWISE_ADJOINT;
6074#else
6075 ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
6076 FFTW_INIT | NFFT_SORT_NODES;
6077#endif
6078 }
6079 else
6080 ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
6082
6083 ths->fftw_flags= FFTW_ESTIMATE| FFTW_DESTROY_INPUT;
6084
6085 ths->K = 0;
6086 init_help(ths);
6087}
6088
6089void X(init_guru)(X(plan) *ths, int d, int *N, int M_total, int *n, int m,
6090 unsigned flags, unsigned fftw_flags)
6091{
6092 INT t; /* index over all dimensions */
6093
6094 ths->d = (INT)d;
6095 ths->M_total = (INT)M_total;
6096 ths->N = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
6097
6098 for (t = 0; t < d; t++)
6099 ths->N[t] = (INT)N[t];
6100
6101 ths->n = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
6102
6103 for (t = 0; t < d; t++)
6104 ths->n[t] = (INT)n[t];
6105
6106 ths->m = (INT)m;
6107
6108 ths->flags = flags;
6109 ths->fftw_flags = fftw_flags;
6110
6111 ths->K = 0;
6112 init_help(ths);
6113}
6114
6115void X(init_lin)(X(plan) *ths, int d, int *N, int M_total, int *n, int m, int K,
6116 unsigned flags, unsigned fftw_flags)
6117{
6118 INT t; /* index over all dimensions */
6119
6120 ths->d = (INT)d;
6121 ths->M_total = (INT)M_total;
6122 ths->N = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
6123
6124 for (t = 0; t < d; t++)
6125 ths->N[t] = (INT)N[t];
6126
6127 ths->n = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
6128
6129 for (t = 0; t < d; t++)
6130 ths->n[t] = (INT)n[t];
6131
6132 ths->m = (INT)m;
6133
6134 ths->flags = flags;
6135 ths->fftw_flags = fftw_flags;
6136
6137 ths->K = K;
6138 init_help(ths);
6139}
6140
6141void X(init_1d)(X(plan) *ths, int N1, int M_total)
6142{
6143 int N[1];
6144
6145 N[0] = N1;
6146
6147 X(init)(ths, 1, N, M_total);
6148}
6149
6150void X(init_2d)(X(plan) *ths, int N1, int N2, int M_total)
6151{
6152 int N[2];
6153
6154 N[0] = N1;
6155 N[1] = N2;
6156 X(init)(ths, 2, N, M_total);
6157}
6158
6159void X(init_3d)(X(plan) *ths, int N1, int N2, int N3, int M_total)
6160{
6161 int N[3];
6162
6163 N[0] = N1;
6164 N[1] = N2;
6165 N[2] = N3;
6166 X(init)(ths, 3, N, M_total);
6167}
6168
6169const char* X(check)(X(plan) *ths)
6170{
6171 INT j;
6172
6173 if (!ths->f)
6174 return "Member f not initialized.";
6175
6176 if (!ths->x)
6177 return "Member x not initialized.";
6178
6179 if (!ths->f_hat)
6180 return "Member f_hat not initialized.";
6181
6182 if ((ths->flags & PRE_LIN_PSI) && ths->K < ths->M_total)
6183 return "Number of nodes too small to use PRE_LIN_PSI.";
6184
6185 for (j = 0; j < ths->M_total * ths->d; j++)
6186 {
6187 if ((ths->x[j]<-K(0.5)) || (ths->x[j]>= K(0.5)))
6188 {
6189 return "ths->x out of range [-0.5,0.5)";
6190 }
6191 }
6192
6193 for (j = 0; j < ths->d; j++)
6194 {
6195 if (ths->sigma[j] <= 1)
6196 return "Oversampling factor too small";
6197
6198 /* Automatically calls trafo_direct if
6199 if(ths->N[j] <= ths->m)
6200 return "Polynomial degree N is <= cut-off m";
6201 */
6202
6203 if(ths->N[j]%2 == 1)
6204 return "polynomial degree N has to be even";
6205 }
6206 return 0;
6207}
6208
6209void X(finalize)(X(plan) *ths)
6210{
6211 INT t; /* index over dimensions */
6212
6213 if(ths->flags & NFFT_SORT_NODES)
6214 Y(free)(ths->index_x);
6215
6216 if(ths->flags & FFTW_INIT)
6217 {
6218#ifdef _OPENMP
6219 #pragma omp critical (nfft_omp_critical_fftw_plan)
6220#endif
6221 FFTW(destroy_plan)(ths->my_fftw_plan2);
6222#ifdef _OPENMP
6223 #pragma omp critical (nfft_omp_critical_fftw_plan)
6224#endif
6225 FFTW(destroy_plan)(ths->my_fftw_plan1);
6226
6227 if(ths->flags & FFT_OUT_OF_PLACE)
6228 Y(free)(ths->g2);
6229
6230 Y(free)(ths->g1);
6231 }
6232
6233 if(ths->flags & PRE_FULL_PSI)
6234 {
6235 Y(free)(ths->psi_index_g);
6236 Y(free)(ths->psi_index_f);
6237 Y(free)(ths->psi);
6238 }
6239
6240 if(ths->flags & PRE_PSI)
6241 Y(free)(ths->psi);
6242
6243 if(ths->flags & PRE_FG_PSI)
6244 Y(free)(ths->psi);
6245
6246 if(ths->flags & PRE_LIN_PSI)
6247 Y(free)(ths->psi);
6248
6249 if(ths->flags & PRE_PHI_HUT)
6250 {
6251 for (t = 0; t < ths->d; t++)
6252 Y(free)(ths->c_phi_inv[t]);
6253 Y(free)(ths->c_phi_inv);
6254 }
6255
6256 if(ths->flags & MALLOC_F)
6257 Y(free)(ths->f);
6258
6259 if(ths->flags & MALLOC_F_HAT)
6260 Y(free)(ths->f_hat);
6261
6262 if(ths->flags & MALLOC_X)
6263 Y(free)(ths->x);
6264
6265 WINDOW_HELP_FINALIZE;
6266
6267 Y(free)(ths->sigma);
6268 Y(free)(ths->n);
6269 Y(free)(ths->N);
6270}
#define FG_PSI
Definition nfft3.h:188
#define MALLOC_F_HAT
Definition nfft3.h:194
#define MALLOC_X
Definition nfft3.h:193
#define PRE_FULL_PSI
Definition nfft3.h:192
#define FFT_OUT_OF_PLACE
Definition nfft3.h:196
#define PRE_PSI
Definition nfft3.h:191
#define PRE_FG_PSI
Definition nfft3.h:190
#define MALLOC_F
Definition nfft3.h:195
#define PRE_LIN_PSI
Definition nfft3.h:189
#define FFTW_INIT
Definition nfft3.h:197
#define PRE_PHI_HUT
Definition nfft3.h:187
#define TIC(a)
Timing, method works since the inaccurate timer is updated mostly in the measured function.
Definition infft.h:1400
#define UNUSED(x)
Dummy use of unused parameters to silence compiler warnings.
Definition infft.h:1319
Internal header file for auxiliary definitions and functions.
Header file for the nfft3 library.