/* This file is part of the ducc FFT library Copyright (C) 2010-2023 Max-Planck-Society Copyright (C) 2019 Peter Bell For the odd-sized DCT-IV transforms: Copyright (C) 2003, 2007-14 Matteo Frigo Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology Authors: Martin Reinecke, Peter Bell */ /* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ /* All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This code is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this code; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef DUCC0_FFT1D_IMPL_H #define DUCC0_FFT1D_IMPL_H #include #include #include #include #include #include #include #include #include #include "ducc0/infra/useful_macros.h" #include "ducc0/math/cmplx.h" #include "ducc0/infra/error_handling.h" #include "ducc0/infra/aligned_array.h" #include "ducc0/infra/simd.h" #include "ducc0/infra/threading.h" #include "ducc0/math/unity_roots.h" #include "ducc0/fft/fft.h" namespace ducc0 { namespace detail_fft { using namespace std; // the next line is necessary to address some sloppy name choices in AdaptiveCpp using std::min, std::max; template constexpr inline size_t fft1d_simdlen = min(8, native_simd::size()); template<> constexpr inline size_t fft1d_simdlen = min(4, native_simd::size()); template<> constexpr inline size_t fft1d_simdlen = min(8, native_simd::size()); template using fft1d_simd = typename simd_select>::type; template constexpr inline bool fft1d_simd_exists = (fft1d_simdlen > 1); // Always use std:: for functions template T cos(T) = delete; template T sin(T) = delete; template T sqrt(T) = delete; template void ROTX90(Cmplx &a) { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; } #define POCKETFFT_EXEC_DISPATCH \ virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, \ bool fwd, size_t nthreads=1) const \ { \ static const auto tics = tidx(); \ if (ti==tics) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ if constexpr (fft1d_simdlen > 1) \ if constexpr (simd_exists>) \ { \ using Tfv = typename simd_select>::type; \ using Tcv = Cmplx; \ static const auto ticv = tidx(); \ if (ti==ticv) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ } \ if constexpr (fft1d_simdlen > 2) \ if constexpr (simd_exists/2>) \ { \ using Tfv = typename simd_select/2>::type; \ using Tcv = Cmplx; \ static const auto ticv = tidx(); \ if (ti==ticv) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ } \ if constexpr (fft1d_simdlen > 4) \ if constexpr (simd_exists/4>) \ { \ using Tfv = typename simd_select/4>::type; \ using Tcv = Cmplx; \ static const auto ticv = tidx(); \ if (ti==ticv) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ } \ if constexpr (fft1d_simdlen > 8) \ if constexpr (simd_exists/8>) \ { \ using Tfv = typename simd_select/8>::type; \ using Tcv = Cmplx; \ static const auto ticv = tidx(); \ if (ti==ticv) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ } \ MR_fail("impossible vector length requested"); \ } template class cfftp1: public cfftpass { public: cfftp1() {} virtual size_t bufsize() const { return 0; } virtual bool needs_copy() const { return false; } virtual void *exec(const type_index & /*ti*/, void * in, void * /*copy*/, void * /*buf*/, bool /*fwd*/, size_t /*nthreads*/) const { return in; } }; template class cfftp2: public cfftpass { private: using typename cfftpass::Tcs; size_t l1, ido; static constexpr size_t ip=2; aligned_array wa; auto WA(size_t i) const { return wa[i-1]; } template Tcd *exec_ (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const { if (ido==1) { auto CH = [ch,this](size_t b, size_t c) -> Tcd& { return ch[b+l1*c]; }; auto CC = [cc](size_t b, size_t c) -> const Tcd& { return cc[b+ip*c]; }; for (size_t k=0; k Tcd& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& { return cc[a+ido*(b+ip*c)]; }; for (size_t k=0; k(CC(i,0,k)-CC(i,1,k),WA(i),CH(i,k,1)); } } return ch; } } public: cfftp2(size_t l1_, size_t ido_, const Troots &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { size_t N=ip*l1*ido; size_t rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t i=1; i class cfftp3: public cfftpass { private: using typename cfftpass::Tcs; size_t l1, ido; static constexpr size_t ip=3; aligned_array wa; auto WA(size_t x, size_t i) const { return wa[x+(i-1)*(ip-1)]; } template Tcd *exec_ (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const { constexpr Tfs tw1r=-0.5, tw1i= (fwd ? -1: 1) * Tfs(0.8660254037844386467637231707529362L); auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& { return cc[a+ido*(b+ip*c)]; }; #define POCKETFFT_PREP3(idx) \ Tcd t0 = CC(idx,0,k), t1, t2; \ PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \ CH(idx,k,0)=t0+t1; #define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \ { \ Tcd ca=t0+t1*twr; \ Tcd cb{-t2.i*twi, t2.r*twi}; \ PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\ } #define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \ { \ Tcd ca=t0+t1*twr; \ Tcd cb{-t2.i*twi, t2.r*twi}; \ special_mul(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ special_mul(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ } if (ido==1) for (size_t k=0; k &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { size_t N=ip*l1*ido; size_t rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t i=1; i class cfftp4: public cfftpass { private: using typename cfftpass::Tcs; size_t l1, ido; static constexpr size_t ip=4; aligned_array wa; auto WA(size_t x, size_t i) const { return wa[x+(i-1)*(ip-1)]; } template Tcd *exec_ (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const { if (ido==1) { auto CH = [ch,this](size_t b, size_t c) -> Tcd& { return ch[b+l1*c]; }; auto CC = [cc](size_t b, size_t c) -> const Tcd& { return cc[b+ip*c]; }; for (size_t k=0; k(t4); PM(CH(k,0),CH(k,2),t2,t3); PM(CH(k,1),CH(k,3),t1,t4); } } else { auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& { return cc[a+ido*(b+ip*c)]; }; for (size_t k=0; k(t4); PM(CH(0,k,0),CH(0,k,2),t2,t3); PM(CH(0,k,1),CH(0,k,3),t1,t4); } for (size_t i=1; i(t4); CH(i,k,0) = t2+t3; special_mul(t1+t4,WA(0,i),CH(i,k,1)); special_mul(t2-t3,WA(1,i),CH(i,k,2)); special_mul(t1-t4,WA(2,i),CH(i,k,3)); } } } return ch; } public: cfftp4(size_t l1_, size_t ido_, const Troots &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { size_t N=ip*l1*ido; size_t rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t i=1; i class cfftp5: public cfftpass { private: using typename cfftpass::Tcs; size_t l1, ido; static constexpr size_t ip=5; aligned_array wa; auto WA(size_t x, size_t i) const { return wa[x+(i-1)*(ip-1)]; } template Tcd *exec_ (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const { constexpr Tfs tw1r= Tfs(0.3090169943749474241022934171828191L), tw1i= (fwd ? -1: 1) * Tfs(0.9510565162951535721164393333793821L), tw2r= Tfs(-0.8090169943749474241022934171828191L), tw2i= (fwd ? -1: 1) * Tfs(0.5877852522924731291687059546390728L); auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& { return cc[a+ido*(b+ip*c)]; }; #define POCKETFFT_PREP5(idx) \ Tcd t0 = CC(idx,0,k), t1, t2, t3, t4; \ PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \ PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \ CH(idx,k,0).r=t0.r+t1.r+t2.r; \ CH(idx,k,0).i=t0.i+t1.i+t2.i; #define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \ { \ Tcd ca,cb; \ ca.r=t0.r+twar*t1.r+twbr*t2.r; \ ca.i=t0.i+twar*t1.i+twbr*t2.i; \ cb.i=twai*t4.r twbi*t3.r; \ cb.r=-(twai*t4.i twbi*t3.i); \ PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \ } #define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \ { \ Tcd ca,cb,da,db; \ ca.r=t0.r+twar*t1.r+twbr*t2.r; \ ca.i=t0.i+twar*t1.i+twbr*t2.i; \ cb.i=twai*t4.r twbi*t3.r; \ cb.r=-(twai*t4.i twbi*t3.i); \ special_mul(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ special_mul(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ } if (ido==1) for (size_t k=0; k &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { size_t N=ip*l1*ido; auto rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t i=1; i class cfftp7: public cfftpass { private: using typename cfftpass::Tcs; size_t l1, ido; static constexpr size_t ip=7; aligned_array wa; auto WA(size_t x, size_t i) const { return wa[x+(i-1)*(ip-1)]; } template Tcd *exec_ (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const { constexpr Tfs tw1r= Tfs(0.6234898018587335305250048840042398L), tw1i= (fwd ? -1 : 1) * Tfs(0.7818314824680298087084445266740578L), tw2r= Tfs(-0.2225209339563144042889025644967948L), tw2i= (fwd ? -1 : 1) * Tfs(0.9749279121818236070181316829939312L), tw3r= Tfs(-0.9009688679024191262361023195074451L), tw3i= (fwd ? -1 : 1) * Tfs(0.433883739117558120475768332848359L); auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& { return cc[a+ido*(b+ip*c)]; }; #define POCKETFFT_PREP7(idx) \ Tcd t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \ PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \ PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \ PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \ CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \ CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i; #define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \ { \ Tcd ca,cb; \ ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \ ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \ cb.i=y1*t7.r y2*t6.r y3*t5.r; \ cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \ PM(out1,out2,ca,cb); \ } #define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \ POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2)) #define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \ { \ Tcd da,db; \ POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \ special_mul(da,WA(u1-1,i),CH(i,k,u1)); \ special_mul(db,WA(u2-1,i),CH(i,k,u2)); \ } if (ido==1) for (size_t k=0; k &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { size_t N=ip*l1*ido; auto rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t i=1; i class cfftp8: public cfftpass { private: using typename cfftpass::Tcs; size_t l1, ido; static constexpr size_t ip=8; aligned_array wa; auto WA(size_t x, size_t i) const { return wa[x+(i-1)*(ip-1)]; } template void ROTX45(T &a) const { constexpr Tfs hsqt2=Tfs(0.707106781186547524400844362104849L); if constexpr (fwd) { auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); } else { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); } } template void ROTX135(T &a) const { constexpr Tfs hsqt2=Tfs(0.707106781186547524400844362104849L); if constexpr (fwd) { auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); } else { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); } } template Tcd *exec_ (Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const { if (l1==1) { auto CC = [cc,this](size_t a, size_t b) -> Tcd& { return cc[a+ido*b]; }; { Tcd a0, a1, a2, a3, a4, a5, a6, a7; PM(a1,a5,CC(0,1),CC(0,5)); PM(a3,a7,CC(0,3),CC(0,7)); PMINPLACE(a1,a3); ROTX90(a3); ROTX90(a7); PMINPLACE(a5,a7); ROTX45(a5); ROTX135(a7); PM(a0,a4,CC(0,0),CC(0,4)); PM(a2,a6,CC(0,2),CC(0,6)); PM(CC(0,0),CC(0,4),a0+a2,a1); PM(CC(0,2),CC(0,6),a0-a2,a3); ROTX90(a6); PM(CC(0,1),CC(0,5),a4+a6,a5); PM(CC(0,3),CC(0,7),a4-a6,a7); } for (size_t i=1; i(a7); PMINPLACE(a1,a3); ROTX90(a3); PMINPLACE(a5,a7); ROTX45(a5); ROTX135(a7); PM(a0,a4,CC(i,0),CC(i,4)); PM(a2,a6,CC(i,2),CC(i,6)); PMINPLACE(a0,a2); CC(i,0) = a0+a1; special_mul(a0-a1,WA(3,i),CC(i,4)); special_mul(a2+a3,WA(1,i),CC(i,2)); special_mul(a2-a3,WA(5,i),CC(i,6)); ROTX90(a6); PMINPLACE(a4,a6); special_mul(a4+a5,WA(0,i),CC(i,1)); special_mul(a4-a5,WA(4,i),CC(i,5)); special_mul(a6+a7,WA(2,i),CC(i,3)); special_mul(a6-a7,WA(6,i),CC(i,7)); } return cc; } if (ido==1) { auto CH = [ch,this](size_t b, size_t c) -> Tcd& { return ch[b+l1*c]; }; auto CC = [cc](size_t b, size_t c) -> const Tcd& { return cc[b+ip*c]; }; for (size_t k=0; k(a3); ROTX90(a7); PMINPLACE(a5,a7); ROTX45(a5); ROTX135(a7); PM(a0,a4,CC(0,k),CC(4,k)); PM(a2,a6,CC(2,k),CC(6,k)); PM(CH(k,0),CH(k,4),a0+a2,a1); PM(CH(k,2),CH(k,6),a0-a2,a3); ROTX90(a6); PM(CH(k,1),CH(k,5),a4+a6,a5); PM(CH(k,3),CH(k,7),a4-a6,a7); } } else { auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& { return cc[a+ido*(b+ip*c)]; }; for (size_t k=0; k(a3); ROTX90(a7); PMINPLACE(a5,a7); ROTX45(a5); ROTX135(a7); PM(a0,a4,CC(0,0,k),CC(0,4,k)); PM(a2,a6,CC(0,2,k),CC(0,6,k)); PM(CH(0,k,0),CH(0,k,4),a0+a2,a1); PM(CH(0,k,2),CH(0,k,6),a0-a2,a3); ROTX90(a6); PM(CH(0,k,1),CH(0,k,5),a4+a6,a5); PM(CH(0,k,3),CH(0,k,7),a4-a6,a7); } for (size_t i=1; i(a7); PMINPLACE(a1,a3); ROTX90(a3); PMINPLACE(a5,a7); ROTX45(a5); ROTX135(a7); PM(a0,a4,CC(i,0,k),CC(i,4,k)); PM(a2,a6,CC(i,2,k),CC(i,6,k)); PMINPLACE(a0,a2); CH(i,k,0) = a0+a1; special_mul(a0-a1,WA(3,i),CH(i,k,4)); special_mul(a2+a3,WA(1,i),CH(i,k,2)); special_mul(a2-a3,WA(5,i),CH(i,k,6)); ROTX90(a6); PMINPLACE(a4,a6); special_mul(a4+a5,WA(0,i),CH(i,k,1)); special_mul(a4-a5,WA(4,i),CH(i,k,5)); special_mul(a6+a7,WA(2,i),CH(i,k,3)); special_mul(a6-a7,WA(6,i),CH(i,k,7)); } } } return ch; } public: cfftp8(size_t l1_, size_t ido_, const Troots &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { size_t N=ip*l1*ido; auto rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t i=1; i1; } POCKETFFT_EXEC_DISPATCH }; template class cfftp11: public cfftpass { private: using typename cfftpass::Tcs; size_t l1, ido; static constexpr size_t ip=11; aligned_array wa; auto WA(size_t x, size_t i) const { return wa[x+(i-1)*(ip-1)]; } template [[gnu::hot]] Tcd *exec_ (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const { constexpr Tfs tw1r= Tfs(0.8412535328311811688618116489193677L), tw1i= (fwd ? -1 : 1) * Tfs(0.5406408174555975821076359543186917L), tw2r= Tfs(0.4154150130018864255292741492296232L), tw2i= (fwd ? -1 : 1) * Tfs(0.9096319953545183714117153830790285L), tw3r= Tfs(-0.1423148382732851404437926686163697L), tw3i= (fwd ? -1 : 1) * Tfs(0.9898214418809327323760920377767188L), tw4r= Tfs(-0.6548607339452850640569250724662936L), tw4i= (fwd ? -1 : 1) * Tfs(0.7557495743542582837740358439723444L), tw5r= Tfs(-0.9594929736144973898903680570663277L), tw5i= (fwd ? -1 : 1) * Tfs(0.2817325568414296977114179153466169L); auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& { return cc[a+ido*(b+ip*c)]; }; #define POCKETFFT_PREP11(idx) \ Tcd t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \ PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \ PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \ PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \ PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \ PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \ CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \ CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i; #define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \ { \ Tcd ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \ cb; \ cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \ cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \ PM(out1,out2,ca,cb); \ } #define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2)) #define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ { \ Tcd da,db; \ POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \ special_mul(da,WA(u1-1,i),CH(i,k,u1)); \ special_mul(db,WA(u2-1,i),CH(i,k,u2)); \ } if (ido==1) for (size_t k=0; k &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { size_t N=ip*l1*ido; auto rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t i=1; i class cfftpg: public cfftpass { private: using typename cfftpass::Tcs; size_t l1, ido; size_t ip; aligned_array wa; aligned_array csarr; auto WA(size_t x, size_t i) const { return wa[i-1+x*(ido-1)]; } template Tcd *exec_ (Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const { size_t ipph = (ip+1)/2; size_t idl1 = ido*l1; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& { return cc[a+ido*(b+ip*c)]; }; auto CX = [cc,this](size_t a, size_t b, size_t c) -> Tcd& { return cc[a+ido*(b+l1*c)]; }; auto CX2 = [cc, idl1](size_t a, size_t b) -> Tcd& { return cc[a+idl1*b]; }; auto CH2 = [ch, idl1](size_t a, size_t b) -> const Tcd& { return ch[a+idl1*b]; }; for (size_t k=0; kip) iwal-=ip; Tcs xwal=fwd ? csarr[iwal].conj() : csarr[iwal]; iwal+=l; if (iwal>ip) iwal-=ip; Tcs xwal2=fwd ? csarr[iwal].conj() : csarr[iwal]; for (size_t ik=0; ikip) iwal-=ip; Tcs xwal=fwd ? csarr[iwal].conj() : csarr[iwal]; for (size_t ik=0; ik(x1,wa[idij],CX(i,k,j)); idij=(jc-1)*(ido-1)+i-1; special_mul(x2,wa[idij],CX(i,k,jc)); } } } return cc; } public: cfftpg(size_t l1_, size_t ido_, size_t ip_, const Troots &roots) : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)), csarr(ip) { MR_assert((ip&1)&&(ip>=5), "need an odd number >=5"); size_t N=ip*l1*ido; auto rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t j=1; j class cfftpblue: public cfftpass { private: using typename cfftpass::Tcs; const size_t l1, ido, ip; const size_t ip2; const Tcpass subplan; aligned_array wa, bk, bkf; size_t bufsz; bool need_cpy; auto WA(size_t x, size_t i) const { return wa[i-1+x*(ido-1)]; } template Tcd *exec_ (Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * DUCC0_RESTRICT buf, size_t nthreads) const { static const auto ti=tidx(); Tcd *akf = &buf[0]; Tcd *akf2 = subplan->needs_copy() ? (&buf[ip2]) : akf; Tcd *subbuf = akf2+ip2; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tcd& { return cc[a+ido*(b+ip*c)]; }; //FIXME: parallelize here? for (size_t k=0; k(CC(i,m,k),bk[m],akf[m]); auto zero = akf[0]*Tfs(0); for (size_t m=ip; m(subplan->exec(ti,akf,akf2, subbuf, true, nthreads)); /* do the convolution */ res[0] = res[0].template special_mul(bkf[0]); for (size_t m=1; m<(ip2+1)/2; ++m) { res[m] = res[m].template special_mul(bkf[m]); res[ip2-m] = res[ip2-m].template special_mul(bkf[m]); } if ((ip2&1)==0) res[ip2/2] = res[ip2/2].template special_mul(bkf[ip2/2]); /* inverse FFT */ res = static_cast(subplan->exec(ti, res, (res==akf) ? akf2 : akf, subbuf, false, nthreads)); /* multiply by b_k and write to output buffer */ if (l1>1) { if (i==0) for (size_t m=0; m(bk[m]); else { CH(i,k,0) = res[0].template special_mul(bk[0]); for (size_t m=1; m(bk[m]*WA(m-1,i)); } } else { if (i==0) for (size_t m=0; m(bk[m]); else { CC(i,0,0) = res[0].template special_mul(bk[0]); for (size_t m=1; m(bk[m]*WA(m-1,i)); } } } return (l1>1) ? ch : cc; } public: cfftpblue(size_t l1_, size_t ido_, size_t ip_, const Troots &roots, bool vectorize=false) : l1(l1_), ido(ido_), ip(ip_), ip2(util1d::good_size_cmplx(ip*2-1)), subplan(cfftpass::make_pass(ip2, vectorize)), wa((ip-1)*(ido-1)), bk(ip), bkf(ip2/2+1) { size_t N=ip*l1*ido; auto rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t j=1; jsize()/(2*ip))*2*ip==roots->size()) ? roots : make_shared>(2*ip); size_t rfct2 = roots2->size()/(2*ip); for (size_t m=1; m=2*ip) coeff-=2*ip; bk[m] = (*roots2)[coeff*rfct2]; } /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ aligned_array tbkf(ip2), tbkf2(ip2); Tfs xn2 = Tfs(1)/Tfs(ip2); tbkf[0] = bk[0]*xn2; for (size_t m=1; m buf(subplan->bufsize()); static const auto tics=tidx(); auto res = static_cast(subplan->exec(tics, tbkf.data(), tbkf2.data(), buf.data(), true)); for (size_t i=0; i1; bufsz = ip2*(1+subplan->needs_copy()) + subplan->bufsize(); } virtual size_t bufsize() const { return bufsz; } virtual bool needs_copy() const { return need_cpy; } POCKETFFT_EXEC_DISPATCH }; template class cfft_multipass: public cfftpass { private: using typename cfftpass::Tcs; static constexpr size_t bunchsize=8; const size_t l1, ido; size_t ip; vector> passes; size_t bufsz; bool need_cpy; size_t rfct; Troots myroots; // FIXME split into sub-functions. This is too long! template Cmplx *exec_(Cmplx *cc, Cmplx *ch, Cmplx *buf, size_t nthreads) const { using Tc = Cmplx; if ((l1==1) && (ido==1)) // no chance at vectorizing { static const auto tic=tidx(); Tc *p1=cc, *p2=ch; for(const auto &pass: passes) { auto res = static_cast(pass->exec(tic, p1, p2, buf, fwd, nthreads)); if (res==p2) swap (p1,p2); } return p1; } else { if constexpr(is_same::value && fft1d_simd_exists) // we can vectorize! { using Tfv = fft1d_simd; using Tcv = Cmplx; constexpr size_t vlen = Tfv::size(); size_t nvtrans = (l1*ido + vlen-1)/vlen; // NOTE: removed "static" here, because it leads to trouble with gcc 7 // static const type_index ticv = tidx(); const type_index ticv = tidx(); if (ido==1) { auto CH = [ch,this](size_t b, size_t c) -> Tc& { return ch[b+l1*c]; }; auto CC = [cc,this](size_t b, size_t c) -> Tc& { return cc[b+ip*c]; }; execStatic(nvtrans, nthreads, 0, [&](auto &sched) { aligned_array tbuf(2*ip+32+bufsize()); auto cc2 = &tbuf[0]; auto ch2 = &tbuf[ip+16]; auto buf2 = &tbuf[2*ip+32]; while (auto rng=sched.getNext()) for(auto itrans=rng.lo; itrans(pass->exec(ticv, p1, p2, buf2, fwd)); if (res==p2) swap (p1,p2); } for (size_t m=0; m Tc& { return cc[a+ido*b]; }; execStatic(nvtrans, nthreads, 0, [&](auto &sched) { aligned_array tbuf(2*ip+32+bufsize()); auto cc2 = &tbuf[0]; auto ch2 = &tbuf[ip+16]; auto buf2 = &tbuf[2*ip+32]; while (auto rng=sched.getNext()) for(auto itrans=rng.lo; itrans(pass->exec(ticv, p1, p2, buf2, fwd)); if (res==p2) swap (p1,p2); } for (size_t m=0; m= ido) break; if (i==0) CC(0,m) = { p1[m].r[n], p1[m].i[n] }; else { if (m==0) CC(i,0) = { p1[0].r[n], p1[0].i[n] } ; else CC(i,m) = Tcs(p1[m].r[n],p1[m].i[n]).template special_mul((*myroots)[rfct*m*i]); } } } }); return cc; } MR_fail("must not get here"); #if 0 //FIXME this code path is currently unused aligned_array tbuf(2*ip+bufsize()); auto cc2 = &tbuf[0]; auto ch2 = &tbuf[ip]; auto buf2 = &tbuf[2*ip]; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tc& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tc& { return cc[a+ido*(b+ip*c)]; }; //FIXME parallelize? for (size_t itrans=0; itrans ix, kx; size_t ixcur = (itrans*vlen)%ido; size_t kxcur = (itrans*vlen)/ido; for (size_t n=0; n(pass->exec(ticv, p1, p2, buf2, fwd)); if (res==p2) swap (p1,p2); } for (size_t m=0; m= l1*ido) break; if (i==0) CH(0,k,m) = { p1[m].r[n], p1[m].i[n] }; else { if (m==0) CH(i,k,0) = { p1[0].r[n], p1[0].i[n] } ; else CH(i,k,m) = Tcs(p1[m].r[n],p1[m].i[n]).template special_mul((*myroots)[rfct*l1*m*i]); } } } return ch; #endif } else { static const auto tic = tidx *>(); if (ido==1) { // parallelize here! for (size_t n=0; n *p1=&cc[n*ip], *p2=ch; Cmplx *res = nullptr; for(const auto &pass: passes) { res = static_cast *>(pass->exec(tic, p1, p2, buf, fwd)); if (res==p2) swap (p1,p2); } if (res != &cc[n*ip]) copy(res, res+ip, cc+n*ip); } // transpose size_t nbunch = (l1*ido + bunchsize-1)/bunchsize; // parallelize here! for (size_t ibunch=0; ibunch Tc& { return cc[a+ido*b]; }; // parallelize here! for (size_t ibunch=0; ibunch *p1=&cc2[n*ip], *p2=ch2; Cmplx *res = nullptr; for(const auto &pass: passes) { res = static_cast *>(pass->exec(tic, p1, p2, buf2, fwd)); if (res==p2) swap (p1,p2); } if (res==&cc2[n*ip]) // no copying necessary { if (i!=0) { for (size_t m=1; m((*myroots)[rfct*m*i]); } } else { if (i==0) for (size_t m=0; m((*myroots)[rfct*m*i]); } } } for (size_t m=0; m Tc& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tc& { return cc[a+ido*(b+ip*c)]; }; // parallelize here! for (size_t ibunch=0; ibunch ix, kx; size_t ixcur = (ibunch*bunchsize)%ido; size_t kxcur = (ibunch*bunchsize)/ido; for (size_t n=0; n *p1=&cc2[n*ip], *p2=ch2; Cmplx *res = nullptr; for(const auto &pass: passes) { res = static_cast *>(pass->exec(tic, p1, p2, buf2, fwd)); if (res==p2) swap (p1,p2); } if (res==&cc2[n*ip]) // no copying necessary { if (i!=0) { for (size_t m=1; m((*myroots)[rfct*l1*m*i]); } } else { if (i==0) for (size_t m=0; m((*myroots)[rfct*l1*m*i]); } } } for (size_t m=0; m &roots, bool /*vectorize*/=false) : l1(l1_), ido(ido_), ip(ip_), bufsz(0), need_cpy(false), myroots(roots) { size_t N=ip*l1*ido; rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); // FIXME TBD // do we need the vectorize flag at all? size_t lim = 10000; //vectorize ? 10000 : 10000; if (ip<=lim) { auto factors = cfftpass::factorize(ip); size_t l1l=1; for (auto fct: factors) { passes.push_back(cfftpass::make_pass(l1l, ip/(fct*l1l), fct, roots, false)); l1l*=fct; } } else { vector packets(2,1); auto factors = util1d::prime_factors(ip); sort(factors.begin(), factors.end(), std::greater()); for (auto fct: factors) (packets[0]>packets[1]) ? packets[1]*=fct : packets[0]*=fct; size_t l1l=1; for (auto pkt: packets) { passes.push_back(cfftpass::make_pass(l1l, ip/(pkt*l1l), pkt, roots, false)); l1l*=pkt; } } for (const auto &pass: passes) { bufsz = max(bufsz, pass->bufsize()); need_cpy |= pass->needs_copy(); } if ((l1!=1)||(ido!=1)) { need_cpy=true; bufsz += (bunchsize+1)*ip; } } virtual size_t bufsize() const { return bufsz; } virtual bool needs_copy() const { return need_cpy; } POCKETFFT_EXEC_DISPATCH }; #undef POCKETFFT_EXEC_DISPATCH template class cfftp_vecpass: public cfftpass { private: static_assert(simd_exists, "bad vlen"); using typename cfftpass::Tcs; using Tfv=typename simd_select::type; using Tcv=Cmplx; size_t ip; Tcpass spass; Tcpass vpass; size_t bufsz; template Tcs *exec_ (Tcs *cc, Tcs * /*ch*/, Tcs *sbuf, size_t nthreads) const { char *xbuf = reinterpret_cast(sbuf); size_t misalign = reinterpret_cast(xbuf)&(sizeof(Tfv)-1); if (misalign != 0) xbuf += sizeof(Tfv)-misalign; Tcv *buf = reinterpret_cast(xbuf); auto * cc2 = buf; auto * ch2 = buf+ip/vlen+7; auto * buf2 = buf+2*ip/vlen+7+7; static const auto tics = tidx(); // run scalar pass auto res = static_cast(spass->exec(tics, cc, reinterpret_cast(ch2), reinterpret_cast(buf2), fwd, nthreads)); // arrange input in SIMD-friendly way, must be done out-of-place for (size_t i=0; i(); auto res2 = static_cast(vpass->exec(ticv, cc2, ch2, buf2, fwd, nthreads)); // de-SIMDify, can be done pseudo-inplace for (size_t i=0; i &roots) : ip(ip_), spass(cfftpass::make_pass(1, ip/vlen, vlen, roots)), vpass(cfftpass::make_pass(1, 1, ip/vlen, roots)), bufsz(0) { MR_assert((ip/vlen)*vlen==ip, "cannot vectorize this size"); bufsz = 2*(ip/vlen)+7+7; bufsz += max(vpass->bufsize(),(spass->bufsize()+vlen-1)/vlen); // buffers for subpasses bufsz *= vlen; // since we specify in terms of Tcs bufsz += vlen; // wiggle room for alignment shifts } virtual size_t bufsize() const { return bufsz; } virtual bool needs_copy() const { return false; } virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, bool fwd, size_t nthreads=1) const { static const auto tics = tidx(); MR_assert(ti==tics, "bad input type"); auto in1 = static_cast(in); auto copy1 = static_cast(copy); auto buf1 = static_cast(buf); return fwd ? exec_(in1, copy1, buf1, nthreads) : exec_(in1, copy1, buf1, nthreads); } }; template Tcpass cfftpass::make_pass(size_t l1, size_t ido, size_t ip, const Troots &roots, bool vectorize) { MR_assert(ip>=1, "no zero-sized FFTs"); // do we have an 1D vectorizable FFT? if (vectorize && (ip>300)&& (ip<=100000) && (l1==1) && (ido==1)) { // constexpr auto vlen = native_simd::size(); // if constexpr(vlen>=4) constexpr auto vlen = 4; if constexpr(simd_exists) if ((ip&(vlen-1))==0) return make_shared>(ip, roots); } if (ip==1) return make_shared>(); auto factors=cfftpass::factorize(ip); if (factors.size()==1) { switch(ip) { case 2: return make_shared>(l1, ido, roots); case 3: return make_shared>(l1, ido, roots); case 4: return make_shared>(l1, ido, roots); case 5: return make_shared>(l1, ido, roots); case 7: return make_shared>(l1, ido, roots); case 8: return make_shared>(l1, ido, roots); case 11: return make_shared>(l1, ido, roots); default: if (ip<110) return make_shared>(l1, ido, ip, roots); else return make_shared>(l1, ido, ip, roots, vectorize); } } else // more than one factor, need a multipass return make_shared>(l1, ido, ip, roots, vectorize); } #define POCKETFFT_EXEC_DISPATCH \ virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, \ bool fwd, size_t nthreads) const \ { \ static const auto tifs=tidx(); \ if (ti==tifs) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ if constexpr (fft1d_simdlen > 1) \ if constexpr (simd_exists>) \ { \ using Tfv = typename simd_select>::type; \ static const auto tifv=tidx(); \ if (ti==tifv) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ } \ if constexpr (fft1d_simdlen > 2) \ if constexpr (simd_exists/2>) \ { \ using Tfv = typename simd_select/2>::type; \ static const auto tifv=tidx(); \ if (ti==tifv) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ } \ if constexpr (fft1d_simdlen > 4) \ if constexpr (simd_exists/4>) \ { \ using Tfv = typename simd_select/4>::type; \ static const auto tifv=tidx(); \ if (ti==tifv) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ } \ if constexpr (fft1d_simdlen > 8) \ if constexpr (simd_exists/8>) \ { \ using Tfv = typename simd_select/8>::type; \ static const auto tifv=tidx(); \ if (ti==tifv) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ } \ MR_fail("impossible vector length requested"); \ } /* (a+ib) = conj(c+id) * (e+if) */ template inline void MULPM (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) { a=c*e+d*f; b=c*f-d*e; } template class rfftp1: public rfftpass { public: rfftp1() {} virtual size_t bufsize() const { return 0; } virtual bool needs_copy() const { return false; } virtual void *exec(const type_index & /*ti*/, void * in, void * /*copy*/, void * /*buf*/, bool /*fwd*/, size_t /*nthreads*/) const { return in; } }; template class rfftp2: public rfftpass { private: size_t l1, ido; static constexpr size_t ip=2; aligned_array wa; auto WA(size_t x, size_t i) const { return wa[i+x*(ido-1)]; } template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const { if constexpr(fwd) { auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& { return cc[a+ido*(b+l1*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& { return ch[a+ido*(b+ip*c)]; }; for (size_t k=0; k const Tfd& { return cc[a+ido*(b+ip*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& { return ch[a+ido*(b+l1*c)]; }; for (size_t k=0; k &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { size_t N=ip*l1*ido; size_t rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t j=1; j class rfftp3: public rfftpass { private: size_t l1, ido; static constexpr size_t ip=3; aligned_array wa; auto WA(size_t x, size_t i) const { return wa[i+x*(ido-1)]; } template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const { constexpr Tfs taur=Tfs(-0.5), taui=Tfs(0.8660254037844386467637231707529362L); if constexpr(fwd) { auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& { return cc[a+ido*(b+l1*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& { return ch[a+ido*(b+ip*c)]; }; for (size_t k=0; k const Tfd& { return cc[a+ido*(b+ip*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& { return ch[a+ido*(b+l1*c)]; }; for (size_t k=0; k &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { MR_assert(ido&1, "ido must be odd"); size_t N=ip*l1*ido; size_t rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t j=1; j class rfftp4: public rfftpass { private: size_t l1, ido; static constexpr size_t ip=4; aligned_array wa; auto WA(size_t x, size_t i) const { return wa[i+x*(ido-1)]; } template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const { if constexpr(fwd) { constexpr Tfs hsqt2=Tfs(0.707106781186547524400844362104849L); auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& { return cc[a+ido*(b+l1*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& { return ch[a+ido*(b+ip*c)]; }; for (size_t k=0; k const Tfd& { return cc[a+ido*(b+ip*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& { return ch[a+ido*(b+l1*c)]; }; for (size_t k=0; k &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { size_t N=ip*l1*ido; size_t rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t j=1; j class rfftp5: public rfftpass { private: size_t l1, ido; static constexpr size_t ip=5; aligned_array wa; auto WA(size_t x, size_t i) const { return wa[i+x*(ido-1)]; } template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const { constexpr Tfs tr11= Tfs(0.3090169943749474241022934171828191L), ti11= Tfs(0.9510565162951535721164393333793821L), tr12= Tfs(-0.8090169943749474241022934171828191L), ti12= Tfs(0.5877852522924731291687059546390728L); if constexpr(fwd) { auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& { return cc[a+ido*(b+l1*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& { return ch[a+ido*(b+ip*c)]; }; for (size_t k=0; k const Tfd& { return cc[a+ido*(b+ip*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& { return ch[a+ido*(b+l1*c)]; }; for (size_t k=0; k &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { MR_assert(ido&1, "ido must be odd"); size_t N=ip*l1*ido; size_t rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t j=1; j class rfftpg: public rfftpass { private: size_t l1, ido; size_t ip; aligned_array wa, csarr; template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const { if constexpr(fwd) { size_t ipph=(ip+1)/2; size_t idl1 = ido*l1; auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tfd& { return cc[a+ido*(b+ip*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> const Tfd& { return ch[a+ido*(b+l1*c)]; }; auto C1 = [cc,this] (size_t a, size_t b, size_t c) -> Tfd& { return cc[a+ido*(b+l1*c)]; }; auto C2 = [cc,idl1] (size_t a, size_t b) -> Tfd& { return cc[a+idl1*b]; }; auto CH2 = [ch,idl1] (size_t a, size_t b) -> Tfd& { return ch[a+idl1*b]; }; if (ido>1) { for (size_t j=1, jc=ip-1; j=ip) iang-=ip; Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; Tfs ar3=csarr[2*iang], ai3=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; Tfs ar4=csarr[2*iang], ai4=csarr[2*iang+1]; for (size_t ik=0; ik=ip) iang-=ip; Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; for (size_t ik=0; ik=ip) iang-=ip; Tfs ar=csarr[2*iang], ai=csarr[2*iang+1]; for (size_t ik=0; ik const Tfd& { return cc[a+ido*(b+ip*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& { return ch[a+ido*(b+l1*c)]; }; auto C1 = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& { return cc[a+ido*(b+l1*c)]; }; auto C2 = [cc,idl1](size_t a, size_t b) -> Tfd& { return cc[a+idl1*b]; }; auto CH2 = [ch,idl1](size_t a, size_t b) -> Tfd& { return ch[a+idl1*b]; }; for (size_t k=0; kip) iang-=ip; Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; Tfs ar3=csarr[2*iang], ai3=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; Tfs ar4=csarr[2*iang], ai4=csarr[2*iang+1]; for (size_t ik=0; ikip) iang-=ip; Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; for (size_t ik=0; ikip) iang-=ip; Tfs war=csarr[2*iang], wai=csarr[2*iang+1]; for (size_t ik=0; ik &roots) : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)), csarr(2*ip) { MR_assert(ido&1, "ido must be odd"); size_t N=ip*l1*ido; size_t rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t j=1; j class rfftpblue: public rfftpass { private: const size_t l1, ido, ip; aligned_array wa; const Tcpass cplan; size_t bufsz; bool need_cpy; auto WA(size_t x, size_t i) const { return wa[i+x*(ido-1)]; } template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, Tfd * DUCC0_RESTRICT ch, Tfd * DUCC0_RESTRICT buf_, size_t nthreads) const { using Tcd = Cmplx; auto buf = reinterpret_cast