From 92ca9fc032ecaa7f9595f5c5f7d9d8df6c972bbe Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Tue, 19 May 2009 00:21:04 -0400
Subject: [PATCH 01/34] initial pass of FFT module -- includes complex 1-d case
 only

---
 unsupported/Eigen/FFT.h                       |  84 +++++
 unsupported/Eigen/src/FFT/simple_fft_traits.h | 296 ++++++++++++++++++
 unsupported/test/CMakeLists.txt               |   1 +
 unsupported/test/FFT.cpp                      |  85 +++++
 4 files changed, 466 insertions(+)
 create mode 100644 unsupported/Eigen/FFT.h
 create mode 100644 unsupported/Eigen/src/FFT/simple_fft_traits.h
 create mode 100644 unsupported/test/FFT.cpp
diff --git a/unsupported/Eigen/FFT.h b/unsupported/Eigen/FFT.h
new file mode 100644
index 000000000..03490d2c5
--- /dev/null
+++ b/unsupported/Eigen/FFT.h
@@ -0,0 +1,84 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_FFT_H
+#define EIGEN_FFT_H
+
+// simple_fft_traits:  small, free, reasonably efficient default, derived from kissfft
+#include "src/FFT/simple_fft_traits.h"
+#define DEFAULT_FFT_TRAITS simple_fft_traits
+
+// FFTW: faster, GPL-not LGPL, bigger code size
+#ifdef FFTW_PATIENT  // definition of FFTW_PATIENT indicates the caller has included fftw3.h, we can use FFTW routines
+// TODO 
+// #include "src/FFT/fftw_traits.h"
+// #define DEFAULT_FFT_TRAITS fftw_traits
+#endif
+
+// intel Math Kernel Library: fastest, commerical
+#ifdef _MKL_DFTI_H_ // mkl_dfti.h has been included, we can use MKL FFT routines
+// TODO 
+// #include "src/FFT/imkl_traits.h"
+// #define DEFAULT_FFT_TRAITS imkl_traits
+#endif
+
+namespace Eigen {
+
+template <typename _Scalar,
+         typename _Traits=DEFAULT_FFT_TRAITS<_Scalar> 
+         >
+class FFT
+{
+  public:
+    typedef _Traits traits_type;
+    typedef typename traits_type::Scalar Scalar;
+    typedef typename traits_type::Complex Complex;
+
+    FFT(const traits_type & traits=traits_type() ) :m_traits(traits) { }
+
+    void fwd( Complex * dst, const Complex * src, int nfft)
+    {
+      m_traits.prepare(nfft,false,dst,src);
+      m_traits.exec(dst,src);
+      m_traits.postprocess(dst);
+    }
+
+    void inv( Complex * dst, const Complex * src, int nfft)
+    {
+      m_traits.prepare(nfft,true,dst,src);
+      m_traits.exec(dst,src);
+      m_traits.postprocess(dst);
+    }
+
+    // TODO: fwd,inv for Scalar
+    // TODO: multi-dimensional FFTs
+    // TODO: handle Eigen MatrixBase
+
+    traits_type & traits() {return m_traits;}
+  private:
+    traits_type m_traits;
+};
+#undef DEFAULT_FFT_TRAITS
+}
+#endif
diff --git a/unsupported/Eigen/src/FFT/simple_fft_traits.h b/unsupported/Eigen/src/FFT/simple_fft_traits.h
new file mode 100644
index 000000000..fe9d24b84
--- /dev/null
+++ b/unsupported/Eigen/src/FFT/simple_fft_traits.h
@@ -0,0 +1,296 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#include <complex>
+#include <vector>
+
+namespace Eigen {
+
+  template <typename _Scalar>
+  struct simple_fft_traits
+  {
+    typedef _Scalar Scalar;
+    typedef std::complex<Scalar> Complex;
+    simple_fft_traits() : m_nfft(0) {} 
+
+    void prepare(int nfft,bool inverse,Complex * dst,const Complex *src)
+    {
+      if (m_nfft == nfft) {
+        // reuse the twiddles, conjugate if necessary
+        if (inverse != m_inverse)
+          for (int i=0;i<nfft;++i)
+            m_twiddles[i] = conj( m_twiddles[i] );
+        m_inverse = inverse;
+        return;
+      }
+      m_nfft = nfft;
+      m_inverse = inverse;
+      m_twiddles.resize(nfft);
+      Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
+      for (int i=0;i<nfft;++i)
+        m_twiddles[i] = exp( Complex(0,i*phinc) );
+
+      m_stageRadix.resize(0);
+      m_stageRemainder.resize(0);
+      //factorize
+      //start factoring out 4's, then 2's, then 3,5,7,9,...
+      int n= nfft;
+      int p=4;
+      do {
+        while (n % p) {
+          switch (p) {
+            case 4: p = 2; break;
+            case 2: p = 3; break;
+            default: p += 2; break;
+          }
+          if (p*p>n)
+            p=n;// no more factors
+        }
+        n /= p;
+        m_stageRadix.push_back(p);
+        m_stageRemainder.push_back(n);
+      }while(n>1);
+    }
+
+    void exec(Complex * dst, const Complex * src)
+    {
+      work(0, dst, src, 1,1);
+    }
+
+    void postprocess(Complex *dst) 
+    {
+        if (m_inverse) {
+            Scalar scale = 1./m_nfft;
+            for (int k=0;k<m_nfft;++k)
+                dst[k] *= scale;
+        }
+    }
+  private:
+    void work( int stage,Complex * Fout, const Complex * f, size_t fstride,size_t in_stride)
+    {
+      int p = m_stageRadix[stage];
+      int m = m_stageRemainder[stage];
+      Complex * Fout_beg = Fout;
+      Complex * Fout_end = Fout + p*m;
+
+      if (m==1) {
+        do{
+          *Fout = *f;
+          f += fstride*in_stride;
+        }while(++Fout != Fout_end );
+      }else{
+        do{
+          // recursive call:
+          // DFT of size m*p performed by doing
+          // p instances of smaller DFTs of size m, 
+          // each one takes a decimated version of the input
+          work(stage+1, Fout , f, fstride*p,in_stride);
+          f += fstride*in_stride;
+        }while( (Fout += m) != Fout_end );
+      }
+
+      Fout=Fout_beg;
+
+      // recombine the p smaller DFTs 
+      switch (p) {
+        case 2: bfly2(Fout,fstride,m); break;
+        case 3: bfly3(Fout,fstride,m); break;
+        case 4: bfly4(Fout,fstride,m); break;
+        case 5: bfly5(Fout,fstride,m); break;
+        default: bfly_generic(Fout,fstride,m,p); break;
+      }
+    }
+
+    void bfly2( Complex * Fout, const size_t fstride, int m)
+    {
+      for (int k=0;k<m;++k) {
+        Complex t = Fout[m+k] * m_twiddles[k*fstride];
+        Fout[m+k] = Fout[k] - t;
+        Fout[k] += t;
+      }
+    }
+
+    void bfly4( Complex * Fout, const size_t fstride, const size_t m)
+    {
+      Complex scratch[7];
+      int negative_if_inverse = m_inverse * -2 +1;
+      for (size_t k=0;k<m;++k) {
+        scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
+        scratch[1] = Fout[k+2*m] * m_twiddles[k*fstride*2];
+        scratch[2] = Fout[k+3*m] * m_twiddles[k*fstride*3];
+        scratch[5] = Fout[k] - scratch[1];
+
+        Fout[k] += scratch[1];
+        scratch[3] = scratch[0] + scratch[2];
+        scratch[4] = scratch[0] - scratch[2];
+        scratch[4] = Complex( scratch[4].imag()*negative_if_inverse , -scratch[4].real()* negative_if_inverse );
+
+        Fout[k+2*m]  = Fout[k] - scratch[3];
+        Fout[k] += scratch[3];
+        Fout[k+m] = scratch[5] + scratch[4];
+        Fout[k+3*m] = scratch[5] - scratch[4];
+      }
+    }
+
+    void bfly3( Complex * Fout, const size_t fstride, const size_t m)
+    {
+      size_t k=m;
+      const size_t m2 = 2*m;
+      Complex *tw1,*tw2;
+      Complex scratch[5];
+      Complex epi3;
+      epi3 = m_twiddles[fstride*m];
+
+      tw1=tw2=&m_twiddles[0];
+
+      do{
+        scratch[1]=Fout[m] * *tw1;
+        scratch[2]=Fout[m2] * *tw2;
+
+        scratch[3]=scratch[1]+scratch[2];
+        scratch[0]=scratch[1]-scratch[2];
+        tw1 += fstride;
+        tw2 += fstride*2;
+
+        Fout[m] = Complex( Fout->real() - .5*scratch[3].real() , Fout->imag() - .5*scratch[3].imag() );
+
+        scratch[0] *= epi3.imag();
+
+        *Fout += scratch[3];
+
+        Fout[m2] = Complex(  Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
+
+        Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
+        ++Fout;
+      }while(--k);
+    }
+
+    void bfly5( Complex * Fout, const size_t fstride, const size_t m)
+    {
+      Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+      size_t u;
+      Complex scratch[13];
+      Complex * twiddles = &m_twiddles[0];
+      Complex *tw;
+      Complex ya,yb;
+      ya = twiddles[fstride*m];
+      yb = twiddles[fstride*2*m];
+
+      Fout0=Fout;
+      Fout1=Fout0+m;
+      Fout2=Fout0+2*m;
+      Fout3=Fout0+3*m;
+      Fout4=Fout0+4*m;
+
+      tw=twiddles;
+      for ( u=0; u<m; ++u ) {
+        scratch[0] = *Fout0;
+
+        scratch[1]  = *Fout1 * tw[u*fstride];
+        scratch[2]  = *Fout2 * tw[2*u*fstride];
+        scratch[3]  = *Fout3 * tw[3*u*fstride];
+        scratch[4]  = *Fout4 * tw[4*u*fstride];
+
+        scratch[7] = scratch[1] + scratch[4];
+        scratch[10] = scratch[1] - scratch[4];
+        scratch[8] = scratch[2] + scratch[3];
+        scratch[9] = scratch[2] - scratch[3];
+
+        *Fout0 +=  scratch[7];
+        *Fout0 +=  scratch[8];
+
+        scratch[5] = scratch[0] + Complex(
+            (scratch[7].real()*ya.real() ) + (scratch[8].real() *yb.real() ),
+            (scratch[7].imag()*ya.real()) + (scratch[8].imag()*yb.real())
+            );
+
+        scratch[6] = Complex(
+            (scratch[10].imag()*ya.imag()) + (scratch[9].imag()*yb.imag()),
+            -(scratch[10].real()*ya.imag()) - (scratch[9].real()*yb.imag())
+            );
+
+        *Fout1 = scratch[5] - scratch[6];
+        *Fout4 = scratch[5] + scratch[6];
+
+        scratch[11] = scratch[0] +
+          Complex(
+              (scratch[7].real()*yb.real()) + (scratch[8].real()*ya.real()),
+              (scratch[7].imag()*yb.real()) + (scratch[8].imag()*ya.real())
+              );
+
+        scratch[12] = Complex(
+            -(scratch[10].imag()*yb.imag()) + (scratch[9].imag()*ya.imag()),
+            (scratch[10].real()*yb.imag()) - (scratch[9].real()*ya.imag())
+            );
+
+        *Fout2=scratch[11]+scratch[12];
+        *Fout3=scratch[11]-scratch[12];
+
+        ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+      }
+    }
+
+    /* perform the butterfly for one stage of a mixed radix FFT */
+    void bfly_generic(
+        Complex * Fout,
+        const size_t fstride,
+        int m,
+        int p
+        )
+    {
+      int u,k,q1,q;
+      Complex * twiddles = &m_twiddles[0];
+      Complex t;
+      int Norig = m_nfft;
+      Complex * scratchbuf = (Complex*)alloca(p*sizeof(Complex) );
+
+      for ( u=0; u<m; ++u ) {
+        k=u;
+        for ( q1=0 ; q1<p ; ++q1 ) {
+          scratchbuf[q1] = Fout[ k  ];
+          k += m;
+        }
+
+        k=u;
+        for ( q1=0 ; q1<p ; ++q1 ) {
+          int twidx=0;
+          Fout[ k ] = scratchbuf[0];
+          for (q=1;q<p;++q ) {
+            twidx += fstride * k;
+            if (twidx>=Norig) twidx-=Norig;
+            t=scratchbuf[q] * twiddles[twidx];
+            Fout[ k ] += t;
+          }
+          k += m;
+        }
+      }
+    }
+
+    int m_nfft;
+    bool m_inverse;
+    std::vector<Complex> m_twiddles;
+    std::vector<int> m_stageRadix;
+    std::vector<int> m_stageRemainder;
+  };
+}
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index abfbb0185..49de7dfb2 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -19,3 +19,4 @@ ei_add_test(autodiff)
 ei_add_test(BVH)
 ei_add_test(matrixExponential)
 ei_add_test(alignedvector3)
+ei_add_test(FFT)
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
new file mode 100644
index 000000000..b13a7810f
--- /dev/null
+++ b/unsupported/test/FFT.cpp
@@ -0,0 +1,85 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#include "main.h"
+#include <unsupported/Eigen/FFT.h>
+
+//#include <iostream>
+//#include <cstdlib>
+//#include <typeinfo>
+
+using namespace std;
+
+template <class T>
+void test_fft(int nfft)
+{
+    typedef typename Eigen::FFT<T>::Complex Complex;
+
+    //cout << "type:" << typeid(T).name() << " nfft:" << nfft;
+
+    FFT<T> fft;
+
+    vector<Complex> inbuf(nfft);
+    vector<Complex> buf3(nfft);
+    vector<Complex> outbuf(nfft);
+    for (int k=0;k<nfft;++k)
+        inbuf[k]= Complex( 
+                (T)(rand()/(double)RAND_MAX - .5),
+                (T)(rand()/(double)RAND_MAX - .5) );
+    fft.fwd( &outbuf[0] , &inbuf[0] ,nfft);
+    fft.inv( &buf3[0] , &outbuf[0] ,nfft);
+
+    long double totalpower=0;
+    long double difpower=0;
+    for (int k0=0;k0<nfft;++k0) {
+        complex<long double> acc = 0;
+        long double phinc = 2*k0* M_PIl / nfft;
+        for (int k1=0;k1<nfft;++k1) {
+            complex<long double> x(inbuf[k1].real(),inbuf[k1].imag()); 
+            acc += x * exp( complex<long double>(0,-k1*phinc) );
+        }
+        totalpower += norm(acc);
+        complex<long double> x(outbuf[k0].real(),outbuf[k0].imag()); 
+        complex<long double> dif = acc - x;
+        difpower += norm(dif);
+    }
+    long double rmse = sqrt(difpower/totalpower);
+    VERIFY( rmse < 1e-5 );// gross check
+
+    totalpower=0;
+    difpower=0;
+    for (int k=0;k<nfft;++k) {
+        totalpower += norm( inbuf[k] );
+        difpower += norm(inbuf[k] - buf3[k]);
+    }
+    rmse = sqrt(difpower/totalpower);
+    VERIFY( rmse < 1e-5 );// gross check
+}
+
+void test_FFT()
+{
+  CALL_SUBTEST(( test_fft<float>(32) )); CALL_SUBTEST(( test_fft<double>(32) )); CALL_SUBTEST(( test_fft<long double>(32) ));
+  CALL_SUBTEST(( test_fft<float>(1024) )); CALL_SUBTEST(( test_fft<double>(1024) )); CALL_SUBTEST(( test_fft<long double>(1024) ));
+  CALL_SUBTEST(( test_fft<float>(2*3*4*5*7) )); CALL_SUBTEST(( test_fft<double>(2*3*4*5*7) )); CALL_SUBTEST(( test_fft<long double>(2*3*4*5*7) ));
+}

From be1b2ad4ec1851aa990dc97337fc74e16edd1dd0 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Tue, 19 May 2009 00:26:19 -0400
Subject: [PATCH 02/34] removed unused code

---
 unsupported/test/FFT.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index b13a7810f..8347bb76b 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -25,10 +25,6 @@
 #include "main.h"
 #include <unsupported/Eigen/FFT.h>
 
-//#include <iostream>
-//#include <cstdlib>
-//#include <typeinfo>
-
 using namespace std;
 
 template <class T>
@@ -36,8 +32,6 @@ void test_fft(int nfft)
 {
     typedef typename Eigen::FFT<T>::Complex Complex;
 
-    //cout << "type:" << typeid(T).name() << " nfft:" << nfft;
-
     FFT<T> fft;
 
     vector<Complex> inbuf(nfft);

From 68cad98bc935e53102a9432560085b81c5766743 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Tue, 19 May 2009 00:34:38 -0400
Subject: [PATCH 03/34] indent level change

---
 unsupported/Eigen/src/FFT/simple_fft_traits.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/unsupported/Eigen/src/FFT/simple_fft_traits.h b/unsupported/Eigen/src/FFT/simple_fft_traits.h
index fe9d24b84..6fbbeac2e 100644
--- a/unsupported/Eigen/src/FFT/simple_fft_traits.h
+++ b/unsupported/Eigen/src/FFT/simple_fft_traits.h
@@ -80,13 +80,15 @@ namespace Eigen {
 
     void postprocess(Complex *dst) 
     {
-        if (m_inverse) {
-            Scalar scale = 1./m_nfft;
-            for (int k=0;k<m_nfft;++k)
-                dst[k] *= scale;
-        }
+      if (m_inverse) {
+        Scalar scale = 1./m_nfft;
+        for (int k=0;k<m_nfft;++k)
+          dst[k] *= scale;
+      }
     }
-  private:
+
+    private:
+
     void work( int stage,Complex * Fout, const Complex * f, size_t fstride,size_t in_stride)
     {
       int p = m_stageRadix[stage];

From 8b4afe3debb47bf15ea291a7f2d21d863d546536 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Fri, 22 May 2009 22:37:59 -0400
Subject: [PATCH 04/34] added non-optimized real forward fft (no inverse yet)

---
 unsupported/Eigen/FFT.h                       |  27 ++++-
 unsupported/Eigen/src/FFT/simple_fft_traits.h |  10 +-
 unsupported/test/FFT.cpp                      | 113 ++++++++++++------
 3 files changed, 106 insertions(+), 44 deletions(-)

diff --git a/unsupported/Eigen/FFT.h b/unsupported/Eigen/FFT.h
index 03490d2c5..a1f87a609 100644
--- a/unsupported/Eigen/FFT.h
+++ b/unsupported/Eigen/FFT.h
@@ -57,21 +57,36 @@ class FFT
 
     FFT(const traits_type & traits=traits_type() ) :m_traits(traits) { }
 
-    void fwd( Complex * dst, const Complex * src, int nfft)
+    template <typename _Input>
+    void fwd( Complex * dst, const _Input * src, int nfft)
     {
       m_traits.prepare(nfft,false,dst,src);
       m_traits.exec(dst,src);
       m_traits.postprocess(dst);
     }
 
-    void inv( Complex * dst, const Complex * src, int nfft)
+    template <typename _Input>
+    void fwd( std::vector<Complex> & dst, const std::vector<_Input> & src) 
     {
-      m_traits.prepare(nfft,true,dst,src);
-      m_traits.exec(dst,src);
-      m_traits.postprocess(dst);
+        dst.resize( src.size() );
+        fwd( &dst[0],&src[0],src.size() );
+    }
+
+    template <typename _Output>
+    void inv( _Output * dst, const Complex * src, int nfft)
+    {
+        m_traits.prepare(nfft,true,dst,src);
+        m_traits.exec(dst,src);
+        m_traits.postprocess(dst);
+    }
+
+    template <typename _Output>
+    void inv( std::vector<_Output> & dst, const std::vector<Complex> & src) 
+    {
+        dst.resize( src.size() );
+        inv( &dst[0],&src[0],src.size() );
     }
 
-    // TODO: fwd,inv for Scalar
     // TODO: multi-dimensional FFTs
     // TODO: handle Eigen MatrixBase
 
diff --git a/unsupported/Eigen/src/FFT/simple_fft_traits.h b/unsupported/Eigen/src/FFT/simple_fft_traits.h
index 6fbbeac2e..5a910dd1f 100644
--- a/unsupported/Eigen/src/FFT/simple_fft_traits.h
+++ b/unsupported/Eigen/src/FFT/simple_fft_traits.h
@@ -34,7 +34,8 @@ namespace Eigen {
     typedef std::complex<Scalar> Complex;
     simple_fft_traits() : m_nfft(0) {} 
 
-    void prepare(int nfft,bool inverse,Complex * dst,const Complex *src)
+    template <typename _Src>
+    void prepare(int nfft,bool inverse,Complex * dst,const _Src *src)
     {
       if (m_nfft == nfft) {
         // reuse the twiddles, conjugate if necessary
@@ -73,7 +74,8 @@ namespace Eigen {
       }while(n>1);
     }
 
-    void exec(Complex * dst, const Complex * src)
+    template <typename _Src>
+    void exec(Complex * dst, const _Src * src)
     {
       work(0, dst, src, 1,1);
     }
@@ -89,7 +91,9 @@ namespace Eigen {
 
     private:
 
-    void work( int stage,Complex * Fout, const Complex * f, size_t fstride,size_t in_stride)
+   
+    template <typename _Src>
+    void work( int stage,Complex * Fout, const _Src * f, size_t fstride,size_t in_stride)
     {
       int p = m_stageRadix[stage];
       int m = m_stageRemainder[stage];
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index 8347bb76b..ef03359e2 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -25,55 +25,98 @@
 #include "main.h"
 #include <unsupported/Eigen/FFT.h>
 
+
 using namespace std;
 
+template < typename T>
+complex<long double>  promote(complex<T> x) { return complex<long double>(x.real(),x.imag()); }
+
+complex<long double>  promote(float x) { return complex<long double>( x); }
+complex<long double>  promote(double x) { return complex<long double>( x); }
+complex<long double>  promote(long double x) { return complex<long double>( x); }
+    
+
+    template <typename T1,typename T2>
+    long double fft_rmse( const vector<T1> & fftbuf,const vector<T2> & timebuf)
+    {
+        long double totalpower=0;
+        long double difpower=0;
+        for (size_t k0=0;k0<fftbuf.size();++k0) {
+            complex<long double> acc = 0;
+            long double phinc = -2.*k0* M_PIl / timebuf.size();
+            for (size_t k1=0;k1<timebuf.size();++k1) {
+                acc +=  promote( timebuf[k1] ) * exp( complex<long double>(0,k1*phinc) );
+            }
+            totalpower += norm(acc);
+            complex<long double> x = promote(fftbuf[k0]); 
+            complex<long double> dif = acc - x;
+            difpower += norm(dif);
+            cerr << k0 << ":" << acc << " " <<  x << endl;
+        }
+        cerr << "rmse:" << sqrt(difpower/totalpower) << endl;
+        return sqrt(difpower/totalpower);
+    }
+
+    template <typename T1,typename T2>
+    long double dif_rmse( const vector<T1> buf1,const vector<T2> buf2)
+    {
+        long double totalpower=0;
+        long double difpower=0;
+        size_t n = min( buf1.size(),buf2.size() );
+        for (size_t k=0;k<n;++k) {
+            totalpower += (norm( buf1[k] ) + norm(buf2[k]) )/2.;
+            difpower += norm(buf1[k] - buf2[k]);
+        }
+        return sqrt(difpower/totalpower);
+    }
+
 template <class T>
-void test_fft(int nfft)
+void test_scalar(int nfft)
+{
+    typedef typename Eigen::FFT<T>::Complex Complex;
+    typedef typename Eigen::FFT<T>::Scalar Scalar;
+
+    FFT<T> fft;
+    vector<Scalar> inbuf(nfft);
+    vector<Complex> outbuf;
+    for (int k=0;k<nfft;++k)
+        inbuf[k]= (T)(rand()/(double)RAND_MAX - .5);
+    fft.fwd( outbuf,inbuf);
+    VERIFY( fft_rmse(outbuf,inbuf) < 1e-5 );// gross check
+}
+
+template <class T>
+void test_complex(int nfft)
 {
     typedef typename Eigen::FFT<T>::Complex Complex;
 
     FFT<T> fft;
 
     vector<Complex> inbuf(nfft);
-    vector<Complex> buf3(nfft);
-    vector<Complex> outbuf(nfft);
+    vector<Complex> outbuf;
+    vector<Complex> buf3;
     for (int k=0;k<nfft;++k)
-        inbuf[k]= Complex( 
-                (T)(rand()/(double)RAND_MAX - .5),
-                (T)(rand()/(double)RAND_MAX - .5) );
-    fft.fwd( &outbuf[0] , &inbuf[0] ,nfft);
-    fft.inv( &buf3[0] , &outbuf[0] ,nfft);
+        inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) );
+    fft.fwd( outbuf , inbuf);
 
-    long double totalpower=0;
-    long double difpower=0;
-    for (int k0=0;k0<nfft;++k0) {
-        complex<long double> acc = 0;
-        long double phinc = 2*k0* M_PIl / nfft;
-        for (int k1=0;k1<nfft;++k1) {
-            complex<long double> x(inbuf[k1].real(),inbuf[k1].imag()); 
-            acc += x * exp( complex<long double>(0,-k1*phinc) );
-        }
-        totalpower += norm(acc);
-        complex<long double> x(outbuf[k0].real(),outbuf[k0].imag()); 
-        complex<long double> dif = acc - x;
-        difpower += norm(dif);
-    }
-    long double rmse = sqrt(difpower/totalpower);
-    VERIFY( rmse < 1e-5 );// gross check
+    VERIFY( fft_rmse(outbuf,inbuf) < 1e-5 );// gross check
 
-    totalpower=0;
-    difpower=0;
-    for (int k=0;k<nfft;++k) {
-        totalpower += norm( inbuf[k] );
-        difpower += norm(inbuf[k] - buf3[k]);
-    }
-    rmse = sqrt(difpower/totalpower);
-    VERIFY( rmse < 1e-5 );// gross check
+    fft.inv( buf3 , outbuf);
+
+    VERIFY( dif_rmse(inbuf,buf3) < 1e-5 );// gross check
 }
 
 void test_FFT()
 {
-  CALL_SUBTEST(( test_fft<float>(32) )); CALL_SUBTEST(( test_fft<double>(32) )); CALL_SUBTEST(( test_fft<long double>(32) ));
-  CALL_SUBTEST(( test_fft<float>(1024) )); CALL_SUBTEST(( test_fft<double>(1024) )); CALL_SUBTEST(( test_fft<long double>(1024) ));
-  CALL_SUBTEST(( test_fft<float>(2*3*4*5*7) )); CALL_SUBTEST(( test_fft<double>(2*3*4*5*7) )); CALL_SUBTEST(( test_fft<long double>(2*3*4*5*7) ));
+  CALL_SUBTEST( test_complex<float>(32) ); CALL_SUBTEST( test_complex<double>(32) ); CALL_SUBTEST( test_complex<long double>(32) );
+  CALL_SUBTEST( test_complex<float>(1024) ); CALL_SUBTEST( test_complex<double>(1024) ); CALL_SUBTEST( test_complex<long double>(1024) );
+  CALL_SUBTEST( test_complex<float>(3*8) ); CALL_SUBTEST( test_complex<double>(3*8) ); CALL_SUBTEST( test_complex<long double>(3*8) );
+  CALL_SUBTEST( test_complex<float>(5*32) ); CALL_SUBTEST( test_complex<double>(5*32) ); CALL_SUBTEST( test_complex<long double>(5*32) );
+  CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); CALL_SUBTEST( test_complex<long double>(2*3*4) );
+  CALL_SUBTEST( test_complex<float>(2*3*4*5) ); CALL_SUBTEST( test_complex<double>(2*3*4*5) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5) );
+  CALL_SUBTEST( test_complex<float>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<double>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5*7) );
+
+  CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); CALL_SUBTEST( test_scalar<long double>(32) );
+  CALL_SUBTEST( test_scalar<float>(1024) ); CALL_SUBTEST( test_scalar<double>(1024) ); CALL_SUBTEST( test_scalar<long double>(1024) );
+  CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
 }

From 9c0fcd0f6213143216710a5b215aa2bb4a857ce5 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Sat, 23 May 2009 10:09:48 -0400
Subject: [PATCH 05/34] started real optimization, added benchmark for FFT

---
 bench/benchFFT.cpp                            |  64 +++++++++
 unsupported/Eigen/FFT.h                       |   8 +-
 unsupported/Eigen/src/FFT/simple_fft_traits.h | 125 +++++++++++++-----
 unsupported/test/FFT.cpp                      |   3 +-
 4 files changed, 157 insertions(+), 43 deletions(-)
 create mode 100644 bench/benchFFT.cpp

diff --git a/bench/benchFFT.cpp b/bench/benchFFT.cpp
new file mode 100644
index 000000000..041576b75
--- /dev/null
+++ b/bench/benchFFT.cpp
@@ -0,0 +1,64 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#include <complex>
+#include <vector>
+#include <Eigen/Core>
+#include <bench/BenchTimer.h>
+#include <unsupported/Eigen/FFT.h>
+
+using namespace Eigen;
+using namespace std;
+
+#ifndef NFFT
+#define NFFT 1024
+#endif
+
+#ifndef TYPE
+#define TYPE float
+#endif
+
+#ifndef NITS 
+#define NITS (10000000/NFFT)
+#endif
+
+int main() 
+{
+  vector<complex<TYPE> > inbuf(NFFT);
+  vector<complex<TYPE> > outbuf(NFFT);
+  Eigen::FFT<TYPE> fft;
+
+  fft.fwd( outbuf , inbuf);
+
+  BenchTimer timer;
+  timer.reset();
+  for (int k=0;k<8;++k) {
+      timer.start();
+      for(int i = 0; i < NITS; i++)
+          fft.fwd( outbuf , inbuf);
+      timer.stop();
+  }
+  double mflops = 5.*NFFT*log2((double)NFFT) / (1e6 * timer.value() / (double)NITS );
+  cout << "NFFT=" << NFFT << "  " << (double(1e-6*NFFT*NITS)/timer.value()) << " MS/s  " << mflops << "MFLOPS\n";
+}
diff --git a/unsupported/Eigen/FFT.h b/unsupported/Eigen/FFT.h
index a1f87a609..c466423b7 100644
--- a/unsupported/Eigen/FFT.h
+++ b/unsupported/Eigen/FFT.h
@@ -60,9 +60,7 @@ class FFT
     template <typename _Input>
     void fwd( Complex * dst, const _Input * src, int nfft)
     {
-      m_traits.prepare(nfft,false,dst,src);
-      m_traits.exec(dst,src);
-      m_traits.postprocess(dst);
+        m_traits.fwd(dst,src,nfft);
     }
 
     template <typename _Input>
@@ -75,9 +73,7 @@ class FFT
     template <typename _Output>
     void inv( _Output * dst, const Complex * src, int nfft)
     {
-        m_traits.prepare(nfft,true,dst,src);
-        m_traits.exec(dst,src);
-        m_traits.postprocess(dst);
+        m_traits.inv( dst,src,nfft );
     }
 
     template <typename _Output>
diff --git a/unsupported/Eigen/src/FFT/simple_fft_traits.h b/unsupported/Eigen/src/FFT/simple_fft_traits.h
index 5a910dd1f..33433ae03 100644
--- a/unsupported/Eigen/src/FFT/simple_fft_traits.h
+++ b/unsupported/Eigen/src/FFT/simple_fft_traits.h
@@ -35,7 +35,73 @@ namespace Eigen {
     simple_fft_traits() : m_nfft(0) {} 
 
     template <typename _Src>
-    void prepare(int nfft,bool inverse,Complex * dst,const _Src *src)
+    void fwd( Complex * dst,const _Src *src,int nfft)
+    {
+        prepare(nfft,false);
+        work(0, dst, src, 1,1);
+        scale(dst);
+    }
+
+    void fwd( Complex * dst,const Scalar * src,int nfft) 
+    {
+        if ( nfft&1 ) {
+            // use generic mode for odd
+            prepare(nfft,false);
+            work(0, dst, src, 1,1);
+            scale(dst);
+        }else{
+            int ncfft = nfft>>1;
+            // use optimized mode for even real
+            prepare(nfft,false);
+            work(0,dst, reinterpret_cast<const Complex*> (src),2,1);
+            Complex dc = dst[0].real() +  dst[0].imag();
+            Complex nyquist = dst[0].real() -  dst[0].imag();
+            
+            int k;
+            for ( k=1;k <= ncfft/2 ; ++k ) {
+/**
+        fpk    = st->tmpbuf[k];
+        fpnk.r =   st->tmpbuf[ncfft-k].r;
+        fpnk.i = - st->tmpbuf[ncfft-k].i;
+
+        C_ADD( f1k, fpk , fpnk );
+        C_SUB( f2k, fpk , fpnk );
+        C_MUL( tw , f2k , st->super_twiddles[k-1]);
+
+        freqdata[k].r = HALF_OF(f1k.r + tw.r);
+        freqdata[k].i = HALF_OF(f1k.i + tw.i);
+        freqdata[ncfft-k].r = HALF_OF(f1k.r - tw.r);
+        freqdata[ncfft-k].i = HALF_OF(tw.i - f1k.i);
+ */
+                Complex fpk = dst[k];
+                Complex fpnk = conj(dst[ncfft-k]);
+
+                Complex f1k = fpk + fpnk;
+                Complex f2k = fpnk - fpk;
+                //Complex tw = f2k * exp( Complex(0,-3.14159265358979323846264338327 * ((double)k / ncfft + 1) ) ); // TODO repalce this with index into twiddles
+                Complex tw = f2k * m_twiddles[2*k];;
+                
+                dst[k] =  (f1k + tw) * Scalar(.5);
+                dst[ncfft-k] =  conj(f1k -tw)*Scalar(.5);
+            }
+            // place conjugate-symmetric half at the end for completeness
+            // TODO: make this configurable ( opt-out )
+            for ( k=1;k < ncfft ; ++k )
+                dst[nfft-k] = conj(dst[k]);
+
+            dst[0] = dc;
+            dst[ncfft] = nyquist;
+        }
+    }
+
+    void inv(Complex * dst,const Complex  *src,int nfft)
+    {
+        prepare(nfft,true);
+        work(0, dst, src, 1,1);
+        scale(dst);
+    }
+
+    void prepare(int nfft,bool inverse)
     {
       if (m_nfft == nfft) {
         // reuse the twiddles, conjugate if necessary
@@ -74,57 +140,49 @@ namespace Eigen {
       }while(n>1);
     }
 
-    template <typename _Src>
-    void exec(Complex * dst, const _Src * src)
-    {
-      work(0, dst, src, 1,1);
-    }
-
-    void postprocess(Complex *dst) 
+    void scale(Complex *dst) 
     {
       if (m_inverse) {
-        Scalar scale = 1./m_nfft;
+        Scalar s = 1./m_nfft;
         for (int k=0;k<m_nfft;++k)
-          dst[k] *= scale;
+          dst[k] *= s;
       }
     }
 
     private:
 
-   
     template <typename _Src>
-    void work( int stage,Complex * Fout, const _Src * f, size_t fstride,size_t in_stride)
+    void work( int stage,Complex * xout, const _Src * xin, size_t fstride,size_t in_stride)
     {
       int p = m_stageRadix[stage];
       int m = m_stageRemainder[stage];
-      Complex * Fout_beg = Fout;
-      Complex * Fout_end = Fout + p*m;
+      Complex * Fout_beg = xout;
+      Complex * Fout_end = xout + p*m;
 
-      if (m==1) {
-        do{
-          *Fout = *f;
-          f += fstride*in_stride;
-        }while(++Fout != Fout_end );
-      }else{
+      if (m>1) {
         do{
           // recursive call:
           // DFT of size m*p performed by doing
           // p instances of smaller DFTs of size m, 
           // each one takes a decimated version of the input
-          work(stage+1, Fout , f, fstride*p,in_stride);
-          f += fstride*in_stride;
-        }while( (Fout += m) != Fout_end );
+          work(stage+1, xout , xin, fstride*p,in_stride);
+          xin += fstride*in_stride;
+        }while( (xout += m) != Fout_end );
+      }else{
+          do{
+              *xout = *xin;
+              xin += fstride*in_stride;
+          }while(++xout != Fout_end );
       }
-
-      Fout=Fout_beg;
+      xout=Fout_beg;
 
       // recombine the p smaller DFTs 
       switch (p) {
-        case 2: bfly2(Fout,fstride,m); break;
-        case 3: bfly3(Fout,fstride,m); break;
-        case 4: bfly4(Fout,fstride,m); break;
-        case 5: bfly5(Fout,fstride,m); break;
-        default: bfly_generic(Fout,fstride,m,p); break;
+        case 2: bfly2(xout,fstride,m); break;
+        case 3: bfly3(xout,fstride,m); break;
+        case 4: bfly4(xout,fstride,m); break;
+        case 5: bfly5(xout,fstride,m); break;
+        default: bfly_generic(xout,fstride,m,p); break;
       }
     }
 
@@ -139,7 +197,7 @@ namespace Eigen {
 
     void bfly4( Complex * Fout, const size_t fstride, const size_t m)
     {
-      Complex scratch[7];
+      Complex scratch[6];
       int negative_if_inverse = m_inverse * -2 +1;
       for (size_t k=0;k<m;++k) {
         scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
@@ -178,15 +236,10 @@ namespace Eigen {
         scratch[0]=scratch[1]-scratch[2];
         tw1 += fstride;
         tw2 += fstride*2;
-
         Fout[m] = Complex( Fout->real() - .5*scratch[3].real() , Fout->imag() - .5*scratch[3].imag() );
-
         scratch[0] *= epi3.imag();
-
         *Fout += scratch[3];
-
         Fout[m2] = Complex(  Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
-
         Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
         ++Fout;
       }while(--k);
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index ef03359e2..13e98ba77 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -115,8 +115,9 @@ void test_FFT()
   CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); CALL_SUBTEST( test_complex<long double>(2*3*4) );
   CALL_SUBTEST( test_complex<float>(2*3*4*5) ); CALL_SUBTEST( test_complex<double>(2*3*4*5) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5) );
   CALL_SUBTEST( test_complex<float>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<double>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5*7) );
-
+/*
   CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); CALL_SUBTEST( test_scalar<long double>(32) );
   CALL_SUBTEST( test_scalar<float>(1024) ); CALL_SUBTEST( test_scalar<double>(1024) ); CALL_SUBTEST( test_scalar<long double>(1024) );
   CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
+  */
 }

From 304798817268706463f3ff645c8c8b2c887c090a Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Sat, 23 May 2009 12:50:07 -0400
Subject: [PATCH 06/34] scalar forward FFT optimized for even size, converts to
 cpx for odd

---
 unsupported/Eigen/src/FFT/simple_fft_traits.h | 156 ++++++++++--------
 unsupported/test/FFT.cpp                      |  10 +-
 2 files changed, 97 insertions(+), 69 deletions(-)

diff --git a/unsupported/Eigen/src/FFT/simple_fft_traits.h b/unsupported/Eigen/src/FFT/simple_fft_traits.h
index 33433ae03..f7dd2b9cf 100644
--- a/unsupported/Eigen/src/FFT/simple_fft_traits.h
+++ b/unsupported/Eigen/src/FFT/simple_fft_traits.h
@@ -24,6 +24,7 @@
 
 #include <complex>
 #include <vector>
+#include <iostream>
 
 namespace Eigen {
 
@@ -39,51 +40,54 @@ namespace Eigen {
     {
         prepare(nfft,false);
         work(0, dst, src, 1,1);
-        scale(dst);
     }
 
+    // real-to-complex forward FFT
+    // perform two FFTs of src even and src odd
+    // then twiddle to recombine them into the half-spectrum format
+    // then fill in the conjugate symmetric half
     void fwd( Complex * dst,const Scalar * src,int nfft) 
     {
         if ( nfft&1 ) {
             // use generic mode for odd
             prepare(nfft,false);
             work(0, dst, src, 1,1);
-            scale(dst);
         }else{
             int ncfft = nfft>>1;
             // use optimized mode for even real
-            prepare(nfft,false);
-            work(0,dst, reinterpret_cast<const Complex*> (src),2,1);
+            fwd( dst, reinterpret_cast<const Complex*> (src),ncfft);
+            make_real_twiddles(nfft);
+            std::cerr << "dst[0] = " << dst[0] << "\n";
             Complex dc = dst[0].real() +  dst[0].imag();
             Complex nyquist = dst[0].real() -  dst[0].imag();
-            
             int k;
-            for ( k=1;k <= ncfft/2 ; ++k ) {
-/**
-        fpk    = st->tmpbuf[k];
-        fpnk.r =   st->tmpbuf[ncfft-k].r;
-        fpnk.i = - st->tmpbuf[ncfft-k].i;
-
-        C_ADD( f1k, fpk , fpnk );
-        C_SUB( f2k, fpk , fpnk );
-        C_MUL( tw , f2k , st->super_twiddles[k-1]);
-
-        freqdata[k].r = HALF_OF(f1k.r + tw.r);
-        freqdata[k].i = HALF_OF(f1k.i + tw.i);
-        freqdata[ncfft-k].r = HALF_OF(f1k.r - tw.r);
-        freqdata[ncfft-k].i = HALF_OF(tw.i - f1k.i);
- */
+#if 0
+            using namespace std;
+            cerr << "desired:\n";
+            for ( k=1;k <= (ncfft>>1) ; ++k ) {
+                Complex x = exp( Complex(0,-3.14159265358979323846264338327 * ((double) (k) / ncfft + .5) ) );
+                cerr << k << " " << x << "angle(x):" << arg(x) << "\n";
+            }
+            dc=0;
+            cerr << "twiddles:\n";
+            for (k=0;k<ncfft;++k) {
+                Complex x = m_twiddles[k];
+                cerr << k << " " << x << "angle(-x):" << arg(-x) << "\n";
+            }
+#endif
+            for ( k=1;k <= (ncfft>>1) ; ++k ) {
                 Complex fpk = dst[k];
                 Complex fpnk = conj(dst[ncfft-k]);
-
                 Complex f1k = fpk + fpnk;
-                Complex f2k = fpnk - fpk;
-                //Complex tw = f2k * exp( Complex(0,-3.14159265358979323846264338327 * ((double)k / ncfft + 1) ) ); // TODO repalce this with index into twiddles
-                Complex tw = f2k * m_twiddles[2*k];;
-                
+                Complex f2k = fpk - fpnk;
+                //Complex tw = f2k * exp( Complex(0,-3.14159265358979323846264338327 * ((double) (k) / ncfft + .5) ) );
+                Complex tw= f2k * m_realTwiddles[k-1];
+
                 dst[k] =  (f1k + tw) * Scalar(.5);
                 dst[ncfft-k] =  conj(f1k -tw)*Scalar(.5);
             }
+ 
+
             // place conjugate-symmetric half at the end for completeness
             // TODO: make this configurable ( opt-out )
             for ( k=1;k < ncfft ; ++k )
@@ -98,55 +102,74 @@ namespace Eigen {
     {
         prepare(nfft,true);
         work(0, dst, src, 1,1);
-        scale(dst);
+        scale(dst, Scalar(1)/m_nfft );
     }
 
     void prepare(int nfft,bool inverse)
     {
-      if (m_nfft == nfft) {
-        // reuse the twiddles, conjugate if necessary
-        if (inverse != m_inverse)
-          for (int i=0;i<nfft;++i)
-            m_twiddles[i] = conj( m_twiddles[i] );
-        m_inverse = inverse;
-        return;
-      }
-      m_nfft = nfft;
-      m_inverse = inverse;
-      m_twiddles.resize(nfft);
-      Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
-      for (int i=0;i<nfft;++i)
-        m_twiddles[i] = exp( Complex(0,i*phinc) );
-
-      m_stageRadix.resize(0);
-      m_stageRemainder.resize(0);
-      //factorize
-      //start factoring out 4's, then 2's, then 3,5,7,9,...
-      int n= nfft;
-      int p=4;
-      do {
-        while (n % p) {
-          switch (p) {
-            case 4: p = 2; break;
-            case 2: p = 3; break;
-            default: p += 2; break;
-          }
-          if (p*p>n)
-            p=n;// no more factors
-        }
-        n /= p;
-        m_stageRadix.push_back(p);
-        m_stageRemainder.push_back(n);
-      }while(n>1);
+        make_twiddles(nfft,inverse);
+        factorize(nfft);
     }
 
-    void scale(Complex *dst) 
+    void make_real_twiddles(int nfft)
+    {
+        int ncfft2 = nfft>>2;
+        if ( m_realTwiddles.size() != ncfft2) {
+            m_realTwiddles.resize(ncfft2);
+            int ncfft= nfft>>1;
+            for (int k=1;k<=ncfft2;++k) 
+                m_realTwiddles[k-1] = exp( Complex(0,-3.14159265358979323846264338327 * ((double) (k) / ncfft + .5) ) );
+        }
+    }
+
+    void make_twiddles(int nfft,bool inverse)
+    {
+        if ( m_twiddles.size() == nfft) {
+            // reuse the twiddles, conjugate if necessary
+            if (inverse != m_inverse)
+                for (int i=0;i<nfft;++i)
+                    m_twiddles[i] = conj( m_twiddles[i] );
+        }else{
+            m_twiddles.resize(nfft);
+            Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
+            for (int i=0;i<nfft;++i)
+                m_twiddles[i] = exp( Complex(0,i*phinc) );
+        }
+        m_inverse = inverse;
+    }
+
+    void factorize(int nfft)
+    {
+        if (m_stageRadix.size()==0 || m_stageRadix[0] * m_stageRemainder[0] != nfft)
+        {
+            m_stageRadix.resize(0);
+            m_stageRemainder.resize(0);
+            //factorize
+            //start factoring out 4's, then 2's, then 3,5,7,9,...
+            int n= nfft;
+            int p=4;
+            do {
+                while (n % p) {
+                    switch (p) {
+                        case 4: p = 2; break;
+                        case 2: p = 3; break;
+                        default: p += 2; break;
+                    }
+                    if (p*p>n)
+                        p=n;// no more factors
+                }
+                n /= p;
+                m_stageRadix.push_back(p);
+                m_stageRemainder.push_back(n);
+            }while(n>1);
+        }
+        m_nfft = nfft;
+    }
+
+    void scale(Complex *dst,Scalar s) 
     {
-      if (m_inverse) {
-        Scalar s = 1./m_nfft;
         for (int k=0;k<m_nfft;++k)
-          dst[k] *= s;
-      }
+            dst[k] *= s;
     }
 
     private:
@@ -349,6 +372,7 @@ namespace Eigen {
     int m_nfft;
     bool m_inverse;
     std::vector<Complex> m_twiddles;
+    std::vector<Complex> m_realTwiddles;
     std::vector<int> m_stageRadix;
     std::vector<int> m_stageRemainder;
   };
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index 13e98ba77..41c7fed7b 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -41,6 +41,7 @@ complex<long double>  promote(long double x) { return complex<long double>( x);
     {
         long double totalpower=0;
         long double difpower=0;
+        cerr <<"idx\ttruth\t\tvalue\n";
         for (size_t k0=0;k0<fftbuf.size();++k0) {
             complex<long double> acc = 0;
             long double phinc = -2.*k0* M_PIl / timebuf.size();
@@ -51,7 +52,7 @@ complex<long double>  promote(long double x) { return complex<long double>( x);
             complex<long double> x = promote(fftbuf[k0]); 
             complex<long double> dif = acc - x;
             difpower += norm(dif);
-            cerr << k0 << ":" << acc << " " <<  x << endl;
+            cerr << k0 << "\t" << acc << "\t" <<  x << endl;
         }
         cerr << "rmse:" << sqrt(difpower/totalpower) << endl;
         return sqrt(difpower/totalpower);
@@ -108,6 +109,7 @@ void test_complex(int nfft)
 
 void test_FFT()
 {
+#if 0
   CALL_SUBTEST( test_complex<float>(32) ); CALL_SUBTEST( test_complex<double>(32) ); CALL_SUBTEST( test_complex<long double>(32) );
   CALL_SUBTEST( test_complex<float>(1024) ); CALL_SUBTEST( test_complex<double>(1024) ); CALL_SUBTEST( test_complex<long double>(1024) );
   CALL_SUBTEST( test_complex<float>(3*8) ); CALL_SUBTEST( test_complex<double>(3*8) ); CALL_SUBTEST( test_complex<long double>(3*8) );
@@ -115,9 +117,11 @@ void test_FFT()
   CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); CALL_SUBTEST( test_complex<long double>(2*3*4) );
   CALL_SUBTEST( test_complex<float>(2*3*4*5) ); CALL_SUBTEST( test_complex<double>(2*3*4*5) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5) );
   CALL_SUBTEST( test_complex<float>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<double>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5*7) );
-/*
+#endif
+
+#if 1
   CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); CALL_SUBTEST( test_scalar<long double>(32) );
   CALL_SUBTEST( test_scalar<float>(1024) ); CALL_SUBTEST( test_scalar<double>(1024) ); CALL_SUBTEST( test_scalar<long double>(1024) );
   CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
-  */
+#endif
 }

From 326ea773908c2d7e46101085af8f72d20b3f8cbc Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Sat, 23 May 2009 22:50:07 -0400
Subject: [PATCH 07/34] added FFT inverse complex-to-scalar interface (not yet
 optimized)

---
 bench/benchFFT.cpp                            | 73 ++++++++++++++-----
 unsupported/Eigen/src/FFT/simple_fft_traits.h | 31 ++++----
 unsupported/test/FFT.cpp                      | 21 ++++--
 3 files changed, 81 insertions(+), 44 deletions(-)

diff --git a/bench/benchFFT.cpp b/bench/benchFFT.cpp
index 041576b75..84cc49fe3 100644
--- a/bench/benchFFT.cpp
+++ b/bench/benchFFT.cpp
@@ -31,34 +31,67 @@
 using namespace Eigen;
 using namespace std;
 
-#ifndef NFFT
-#define NFFT 1024
-#endif
+
+template <typename T>
+string nameof();
+
+template <> string nameof<float>() {return "float";}
+template <> string nameof<double>() {return "double";}
+template <> string nameof<long double>() {return "long double";}
 
 #ifndef TYPE
 #define TYPE float
 #endif
 
-#ifndef NITS 
-#define NITS (10000000/NFFT)
+#ifndef NFFT
+#define NFFT 1024
+#endif
+#ifndef NDATA
+#define NDATA 1000000
 #endif
 
-int main() 
+using namespace Eigen;
+
+template <typename T>
+void bench(int nfft)
 {
-  vector<complex<TYPE> > inbuf(NFFT);
-  vector<complex<TYPE> > outbuf(NFFT);
-  Eigen::FFT<TYPE> fft;
+    typedef typename NumTraits<T>::Real Scalar;
+    typedef typename std::complex<Scalar> Complex;
+    int nits = NDATA/nfft;
+    vector<T> inbuf(nfft);
+    vector<Complex > outbuf(nfft);
+    FFT< Scalar > fft;
 
-  fft.fwd( outbuf , inbuf);
+    fft.fwd( outbuf , inbuf);
 
-  BenchTimer timer;
-  timer.reset();
-  for (int k=0;k<8;++k) {
-      timer.start();
-      for(int i = 0; i < NITS; i++)
-          fft.fwd( outbuf , inbuf);
-      timer.stop();
-  }
-  double mflops = 5.*NFFT*log2((double)NFFT) / (1e6 * timer.value() / (double)NITS );
-  cout << "NFFT=" << NFFT << "  " << (double(1e-6*NFFT*NITS)/timer.value()) << " MS/s  " << mflops << "MFLOPS\n";
+    BenchTimer timer;
+    timer.reset();
+    for (int k=0;k<8;++k) {
+        timer.start();
+        for(int i = 0; i < nits; i++)
+            fft.fwd( outbuf , inbuf);
+        timer.stop();
+    }
+
+    cout << nameof<Scalar>() << " ";
+    double mflops = 5.*nfft*log2((double)nfft) / (1e6 * timer.value() / (double)nits );
+    if ( NumTraits<T>::IsComplex ) {
+        cout << "complex";
+    }else{
+        cout << "real   ";
+        mflops /= 2;
+    }
+
+    cout << " NFFT=" << nfft << "  " << (double(1e-6*nfft*nits)/timer.value()) << " MS/s  " << mflops << "MFLOPS\n";
+}
+
+int main(int argc,char ** argv)
+{
+    bench<complex<float> >(NFFT);
+    bench<float>(NFFT);
+    bench<complex<double> >(NFFT);
+    bench<double>(NFFT);
+    bench<complex<long double> >(NFFT);
+    bench<long double>(NFFT);
+    return 0;
 }
diff --git a/unsupported/Eigen/src/FFT/simple_fft_traits.h b/unsupported/Eigen/src/FFT/simple_fft_traits.h
index f7dd2b9cf..1e2be8f79 100644
--- a/unsupported/Eigen/src/FFT/simple_fft_traits.h
+++ b/unsupported/Eigen/src/FFT/simple_fft_traits.h
@@ -54,28 +54,14 @@ namespace Eigen {
             work(0, dst, src, 1,1);
         }else{
             int ncfft = nfft>>1;
+            int ncfft2 = nfft>>2;
             // use optimized mode for even real
             fwd( dst, reinterpret_cast<const Complex*> (src),ncfft);
             make_real_twiddles(nfft);
-            std::cerr << "dst[0] = " << dst[0] << "\n";
             Complex dc = dst[0].real() +  dst[0].imag();
             Complex nyquist = dst[0].real() -  dst[0].imag();
             int k;
-#if 0
-            using namespace std;
-            cerr << "desired:\n";
-            for ( k=1;k <= (ncfft>>1) ; ++k ) {
-                Complex x = exp( Complex(0,-3.14159265358979323846264338327 * ((double) (k) / ncfft + .5) ) );
-                cerr << k << " " << x << "angle(x):" << arg(x) << "\n";
-            }
-            dc=0;
-            cerr << "twiddles:\n";
-            for (k=0;k<ncfft;++k) {
-                Complex x = m_twiddles[k];
-                cerr << k << " " << x << "angle(-x):" << arg(-x) << "\n";
-            }
-#endif
-            for ( k=1;k <= (ncfft>>1) ; ++k ) {
+            for ( k=1;k <= ncfft2 ; ++k ) {
                 Complex fpk = dst[k];
                 Complex fpnk = conj(dst[ncfft-k]);
                 Complex f1k = fpk + fpnk;
@@ -87,7 +73,6 @@ namespace Eigen {
                 dst[ncfft-k] =  conj(f1k -tw)*Scalar(.5);
             }
  
-
             // place conjugate-symmetric half at the end for completeness
             // TODO: make this configurable ( opt-out )
             for ( k=1;k < ncfft ; ++k )
@@ -98,6 +83,16 @@ namespace Eigen {
         }
     }
 
+    // half-complex to scalar
+    void inv( Scalar * dst,const Complex * src,int nfft) 
+    {
+        // TODO add optimized version for even numbers
+        std::vector<Complex> tmp(nfft);
+        inv(&tmp[0],src,nfft);
+        for (int k=0;k<nfft;++k)
+            dst[k] = tmp[k].real();
+    }
+
     void inv(Complex * dst,const Complex  *src,int nfft)
     {
         prepare(nfft,true);
@@ -156,7 +151,7 @@ namespace Eigen {
                         default: p += 2; break;
                     }
                     if (p*p>n)
-                        p=n;// no more factors
+                        p=n;// impossible to have a factor > sqrt(n)
                 }
                 n /= p;
                 m_stageRadix.push_back(p);
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index 41c7fed7b..75c33277d 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -28,6 +28,10 @@
 
 using namespace std;
 
+float norm(float x) {return x*x;}
+double norm(double x) {return x*x;}
+long double norm(long double x) {return x*x;}
+
 template < typename T>
 complex<long double>  promote(complex<T> x) { return complex<long double>(x.real(),x.imag()); }
 
@@ -83,7 +87,11 @@ void test_scalar(int nfft)
     for (int k=0;k<nfft;++k)
         inbuf[k]= (T)(rand()/(double)RAND_MAX - .5);
     fft.fwd( outbuf,inbuf);
-    VERIFY( fft_rmse(outbuf,inbuf) < 1e-5 );// gross check
+    VERIFY( fft_rmse(outbuf,inbuf) < test_precision<T>()  );// gross check
+
+    vector<Scalar> buf3;
+    fft.inv( buf3 , outbuf);
+    VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>()  );// gross check
 }
 
 template <class T>
@@ -100,18 +108,18 @@ void test_complex(int nfft)
         inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) );
     fft.fwd( outbuf , inbuf);
 
-    VERIFY( fft_rmse(outbuf,inbuf) < 1e-5 );// gross check
+    VERIFY( fft_rmse(outbuf,inbuf) < test_precision<T>()  );// gross check
 
     fft.inv( buf3 , outbuf);
 
-    VERIFY( dif_rmse(inbuf,buf3) < 1e-5 );// gross check
+    VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>()  );// gross check
 }
 
 void test_FFT()
 {
-#if 0
+#if 1
   CALL_SUBTEST( test_complex<float>(32) ); CALL_SUBTEST( test_complex<double>(32) ); CALL_SUBTEST( test_complex<long double>(32) );
-  CALL_SUBTEST( test_complex<float>(1024) ); CALL_SUBTEST( test_complex<double>(1024) ); CALL_SUBTEST( test_complex<long double>(1024) );
+  CALL_SUBTEST( test_complex<float>(256) ); CALL_SUBTEST( test_complex<double>(256) ); CALL_SUBTEST( test_complex<long double>(256) );
   CALL_SUBTEST( test_complex<float>(3*8) ); CALL_SUBTEST( test_complex<double>(3*8) ); CALL_SUBTEST( test_complex<long double>(3*8) );
   CALL_SUBTEST( test_complex<float>(5*32) ); CALL_SUBTEST( test_complex<double>(5*32) ); CALL_SUBTEST( test_complex<long double>(5*32) );
   CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); CALL_SUBTEST( test_complex<long double>(2*3*4) );
@@ -120,8 +128,9 @@ void test_FFT()
 #endif
 
 #if 1
+  CALL_SUBTEST( test_scalar<float>(45) ); CALL_SUBTEST( test_scalar<double>(45) ); CALL_SUBTEST( test_scalar<long double>(45) );
   CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); CALL_SUBTEST( test_scalar<long double>(32) );
-  CALL_SUBTEST( test_scalar<float>(1024) ); CALL_SUBTEST( test_scalar<double>(1024) ); CALL_SUBTEST( test_scalar<long double>(1024) );
+  CALL_SUBTEST( test_scalar<float>(256) ); CALL_SUBTEST( test_scalar<double>(256) ); CALL_SUBTEST( test_scalar<long double>(256) );
   CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
 #endif
 }

From 210092d16c57ec2fd2f8f515151de284e8a737e3 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Mon, 25 May 2009 20:35:24 -0400
Subject: [PATCH 08/34] changed name from simple_fft_traits to ei_kissfft_impl

---
 bench/benchFFT.cpp                            |  2 +-
 unsupported/Eigen/{FFT.h => FFT}              | 18 ++++++------
 ...{simple_fft_traits.h => ei_kissfft_impl.h} | 29 +++++++++++++++++--
 unsupported/test/FFT.cpp                      |  3 +-
 4 files changed, 37 insertions(+), 15 deletions(-)
 rename unsupported/Eigen/{FFT.h => FFT} (86%)
 rename unsupported/Eigen/src/FFT/{simple_fft_traits.h => ei_kissfft_impl.h} (95%)

diff --git a/bench/benchFFT.cpp b/bench/benchFFT.cpp
index 84cc49fe3..ffa4ffffc 100644
--- a/bench/benchFFT.cpp
+++ b/bench/benchFFT.cpp
@@ -26,7 +26,7 @@
 #include <vector>
 #include <Eigen/Core>
 #include <bench/BenchTimer.h>
-#include <unsupported/Eigen/FFT.h>
+#include <unsupported/Eigen/FFT>
 
 using namespace Eigen;
 using namespace std;
diff --git a/unsupported/Eigen/FFT.h b/unsupported/Eigen/FFT
similarity index 86%
rename from unsupported/Eigen/FFT.h
rename to unsupported/Eigen/FFT
index c466423b7..3d852f5a2 100644
--- a/unsupported/Eigen/FFT.h
+++ b/unsupported/Eigen/FFT
@@ -25,28 +25,28 @@
 #ifndef EIGEN_FFT_H
 #define EIGEN_FFT_H
 
-// simple_fft_traits:  small, free, reasonably efficient default, derived from kissfft
-#include "src/FFT/simple_fft_traits.h"
-#define DEFAULT_FFT_TRAITS simple_fft_traits
+// ei_kissfft_impl:  small, free, reasonably efficient default, derived from kissfft
+#include "src/FFT/ei_kissfft_impl.h"
+#define DEFAULT_FFT_IMPL ei_kissfft_impl
 
 // FFTW: faster, GPL-not LGPL, bigger code size
 #ifdef FFTW_PATIENT  // definition of FFTW_PATIENT indicates the caller has included fftw3.h, we can use FFTW routines
 // TODO 
-// #include "src/FFT/fftw_traits.h"
-// #define DEFAULT_FFT_TRAITS fftw_traits
+// #include "src/FFT/ei_fftw_impl.h"
+// #define DEFAULT_FFT_IMPL ei_fftw_impl
 #endif
 
 // intel Math Kernel Library: fastest, commerical
 #ifdef _MKL_DFTI_H_ // mkl_dfti.h has been included, we can use MKL FFT routines
 // TODO 
-// #include "src/FFT/imkl_traits.h"
-// #define DEFAULT_FFT_TRAITS imkl_traits
+// #include "src/FFT/ei_imkl_impl.h"
+// #define DEFAULT_FFT_IMPL ei_imkl_impl
 #endif
 
 namespace Eigen {
 
 template <typename _Scalar,
-         typename _Traits=DEFAULT_FFT_TRAITS<_Scalar> 
+         typename _Traits=DEFAULT_FFT_IMPL<_Scalar> 
          >
 class FFT
 {
@@ -90,6 +90,6 @@ class FFT
   private:
     traits_type m_traits;
 };
-#undef DEFAULT_FFT_TRAITS
+#undef DEFAULT_FFT_IMPL
 }
 #endif
diff --git a/unsupported/Eigen/src/FFT/simple_fft_traits.h b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
similarity index 95%
rename from unsupported/Eigen/src/FFT/simple_fft_traits.h
rename to unsupported/Eigen/src/FFT/ei_kissfft_impl.h
index 1e2be8f79..ce2c9f16e 100644
--- a/unsupported/Eigen/src/FFT/simple_fft_traits.h
+++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
@@ -24,16 +24,15 @@
 
 #include <complex>
 #include <vector>
-#include <iostream>
 
 namespace Eigen {
 
   template <typename _Scalar>
-  struct simple_fft_traits
+  struct ei_kissfft_impl
   {
     typedef _Scalar Scalar;
     typedef std::complex<Scalar> Complex;
-    simple_fft_traits() : m_nfft(0) {} 
+    ei_kissfft_impl() : m_nfft(0) {} 
 
     template <typename _Src>
     void fwd( Complex * dst,const _Src *src,int nfft)
@@ -370,5 +369,29 @@ namespace Eigen {
     std::vector<Complex> m_realTwiddles;
     std::vector<int> m_stageRadix;
     std::vector<int> m_stageRemainder;
+/*
+    enum {FORWARD,INVERSE,REAL,COMPLEX};
+
+    struct PlanKey
+    {
+        PlanKey(int nfft,bool isinverse,bool iscomplex)
+        {
+            _key = (nfft<<2) | (isinverse<<1) | iscomplex;
+        }
+
+        bool operator<(const PlanKey & other) const
+        {
+            return this->_key < other._key;
+        }
+        int _key;
+    };
+
+    struct PlanData
+    {
+        std::vector<Complex> m_twiddles;
+    };
+
+    std::map<PlanKey,
+*/
   };
 }
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index 75c33277d..daf397790 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -23,8 +23,7 @@
 // Eigen. If not, see <http://www.gnu.org/licenses/>.
 
 #include "main.h"
-#include <unsupported/Eigen/FFT.h>
-
+#include <unsupported/Eigen/FFT>
 
 using namespace std;
 

From 03ed6f9bfb63879d475f5bb8ea46cff96063d010 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Mon, 25 May 2009 23:06:49 -0400
Subject: [PATCH 09/34] refactored ei_kissfft_impl to maintain a cache of cpx
 fft plans

---
 unsupported/Eigen/src/FFT/ei_kissfft_impl.h | 612 ++++++++++----------
 unsupported/test/FFT.cpp                    |   7 +-
 2 files changed, 320 insertions(+), 299 deletions(-)

diff --git a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
index ce2c9f16e..3580e6c61 100644
--- a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
@@ -24,21 +24,279 @@
 
 #include <complex>
 #include <vector>
+#include <map>
 
 namespace Eigen {
 
+    template <typename _Scalar>
+    struct ei_kiss_cpx_fft
+    {
+        typedef  _Scalar Scalar;
+        typedef  std::complex<Scalar> Complex;
+        std::vector<Complex> m_twiddles;
+        std::vector<int> m_stageRadix;
+        std::vector<int> m_stageRemainder;
+        bool m_inverse;
+
+        ei_kiss_cpx_fft() { }
+
+        void make_twiddles(int nfft,bool inverse)
+        {
+            m_inverse = inverse;
+            m_twiddles.resize(nfft);
+            Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
+            for (int i=0;i<nfft;++i)
+                m_twiddles[i] = exp( Complex(0,i*phinc) );
+        }
+
+        void invert()
+        {
+            m_inverse = !m_inverse;
+            for ( size_t i=0;i<m_twiddles.size() ;++i)
+                m_twiddles[i] = conj( m_twiddles[i] );
+        }
+
+        void factorize(int nfft)
+        {
+            if (m_stageRadix.size()==0 || m_stageRadix[0] * m_stageRemainder[0] != nfft)
+            {
+                m_stageRadix.resize(0);
+                m_stageRemainder.resize(0);
+                //factorize
+                //start factoring out 4's, then 2's, then 3,5,7,9,...
+                int n= nfft;
+                int p=4;
+                do {
+                    while (n % p) {
+                        switch (p) {
+                            case 4: p = 2; break;
+                            case 2: p = 3; break;
+                            default: p += 2; break;
+                        }
+                        if (p*p>n)
+                            p=n;// impossible to have a factor > sqrt(n)
+                    }
+                    n /= p;
+                    m_stageRadix.push_back(p);
+                    m_stageRemainder.push_back(n);
+                }while(n>1);
+            }
+        }
+
+        template <typename _Src>
+            void work( int stage,Complex * xout, const _Src * xin, size_t fstride,size_t in_stride)
+            {
+                int p = m_stageRadix[stage];
+                int m = m_stageRemainder[stage];
+                Complex * Fout_beg = xout;
+                Complex * Fout_end = xout + p*m;
+
+                if (m>1) {
+                    do{
+                        // recursive call:
+                        // DFT of size m*p performed by doing
+                        // p instances of smaller DFTs of size m, 
+                        // each one takes a decimated version of the input
+                        work(stage+1, xout , xin, fstride*p,in_stride);
+                        xin += fstride*in_stride;
+                    }while( (xout += m) != Fout_end );
+                }else{
+                    do{
+                        *xout = *xin;
+                        xin += fstride*in_stride;
+                    }while(++xout != Fout_end );
+                }
+                xout=Fout_beg;
+
+                // recombine the p smaller DFTs 
+                switch (p) {
+                    case 2: bfly2(xout,fstride,m); break;
+                    case 3: bfly3(xout,fstride,m); break;
+                    case 4: bfly4(xout,fstride,m); break;
+                    case 5: bfly5(xout,fstride,m); break;
+                    default: bfly_generic(xout,fstride,m,p); break;
+                }
+            }
+
+        void bfly2( Complex * Fout, const size_t fstride, int m)
+        {
+            for (int k=0;k<m;++k) {
+                Complex t = Fout[m+k] * m_twiddles[k*fstride];
+                Fout[m+k] = Fout[k] - t;
+                Fout[k] += t;
+            }
+        }
+
+        void bfly4( Complex * Fout, const size_t fstride, const size_t m)
+        {
+            Complex scratch[6];
+            int negative_if_inverse = m_inverse * -2 +1;
+            for (size_t k=0;k<m;++k) {
+                scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
+                scratch[1] = Fout[k+2*m] * m_twiddles[k*fstride*2];
+                scratch[2] = Fout[k+3*m] * m_twiddles[k*fstride*3];
+                scratch[5] = Fout[k] - scratch[1];
+
+                Fout[k] += scratch[1];
+                scratch[3] = scratch[0] + scratch[2];
+                scratch[4] = scratch[0] - scratch[2];
+                scratch[4] = Complex( scratch[4].imag()*negative_if_inverse , -scratch[4].real()* negative_if_inverse );
+
+                Fout[k+2*m]  = Fout[k] - scratch[3];
+                Fout[k] += scratch[3];
+                Fout[k+m] = scratch[5] + scratch[4];
+                Fout[k+3*m] = scratch[5] - scratch[4];
+            }
+        }
+
+        void bfly3( Complex * Fout, const size_t fstride, const size_t m)
+        {
+            size_t k=m;
+            const size_t m2 = 2*m;
+            Complex *tw1,*tw2;
+            Complex scratch[5];
+            Complex epi3;
+            epi3 = m_twiddles[fstride*m];
+
+            tw1=tw2=&m_twiddles[0];
+
+            do{
+                scratch[1]=Fout[m] * *tw1;
+                scratch[2]=Fout[m2] * *tw2;
+
+                scratch[3]=scratch[1]+scratch[2];
+                scratch[0]=scratch[1]-scratch[2];
+                tw1 += fstride;
+                tw2 += fstride*2;
+                Fout[m] = Complex( Fout->real() - .5*scratch[3].real() , Fout->imag() - .5*scratch[3].imag() );
+                scratch[0] *= epi3.imag();
+                *Fout += scratch[3];
+                Fout[m2] = Complex(  Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
+                Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
+                ++Fout;
+            }while(--k);
+        }
+
+        void bfly5( Complex * Fout, const size_t fstride, const size_t m)
+        {
+            Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+            size_t u;
+            Complex scratch[13];
+            Complex * twiddles = &m_twiddles[0];
+            Complex *tw;
+            Complex ya,yb;
+            ya = twiddles[fstride*m];
+            yb = twiddles[fstride*2*m];
+
+            Fout0=Fout;
+            Fout1=Fout0+m;
+            Fout2=Fout0+2*m;
+            Fout3=Fout0+3*m;
+            Fout4=Fout0+4*m;
+
+            tw=twiddles;
+            for ( u=0; u<m; ++u ) {
+                scratch[0] = *Fout0;
+
+                scratch[1]  = *Fout1 * tw[u*fstride];
+                scratch[2]  = *Fout2 * tw[2*u*fstride];
+                scratch[3]  = *Fout3 * tw[3*u*fstride];
+                scratch[4]  = *Fout4 * tw[4*u*fstride];
+
+                scratch[7] = scratch[1] + scratch[4];
+                scratch[10] = scratch[1] - scratch[4];
+                scratch[8] = scratch[2] + scratch[3];
+                scratch[9] = scratch[2] - scratch[3];
+
+                *Fout0 +=  scratch[7];
+                *Fout0 +=  scratch[8];
+
+                scratch[5] = scratch[0] + Complex(
+                        (scratch[7].real()*ya.real() ) + (scratch[8].real() *yb.real() ),
+                        (scratch[7].imag()*ya.real()) + (scratch[8].imag()*yb.real())
+                        );
+
+                scratch[6] = Complex(
+                        (scratch[10].imag()*ya.imag()) + (scratch[9].imag()*yb.imag()),
+                        -(scratch[10].real()*ya.imag()) - (scratch[9].real()*yb.imag())
+                        );
+
+                *Fout1 = scratch[5] - scratch[6];
+                *Fout4 = scratch[5] + scratch[6];
+
+                scratch[11] = scratch[0] +
+                    Complex(
+                            (scratch[7].real()*yb.real()) + (scratch[8].real()*ya.real()),
+                            (scratch[7].imag()*yb.real()) + (scratch[8].imag()*ya.real())
+                           );
+
+                scratch[12] = Complex(
+                        -(scratch[10].imag()*yb.imag()) + (scratch[9].imag()*ya.imag()),
+                        (scratch[10].real()*yb.imag()) - (scratch[9].real()*ya.imag())
+                        );
+
+                *Fout2=scratch[11]+scratch[12];
+                *Fout3=scratch[11]-scratch[12];
+
+                ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+            }
+        }
+
+        /* perform the butterfly for one stage of a mixed radix FFT */
+        void bfly_generic(
+                Complex * Fout,
+                const size_t fstride,
+                int m,
+                int p
+                )
+        {
+            int u,k,q1,q;
+            Complex * twiddles = &m_twiddles[0];
+            Complex t;
+            int Norig = m_twiddles.size();
+            Complex * scratchbuf = (Complex*)alloca(p*sizeof(Complex) );
+
+            for ( u=0; u<m; ++u ) {
+                k=u;
+                for ( q1=0 ; q1<p ; ++q1 ) {
+                    scratchbuf[q1] = Fout[ k  ];
+                    k += m;
+                }
+
+                k=u;
+                for ( q1=0 ; q1<p ; ++q1 ) {
+                    int twidx=0;
+                    Fout[ k ] = scratchbuf[0];
+                    for (q=1;q<p;++q ) {
+                        twidx += fstride * k;
+                        if (twidx>=Norig) twidx-=Norig;
+                        t=scratchbuf[q] * twiddles[twidx];
+                        Fout[ k ] += t;
+                    }
+                    k += m;
+                }
+            }
+        }
+    };
+
+
   template <typename _Scalar>
   struct ei_kissfft_impl
   {
     typedef _Scalar Scalar;
     typedef std::complex<Scalar> Complex;
-    ei_kissfft_impl() : m_nfft(0) {} 
+    ei_kissfft_impl() {} 
+
+    void clear() 
+    {
+        m_plans.clear();
+        m_realTwiddles.clear();
+    }
 
     template <typename _Src>
     void fwd( Complex * dst,const _Src *src,int nfft)
     {
-        prepare(nfft,false);
-        work(0, dst, src, 1,1);
+        get_plan(nfft,false).work(0, dst, src, 1,1);
     }
 
     // real-to-complex forward FFT
@@ -47,16 +305,16 @@ namespace Eigen {
     // then fill in the conjugate symmetric half
     void fwd( Complex * dst,const Scalar * src,int nfft) 
     {
-        if ( nfft&1 ) {
+        if ( nfft&3  ) {
             // use generic mode for odd
-            prepare(nfft,false);
-            work(0, dst, src, 1,1);
+            get_plan(nfft,false).work(0, dst, src, 1,1);
         }else{
             int ncfft = nfft>>1;
             int ncfft2 = nfft>>2;
+            Complex * rtw = real_twiddles(ncfft2);
+
             // use optimized mode for even real
-            fwd( dst, reinterpret_cast<const Complex*> (src),ncfft);
-            make_real_twiddles(nfft);
+            fwd( dst, reinterpret_cast<const Complex*> (src), ncfft);
             Complex dc = dst[0].real() +  dst[0].imag();
             Complex nyquist = dst[0].real() -  dst[0].imag();
             int k;
@@ -65,8 +323,7 @@ namespace Eigen {
                 Complex fpnk = conj(dst[ncfft-k]);
                 Complex f1k = fpk + fpnk;
                 Complex f2k = fpk - fpnk;
-                //Complex tw = f2k * exp( Complex(0,-3.14159265358979323846264338327 * ((double) (k) / ncfft + .5) ) );
-                Complex tw= f2k * m_realTwiddles[k-1];
+                Complex tw= f2k * rtw[k-1];
 
                 dst[k] =  (f1k + tw) * Scalar(.5);
                 dst[ncfft-k] =  conj(f1k -tw)*Scalar(.5);
@@ -94,304 +351,67 @@ namespace Eigen {
 
     void inv(Complex * dst,const Complex  *src,int nfft)
     {
-        prepare(nfft,true);
-        work(0, dst, src, 1,1);
-        scale(dst, Scalar(1)/m_nfft );
-    }
-
-    void prepare(int nfft,bool inverse)
-    {
-        make_twiddles(nfft,inverse);
-        factorize(nfft);
-    }
-
-    void make_real_twiddles(int nfft)
-    {
-        int ncfft2 = nfft>>2;
-        if ( m_realTwiddles.size() != ncfft2) {
-            m_realTwiddles.resize(ncfft2);
-            int ncfft= nfft>>1;
-            for (int k=1;k<=ncfft2;++k) 
-                m_realTwiddles[k-1] = exp( Complex(0,-3.14159265358979323846264338327 * ((double) (k) / ncfft + .5) ) );
-        }
-    }
-
-    void make_twiddles(int nfft,bool inverse)
-    {
-        if ( m_twiddles.size() == nfft) {
-            // reuse the twiddles, conjugate if necessary
-            if (inverse != m_inverse)
-                for (int i=0;i<nfft;++i)
-                    m_twiddles[i] = conj( m_twiddles[i] );
-        }else{
-            m_twiddles.resize(nfft);
-            Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
-            for (int i=0;i<nfft;++i)
-                m_twiddles[i] = exp( Complex(0,i*phinc) );
-        }
-        m_inverse = inverse;
-    }
-
-    void factorize(int nfft)
-    {
-        if (m_stageRadix.size()==0 || m_stageRadix[0] * m_stageRemainder[0] != nfft)
-        {
-            m_stageRadix.resize(0);
-            m_stageRemainder.resize(0);
-            //factorize
-            //start factoring out 4's, then 2's, then 3,5,7,9,...
-            int n= nfft;
-            int p=4;
-            do {
-                while (n % p) {
-                    switch (p) {
-                        case 4: p = 2; break;
-                        case 2: p = 3; break;
-                        default: p += 2; break;
-                    }
-                    if (p*p>n)
-                        p=n;// impossible to have a factor > sqrt(n)
-                }
-                n /= p;
-                m_stageRadix.push_back(p);
-                m_stageRemainder.push_back(n);
-            }while(n>1);
-        }
-        m_nfft = nfft;
-    }
-
-    void scale(Complex *dst,Scalar s) 
-    {
-        for (int k=0;k<m_nfft;++k)
-            dst[k] *= s;
+        get_plan(nfft,true).work(0, dst, src, 1,1);
+        scale(dst, nfft, Scalar(1)/nfft );
     }
 
     private:
 
-    template <typename _Src>
-    void work( int stage,Complex * xout, const _Src * xin, size_t fstride,size_t in_stride)
+    typedef ei_kiss_cpx_fft<Scalar> PlanData;
+
+    typedef std::map<int,PlanData> PlanMap;
+    PlanMap m_plans;
+    std::map<int, std::vector<Complex> > m_realTwiddles;
+
+    int PlanKey(int nfft,bool isinverse) const { return (nfft<<1) | isinverse; }
+
+    PlanData & get_plan(int nfft,bool inverse)
     {
-      int p = m_stageRadix[stage];
-      int m = m_stageRemainder[stage];
-      Complex * Fout_beg = xout;
-      Complex * Fout_end = xout + p*m;
-
-      if (m>1) {
-        do{
-          // recursive call:
-          // DFT of size m*p performed by doing
-          // p instances of smaller DFTs of size m, 
-          // each one takes a decimated version of the input
-          work(stage+1, xout , xin, fstride*p,in_stride);
-          xin += fstride*in_stride;
-        }while( (xout += m) != Fout_end );
-      }else{
-          do{
-              *xout = *xin;
-              xin += fstride*in_stride;
-          }while(++xout != Fout_end );
-      }
-      xout=Fout_beg;
-
-      // recombine the p smaller DFTs 
-      switch (p) {
-        case 2: bfly2(xout,fstride,m); break;
-        case 3: bfly3(xout,fstride,m); break;
-        case 4: bfly4(xout,fstride,m); break;
-        case 5: bfly5(xout,fstride,m); break;
-        default: bfly_generic(xout,fstride,m,p); break;
-      }
-    }
-
-    void bfly2( Complex * Fout, const size_t fstride, int m)
-    {
-      for (int k=0;k<m;++k) {
-        Complex t = Fout[m+k] * m_twiddles[k*fstride];
-        Fout[m+k] = Fout[k] - t;
-        Fout[k] += t;
-      }
-    }
-
-    void bfly4( Complex * Fout, const size_t fstride, const size_t m)
-    {
-      Complex scratch[6];
-      int negative_if_inverse = m_inverse * -2 +1;
-      for (size_t k=0;k<m;++k) {
-        scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
-        scratch[1] = Fout[k+2*m] * m_twiddles[k*fstride*2];
-        scratch[2] = Fout[k+3*m] * m_twiddles[k*fstride*3];
-        scratch[5] = Fout[k] - scratch[1];
-
-        Fout[k] += scratch[1];
-        scratch[3] = scratch[0] + scratch[2];
-        scratch[4] = scratch[0] - scratch[2];
-        scratch[4] = Complex( scratch[4].imag()*negative_if_inverse , -scratch[4].real()* negative_if_inverse );
-
-        Fout[k+2*m]  = Fout[k] - scratch[3];
-        Fout[k] += scratch[3];
-        Fout[k+m] = scratch[5] + scratch[4];
-        Fout[k+3*m] = scratch[5] - scratch[4];
-      }
-    }
-
-    void bfly3( Complex * Fout, const size_t fstride, const size_t m)
-    {
-      size_t k=m;
-      const size_t m2 = 2*m;
-      Complex *tw1,*tw2;
-      Complex scratch[5];
-      Complex epi3;
-      epi3 = m_twiddles[fstride*m];
-
-      tw1=tw2=&m_twiddles[0];
-
-      do{
-        scratch[1]=Fout[m] * *tw1;
-        scratch[2]=Fout[m2] * *tw2;
-
-        scratch[3]=scratch[1]+scratch[2];
-        scratch[0]=scratch[1]-scratch[2];
-        tw1 += fstride;
-        tw2 += fstride*2;
-        Fout[m] = Complex( Fout->real() - .5*scratch[3].real() , Fout->imag() - .5*scratch[3].imag() );
-        scratch[0] *= epi3.imag();
-        *Fout += scratch[3];
-        Fout[m2] = Complex(  Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
-        Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
-        ++Fout;
-      }while(--k);
-    }
-
-    void bfly5( Complex * Fout, const size_t fstride, const size_t m)
-    {
-      Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
-      size_t u;
-      Complex scratch[13];
-      Complex * twiddles = &m_twiddles[0];
-      Complex *tw;
-      Complex ya,yb;
-      ya = twiddles[fstride*m];
-      yb = twiddles[fstride*2*m];
-
-      Fout0=Fout;
-      Fout1=Fout0+m;
-      Fout2=Fout0+2*m;
-      Fout3=Fout0+3*m;
-      Fout4=Fout0+4*m;
-
-      tw=twiddles;
-      for ( u=0; u<m; ++u ) {
-        scratch[0] = *Fout0;
-
-        scratch[1]  = *Fout1 * tw[u*fstride];
-        scratch[2]  = *Fout2 * tw[2*u*fstride];
-        scratch[3]  = *Fout3 * tw[3*u*fstride];
-        scratch[4]  = *Fout4 * tw[4*u*fstride];
-
-        scratch[7] = scratch[1] + scratch[4];
-        scratch[10] = scratch[1] - scratch[4];
-        scratch[8] = scratch[2] + scratch[3];
-        scratch[9] = scratch[2] - scratch[3];
-
-        *Fout0 +=  scratch[7];
-        *Fout0 +=  scratch[8];
-
-        scratch[5] = scratch[0] + Complex(
-            (scratch[7].real()*ya.real() ) + (scratch[8].real() *yb.real() ),
-            (scratch[7].imag()*ya.real()) + (scratch[8].imag()*yb.real())
-            );
-
-        scratch[6] = Complex(
-            (scratch[10].imag()*ya.imag()) + (scratch[9].imag()*yb.imag()),
-            -(scratch[10].real()*ya.imag()) - (scratch[9].real()*yb.imag())
-            );
-
-        *Fout1 = scratch[5] - scratch[6];
-        *Fout4 = scratch[5] + scratch[6];
-
-        scratch[11] = scratch[0] +
-          Complex(
-              (scratch[7].real()*yb.real()) + (scratch[8].real()*ya.real()),
-              (scratch[7].imag()*yb.real()) + (scratch[8].imag()*ya.real())
-              );
-
-        scratch[12] = Complex(
-            -(scratch[10].imag()*yb.imag()) + (scratch[9].imag()*ya.imag()),
-            (scratch[10].real()*yb.imag()) - (scratch[9].real()*ya.imag())
-            );
-
-        *Fout2=scratch[11]+scratch[12];
-        *Fout3=scratch[11]-scratch[12];
-
-        ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
-      }
-    }
-
-    /* perform the butterfly for one stage of a mixed radix FFT */
-    void bfly_generic(
-        Complex * Fout,
-        const size_t fstride,
-        int m,
-        int p
-        )
-    {
-      int u,k,q1,q;
-      Complex * twiddles = &m_twiddles[0];
-      Complex t;
-      int Norig = m_nfft;
-      Complex * scratchbuf = (Complex*)alloca(p*sizeof(Complex) );
-
-      for ( u=0; u<m; ++u ) {
-        k=u;
-        for ( q1=0 ; q1<p ; ++q1 ) {
-          scratchbuf[q1] = Fout[ k  ];
-          k += m;
+        /* 
+         * for some reason this does not work
+         *
+        typedef typename std::map<int,PlanData>::iterator MapIt;
+        MapIt it;
+        it = m_plans.find( PlanKey(nfft,inverse) );
+        if (it == m_plans.end() ) {
+            // create new entry
+            it = m_plans.insert( make_pair( PlanKey(nfft,inverse) , PlanData() ) );
+            MapIt it2 = m_plans.find( PlanKey(nfft,!inverse) );
+            if (it2 != m_plans.end() ) {
+                it->second = it2.second;
+                it->second.invert();
+            }else{
+                it->second.make_twiddles(nfft,inverse);
+                it->second.factorize(nfft);
+            }
         }
-
-        k=u;
-        for ( q1=0 ; q1<p ; ++q1 ) {
-          int twidx=0;
-          Fout[ k ] = scratchbuf[0];
-          for (q=1;q<p;++q ) {
-            twidx += fstride * k;
-            if (twidx>=Norig) twidx-=Norig;
-            t=scratchbuf[q] * twiddles[twidx];
-            Fout[ k ] += t;
-          }
-          k += m;
+        return it->second;
+        */
+        PlanData & pd = m_plans[ PlanKey(nfft,inverse) ];
+        if ( pd.m_twiddles.size() == 0 ) {
+            pd.make_twiddles(nfft,inverse);
+            pd.factorize(nfft);
         }
-      }
+        return pd;
     }
 
-    int m_nfft;
-    bool m_inverse;
-    std::vector<Complex> m_twiddles;
-    std::vector<Complex> m_realTwiddles;
-    std::vector<int> m_stageRadix;
-    std::vector<int> m_stageRemainder;
-/*
-    enum {FORWARD,INVERSE,REAL,COMPLEX};
-
-    struct PlanKey
+    Complex * real_twiddles(int ncfft2)
     {
-        PlanKey(int nfft,bool isinverse,bool iscomplex)
-        {
-            _key = (nfft<<2) | (isinverse<<1) | iscomplex;
+        std::vector<Complex> & twidref = m_realTwiddles[ncfft2];// creates new if not there
+        if ( (int)twidref.size() != ncfft2 ) {
+            twidref.resize(ncfft2);
+            int ncfft= ncfft2<<1;
+            Scalar pi =  acos( Scalar(-1) );
+            for (int k=1;k<=ncfft2;++k) 
+                twidref[k-1] = exp( Complex(0,-pi * ((double) (k) / ncfft + .5) ) );
         }
+        return &twidref[0];
+    }
 
-        bool operator<(const PlanKey & other) const
-        {
-            return this->_key < other._key;
-        }
-        int _key;
-    };
-
-    struct PlanData
+    void scale(Complex *dst,int n,Scalar s) 
     {
-        std::vector<Complex> m_twiddles;
-    };
-
-    std::map<PlanKey,
-*/
+        for (int k=0;k<n;++k)
+            dst[k] *= s;
+    }
   };
 }
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index daf397790..32d1393d0 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -44,7 +44,7 @@ complex<long double>  promote(long double x) { return complex<long double>( x);
     {
         long double totalpower=0;
         long double difpower=0;
-        cerr <<"idx\ttruth\t\tvalue\n";
+        cerr <<"idx\ttruth\t\tvalue\t|dif|=\n";
         for (size_t k0=0;k0<fftbuf.size();++k0) {
             complex<long double> acc = 0;
             long double phinc = -2.*k0* M_PIl / timebuf.size();
@@ -55,7 +55,7 @@ complex<long double>  promote(long double x) { return complex<long double>( x);
             complex<long double> x = promote(fftbuf[k0]); 
             complex<long double> dif = acc - x;
             difpower += norm(dif);
-            cerr << k0 << "\t" << acc << "\t" <<  x << endl;
+            cerr << k0 << "\t" << acc << "\t" <<  x << "\t" << sqrt(norm(dif)) << endl;
         }
         cerr << "rmse:" << sqrt(difpower/totalpower) << endl;
         return sqrt(difpower/totalpower);
@@ -127,8 +127,9 @@ void test_FFT()
 #endif
 
 #if 1
-  CALL_SUBTEST( test_scalar<float>(45) ); CALL_SUBTEST( test_scalar<double>(45) ); CALL_SUBTEST( test_scalar<long double>(45) );
   CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); CALL_SUBTEST( test_scalar<long double>(32) );
+  CALL_SUBTEST( test_scalar<float>(45) ); CALL_SUBTEST( test_scalar<double>(45) ); CALL_SUBTEST( test_scalar<long double>(45) );
+  CALL_SUBTEST( test_scalar<float>(50) ); CALL_SUBTEST( test_scalar<double>(50) ); CALL_SUBTEST( test_scalar<long double>(50) );
   CALL_SUBTEST( test_scalar<float>(256) ); CALL_SUBTEST( test_scalar<double>(256) ); CALL_SUBTEST( test_scalar<long double>(256) );
   CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
 #endif

From 09b47332553a79dab30516e6b1d410dea90cf9b7 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Mon, 25 May 2009 23:52:21 -0400
Subject: [PATCH 10/34] added real-optimized inverse FFT (NFFT must be multiple
 of 4)

---
 bench/benchFFT.cpp                          |  30 +-
 unsupported/Eigen/src/FFT/ei_kissfft_impl.h | 688 ++++++++++----------
 2 files changed, 370 insertions(+), 348 deletions(-)

diff --git a/bench/benchFFT.cpp b/bench/benchFFT.cpp
index ffa4ffffc..14f5063fb 100644
--- a/bench/benchFFT.cpp
+++ b/bench/benchFFT.cpp
@@ -53,7 +53,7 @@ template <> string nameof<long double>() {return "long double";}
 using namespace Eigen;
 
 template <typename T>
-void bench(int nfft)
+void bench(int nfft,bool fwd)
 {
     typedef typename NumTraits<T>::Real Scalar;
     typedef typename std::complex<Scalar> Complex;
@@ -69,7 +69,10 @@ void bench(int nfft)
     for (int k=0;k<8;++k) {
         timer.start();
         for(int i = 0; i < nits; i++)
-            fft.fwd( outbuf , inbuf);
+            if (fwd)
+                fft.fwd( outbuf , inbuf);
+            else
+                fft.inv(inbuf,outbuf);
         timer.stop();
     }
 
@@ -82,16 +85,27 @@ void bench(int nfft)
         mflops /= 2;
     }
 
+    if (fwd)
+        cout << " fwd";
+    else
+        cout << " inv";
+
     cout << " NFFT=" << nfft << "  " << (double(1e-6*nfft*nits)/timer.value()) << " MS/s  " << mflops << "MFLOPS\n";
 }
 
 int main(int argc,char ** argv)
 {
-    bench<complex<float> >(NFFT);
-    bench<float>(NFFT);
-    bench<complex<double> >(NFFT);
-    bench<double>(NFFT);
-    bench<complex<long double> >(NFFT);
-    bench<long double>(NFFT);
+    bench<complex<float> >(NFFT,true);
+    bench<complex<float> >(NFFT,false);
+    bench<float>(NFFT,true);
+    bench<float>(NFFT,false);
+    bench<complex<double> >(NFFT,true);
+    bench<complex<double> >(NFFT,false);
+    bench<double>(NFFT,true);
+    bench<double>(NFFT,false);
+    bench<complex<long double> >(NFFT,true);
+    bench<complex<long double> >(NFFT,false);
+    bench<long double>(NFFT,true);
+    bench<long double>(NFFT,false);
     return 0;
 }
diff --git a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
index 3580e6c61..453c7f6da 100644
--- a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
@@ -28,390 +28,398 @@
 
 namespace Eigen {
 
-    template <typename _Scalar>
-    struct ei_kiss_cpx_fft
+  template <typename _Scalar>
+  struct ei_kiss_cpx_fft
+  {
+    typedef  _Scalar Scalar;
+    typedef  std::complex<Scalar> Complex;
+    std::vector<Complex> m_twiddles;
+    std::vector<int> m_stageRadix;
+    std::vector<int> m_stageRemainder;
+    bool m_inverse;
+
+    void make_twiddles(int nfft,bool inverse)
     {
-        typedef  _Scalar Scalar;
-        typedef  std::complex<Scalar> Complex;
-        std::vector<Complex> m_twiddles;
-        std::vector<int> m_stageRadix;
-        std::vector<int> m_stageRemainder;
-        bool m_inverse;
+      m_inverse = inverse;
+      m_twiddles.resize(nfft);
+      Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
+      for (int i=0;i<nfft;++i)
+        m_twiddles[i] = exp( Complex(0,i*phinc) );
+    }
 
-        ei_kiss_cpx_fft() { }
+    void conjugate()
+    {
+      m_inverse = !m_inverse;
+      for ( size_t i=0;i<m_twiddles.size() ;++i)
+        m_twiddles[i] = conj( m_twiddles[i] );
+    }
 
-        void make_twiddles(int nfft,bool inverse)
-        {
-            m_inverse = inverse;
-            m_twiddles.resize(nfft);
-            Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
-            for (int i=0;i<nfft;++i)
-                m_twiddles[i] = exp( Complex(0,i*phinc) );
+    void factorize(int nfft)
+    {
+      //start factoring out 4's, then 2's, then 3,5,7,9,...
+      int n= nfft;
+      int p=4;
+      do {
+        while (n % p) {
+          switch (p) {
+            case 4: p = 2; break;
+            case 2: p = 3; break;
+            default: p += 2; break;
+          }
+          if (p*p>n)
+            p=n;// impossible to have a factor > sqrt(n)
+        }
+        n /= p;
+        m_stageRadix.push_back(p);
+        m_stageRemainder.push_back(n);
+      }while(n>1);
+    }
+
+    template <typename _Src>
+    void work( int stage,Complex * xout, const _Src * xin, size_t fstride,size_t in_stride)
+    {
+      int p = m_stageRadix[stage];
+      int m = m_stageRemainder[stage];
+      Complex * Fout_beg = xout;
+      Complex * Fout_end = xout + p*m;
+
+      if (m>1) {
+        do{
+          // recursive call:
+          // DFT of size m*p performed by doing
+          // p instances of smaller DFTs of size m, 
+          // each one takes a decimated version of the input
+          work(stage+1, xout , xin, fstride*p,in_stride);
+          xin += fstride*in_stride;
+        }while( (xout += m) != Fout_end );
+      }else{
+        do{
+          *xout = *xin;
+          xin += fstride*in_stride;
+        }while(++xout != Fout_end );
+      }
+      xout=Fout_beg;
+
+      // recombine the p smaller DFTs 
+      switch (p) {
+        case 2: bfly2(xout,fstride,m); break;
+        case 3: bfly3(xout,fstride,m); break;
+        case 4: bfly4(xout,fstride,m); break;
+        case 5: bfly5(xout,fstride,m); break;
+        default: bfly_generic(xout,fstride,m,p); break;
+      }
+    }
+
+    void bfly2( Complex * Fout, const size_t fstride, int m)
+    {
+      for (int k=0;k<m;++k) {
+        Complex t = Fout[m+k] * m_twiddles[k*fstride];
+        Fout[m+k] = Fout[k] - t;
+        Fout[k] += t;
+      }
+    }
+
+    void bfly4( Complex * Fout, const size_t fstride, const size_t m)
+    {
+      Complex scratch[6];
+      int negative_if_inverse = m_inverse * -2 +1;
+      for (size_t k=0;k<m;++k) {
+        scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
+        scratch[1] = Fout[k+2*m] * m_twiddles[k*fstride*2];
+        scratch[2] = Fout[k+3*m] * m_twiddles[k*fstride*3];
+        scratch[5] = Fout[k] - scratch[1];
+
+        Fout[k] += scratch[1];
+        scratch[3] = scratch[0] + scratch[2];
+        scratch[4] = scratch[0] - scratch[2];
+        scratch[4] = Complex( scratch[4].imag()*negative_if_inverse , -scratch[4].real()* negative_if_inverse );
+
+        Fout[k+2*m]  = Fout[k] - scratch[3];
+        Fout[k] += scratch[3];
+        Fout[k+m] = scratch[5] + scratch[4];
+        Fout[k+3*m] = scratch[5] - scratch[4];
+      }
+    }
+
+    void bfly3( Complex * Fout, const size_t fstride, const size_t m)
+    {
+      size_t k=m;
+      const size_t m2 = 2*m;
+      Complex *tw1,*tw2;
+      Complex scratch[5];
+      Complex epi3;
+      epi3 = m_twiddles[fstride*m];
+
+      tw1=tw2=&m_twiddles[0];
+
+      do{
+        scratch[1]=Fout[m] * *tw1;
+        scratch[2]=Fout[m2] * *tw2;
+
+        scratch[3]=scratch[1]+scratch[2];
+        scratch[0]=scratch[1]-scratch[2];
+        tw1 += fstride;
+        tw2 += fstride*2;
+        Fout[m] = Complex( Fout->real() - .5*scratch[3].real() , Fout->imag() - .5*scratch[3].imag() );
+        scratch[0] *= epi3.imag();
+        *Fout += scratch[3];
+        Fout[m2] = Complex(  Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
+        Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
+        ++Fout;
+      }while(--k);
+    }
+
+    void bfly5( Complex * Fout, const size_t fstride, const size_t m)
+    {
+      Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+      size_t u;
+      Complex scratch[13];
+      Complex * twiddles = &m_twiddles[0];
+      Complex *tw;
+      Complex ya,yb;
+      ya = twiddles[fstride*m];
+      yb = twiddles[fstride*2*m];
+
+      Fout0=Fout;
+      Fout1=Fout0+m;
+      Fout2=Fout0+2*m;
+      Fout3=Fout0+3*m;
+      Fout4=Fout0+4*m;
+
+      tw=twiddles;
+      for ( u=0; u<m; ++u ) {
+        scratch[0] = *Fout0;
+
+        scratch[1]  = *Fout1 * tw[u*fstride];
+        scratch[2]  = *Fout2 * tw[2*u*fstride];
+        scratch[3]  = *Fout3 * tw[3*u*fstride];
+        scratch[4]  = *Fout4 * tw[4*u*fstride];
+
+        scratch[7] = scratch[1] + scratch[4];
+        scratch[10] = scratch[1] - scratch[4];
+        scratch[8] = scratch[2] + scratch[3];
+        scratch[9] = scratch[2] - scratch[3];
+
+        *Fout0 +=  scratch[7];
+        *Fout0 +=  scratch[8];
+
+        scratch[5] = scratch[0] + Complex(
+            (scratch[7].real()*ya.real() ) + (scratch[8].real() *yb.real() ),
+            (scratch[7].imag()*ya.real()) + (scratch[8].imag()*yb.real())
+            );
+
+        scratch[6] = Complex(
+            (scratch[10].imag()*ya.imag()) + (scratch[9].imag()*yb.imag()),
+            -(scratch[10].real()*ya.imag()) - (scratch[9].real()*yb.imag())
+            );
+
+        *Fout1 = scratch[5] - scratch[6];
+        *Fout4 = scratch[5] + scratch[6];
+
+        scratch[11] = scratch[0] +
+          Complex(
+              (scratch[7].real()*yb.real()) + (scratch[8].real()*ya.real()),
+              (scratch[7].imag()*yb.real()) + (scratch[8].imag()*ya.real())
+              );
+
+        scratch[12] = Complex(
+            -(scratch[10].imag()*yb.imag()) + (scratch[9].imag()*ya.imag()),
+            (scratch[10].real()*yb.imag()) - (scratch[9].real()*ya.imag())
+            );
+
+        *Fout2=scratch[11]+scratch[12];
+        *Fout3=scratch[11]-scratch[12];
+
+        ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+      }
+    }
+
+    /* perform the butterfly for one stage of a mixed radix FFT */
+    void bfly_generic(
+        Complex * Fout,
+        const size_t fstride,
+        int m,
+        int p
+        )
+    {
+      int u,k,q1,q;
+      Complex * twiddles = &m_twiddles[0];
+      Complex t;
+      int Norig = m_twiddles.size();
+      Complex * scratchbuf = (Complex*)alloca(p*sizeof(Complex) );
+
+      for ( u=0; u<m; ++u ) {
+        k=u;
+        for ( q1=0 ; q1<p ; ++q1 ) {
+          scratchbuf[q1] = Fout[ k  ];
+          k += m;
         }
 
-        void invert()
-        {
-            m_inverse = !m_inverse;
-            for ( size_t i=0;i<m_twiddles.size() ;++i)
-                m_twiddles[i] = conj( m_twiddles[i] );
+        k=u;
+        for ( q1=0 ; q1<p ; ++q1 ) {
+          int twidx=0;
+          Fout[ k ] = scratchbuf[0];
+          for (q=1;q<p;++q ) {
+            twidx += fstride * k;
+            if (twidx>=Norig) twidx-=Norig;
+            t=scratchbuf[q] * twiddles[twidx];
+            Fout[ k ] += t;
+          }
+          k += m;
         }
-
-        void factorize(int nfft)
-        {
-            if (m_stageRadix.size()==0 || m_stageRadix[0] * m_stageRemainder[0] != nfft)
-            {
-                m_stageRadix.resize(0);
-                m_stageRemainder.resize(0);
-                //factorize
-                //start factoring out 4's, then 2's, then 3,5,7,9,...
-                int n= nfft;
-                int p=4;
-                do {
-                    while (n % p) {
-                        switch (p) {
-                            case 4: p = 2; break;
-                            case 2: p = 3; break;
-                            default: p += 2; break;
-                        }
-                        if (p*p>n)
-                            p=n;// impossible to have a factor > sqrt(n)
-                    }
-                    n /= p;
-                    m_stageRadix.push_back(p);
-                    m_stageRemainder.push_back(n);
-                }while(n>1);
-            }
-        }
-
-        template <typename _Src>
-            void work( int stage,Complex * xout, const _Src * xin, size_t fstride,size_t in_stride)
-            {
-                int p = m_stageRadix[stage];
-                int m = m_stageRemainder[stage];
-                Complex * Fout_beg = xout;
-                Complex * Fout_end = xout + p*m;
-
-                if (m>1) {
-                    do{
-                        // recursive call:
-                        // DFT of size m*p performed by doing
-                        // p instances of smaller DFTs of size m, 
-                        // each one takes a decimated version of the input
-                        work(stage+1, xout , xin, fstride*p,in_stride);
-                        xin += fstride*in_stride;
-                    }while( (xout += m) != Fout_end );
-                }else{
-                    do{
-                        *xout = *xin;
-                        xin += fstride*in_stride;
-                    }while(++xout != Fout_end );
-                }
-                xout=Fout_beg;
-
-                // recombine the p smaller DFTs 
-                switch (p) {
-                    case 2: bfly2(xout,fstride,m); break;
-                    case 3: bfly3(xout,fstride,m); break;
-                    case 4: bfly4(xout,fstride,m); break;
-                    case 5: bfly5(xout,fstride,m); break;
-                    default: bfly_generic(xout,fstride,m,p); break;
-                }
-            }
-
-        void bfly2( Complex * Fout, const size_t fstride, int m)
-        {
-            for (int k=0;k<m;++k) {
-                Complex t = Fout[m+k] * m_twiddles[k*fstride];
-                Fout[m+k] = Fout[k] - t;
-                Fout[k] += t;
-            }
-        }
-
-        void bfly4( Complex * Fout, const size_t fstride, const size_t m)
-        {
-            Complex scratch[6];
-            int negative_if_inverse = m_inverse * -2 +1;
-            for (size_t k=0;k<m;++k) {
-                scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
-                scratch[1] = Fout[k+2*m] * m_twiddles[k*fstride*2];
-                scratch[2] = Fout[k+3*m] * m_twiddles[k*fstride*3];
-                scratch[5] = Fout[k] - scratch[1];
-
-                Fout[k] += scratch[1];
-                scratch[3] = scratch[0] + scratch[2];
-                scratch[4] = scratch[0] - scratch[2];
-                scratch[4] = Complex( scratch[4].imag()*negative_if_inverse , -scratch[4].real()* negative_if_inverse );
-
-                Fout[k+2*m]  = Fout[k] - scratch[3];
-                Fout[k] += scratch[3];
-                Fout[k+m] = scratch[5] + scratch[4];
-                Fout[k+3*m] = scratch[5] - scratch[4];
-            }
-        }
-
-        void bfly3( Complex * Fout, const size_t fstride, const size_t m)
-        {
-            size_t k=m;
-            const size_t m2 = 2*m;
-            Complex *tw1,*tw2;
-            Complex scratch[5];
-            Complex epi3;
-            epi3 = m_twiddles[fstride*m];
-
-            tw1=tw2=&m_twiddles[0];
-
-            do{
-                scratch[1]=Fout[m] * *tw1;
-                scratch[2]=Fout[m2] * *tw2;
-
-                scratch[3]=scratch[1]+scratch[2];
-                scratch[0]=scratch[1]-scratch[2];
-                tw1 += fstride;
-                tw2 += fstride*2;
-                Fout[m] = Complex( Fout->real() - .5*scratch[3].real() , Fout->imag() - .5*scratch[3].imag() );
-                scratch[0] *= epi3.imag();
-                *Fout += scratch[3];
-                Fout[m2] = Complex(  Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
-                Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
-                ++Fout;
-            }while(--k);
-        }
-
-        void bfly5( Complex * Fout, const size_t fstride, const size_t m)
-        {
-            Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
-            size_t u;
-            Complex scratch[13];
-            Complex * twiddles = &m_twiddles[0];
-            Complex *tw;
-            Complex ya,yb;
-            ya = twiddles[fstride*m];
-            yb = twiddles[fstride*2*m];
-
-            Fout0=Fout;
-            Fout1=Fout0+m;
-            Fout2=Fout0+2*m;
-            Fout3=Fout0+3*m;
-            Fout4=Fout0+4*m;
-
-            tw=twiddles;
-            for ( u=0; u<m; ++u ) {
-                scratch[0] = *Fout0;
-
-                scratch[1]  = *Fout1 * tw[u*fstride];
-                scratch[2]  = *Fout2 * tw[2*u*fstride];
-                scratch[3]  = *Fout3 * tw[3*u*fstride];
-                scratch[4]  = *Fout4 * tw[4*u*fstride];
-
-                scratch[7] = scratch[1] + scratch[4];
-                scratch[10] = scratch[1] - scratch[4];
-                scratch[8] = scratch[2] + scratch[3];
-                scratch[9] = scratch[2] - scratch[3];
-
-                *Fout0 +=  scratch[7];
-                *Fout0 +=  scratch[8];
-
-                scratch[5] = scratch[0] + Complex(
-                        (scratch[7].real()*ya.real() ) + (scratch[8].real() *yb.real() ),
-                        (scratch[7].imag()*ya.real()) + (scratch[8].imag()*yb.real())
-                        );
-
-                scratch[6] = Complex(
-                        (scratch[10].imag()*ya.imag()) + (scratch[9].imag()*yb.imag()),
-                        -(scratch[10].real()*ya.imag()) - (scratch[9].real()*yb.imag())
-                        );
-
-                *Fout1 = scratch[5] - scratch[6];
-                *Fout4 = scratch[5] + scratch[6];
-
-                scratch[11] = scratch[0] +
-                    Complex(
-                            (scratch[7].real()*yb.real()) + (scratch[8].real()*ya.real()),
-                            (scratch[7].imag()*yb.real()) + (scratch[8].imag()*ya.real())
-                           );
-
-                scratch[12] = Complex(
-                        -(scratch[10].imag()*yb.imag()) + (scratch[9].imag()*ya.imag()),
-                        (scratch[10].real()*yb.imag()) - (scratch[9].real()*ya.imag())
-                        );
-
-                *Fout2=scratch[11]+scratch[12];
-                *Fout3=scratch[11]-scratch[12];
-
-                ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
-            }
-        }
-
-        /* perform the butterfly for one stage of a mixed radix FFT */
-        void bfly_generic(
-                Complex * Fout,
-                const size_t fstride,
-                int m,
-                int p
-                )
-        {
-            int u,k,q1,q;
-            Complex * twiddles = &m_twiddles[0];
-            Complex t;
-            int Norig = m_twiddles.size();
-            Complex * scratchbuf = (Complex*)alloca(p*sizeof(Complex) );
-
-            for ( u=0; u<m; ++u ) {
-                k=u;
-                for ( q1=0 ; q1<p ; ++q1 ) {
-                    scratchbuf[q1] = Fout[ k  ];
-                    k += m;
-                }
-
-                k=u;
-                for ( q1=0 ; q1<p ; ++q1 ) {
-                    int twidx=0;
-                    Fout[ k ] = scratchbuf[0];
-                    for (q=1;q<p;++q ) {
-                        twidx += fstride * k;
-                        if (twidx>=Norig) twidx-=Norig;
-                        t=scratchbuf[q] * twiddles[twidx];
-                        Fout[ k ] += t;
-                    }
-                    k += m;
-                }
-            }
-        }
-    };
-
+      }
+    }
+  };
 
   template <typename _Scalar>
   struct ei_kissfft_impl
   {
-    typedef _Scalar Scalar;
-    typedef std::complex<Scalar> Complex;
-    ei_kissfft_impl() {} 
+      typedef _Scalar Scalar;
+      typedef std::complex<Scalar> Complex;
 
-    void clear() 
-    {
+      void clear() 
+      {
         m_plans.clear();
         m_realTwiddles.clear();
-    }
+      }
 
-    template <typename _Src>
-    void fwd( Complex * dst,const _Src *src,int nfft)
-    {
+      template <typename _Src>
+      void fwd( Complex * dst,const _Src *src,int nfft)
+      {
         get_plan(nfft,false).work(0, dst, src, 1,1);
-    }
+      }
 
-    // real-to-complex forward FFT
-    // perform two FFTs of src even and src odd
-    // then twiddle to recombine them into the half-spectrum format
-    // then fill in the conjugate symmetric half
-    void fwd( Complex * dst,const Scalar * src,int nfft) 
-    {
+      // real-to-complex forward FFT
+      // perform two FFTs of src even and src odd
+      // then twiddle to recombine them into the half-spectrum format
+      // then fill in the conjugate symmetric half
+      void fwd( Complex * dst,const Scalar * src,int nfft) 
+      {
         if ( nfft&3  ) {
-            // use generic mode for odd
-            get_plan(nfft,false).work(0, dst, src, 1,1);
+          // use generic mode for odd
+          get_plan(nfft,false).work(0, dst, src, 1,1);
         }else{
-            int ncfft = nfft>>1;
-            int ncfft2 = nfft>>2;
-            Complex * rtw = real_twiddles(ncfft2);
+          int ncfft = nfft>>1;
+          int ncfft2 = nfft>>2;
+          Complex * rtw = real_twiddles(ncfft2);
 
-            // use optimized mode for even real
-            fwd( dst, reinterpret_cast<const Complex*> (src), ncfft);
-            Complex dc = dst[0].real() +  dst[0].imag();
-            Complex nyquist = dst[0].real() -  dst[0].imag();
-            int k;
-            for ( k=1;k <= ncfft2 ; ++k ) {
-                Complex fpk = dst[k];
-                Complex fpnk = conj(dst[ncfft-k]);
-                Complex f1k = fpk + fpnk;
-                Complex f2k = fpk - fpnk;
-                Complex tw= f2k * rtw[k-1];
+          // use optimized mode for even real
+          fwd( dst, reinterpret_cast<const Complex*> (src), ncfft);
+          Complex dc = dst[0].real() +  dst[0].imag();
+          Complex nyquist = dst[0].real() -  dst[0].imag();
+          int k;
+          for ( k=1;k <= ncfft2 ; ++k ) {
+            Complex fpk = dst[k];
+            Complex fpnk = conj(dst[ncfft-k]);
+            Complex f1k = fpk + fpnk;
+            Complex f2k = fpk - fpnk;
+            Complex tw= f2k * rtw[k-1];
+            dst[k] =  (f1k + tw) * Scalar(.5);
+            dst[ncfft-k] =  conj(f1k -tw)*Scalar(.5);
+          }
 
-                dst[k] =  (f1k + tw) * Scalar(.5);
-                dst[ncfft-k] =  conj(f1k -tw)*Scalar(.5);
-            }
- 
-            // place conjugate-symmetric half at the end for completeness
-            // TODO: make this configurable ( opt-out )
-            for ( k=1;k < ncfft ; ++k )
-                dst[nfft-k] = conj(dst[k]);
-
-            dst[0] = dc;
-            dst[ncfft] = nyquist;
+          // place conjugate-symmetric half at the end for completeness
+          // TODO: make this configurable ( opt-out )
+          for ( k=1;k < ncfft ; ++k )
+            dst[nfft-k] = conj(dst[k]);
+          dst[0] = dc;
+          dst[ncfft] = nyquist;
         }
-    }
+      }
 
-    // half-complex to scalar
-    void inv( Scalar * dst,const Complex * src,int nfft) 
-    {
-        // TODO add optimized version for even numbers
-        std::vector<Complex> tmp(nfft);
-        inv(&tmp[0],src,nfft);
-        for (int k=0;k<nfft;++k)
-            dst[k] = tmp[k].real();
-    }
-
-    void inv(Complex * dst,const Complex  *src,int nfft)
-    {
+      // inverse complex-to-complex
+      void inv(Complex * dst,const Complex  *src,int nfft)
+      {
         get_plan(nfft,true).work(0, dst, src, 1,1);
         scale(dst, nfft, Scalar(1)/nfft );
-    }
+      }
 
-    private:
+      // half-complex to scalar
+      void inv( Scalar * dst,const Complex * src,int nfft) 
+      {
+        if (nfft&3) {
+          m_scratchBuf.resize(nfft);
+          inv(&m_scratchBuf[0],src,nfft);
+          for (int k=0;k<nfft;++k)
+            dst[k] = m_scratchBuf[k].real();
+        }else{
+          // optimized version for multiple of 4
+          int ncfft = nfft>>1;
+          int ncfft2 = nfft>>2;
+          Complex * rtw = real_twiddles(ncfft2);
+          m_scratchBuf.resize(ncfft);
+          m_scratchBuf[0] = Complex( src[0].real() + src[ncfft].real(), src[0].real() - src[ncfft].real() );
+          for (int k = 1; k <= ncfft / 2; ++k) {
+            Complex fk = src[k];
+            Complex fnkc = conj(src[ncfft-k]);
+            Complex fek = fk + fnkc;
+            Complex tmp = fk - fnkc;
+            Complex fok = tmp * conj(rtw[k-1]);
+            m_scratchBuf[k] = fek + fok;
+            m_scratchBuf[ncfft-k] = conj(fek - fok);
+          }
+          scale(&m_scratchBuf[0], ncfft, Scalar(1)/nfft );
+          get_plan(ncfft,true).work(0, reinterpret_cast<Complex*>(dst), &m_scratchBuf[0], 1,1);
+        }
+      }
 
-    typedef ei_kiss_cpx_fft<Scalar> PlanData;
+  private:
 
-    typedef std::map<int,PlanData> PlanMap;
-    PlanMap m_plans;
-    std::map<int, std::vector<Complex> > m_realTwiddles;
+      typedef ei_kiss_cpx_fft<Scalar> PlanData;
 
-    int PlanKey(int nfft,bool isinverse) const { return (nfft<<1) | isinverse; }
+      typedef std::map<int,PlanData> PlanMap;
+      PlanMap m_plans;
+      std::map<int, std::vector<Complex> > m_realTwiddles;
+      std::vector<Complex> m_scratchBuf;
 
-    PlanData & get_plan(int nfft,bool inverse)
-    {
-        /* 
+      int PlanKey(int nfft,bool isinverse) const { return (nfft<<1) | isinverse; }
+
+      PlanData & get_plan(int nfft,bool inverse)
+      {
+        /*  TODO: figure out why this does not work (g++ 4.3.2)
          * for some reason this does not work
          *
-        typedef typename std::map<int,PlanData>::iterator MapIt;
-        MapIt it;
-        it = m_plans.find( PlanKey(nfft,inverse) );
-        if (it == m_plans.end() ) {
-            // create new entry
-            it = m_plans.insert( make_pair( PlanKey(nfft,inverse) , PlanData() ) );
-            MapIt it2 = m_plans.find( PlanKey(nfft,!inverse) );
-            if (it2 != m_plans.end() ) {
-                it->second = it2.second;
-                it->second.invert();
-            }else{
-                it->second.make_twiddles(nfft,inverse);
-                it->second.factorize(nfft);
-            }
+         PlanMap::iterator it;
+         it = m_plans.find( PlanKey(nfft,inverse) );
+         if (it == m_plans.end() ) {
+        // create new entry
+        it = m_plans.insert( make_pair( PlanKey(nfft,inverse) , PlanData() ) );
+        MapIt it2 = m_plans.find( PlanKey(nfft,!inverse) );
+        if (it2 != m_plans.end() ) {
+        it->second = it2.second;
+        it->second.conjugate();
+        }else{
+        it->second.make_twiddles(nfft,inverse);
+        it->second.factorize(nfft);
+        }
         }
         return it->second;
         */
         PlanData & pd = m_plans[ PlanKey(nfft,inverse) ];
         if ( pd.m_twiddles.size() == 0 ) {
-            pd.make_twiddles(nfft,inverse);
-            pd.factorize(nfft);
+          pd.make_twiddles(nfft,inverse);
+          pd.factorize(nfft);
         }
         return pd;
-    }
+      }
 
-    Complex * real_twiddles(int ncfft2)
-    {
+      Complex * real_twiddles(int ncfft2)
+      {
         std::vector<Complex> & twidref = m_realTwiddles[ncfft2];// creates new if not there
         if ( (int)twidref.size() != ncfft2 ) {
-            twidref.resize(ncfft2);
-            int ncfft= ncfft2<<1;
-            Scalar pi =  acos( Scalar(-1) );
-            for (int k=1;k<=ncfft2;++k) 
-                twidref[k-1] = exp( Complex(0,-pi * ((double) (k) / ncfft + .5) ) );
+          twidref.resize(ncfft2);
+          int ncfft= ncfft2<<1;
+          Scalar pi =  acos( Scalar(-1) );
+          for (int k=1;k<=ncfft2;++k) 
+            twidref[k-1] = exp( Complex(0,-pi * ((double) (k) / ncfft + .5) ) );
         }
         return &twidref[0];
-    }
+      }
 
-    void scale(Complex *dst,int n,Scalar s) 
-    {
+      void scale(Complex *dst,int n,Scalar s) 
+      {
         for (int k=0;k<n;++k)
-            dst[k] *= s;
-    }
+          dst[k] *= s;
+      }
   };
 }

From f13e000b454fed0a9b0d71ca1481d6894bacd8eb Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Wed, 27 May 2009 21:32:42 -0400
Subject: [PATCH 11/34] various comment changes

---
 unsupported/Eigen/FFT                       |   6 +-
 unsupported/Eigen/src/FFT/ei_kissfft_impl.h | 472 ++++++++++----------
 2 files changed, 231 insertions(+), 247 deletions(-)

diff --git a/unsupported/Eigen/FFT b/unsupported/Eigen/FFT
index 3d852f5a2..31d8c74c5 100644
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT
@@ -1,5 +1,5 @@
 // This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
+// for linear algebra. 
 //
 // Copyright (C) 2009 Mark Borgerding mark a borgerding net
 //
@@ -29,14 +29,14 @@
 #include "src/FFT/ei_kissfft_impl.h"
 #define DEFAULT_FFT_IMPL ei_kissfft_impl
 
-// FFTW: faster, GPL-not LGPL, bigger code size
+// FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size
 #ifdef FFTW_PATIENT  // definition of FFTW_PATIENT indicates the caller has included fftw3.h, we can use FFTW routines
 // TODO 
 // #include "src/FFT/ei_fftw_impl.h"
 // #define DEFAULT_FFT_IMPL ei_fftw_impl
 #endif
 
-// intel Math Kernel Library: fastest, commerical
+// intel Math Kernel Library: fastest, commerical -- incompatible with Eigen in GPL form
 #ifdef _MKL_DFTI_H_ // mkl_dfti.h has been included, we can use MKL FFT routines
 // TODO 
 // #include "src/FFT/ei_imkl_impl.h"
diff --git a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
index 453c7f6da..91fa5ca18 100644
--- a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
@@ -1,5 +1,5 @@
 // This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
+// for linear algebra.
 //
 // Copyright (C) 2009 Mark Borgerding mark a borgerding net
 //
@@ -28,252 +28,255 @@
 
 namespace Eigen {
 
+  // This FFT implementation was derived from kissfft http:sourceforge.net/projects/kissfft
+  // Copyright 2003-2009 Mark Borgerding
+
   template <typename _Scalar>
-  struct ei_kiss_cpx_fft
-  {
-    typedef  _Scalar Scalar;
-    typedef  std::complex<Scalar> Complex;
-    std::vector<Complex> m_twiddles;
-    std::vector<int> m_stageRadix;
-    std::vector<int> m_stageRemainder;
-    bool m_inverse;
-
-    void make_twiddles(int nfft,bool inverse)
+    struct ei_kiss_cpx_fft
     {
-      m_inverse = inverse;
-      m_twiddles.resize(nfft);
-      Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
-      for (int i=0;i<nfft;++i)
-        m_twiddles[i] = exp( Complex(0,i*phinc) );
-    }
+      typedef _Scalar Scalar;
+      typedef std::complex<Scalar> Complex;
+      std::vector<Complex> m_twiddles;
+      std::vector<int> m_stageRadix;
+      std::vector<int> m_stageRemainder;
+      bool m_inverse;
 
-    void conjugate()
-    {
-      m_inverse = !m_inverse;
-      for ( size_t i=0;i<m_twiddles.size() ;++i)
-        m_twiddles[i] = conj( m_twiddles[i] );
-    }
+      void make_twiddles(int nfft,bool inverse)
+      {
+        m_inverse = inverse;
+        m_twiddles.resize(nfft);
+        Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
+        for (int i=0;i<nfft;++i)
+          m_twiddles[i] = exp( Complex(0,i*phinc) );
+      }
 
-    void factorize(int nfft)
-    {
-      //start factoring out 4's, then 2's, then 3,5,7,9,...
-      int n= nfft;
-      int p=4;
-      do {
-        while (n % p) {
-          switch (p) {
-            case 4: p = 2; break;
-            case 2: p = 3; break;
-            default: p += 2; break;
+      void conjugate()
+      {
+        m_inverse = !m_inverse;
+        for ( size_t i=0;i<m_twiddles.size() ;++i)
+          m_twiddles[i] = conj( m_twiddles[i] );
+      }
+
+      void factorize(int nfft)
+      {
+        //start factoring out 4's, then 2's, then 3,5,7,9,...
+        int n= nfft;
+        int p=4;
+        do {
+          while (n % p) {
+            switch (p) {
+              case 4: p = 2; break;
+              case 2: p = 3; break;
+              default: p += 2; break;
+            }
+            if (p*p>n)
+              p=n;// impossible to have a factor > sqrt(n)
+          }
+          n /= p;
+          m_stageRadix.push_back(p);
+          m_stageRemainder.push_back(n);
+        }while(n>1);
+      }
+
+      template <typename _Src>
+        void work( int stage,Complex * xout, const _Src * xin, size_t fstride,size_t in_stride)
+        {
+          int p = m_stageRadix[stage];
+          int m = m_stageRemainder[stage];
+          Complex * Fout_beg = xout;
+          Complex * Fout_end = xout + p*m;
+
+          if (m>1) {
+            do{
+              // recursive call:
+              // DFT of size m*p performed by doing
+              // p instances of smaller DFTs of size m, 
+              // each one takes a decimated version of the input
+              work(stage+1, xout , xin, fstride*p,in_stride);
+              xin += fstride*in_stride;
+            }while( (xout += m) != Fout_end );
+          }else{
+            do{
+              *xout = *xin;
+              xin += fstride*in_stride;
+            }while(++xout != Fout_end );
+          }
+          xout=Fout_beg;
+
+          // recombine the p smaller DFTs 
+          switch (p) {
+            case 2: bfly2(xout,fstride,m); break;
+            case 3: bfly3(xout,fstride,m); break;
+            case 4: bfly4(xout,fstride,m); break;
+            case 5: bfly5(xout,fstride,m); break;
+            default: bfly_generic(xout,fstride,m,p); break;
           }
-          if (p*p>n)
-            p=n;// impossible to have a factor > sqrt(n)
         }
-        n /= p;
-        m_stageRadix.push_back(p);
-        m_stageRemainder.push_back(n);
-      }while(n>1);
-    }
 
-    template <typename _Src>
-    void work( int stage,Complex * xout, const _Src * xin, size_t fstride,size_t in_stride)
-    {
-      int p = m_stageRadix[stage];
-      int m = m_stageRemainder[stage];
-      Complex * Fout_beg = xout;
-      Complex * Fout_end = xout + p*m;
+      void bfly2( Complex * Fout, const size_t fstride, int m)
+      {
+        for (int k=0;k<m;++k) {
+          Complex t = Fout[m+k] * m_twiddles[k*fstride];
+          Fout[m+k] = Fout[k] - t;
+          Fout[k] += t;
+        }
+      }
+
+      void bfly4( Complex * Fout, const size_t fstride, const size_t m)
+      {
+        Complex scratch[6];
+        int negative_if_inverse = m_inverse * -2 +1;
+        for (size_t k=0;k<m;++k) {
+          scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
+          scratch[1] = Fout[k+2*m] * m_twiddles[k*fstride*2];
+          scratch[2] = Fout[k+3*m] * m_twiddles[k*fstride*3];
+          scratch[5] = Fout[k] - scratch[1];
+
+          Fout[k] += scratch[1];
+          scratch[3] = scratch[0] + scratch[2];
+          scratch[4] = scratch[0] - scratch[2];
+          scratch[4] = Complex( scratch[4].imag()*negative_if_inverse , -scratch[4].real()* negative_if_inverse );
+
+          Fout[k+2*m]  = Fout[k] - scratch[3];
+          Fout[k] += scratch[3];
+          Fout[k+m] = scratch[5] + scratch[4];
+          Fout[k+3*m] = scratch[5] - scratch[4];
+        }
+      }
+
+      void bfly3( Complex * Fout, const size_t fstride, const size_t m)
+      {
+        size_t k=m;
+        const size_t m2 = 2*m;
+        Complex *tw1,*tw2;
+        Complex scratch[5];
+        Complex epi3;
+        epi3 = m_twiddles[fstride*m];
+
+        tw1=tw2=&m_twiddles[0];
 
-      if (m>1) {
         do{
-          // recursive call:
-          // DFT of size m*p performed by doing
-          // p instances of smaller DFTs of size m, 
-          // each one takes a decimated version of the input
-          work(stage+1, xout , xin, fstride*p,in_stride);
-          xin += fstride*in_stride;
-        }while( (xout += m) != Fout_end );
-      }else{
-        do{
-          *xout = *xin;
-          xin += fstride*in_stride;
-        }while(++xout != Fout_end );
+          scratch[1]=Fout[m] * *tw1;
+          scratch[2]=Fout[m2] * *tw2;
+
+          scratch[3]=scratch[1]+scratch[2];
+          scratch[0]=scratch[1]-scratch[2];
+          tw1 += fstride;
+          tw2 += fstride*2;
+          Fout[m] = Complex( Fout->real() - .5*scratch[3].real() , Fout->imag() - .5*scratch[3].imag() );
+          scratch[0] *= epi3.imag();
+          *Fout += scratch[3];
+          Fout[m2] = Complex(  Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
+          Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
+          ++Fout;
+        }while(--k);
       }
-      xout=Fout_beg;
 
-      // recombine the p smaller DFTs 
-      switch (p) {
-        case 2: bfly2(xout,fstride,m); break;
-        case 3: bfly3(xout,fstride,m); break;
-        case 4: bfly4(xout,fstride,m); break;
-        case 5: bfly5(xout,fstride,m); break;
-        default: bfly_generic(xout,fstride,m,p); break;
-      }
-    }
+      void bfly5( Complex * Fout, const size_t fstride, const size_t m)
+      {
+        Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+        size_t u;
+        Complex scratch[13];
+        Complex * twiddles = &m_twiddles[0];
+        Complex *tw;
+        Complex ya,yb;
+        ya = twiddles[fstride*m];
+        yb = twiddles[fstride*2*m];
 
-    void bfly2( Complex * Fout, const size_t fstride, int m)
-    {
-      for (int k=0;k<m;++k) {
-        Complex t = Fout[m+k] * m_twiddles[k*fstride];
-        Fout[m+k] = Fout[k] - t;
-        Fout[k] += t;
-      }
-    }
+        Fout0=Fout;
+        Fout1=Fout0+m;
+        Fout2=Fout0+2*m;
+        Fout3=Fout0+3*m;
+        Fout4=Fout0+4*m;
 
-    void bfly4( Complex * Fout, const size_t fstride, const size_t m)
-    {
-      Complex scratch[6];
-      int negative_if_inverse = m_inverse * -2 +1;
-      for (size_t k=0;k<m;++k) {
-        scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
-        scratch[1] = Fout[k+2*m] * m_twiddles[k*fstride*2];
-        scratch[2] = Fout[k+3*m] * m_twiddles[k*fstride*3];
-        scratch[5] = Fout[k] - scratch[1];
+        tw=twiddles;
+        for ( u=0; u<m; ++u ) {
+          scratch[0] = *Fout0;
 
-        Fout[k] += scratch[1];
-        scratch[3] = scratch[0] + scratch[2];
-        scratch[4] = scratch[0] - scratch[2];
-        scratch[4] = Complex( scratch[4].imag()*negative_if_inverse , -scratch[4].real()* negative_if_inverse );
+          scratch[1]  = *Fout1 * tw[u*fstride];
+          scratch[2]  = *Fout2 * tw[2*u*fstride];
+          scratch[3]  = *Fout3 * tw[3*u*fstride];
+          scratch[4]  = *Fout4 * tw[4*u*fstride];
 
-        Fout[k+2*m]  = Fout[k] - scratch[3];
-        Fout[k] += scratch[3];
-        Fout[k+m] = scratch[5] + scratch[4];
-        Fout[k+3*m] = scratch[5] - scratch[4];
-      }
-    }
+          scratch[7] = scratch[1] + scratch[4];
+          scratch[10] = scratch[1] - scratch[4];
+          scratch[8] = scratch[2] + scratch[3];
+          scratch[9] = scratch[2] - scratch[3];
 
-    void bfly3( Complex * Fout, const size_t fstride, const size_t m)
-    {
-      size_t k=m;
-      const size_t m2 = 2*m;
-      Complex *tw1,*tw2;
-      Complex scratch[5];
-      Complex epi3;
-      epi3 = m_twiddles[fstride*m];
+          *Fout0 +=  scratch[7];
+          *Fout0 +=  scratch[8];
 
-      tw1=tw2=&m_twiddles[0];
-
-      do{
-        scratch[1]=Fout[m] * *tw1;
-        scratch[2]=Fout[m2] * *tw2;
-
-        scratch[3]=scratch[1]+scratch[2];
-        scratch[0]=scratch[1]-scratch[2];
-        tw1 += fstride;
-        tw2 += fstride*2;
-        Fout[m] = Complex( Fout->real() - .5*scratch[3].real() , Fout->imag() - .5*scratch[3].imag() );
-        scratch[0] *= epi3.imag();
-        *Fout += scratch[3];
-        Fout[m2] = Complex(  Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
-        Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
-        ++Fout;
-      }while(--k);
-    }
-
-    void bfly5( Complex * Fout, const size_t fstride, const size_t m)
-    {
-      Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
-      size_t u;
-      Complex scratch[13];
-      Complex * twiddles = &m_twiddles[0];
-      Complex *tw;
-      Complex ya,yb;
-      ya = twiddles[fstride*m];
-      yb = twiddles[fstride*2*m];
-
-      Fout0=Fout;
-      Fout1=Fout0+m;
-      Fout2=Fout0+2*m;
-      Fout3=Fout0+3*m;
-      Fout4=Fout0+4*m;
-
-      tw=twiddles;
-      for ( u=0; u<m; ++u ) {
-        scratch[0] = *Fout0;
-
-        scratch[1]  = *Fout1 * tw[u*fstride];
-        scratch[2]  = *Fout2 * tw[2*u*fstride];
-        scratch[3]  = *Fout3 * tw[3*u*fstride];
-        scratch[4]  = *Fout4 * tw[4*u*fstride];
-
-        scratch[7] = scratch[1] + scratch[4];
-        scratch[10] = scratch[1] - scratch[4];
-        scratch[8] = scratch[2] + scratch[3];
-        scratch[9] = scratch[2] - scratch[3];
-
-        *Fout0 +=  scratch[7];
-        *Fout0 +=  scratch[8];
-
-        scratch[5] = scratch[0] + Complex(
-            (scratch[7].real()*ya.real() ) + (scratch[8].real() *yb.real() ),
-            (scratch[7].imag()*ya.real()) + (scratch[8].imag()*yb.real())
-            );
-
-        scratch[6] = Complex(
-            (scratch[10].imag()*ya.imag()) + (scratch[9].imag()*yb.imag()),
-            -(scratch[10].real()*ya.imag()) - (scratch[9].real()*yb.imag())
-            );
-
-        *Fout1 = scratch[5] - scratch[6];
-        *Fout4 = scratch[5] + scratch[6];
-
-        scratch[11] = scratch[0] +
-          Complex(
-              (scratch[7].real()*yb.real()) + (scratch[8].real()*ya.real()),
-              (scratch[7].imag()*yb.real()) + (scratch[8].imag()*ya.real())
+          scratch[5] = scratch[0] + Complex(
+              (scratch[7].real()*ya.real() ) + (scratch[8].real() *yb.real() ),
+              (scratch[7].imag()*ya.real()) + (scratch[8].imag()*yb.real())
               );
 
-        scratch[12] = Complex(
-            -(scratch[10].imag()*yb.imag()) + (scratch[9].imag()*ya.imag()),
-            (scratch[10].real()*yb.imag()) - (scratch[9].real()*ya.imag())
-            );
+          scratch[6] = Complex(
+              (scratch[10].imag()*ya.imag()) + (scratch[9].imag()*yb.imag()),
+              -(scratch[10].real()*ya.imag()) - (scratch[9].real()*yb.imag())
+              );
 
-        *Fout2=scratch[11]+scratch[12];
-        *Fout3=scratch[11]-scratch[12];
+          *Fout1 = scratch[5] - scratch[6];
+          *Fout4 = scratch[5] + scratch[6];
 
-        ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
-      }
-    }
+          scratch[11] = scratch[0] +
+            Complex(
+                (scratch[7].real()*yb.real()) + (scratch[8].real()*ya.real()),
+                (scratch[7].imag()*yb.real()) + (scratch[8].imag()*ya.real())
+                );
 
-    /* perform the butterfly for one stage of a mixed radix FFT */
-    void bfly_generic(
-        Complex * Fout,
-        const size_t fstride,
-        int m,
-        int p
-        )
-    {
-      int u,k,q1,q;
-      Complex * twiddles = &m_twiddles[0];
-      Complex t;
-      int Norig = m_twiddles.size();
-      Complex * scratchbuf = (Complex*)alloca(p*sizeof(Complex) );
+          scratch[12] = Complex(
+              -(scratch[10].imag()*yb.imag()) + (scratch[9].imag()*ya.imag()),
+              (scratch[10].real()*yb.imag()) - (scratch[9].real()*ya.imag())
+              );
 
-      for ( u=0; u<m; ++u ) {
-        k=u;
-        for ( q1=0 ; q1<p ; ++q1 ) {
-          scratchbuf[q1] = Fout[ k  ];
-          k += m;
+          *Fout2=scratch[11]+scratch[12];
+          *Fout3=scratch[11]-scratch[12];
+
+          ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
         }
+      }
 
-        k=u;
-        for ( q1=0 ; q1<p ; ++q1 ) {
-          int twidx=0;
-          Fout[ k ] = scratchbuf[0];
-          for (q=1;q<p;++q ) {
-            twidx += fstride * k;
-            if (twidx>=Norig) twidx-=Norig;
-            t=scratchbuf[q] * twiddles[twidx];
-            Fout[ k ] += t;
+      /* perform the butterfly for one stage of a mixed radix FFT */
+      void bfly_generic(
+          Complex * Fout,
+          const size_t fstride,
+          int m,
+          int p
+          )
+      {
+        int u,k,q1,q;
+        Complex * twiddles = &m_twiddles[0];
+        Complex t;
+        int Norig = m_twiddles.size();
+        Complex * scratchbuf = (Complex*)alloca(p*sizeof(Complex) );
+
+        for ( u=0; u<m; ++u ) {
+          k=u;
+          for ( q1=0 ; q1<p ; ++q1 ) {
+            scratchbuf[q1] = Fout[ k  ];
+            k += m;
+          }
+
+          k=u;
+          for ( q1=0 ; q1<p ; ++q1 ) {
+            int twidx=0;
+            Fout[ k ] = scratchbuf[0];
+            for (q=1;q<p;++q ) {
+              twidx += fstride * k;
+              if (twidx>=Norig) twidx-=Norig;
+              t=scratchbuf[q] * twiddles[twidx];
+              Fout[ k ] += t;
+            }
+            k += m;
           }
-          k += m;
         }
       }
-    }
-  };
+    };
 
   template <typename _Scalar>
-  struct ei_kissfft_impl
-  {
+    struct ei_kissfft_impl
+    {
       typedef _Scalar Scalar;
       typedef std::complex<Scalar> Complex;
 
@@ -284,10 +287,10 @@ namespace Eigen {
       }
 
       template <typename _Src>
-      void fwd( Complex * dst,const _Src *src,int nfft)
-      {
-        get_plan(nfft,false).work(0, dst, src, 1,1);
-      }
+        void fwd( Complex * dst,const _Src *src,int nfft)
+        {
+          get_plan(nfft,false).work(0, dst, src, 1,1);
+        }
 
       // real-to-complex forward FFT
       // perform two FFTs of src even and src odd
@@ -363,11 +366,10 @@ namespace Eigen {
         }
       }
 
-  private:
-
+      private:
       typedef ei_kiss_cpx_fft<Scalar> PlanData;
-
       typedef std::map<int,PlanData> PlanMap;
+
       PlanMap m_plans;
       std::map<int, std::vector<Complex> > m_realTwiddles;
       std::vector<Complex> m_scratchBuf;
@@ -376,25 +378,7 @@ namespace Eigen {
 
       PlanData & get_plan(int nfft,bool inverse)
       {
-        /*  TODO: figure out why this does not work (g++ 4.3.2)
-         * for some reason this does not work
-         *
-         PlanMap::iterator it;
-         it = m_plans.find( PlanKey(nfft,inverse) );
-         if (it == m_plans.end() ) {
-        // create new entry
-        it = m_plans.insert( make_pair( PlanKey(nfft,inverse) , PlanData() ) );
-        MapIt it2 = m_plans.find( PlanKey(nfft,!inverse) );
-        if (it2 != m_plans.end() ) {
-        it->second = it2.second;
-        it->second.conjugate();
-        }else{
-        it->second.make_twiddles(nfft,inverse);
-        it->second.factorize(nfft);
-        }
-        }
-        return it->second;
-        */
+        // TODO look for PlanKey(nfft, ! inverse) and conjugate the twiddles
         PlanData & pd = m_plans[ PlanKey(nfft,inverse) ];
         if ( pd.m_twiddles.size() == 0 ) {
           pd.make_twiddles(nfft,inverse);
@@ -421,5 +405,5 @@ namespace Eigen {
         for (int k=0;k<n;++k)
           dst[k] *= s;
       }
-  };
+    };
 }

From 1fd6dfe428e6b65383b84f8dfe8f67c70ec370b4 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Sat, 30 May 2009 17:55:47 -0400
Subject: [PATCH 12/34] added ei_fftw_impl

---
 unsupported/Eigen/FFT                       |   5 +-
 unsupported/Eigen/src/FFT/ei_fftw_impl.h    | 198 ++++++++++++++++++++
 unsupported/Eigen/src/FFT/ei_kissfft_impl.h |  25 +--
 3 files changed, 214 insertions(+), 14 deletions(-)
 create mode 100644 unsupported/Eigen/src/FFT/ei_fftw_impl.h

diff --git a/unsupported/Eigen/FFT b/unsupported/Eigen/FFT
index 31d8c74c5..03f8504a4 100644
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT
@@ -30,9 +30,8 @@
 #define DEFAULT_FFT_IMPL ei_kissfft_impl
 
 // FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size
-#ifdef FFTW_PATIENT  // definition of FFTW_PATIENT indicates the caller has included fftw3.h, we can use FFTW routines
-// TODO 
-// #include "src/FFT/ei_fftw_impl.h"
+#ifdef FFTW_ESTIMATE  // definition of FFTW_ESTIMATE indicates the caller has included fftw3.h, we can use FFTW routines
+#include "src/FFT/ei_fftw_impl.h"
 // #define DEFAULT_FFT_IMPL ei_fftw_impl
 #endif
 
diff --git a/unsupported/Eigen/src/FFT/ei_fftw_impl.h b/unsupported/Eigen/src/FFT/ei_fftw_impl.h
new file mode 100644
index 000000000..d592bbb20
--- /dev/null
+++ b/unsupported/Eigen/src/FFT/ei_fftw_impl.h
@@ -0,0 +1,198 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. 
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+namespace Eigen {
+  // FFTW uses non-const arguments
+  // so we must use ugly const_cast calls for all the args it uses
+  //
+  // This should be safe as long as 
+  // 1. we use FFTW_ESTIMATE for all our planning
+  //       see the FFTW docs section 4.3.2 "Planner Flags"
+  // 2. fftw_complex is compatible with std::complex
+  //    This assumes std::complex<T> layout is array of size 2 with real,imag
+  template <typename T> 
+  T * ei_fftw_cast(const T* p) 
+  { 
+      return const_cast<T*>( p); 
+  }
+
+  fftw_complex * ei_fftw_cast( const std::complex<double> * p) 
+  { 
+      return const_cast<fftw_complex*>( reinterpret_cast<const fftw_complex*>(p) ); 
+  }
+
+  fftwf_complex * ei_fftw_cast( const std::complex<float> * p) 
+  { 
+      return const_cast<fftwf_complex*>( reinterpret_cast<const fftwf_complex*>(p) ); 
+  }
+
+  fftwl_complex * ei_fftw_cast( const std::complex<long double> * p) 
+  { 
+      return const_cast<fftwl_complex*>( reinterpret_cast<const fftwl_complex*>(p) ); 
+  }
+
+  template <typename T> 
+  struct ei_fftw_plan {};
+
+  template <> 
+  struct ei_fftw_plan<float>
+  {
+      typedef float scalar_type;
+      typedef fftwf_complex complex_type;
+      fftwf_plan m_plan;
+      ei_fftw_plan() :m_plan(NULL) {}
+      ~ei_fftw_plan() {if (m_plan) fftwf_destroy_plan(m_plan);}
+
+      void fwd(complex_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftwf_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE);
+          fftwf_execute_dft( m_plan, src,dst);
+      }
+      void inv(complex_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftwf_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE);
+          fftwf_execute_dft( m_plan, src,dst);
+      }
+      void fwd(complex_type * dst,scalar_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftwf_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE);
+          fftwf_execute_dft_r2c( m_plan,src,dst);
+      }
+      void inv(scalar_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL)
+              m_plan = fftwf_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE);
+          fftwf_execute_dft_c2r( m_plan, src,dst);
+      }
+  };
+  template <> 
+  struct ei_fftw_plan<double>
+  {
+      typedef double scalar_type;
+      typedef fftw_complex complex_type;
+      fftw_plan m_plan;
+      ei_fftw_plan() :m_plan(NULL) {}
+      ~ei_fftw_plan() {if (m_plan) fftw_destroy_plan(m_plan);}
+
+      void fwd(complex_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftw_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE);
+          fftw_execute_dft( m_plan, src,dst);
+      }
+      void inv(complex_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftw_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE);
+          fftw_execute_dft( m_plan, src,dst);
+      }
+      void fwd(complex_type * dst,scalar_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftw_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE);
+          fftw_execute_dft_r2c( m_plan,src,dst);
+      }
+      void inv(scalar_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL)
+              m_plan = fftw_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE);
+          fftw_execute_dft_c2r( m_plan, src,dst);
+      }
+  };
+  template <> 
+  struct ei_fftw_plan<long double>
+  {
+      typedef long double scalar_type;
+      typedef fftwl_complex complex_type;
+      fftwl_plan m_plan;
+      ei_fftw_plan() :m_plan(NULL) {}
+      ~ei_fftw_plan() {if (m_plan) fftwl_destroy_plan(m_plan);}
+
+      void fwd(complex_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftwl_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE);
+          fftwl_execute_dft( m_plan, src,dst);
+      }
+      void inv(complex_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftwl_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE);
+          fftwl_execute_dft( m_plan, src,dst);
+      }
+      void fwd(complex_type * dst,scalar_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftwl_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE);
+          fftwl_execute_dft_r2c( m_plan,src,dst);
+      }
+      void inv(scalar_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL)
+              m_plan = fftwl_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE);
+          fftwl_execute_dft_c2r( m_plan, src,dst);
+      }
+  };
+
+  template <typename _Scalar>
+  struct ei_fftw_impl
+  {
+      typedef _Scalar Scalar;
+      typedef std::complex<Scalar> Complex;
+
+      void clear() 
+      {
+        m_plans.clear();
+      }
+
+      void fwd( Complex * dst,const Complex *src,int nfft)
+      {
+        get_plan(nfft,false,dst,src).fwd(ei_fftw_cast(dst), ei_fftw_cast(src),nfft );
+      }
+
+      // real-to-complex forward FFT
+      void fwd( Complex * dst,const Scalar * src,int nfft) 
+      {
+          get_plan(nfft,false,dst,src).fwd(ei_fftw_cast(dst), ei_fftw_cast(src) ,nfft);
+          int nhbins=(nfft>>1)+1;
+          for (int k=nhbins;k < nfft; ++k )
+              dst[k] = conj(dst[nfft-k]);
+      }
+
+      // inverse complex-to-complex
+      void inv(Complex * dst,const Complex  *src,int nfft)
+      {
+        get_plan(nfft,true,dst,src).inv(ei_fftw_cast(dst), ei_fftw_cast(src),nfft );
+        // scaling
+        Scalar s = 1./nfft;
+        for (int k=0;k<nfft;++k)
+          dst[k] *= s;
+      }
+
+      // half-complex to scalar
+      void inv( Scalar * dst,const Complex * src,int nfft) 
+      {
+        get_plan(nfft,true,dst,src).inv(ei_fftw_cast(dst), ei_fftw_cast(src),nfft );
+        Scalar s = 1./nfft;
+        for (int k=0;k<nfft;++k)
+          dst[k] *= s;
+      }
+
+  private:
+      typedef ei_fftw_plan<Scalar> PlanData;
+      typedef std::map<int,PlanData> PlanMap;
+
+      PlanMap m_plans;
+
+      PlanData & get_plan(int nfft,bool inverse,void * dst,const void * src)
+      {
+          bool inplace = (dst==src);
+          bool aligned = ( (reinterpret_cast<size_t>(src)&15) | (reinterpret_cast<size_t>(dst)&15) ) == 0;
+          int key = (nfft<<3 ) | (inverse<<2) | (inplace<<1) | aligned;
+          return m_plans[key];
+      }
+  };
+}
diff --git a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
index 91fa5ca18..a84ac68a0 100644
--- a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
@@ -39,6 +39,7 @@ namespace Eigen {
       std::vector<Complex> m_twiddles;
       std::vector<int> m_stageRadix;
       std::vector<int> m_stageRemainder;
+      std::vector<Complex> m_scratchBuf;
       bool m_inverse;
 
       void make_twiddles(int nfft,bool inverse)
@@ -75,6 +76,8 @@ namespace Eigen {
           n /= p;
           m_stageRadix.push_back(p);
           m_stageRemainder.push_back(n);
+          if ( p > 5 )
+              m_scratchBuf.resize(p); // scratchbuf will be needed in bfly_generic
         }while(n>1);
       }
 
@@ -249,7 +252,7 @@ namespace Eigen {
         Complex * twiddles = &m_twiddles[0];
         Complex t;
         int Norig = m_twiddles.size();
-        Complex * scratchbuf = (Complex*)alloca(p*sizeof(Complex) );
+        Complex * scratchbuf = &m_scratchBuf[0];
 
         for ( u=0; u<m; ++u ) {
           k=u;
@@ -341,28 +344,28 @@ namespace Eigen {
       void inv( Scalar * dst,const Complex * src,int nfft) 
       {
         if (nfft&3) {
-          m_scratchBuf.resize(nfft);
-          inv(&m_scratchBuf[0],src,nfft);
+          m_tmpBuf.resize(nfft);
+          inv(&m_tmpBuf[0],src,nfft);
           for (int k=0;k<nfft;++k)
-            dst[k] = m_scratchBuf[k].real();
+            dst[k] = m_tmpBuf[k].real();
         }else{
           // optimized version for multiple of 4
           int ncfft = nfft>>1;
           int ncfft2 = nfft>>2;
           Complex * rtw = real_twiddles(ncfft2);
-          m_scratchBuf.resize(ncfft);
-          m_scratchBuf[0] = Complex( src[0].real() + src[ncfft].real(), src[0].real() - src[ncfft].real() );
+          m_tmpBuf.resize(ncfft);
+          m_tmpBuf[0] = Complex( src[0].real() + src[ncfft].real(), src[0].real() - src[ncfft].real() );
           for (int k = 1; k <= ncfft / 2; ++k) {
             Complex fk = src[k];
             Complex fnkc = conj(src[ncfft-k]);
             Complex fek = fk + fnkc;
             Complex tmp = fk - fnkc;
             Complex fok = tmp * conj(rtw[k-1]);
-            m_scratchBuf[k] = fek + fok;
-            m_scratchBuf[ncfft-k] = conj(fek - fok);
+            m_tmpBuf[k] = fek + fok;
+            m_tmpBuf[ncfft-k] = conj(fek - fok);
           }
-          scale(&m_scratchBuf[0], ncfft, Scalar(1)/nfft );
-          get_plan(ncfft,true).work(0, reinterpret_cast<Complex*>(dst), &m_scratchBuf[0], 1,1);
+          scale(&m_tmpBuf[0], ncfft, Scalar(1)/nfft );
+          get_plan(ncfft,true).work(0, reinterpret_cast<Complex*>(dst), &m_tmpBuf[0], 1,1);
         }
       }
 
@@ -372,7 +375,7 @@ namespace Eigen {
 
       PlanMap m_plans;
       std::map<int, std::vector<Complex> > m_realTwiddles;
-      std::vector<Complex> m_scratchBuf;
+      std::vector<Complex> m_tmpBuf;
 
       int PlanKey(int nfft,bool isinverse) const { return (nfft<<1) | isinverse; }
 

From 1c543401741f1328b65d30a764f6273ddf9b60b6 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Sun, 31 May 2009 15:44:57 -0400
Subject: [PATCH 13/34] more work on ei_fftw_impl

---
 bench/benchFFT.cpp       | 4 ++++
 unsupported/Eigen/FFT    | 3 ++-
 unsupported/test/FFT.cpp | 5 +++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/bench/benchFFT.cpp b/bench/benchFFT.cpp
index 14f5063fb..4b6cabb55 100644
--- a/bench/benchFFT.cpp
+++ b/bench/benchFFT.cpp
@@ -26,6 +26,10 @@
 #include <vector>
 #include <Eigen/Core>
 #include <bench/BenchTimer.h>
+#ifdef USE_FFTW
+#include <fftw3.h>
+#endif
+
 #include <unsupported/Eigen/FFT>
 
 using namespace Eigen;
diff --git a/unsupported/Eigen/FFT b/unsupported/Eigen/FFT
index 03f8504a4..dc7e85908 100644
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT
@@ -32,7 +32,8 @@
 // FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size
 #ifdef FFTW_ESTIMATE  // definition of FFTW_ESTIMATE indicates the caller has included fftw3.h, we can use FFTW routines
 #include "src/FFT/ei_fftw_impl.h"
-// #define DEFAULT_FFT_IMPL ei_fftw_impl
+#undef DEFAULT_FFT_IMPL
+#define DEFAULT_FFT_IMPL ei_fftw_impl
 #endif
 
 // intel Math Kernel Library: fastest, commerical -- incompatible with Eigen in GPL form
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index 32d1393d0..28230e1c4 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -23,6 +23,11 @@
 // Eigen. If not, see <http://www.gnu.org/licenses/>.
 
 #include "main.h"
+//#define USE_FFTW
+#ifdef USE_FFTW
+#include <fftw3.h>
+#endif
+
 #include <unsupported/Eigen/FFT>
 
 using namespace std;

From 4d6b962ba47ca2a6445bebd144b8122f3b8e96b7 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Wed, 10 Jun 2009 22:16:32 -0400
Subject: [PATCH 14/34] added FindFFTW, but I don't think it's right yet

---
 cmake/FindFFTW.cmake            | 24 ++++++++++++++++++++++++
 unsupported/test/CMakeLists.txt |  6 ++++++
 unsupported/test/FFT.cpp        | 12 +++---------
 3 files changed, 33 insertions(+), 9 deletions(-)
 create mode 100644 cmake/FindFFTW.cmake

diff --git a/cmake/FindFFTW.cmake b/cmake/FindFFTW.cmake
new file mode 100644
index 000000000..a56450b17
--- /dev/null
+++ b/cmake/FindFFTW.cmake
@@ -0,0 +1,24 @@
+
+if (FFTW_INCLUDES AND FFTW_LIBRARIES)
+  set(FFTW_FIND_QUIETLY TRUE)
+endif (FFTW_INCLUDES AND FFTW_LIBRARIES)
+
+find_path(FFTW_INCLUDES
+  NAMES
+  fftw3.h
+  PATHS
+  $ENV{FFTWDIR}
+  ${INCLUDE_INSTALL_DIR}
+)
+
+find_library(FFTWF_LIB NAMES fftw3f PATHS $ENV{FFTWDIR} ${LIB_INSTALL_DIR})
+find_library(FFTW_LIB NAMES fftw3 PATHS $ENV{FFTWDIR} ${LIB_INSTALL_DIR})
+find_library(FFTWL_LIB NAMES fftw3l  PATHS $ENV{FFTWDIR} ${LIB_INSTALL_DIR})
+set(FFTW_LIBRARIES "${FFTWF_LIB} ${FFTW_LIB} ${FFTWL_LIB}" )
+message(STATUS "FFTW ${FFTW_LIBRARIES}" )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(FFTW DEFAULT_MSG
+                                  FFTW_INCLUDES FFTW_LIBRARIES)
+
+mark_as_advanced(FFTW_INCLUDES FFTW_LIBRARIES)
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 49de7dfb2..23217811f 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -20,3 +20,9 @@ ei_add_test(BVH)
 ei_add_test(matrixExponential)
 ei_add_test(alignedvector3)
 ei_add_test(FFT)
+
+find_package(FFTW)
+if(FFTW_FOUND)
+  ei_add_test(FFTW  " " "${FFTW_LIBRARIES}" )
+endif(FFTW_FOUND)
+
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index 28230e1c4..f0b9b68bf 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -23,11 +23,6 @@
 // Eigen. If not, see <http://www.gnu.org/licenses/>.
 
 #include "main.h"
-//#define USE_FFTW
-#ifdef USE_FFTW
-#include <fftw3.h>
-#endif
-
 #include <unsupported/Eigen/FFT>
 
 using namespace std;
@@ -121,7 +116,7 @@ void test_complex(int nfft)
 
 void test_FFT()
 {
-#if 1
+
   CALL_SUBTEST( test_complex<float>(32) ); CALL_SUBTEST( test_complex<double>(32) ); CALL_SUBTEST( test_complex<long double>(32) );
   CALL_SUBTEST( test_complex<float>(256) ); CALL_SUBTEST( test_complex<double>(256) ); CALL_SUBTEST( test_complex<long double>(256) );
   CALL_SUBTEST( test_complex<float>(3*8) ); CALL_SUBTEST( test_complex<double>(3*8) ); CALL_SUBTEST( test_complex<long double>(3*8) );
@@ -129,13 +124,12 @@ void test_FFT()
   CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); CALL_SUBTEST( test_complex<long double>(2*3*4) );
   CALL_SUBTEST( test_complex<float>(2*3*4*5) ); CALL_SUBTEST( test_complex<double>(2*3*4*5) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5) );
   CALL_SUBTEST( test_complex<float>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<double>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5*7) );
-#endif
 
-#if 1
+
+
   CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); CALL_SUBTEST( test_scalar<long double>(32) );
   CALL_SUBTEST( test_scalar<float>(45) ); CALL_SUBTEST( test_scalar<double>(45) ); CALL_SUBTEST( test_scalar<long double>(45) );
   CALL_SUBTEST( test_scalar<float>(50) ); CALL_SUBTEST( test_scalar<double>(50) ); CALL_SUBTEST( test_scalar<long double>(50) );
   CALL_SUBTEST( test_scalar<float>(256) ); CALL_SUBTEST( test_scalar<double>(256) ); CALL_SUBTEST( test_scalar<long double>(256) );
   CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
-#endif
 }

From 218711e18b1adc518a2f861fe53d5ce9da0e98b7 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Wed, 10 Jun 2009 23:25:27 -0400
Subject: [PATCH 15/34] example file

---
 unsupported/doc/examples/FFT.cpp | 117 +++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 unsupported/doc/examples/FFT.cpp

diff --git a/unsupported/doc/examples/FFT.cpp b/unsupported/doc/examples/FFT.cpp
new file mode 100644
index 000000000..55e29585a
--- /dev/null
+++ b/unsupported/doc/examples/FFT.cpp
@@ -0,0 +1,117 @@
+//  To use the simple FFT implementation
+//  g++ -o demofft -I.. -Wall -O3 FFT.cpp 
+
+//  To use the FFTW implementation
+//  g++ -o demofft -I.. -DUSE_FFTW -Wall -O3 FFT.cpp -lfftw3 -lfftw3f -lfftw3l
+
+#ifdef USE_FFTW
+#include <fftw3.h>
+#endif
+
+#include <vector>
+#include <complex>
+#include <algorithm>
+#include <iterator>
+#include <Eigen/Core>
+#include <unsupported/Eigen/FFT>
+
+using namespace std;
+using namespace Eigen;
+
+template <typename T>
+T mag2(T a)
+{
+    return a*a;
+}
+template <typename T>
+T mag2(std::complex<T> a)
+{
+    return norm(a);
+}
+
+template <typename T>
+T mag2(const std::vector<T> & vec)
+{
+    T out=0;
+    for (size_t k=0;k<vec.size();++k)
+        out += mag2(vec[k]);
+    return out;
+}
+
+template <typename T>
+T mag2(const std::vector<std::complex<T> > & vec)
+{
+    T out=0;
+    for (size_t k=0;k<vec.size();++k)
+        out += mag2(vec[k]);
+    return out;
+}
+
+template <typename T>
+vector<T> operator-(const vector<T> & a,const vector<T> & b )
+{
+    vector<T> c(a);
+    for (size_t k=0;k<b.size();++k) 
+        c[k] -= b[k];
+    return c;
+}
+
+template <typename T>
+void RandomFill(std::vector<T> & vec)
+{
+    for (size_t k=0;k<vec.size();++k)
+        vec[k] = T( rand() )/T(RAND_MAX) - .5;
+}
+
+template <typename T>
+void RandomFill(std::vector<std::complex<T> > & vec)
+{
+    for (size_t k=0;k<vec.size();++k)
+        vec[k] = std::complex<T> ( T( rand() )/T(RAND_MAX) - .5, T( rand() )/T(RAND_MAX) - .5);
+}
+
+template <typename T_time,typename T_freq>
+void fwd_inv(size_t nfft)
+{
+    typedef typename NumTraits<T_freq>::Real Scalar;
+    vector<T_time> timebuf(nfft);
+    RandomFill(timebuf);
+
+    vector<T_freq> freqbuf;
+    static FFT<Scalar> fft;
+    fft.fwd(freqbuf,timebuf);
+
+    vector<T_time> timebuf2;
+    fft.inv(timebuf2,freqbuf);
+
+    long double rmse = mag2(timebuf - timebuf2) / mag2(timebuf);
+    cout << "roundtrip rmse: " << rmse << endl;
+}
+
+template <typename T_scalar>
+void two_demos(int nfft)
+{
+    cout << "     scalar ";
+    fwd_inv<T_scalar,std::complex<T_scalar> >(nfft);
+    cout << "    complex ";
+    fwd_inv<std::complex<T_scalar>,std::complex<T_scalar> >(nfft);
+}
+
+void demo_all_types(int nfft)
+{
+    cout << "nfft=" << nfft << endl;
+    cout << "   float" << endl;
+    two_demos<float>(nfft);
+    cout << "   double" << endl;
+    two_demos<double>(nfft);
+    cout << "   long double" << endl;
+    two_demos<long double>(nfft);
+}
+
+int main()
+{
+    demo_all_types( 2*3*4*5*7 );
+    demo_all_types( 2*9*16*25 );
+    demo_all_types( 1024 );
+    return 0;
+}

From e577c70e49dddd88c83175921431d2725128b0f3 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Tue, 16 Jun 2009 23:54:58 -0400
Subject: [PATCH 16/34] candidate header for Eigen/Complex

---
 unsupported/Eigen/Complex | 142 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 unsupported/Eigen/Complex

diff --git a/unsupported/Eigen/Complex b/unsupported/Eigen/Complex
new file mode 100644
index 000000000..f8d9af25c
--- /dev/null
+++ b/unsupported/Eigen/Complex
@@ -0,0 +1,142 @@
+#ifndef EIGEN_COMPLEX_H
+#define EIGEN_COMPLEX_H
+
+#include <complex>
+
+namespace Eigen {
+
+template <typename _NativePtr,typename _PunnedPtr>
+struct castable_pointer
+{
+    castable_pointer(_NativePtr ptr) : _ptr(ptr) {}
+    operator _NativePtr ()  {return _ptr;}
+    operator _PunnedPtr ()  {return reinterpret_cast<_PunnedPtr>(_ptr);}
+    private: 
+    _NativePtr _ptr;
+};
+
+template <typename T>
+struct Complex
+{
+    typedef typename std::complex<T> StandardComplex;
+    typedef T value_type;
+
+    Complex(const T& re = T(), const T& im = T()) : _re(re),_im(im) { }
+    Complex(const Complex&other ): _re(other.real()) ,_im(other.imag()) {}
+
+    template<class X> 
+    Complex(const Complex<X>&other): _re(other.real()) ,_im(other.imag()) {}
+    template<class X> 
+    Complex(const std::complex<X>&other): _re(other.real()) ,_im(other.imag()) {}
+
+    typedef castable_pointer< Complex<T>*, StandardComplex* > pointer_type;
+    typedef castable_pointer< const Complex<T>*, const StandardComplex* > const_pointer_type;
+
+    pointer_type operator & () {return pointer_type(this);}
+    const_pointer_type operator & () const {return const_pointer_type(this);}
+
+    StandardComplex std_type() const {return StandardComplex(real(),imag());}
+    StandardComplex & std_type() {return *(StandardComplex*)(&(*this));}
+
+    operator StandardComplex () const {return std_type();}
+    operator StandardComplex & () {return std_type();}
+
+    T & real() {return _re;}
+    T & imag() {return _im;}
+    const T & real() const {return _re;}
+    const T & imag() const {return _im;}
+
+    // *** complex member functions: ***
+    Complex<T>& operator= (const T& val) { _re=val;_im=0;return *this;  }
+    Complex<T>& operator+= (const T& val) {_re+=val;return *this;}
+    Complex<T>& operator-= (const T& val) {_re-=val;return *this;}
+    Complex<T>& operator*= (const T& val) {_re*=val;_im*=val;return *this;  }
+    Complex<T>& operator/= (const T& val) {_re/=val;_im/=val;return *this;  }
+
+    Complex& operator= (const Complex& rhs) {_re=rhs._re;_im=rhs._im;return *this;}
+    Complex& operator= (const StandardComplex& rhs) {_re=rhs.real();_im=rhs.imag();return *this;}
+
+    template<class X> Complex<T>& operator= (const Complex<X>& rhs) { _re=rhs._re;_im=rhs._im;return *this;}
+    template<class X> Complex<T>& operator+= (const Complex<X>& rhs) { _re+=rhs._re;_im+=rhs._im;return *this;}
+    template<class X> Complex<T>& operator-= (const Complex<X>& rhs) { _re-=rhs._re;_im-=rhs._im;return *this;}
+    template<class X> Complex<T>& operator*= (const Complex<X>& rhs) { this->std_type() *= rhs.std_type(); return *this; }
+    template<class X> Complex<T>& operator/= (const Complex<X>& rhs) { this->std_type() /= rhs.std_type(); return *this; }
+
+    private:
+    T _re;
+    T _im;
+};
+
+template <typename T>
+T ei_to_std( const T & x) {return x;}
+
+template <typename T>
+std::complex<T> ei_to_std( const Complex<T> & x) {return x.std_type();}
+
+// 26.2.6 operators
+template<class T> Complex<T> operator+(const Complex<T>& rhs) {return rhs;}
+template<class T> Complex<T> operator-(const Complex<T>& rhs) {return -ei_to_std(rhs);}
+
+template<class T> Complex<T> operator+(const Complex<T>& lhs, const Complex<T>& rhs) { return ei_to_std(lhs) + ei_to_std(rhs);}
+template<class T> Complex<T> operator-(const Complex<T>& lhs, const Complex<T>& rhs) { return ei_to_std(lhs) - ei_to_std(rhs);}
+template<class T> Complex<T> operator*(const Complex<T>& lhs, const Complex<T>& rhs) { return ei_to_std(lhs) * ei_to_std(rhs);}
+template<class T> Complex<T> operator/(const Complex<T>& lhs, const Complex<T>& rhs) { return ei_to_std(lhs) / ei_to_std(rhs);}
+template<class T> bool operator==(const Complex<T>& lhs, const Complex<T>& rhs) { return ei_to_std(lhs) == ei_to_std(rhs);}
+template<class T> bool operator!=(const Complex<T>& lhs, const Complex<T>& rhs) { return ei_to_std(lhs) != ei_to_std(rhs);}
+
+template<class T> Complex<T> operator+(const Complex<T>& lhs, const T& rhs) {return ei_to_std(lhs) + ei_to_std(rhs); }
+template<class T> Complex<T> operator-(const Complex<T>& lhs, const T& rhs) {return ei_to_std(lhs) - ei_to_std(rhs); }
+template<class T> Complex<T> operator*(const Complex<T>& lhs, const T& rhs) {return ei_to_std(lhs) * ei_to_std(rhs); }
+template<class T> Complex<T> operator/(const Complex<T>& lhs, const T& rhs) {return ei_to_std(lhs) / ei_to_std(rhs); }
+template<class T> bool operator==(const Complex<T>& lhs, const T& rhs) {return ei_to_std(lhs) == ei_to_std(rhs); }
+template<class T> bool operator!=(const Complex<T>& lhs, const T& rhs) {return ei_to_std(lhs) != ei_to_std(rhs); }
+
+template<class T> Complex<T> operator+(const T& lhs, const Complex<T>& rhs) {return ei_to_std(lhs) + ei_to_std(rhs); }
+template<class T> Complex<T> operator-(const T& lhs, const Complex<T>& rhs) {return ei_to_std(lhs) - ei_to_std(rhs); }
+template<class T> Complex<T> operator*(const T& lhs, const Complex<T>& rhs) {return ei_to_std(lhs) * ei_to_std(rhs); }
+template<class T> Complex<T> operator/(const T& lhs, const Complex<T>& rhs) {return ei_to_std(lhs) / ei_to_std(rhs); }
+template<class T> bool operator==(const T& lhs, const Complex<T>& rhs) {return ei_to_std(lhs) == ei_to_std(rhs); }
+template<class T> bool operator!=(const T& lhs, const Complex<T>& rhs) {return ei_to_std(lhs) != ei_to_std(rhs); }
+
+template<class T, class charT, class traits>
+std::basic_istream<charT,traits>&
+  operator>> (std::basic_istream<charT,traits>& istr, Complex<T>& rhs)
+{
+    return istr >> rhs.std_type();
+}
+
+template<class T, class charT, class traits>
+std::basic_ostream<charT,traits>&
+operator<< (std::basic_ostream<charT,traits>& ostr, const Complex<T>& rhs)
+{
+    return ostr << rhs.std_type();
+}
+
+ // 26.2.7 values:
+ template<class T> T real(const Complex<T>&x) {return real(ei_to_std(x));}
+ template<class T> T abs(const Complex<T>&x) {return abs(ei_to_std(x));}
+ template<class T> T arg(const Complex<T>&x) {return arg(ei_to_std(x));}
+ template<class T> T norm(const Complex<T>&x) {return norm(ei_to_std(x));}
+
+ template<class T> Complex<T> conj(const Complex<T>&x) { return conj(ei_to_std(x));}
+ template<class T> Complex<T> polar(const T& x, const T&y) {return polar(ei_to_std(x),ei_to_std(y));} 
+ // 26.2.8 transcendentals:
+ template<class T> Complex<T> cos (const  Complex<T>&x){return cos(ei_to_std(x));}
+ template<class T> Complex<T> cosh (const  Complex<T>&x){return cosh(ei_to_std(x));}
+ template<class T> Complex<T> exp (const  Complex<T>&x){return exp(ei_to_std(x));}
+ template<class T> Complex<T> log (const  Complex<T>&x){return log(ei_to_std(x));}
+ template<class T> Complex<T> log10 (const  Complex<T>&x){return log10(ei_to_std(x));}
+
+ template<class T> Complex<T> pow(const Complex<T>&x, int p) {return pow(ei_to_std(x),ei_to_std(p));}
+ template<class T> Complex<T> pow(const Complex<T>&x, const T&p) {return pow(ei_to_std(x),ei_to_std(p));}
+ template<class T> Complex<T> pow(const Complex<T>&x, const Complex<T>&p) {return pow(ei_to_std(x),ei_to_std(p));}
+ template<class T> Complex<T> pow(const T&x, const Complex<T>&p) {return pow(ei_to_std(x),ei_to_std(p));}
+
+ template<class T> Complex<T> sin (const  Complex<T>&x){return sin(ei_to_std(x));}
+ template<class T> Complex<T> sinh (const  Complex<T>&x){return sinh(ei_to_std(x));}
+ template<class T> Complex<T> sqrt (const  Complex<T>&x){return sqrt(ei_to_std(x));}
+ template<class T> Complex<T> tan (const  Complex<T>&x){return tan(ei_to_std(x));}
+ template<class T> Complex<T> tanh (const  Complex<T>&x){return tanh(ei_to_std(x));}
+}
+
+#endif

From fb9a15e451371f4e600dbb60b8e5c9d098889844 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Wed, 17 Jun 2009 00:09:18 -0400
Subject: [PATCH 17/34] added copyright notice

---
 unsupported/Eigen/Complex | 50 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 44 insertions(+), 6 deletions(-)

diff --git a/unsupported/Eigen/Complex b/unsupported/Eigen/Complex
index f8d9af25c..a0483abdf 100644
--- a/unsupported/Eigen/Complex
+++ b/unsupported/Eigen/Complex
@@ -1,6 +1,34 @@
 #ifndef EIGEN_COMPLEX_H
 #define EIGEN_COMPLEX_H
 
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+// Eigen::Complex reuses as much as possible from std::complex
+// and allows easy conversion to and from, even at the pointer level.
+
+
 #include <complex>
 
 namespace Eigen {
@@ -21,6 +49,7 @@ struct Complex
     typedef typename std::complex<T> StandardComplex;
     typedef T value_type;
 
+    // constructors
     Complex(const T& re = T(), const T& im = T()) : _re(re),_im(im) { }
     Complex(const Complex&other ): _re(other.real()) ,_im(other.imag()) {}
 
@@ -29,22 +58,31 @@ struct Complex
     template<class X> 
     Complex(const std::complex<X>&other): _re(other.real()) ,_im(other.imag()) {}
 
+
+    // allow binary access to the object as a std::complex
     typedef castable_pointer< Complex<T>*, StandardComplex* > pointer_type;
     typedef castable_pointer< const Complex<T>*, const StandardComplex* > const_pointer_type;
-
     pointer_type operator & () {return pointer_type(this);}
     const_pointer_type operator & () const {return const_pointer_type(this);}
 
-    StandardComplex std_type() const {return StandardComplex(real(),imag());}
-    StandardComplex & std_type() {return *(StandardComplex*)(&(*this));}
-
     operator StandardComplex () const {return std_type();}
     operator StandardComplex & () {return std_type();}
 
-    T & real() {return _re;}
-    T & imag() {return _im;}
+    StandardComplex std_type() const {return StandardComplex(real(),imag());}
+    StandardComplex & std_type() {return *reinterpret_cast<StandardComplex*>(this);}
+
+
+    // every sort of accessor and mutator that has ever been in fashion.
+    // For a brief history, search for "std::complex over-encapsulated"
+    // http://www.open-std.org/jtc1/sc22/wg21/docs/lwg-defects.html#387
     const T & real() const {return _re;}
     const T & imag() const {return _im;}
+    T & real() {return _re;}
+    T & imag() {return _im;}
+    T & real(const T & x) {return _re=x;}
+    T & imag(const T & x) {return _im=x;}
+    void set_real(const T & x) {_re = x;}
+    void set_imag(const T & x) {_im = x;}
 
     // *** complex member functions: ***
     Complex<T>& operator= (const T& val) { _re=val;_im=0;return *this;  }

From a39de276a95234ff85f58fa93364b047f3d72c6b Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Mon, 14 Sep 2009 01:52:26 -0400
Subject: [PATCH 18/34] added the test case for FFTW

---
 unsupported/test/CMakeLists.txt |   2 +-
 unsupported/test/FFTW.cpp       | 136 ++++++++++++++++++++++++++++++++
 2 files changed, 137 insertions(+), 1 deletion(-)
 create mode 100644 unsupported/test/FFTW.cpp

diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 23217811f..f42077bdc 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -23,6 +23,6 @@ ei_add_test(FFT)
 
 find_package(FFTW)
 if(FFTW_FOUND)
-  ei_add_test(FFTW  " " "${FFTW_LIBRARIES}" )
+  ei_add_test(FFTW  " " "-lfftw3 -lfftw3f -lfftw3l" )
 endif(FFTW_FOUND)
 
diff --git a/unsupported/test/FFTW.cpp b/unsupported/test/FFTW.cpp
new file mode 100644
index 000000000..cf7be75aa
--- /dev/null
+++ b/unsupported/test/FFTW.cpp
@@ -0,0 +1,136 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#include "main.h"
+#include <fftw3.h>
+#include <unsupported/Eigen/FFT>
+
+using namespace std;
+
+float norm(float x) {return x*x;}
+double norm(double x) {return x*x;}
+long double norm(long double x) {return x*x;}
+
+template < typename T>
+complex<long double>  promote(complex<T> x) { return complex<long double>(x.real(),x.imag()); }
+
+complex<long double>  promote(float x) { return complex<long double>( x); }
+complex<long double>  promote(double x) { return complex<long double>( x); }
+complex<long double>  promote(long double x) { return complex<long double>( x); }
+    
+
+    template <typename T1,typename T2>
+    long double fft_rmse( const vector<T1> & fftbuf,const vector<T2> & timebuf)
+    {
+        long double totalpower=0;
+        long double difpower=0;
+        cerr <<"idx\ttruth\t\tvalue\t|dif|=\n";
+        for (size_t k0=0;k0<fftbuf.size();++k0) {
+            complex<long double> acc = 0;
+            long double phinc = -2.*k0* M_PIl / timebuf.size();
+            for (size_t k1=0;k1<timebuf.size();++k1) {
+                acc +=  promote( timebuf[k1] ) * exp( complex<long double>(0,k1*phinc) );
+            }
+            totalpower += norm(acc);
+            complex<long double> x = promote(fftbuf[k0]); 
+            complex<long double> dif = acc - x;
+            difpower += norm(dif);
+            cerr << k0 << "\t" << acc << "\t" <<  x << "\t" << sqrt(norm(dif)) << endl;
+        }
+        cerr << "rmse:" << sqrt(difpower/totalpower) << endl;
+        return sqrt(difpower/totalpower);
+    }
+
+    template <typename T1,typename T2>
+    long double dif_rmse( const vector<T1> buf1,const vector<T2> buf2)
+    {
+        long double totalpower=0;
+        long double difpower=0;
+        size_t n = min( buf1.size(),buf2.size() );
+        for (size_t k=0;k<n;++k) {
+            totalpower += (norm( buf1[k] ) + norm(buf2[k]) )/2.;
+            difpower += norm(buf1[k] - buf2[k]);
+        }
+        return sqrt(difpower/totalpower);
+    }
+
+template <class T>
+void test_scalar(int nfft)
+{
+    typedef typename Eigen::FFT<T>::Complex Complex;
+    typedef typename Eigen::FFT<T>::Scalar Scalar;
+
+    FFT<T> fft;
+    vector<Scalar> inbuf(nfft);
+    vector<Complex> outbuf;
+    for (int k=0;k<nfft;++k)
+        inbuf[k]= (T)(rand()/(double)RAND_MAX - .5);
+    fft.fwd( outbuf,inbuf);
+    VERIFY( fft_rmse(outbuf,inbuf) < test_precision<T>()  );// gross check
+
+    vector<Scalar> buf3;
+    fft.inv( buf3 , outbuf);
+    VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>()  );// gross check
+}
+
+template <class T>
+void test_complex(int nfft)
+{
+    typedef typename Eigen::FFT<T>::Complex Complex;
+
+    FFT<T> fft;
+
+    vector<Complex> inbuf(nfft);
+    vector<Complex> outbuf;
+    vector<Complex> buf3;
+    for (int k=0;k<nfft;++k)
+        inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) );
+    fft.fwd( outbuf , inbuf);
+
+    VERIFY( fft_rmse(outbuf,inbuf) < test_precision<T>()  );// gross check
+
+    fft.inv( buf3 , outbuf);
+
+    VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>()  );// gross check
+}
+
+void test_FFTW()
+{
+
+  CALL_SUBTEST( test_complex<float>(32) ); CALL_SUBTEST( test_complex<double>(32) ); CALL_SUBTEST( test_complex<long double>(32) );
+  CALL_SUBTEST( test_complex<float>(256) ); CALL_SUBTEST( test_complex<double>(256) ); CALL_SUBTEST( test_complex<long double>(256) );
+  CALL_SUBTEST( test_complex<float>(3*8) ); CALL_SUBTEST( test_complex<double>(3*8) ); CALL_SUBTEST( test_complex<long double>(3*8) );
+  CALL_SUBTEST( test_complex<float>(5*32) ); CALL_SUBTEST( test_complex<double>(5*32) ); CALL_SUBTEST( test_complex<long double>(5*32) );
+  CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); CALL_SUBTEST( test_complex<long double>(2*3*4) );
+  CALL_SUBTEST( test_complex<float>(2*3*4*5) ); CALL_SUBTEST( test_complex<double>(2*3*4*5) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5) );
+  CALL_SUBTEST( test_complex<float>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<double>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5*7) );
+
+
+
+  CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); CALL_SUBTEST( test_scalar<long double>(32) );
+  CALL_SUBTEST( test_scalar<float>(45) ); CALL_SUBTEST( test_scalar<double>(45) ); CALL_SUBTEST( test_scalar<long double>(45) );
+  CALL_SUBTEST( test_scalar<float>(50) ); CALL_SUBTEST( test_scalar<double>(50) ); CALL_SUBTEST( test_scalar<long double>(50) );
+  CALL_SUBTEST( test_scalar<float>(256) ); CALL_SUBTEST( test_scalar<double>(256) ); CALL_SUBTEST( test_scalar<long double>(256) );
+  CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
+}

From 44ba4b1d6d5cd39d824bb83876175d0dc39a9cc3 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 16 Oct 2009 11:27:04 +0200
Subject: [PATCH 19/34] add operator+ scalar to AutoDiffScalar

---
 unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
index 888aa5c8c..fc5e237ab 100644
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -108,6 +108,22 @@ class AutoDiffScalar
     inline const DerType& derivatives() const { return m_derivatives; }
     inline DerType& derivatives() { return m_derivatives; }
 
+    inline const AutoDiffScalar<DerType> operator+(const Scalar& other) const
+    {
+      return AutoDiffScalar<DerType>(m_value + other, m_derivatives);
+    }
+
+    friend inline const AutoDiffScalar<DerType> operator+(const Scalar& a, const AutoDiffScalar& b)
+    {
+      return AutoDiffScalar<DerType>(a + b.value(), b.derivatives());
+    }
+
+    inline AutoDiffScalar& operator+=(const Scalar& other)
+    {
+      value() += other;
+      return *this;
+    }
+
     template<typename OtherDerType>
     inline const AutoDiffScalar<CwiseBinaryOp<ei_scalar_sum_op<Scalar>,DerType,OtherDerType> >
     operator+(const AutoDiffScalar<OtherDerType>& other) const

From 7b0c4102facc9b5f6ca99ef76febb05a9499b8b0 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 16 Oct 2009 13:22:38 +0200
Subject: [PATCH 20/34] * add a Make* expression type builder to allow the  
 construction of generic expressions working   for both dense and sparse
 matrix. A nicer solution   would be to use CwiseBinaryOp for any kind of
 matrix.   To this end we either need to change the overall design   so that
 the base class(es) depends on the kind of matrix,   or we could add a
 template parameter to each expression   type (e.g., int Kind =
 ei_traits<MatrixType>::Kind)   allowing to specialize each expression for
 each kind of matrix. * Extend AutoDiffScalar to work with sparse vector
 expression   for the derivatives.

---
 Eigen/Core                                    |  1 +
 Eigen/Sparse                                  |  1 +
 Eigen/src/Core/ExpressionMaker.h              | 61 +++++++++++++
 Eigen/src/Sparse/SparseExpressionMaker.h      | 48 +++++++++++
 .../Eigen/src/AutoDiff/AutoDiffJacobian.h     |  4 +-
 .../Eigen/src/AutoDiff/AutoDiffScalar.h       | 85 ++++++++++---------
 6 files changed, 161 insertions(+), 39 deletions(-)
 create mode 100644 Eigen/src/Core/ExpressionMaker.h
 create mode 100644 Eigen/src/Sparse/SparseExpressionMaker.h

diff --git a/Eigen/Core b/Eigen/Core
index c8fcb1c09..3dce6422f 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -200,6 +200,7 @@ namespace Eigen {
 #include "src/Core/products/TriangularMatrixMatrix.h"
 #include "src/Core/products/TriangularSolverMatrix.h"
 #include "src/Core/BandMatrix.h"
+#include "src/Core/ExpressionMaker.h"
 
 } // namespace Eigen
 
diff --git a/Eigen/Sparse b/Eigen/Sparse
index a8888daa3..96bd61419 100644
--- a/Eigen/Sparse
+++ b/Eigen/Sparse
@@ -110,6 +110,7 @@ namespace Eigen {
 #include "src/Sparse/SparseLLT.h"
 #include "src/Sparse/SparseLDLT.h"
 #include "src/Sparse/SparseLU.h"
+#include "src/Sparse/SparseExpressionMaker.h"
 
 #ifdef EIGEN_CHOLMOD_SUPPORT
 # include "src/Sparse/CholmodSupport.h"
diff --git a/Eigen/src/Core/ExpressionMaker.h b/Eigen/src/Core/ExpressionMaker.h
new file mode 100644
index 000000000..1d265b63c
--- /dev/null
+++ b/Eigen/src/Core/ExpressionMaker.h
@@ -0,0 +1,61 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_EXPRESSIONMAKER_H
+#define EIGEN_EXPRESSIONMAKER_H
+
+// computes the shape of a matrix from its traits flag
+template<typename XprType> struct ei_shape_of
+{
+  enum { ret = ei_traits<XprType>::Flags&SparseBit ? IsSparse : IsDense };
+};
+
+
+// Since the Sparse module is completely separated from the Core module, there is
+// no way to write the type of a generic expression working for both dense and sparse
+// matrix. Unless we change the overall design, here is a workaround.
+// There is an example in unsuported/Eigen/src/AutoDiff/AutoDiffScalar.
+
+template<typename XprType, int Shape = ei_shape_of<XprType>::ret>
+struct MakeNestByValue
+{
+  typedef NestByValue<XprType> Type;
+};
+
+template<typename Func, typename XprType, int Shape = ei_shape_of<XprType>::ret>
+struct MakeCwiseUnaryOp
+{
+  typedef CwiseUnaryOp<Func,XprType> Type;
+};
+
+template<typename Func, typename A, typename B, int Shape = ei_shape_of<A>::ret>
+struct MakeCwiseBinaryOp
+{
+  typedef CwiseBinaryOp<Func,A,B> Type;
+};
+
+// TODO complete the list
+
+
+#endif // EIGEN_EXPRESSIONMAKER_H
diff --git a/Eigen/src/Sparse/SparseExpressionMaker.h b/Eigen/src/Sparse/SparseExpressionMaker.h
new file mode 100644
index 000000000..1fdcbb1f2
--- /dev/null
+++ b/Eigen/src/Sparse/SparseExpressionMaker.h
@@ -0,0 +1,48 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_SPARSE_EXPRESSIONMAKER_H
+#define EIGEN_SPARSE_EXPRESSIONMAKER_H
+
+template<typename XprType>
+struct MakeNestByValue<XprType,IsSparse>
+{
+  typedef SparseNestByValue<XprType> Type;
+};
+
+template<typename Func, typename XprType>
+struct MakeCwiseUnaryOp<Func,XprType,IsSparse>
+{
+  typedef SparseCwiseUnaryOp<Func,XprType> Type;
+};
+
+template<typename Func, typename A, typename B>
+struct MakeCwiseBinaryOp<Func,A,B,IsSparse>
+{
+  typedef SparseCwiseBinaryOp<Func,A,B> Type;
+};
+
+// TODO complete the list
+
+#endif // EIGEN_SPARSE_EXPRESSIONMAKER_H
diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h b/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
index a5e881487..b3983f8a6 100644
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
@@ -50,10 +50,12 @@ public:
   typedef typename Functor::InputType InputType;
   typedef typename Functor::ValueType ValueType;
   typedef typename Functor::JacobianType JacobianType;
+  typedef typename JacobianType::Scalar Scalar;
 
-  typedef Matrix<double,InputsAtCompileTime,1> DerivativeType;
+  typedef Matrix<Scalar,InputsAtCompileTime,1> DerivativeType;
   typedef AutoDiffScalar<DerivativeType> ActiveScalar;
 
+
   typedef Matrix<ActiveScalar, InputsAtCompileTime, 1> ActiveInput;
   typedef Matrix<ActiveScalar, ValuesAtCompileTime, 1> ActiveValue;
 
diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
index fc5e237ab..2fb733a99 100644
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -42,9 +42,17 @@ void ei_make_coherent(const A& a, const B&b)
 /** \class AutoDiffScalar
   * \brief A scalar type replacement with automatic differentation capability
   *
-  * \param DerType the vector type used to store/represent the derivatives (e.g. Vector3f)
+  * \param _DerType the vector type used to store/represent the derivatives. The base scalar type
+  *                 as well as the number of derivatives to compute are determined from this type.
+  *                 Typical choices include, e.g., \c Vector4f for 4 derivatives, or \c VectorXf
+  *                 if the number of derivatives is not known at compile time, and/or, the number
+  *                 of derivatives is large.
+  *                 Note that _DerType can also be a reference (e.g., \c VectorXf&) to wrap a
+  *                 existing vector into an AutoDiffScalar.
+  *                 Finally, _DerType can also be any Eigen compatible expression.
   *
-  * This class represents a scalar value while tracking its respective derivatives.
+  * This class represents a scalar value while tracking its respective derivatives using Eigen's expression
+  * template mechanism.
   *
   * It supports the following list of global math function:
   *  - std::abs, std::sqrt, std::pow, std::exp, std::log, std::sin, std::cos,
@@ -56,10 +64,11 @@ void ei_make_coherent(const A& a, const B&b)
   * while derivatives are computed right away.
   *
   */
-template<typename DerType>
+template<typename _DerType>
 class AutoDiffScalar
 {
   public:
+    typedef typename ei_cleantype<_DerType>::type DerType;
     typedef typename ei_traits<DerType>::Scalar Scalar;
 
     inline AutoDiffScalar() {}
@@ -108,12 +117,12 @@ class AutoDiffScalar
     inline const DerType& derivatives() const { return m_derivatives; }
     inline DerType& derivatives() { return m_derivatives; }
 
-    inline const AutoDiffScalar<DerType> operator+(const Scalar& other) const
+    inline const AutoDiffScalar<DerType&> operator+(const Scalar& other) const
     {
       return AutoDiffScalar<DerType>(m_value + other, m_derivatives);
     }
 
-    friend inline const AutoDiffScalar<DerType> operator+(const Scalar& a, const AutoDiffScalar& b)
+    friend inline const AutoDiffScalar<DerType&> operator+(const Scalar& a, const AutoDiffScalar& b)
     {
       return AutoDiffScalar<DerType>(a + b.value(), b.derivatives());
     }
@@ -125,11 +134,11 @@ class AutoDiffScalar
     }
 
     template<typename OtherDerType>
-    inline const AutoDiffScalar<CwiseBinaryOp<ei_scalar_sum_op<Scalar>,DerType,OtherDerType> >
+    inline const AutoDiffScalar<typename MakeCwiseBinaryOp<ei_scalar_sum_op<Scalar>,DerType,typename ei_cleantype<OtherDerType>::type>::Type >
     operator+(const AutoDiffScalar<OtherDerType>& other) const
     {
       ei_make_coherent(m_derivatives, other.derivatives());
-      return AutoDiffScalar<CwiseBinaryOp<ei_scalar_sum_op<Scalar>,DerType,OtherDerType> >(
+      return AutoDiffScalar<typename MakeCwiseBinaryOp<ei_scalar_sum_op<Scalar>,DerType,typename ei_cleantype<OtherDerType>::type>::Type >(
         m_value + other.value(),
         m_derivatives + other.derivatives());
     }
@@ -143,11 +152,11 @@ class AutoDiffScalar
     }
 
     template<typename OtherDerType>
-    inline const AutoDiffScalar<CwiseBinaryOp<ei_scalar_difference_op<Scalar>, DerType,OtherDerType> >
+    inline const AutoDiffScalar<typename MakeCwiseBinaryOp<ei_scalar_difference_op<Scalar>, DerType,typename ei_cleantype<OtherDerType>::type>::Type >
     operator-(const AutoDiffScalar<OtherDerType>& other) const
     {
       ei_make_coherent(m_derivatives, other.derivatives());
-      return AutoDiffScalar<CwiseBinaryOp<ei_scalar_difference_op<Scalar>, DerType,OtherDerType> >(
+      return AutoDiffScalar<typename MakeCwiseBinaryOp<ei_scalar_difference_op<Scalar>, DerType,typename ei_cleantype<OtherDerType>::type>::Type >(
         m_value - other.value(),
         m_derivatives - other.derivatives());
     }
@@ -161,73 +170,73 @@ class AutoDiffScalar
     }
 
     template<typename OtherDerType>
-    inline const AutoDiffScalar<CwiseUnaryOp<ei_scalar_opposite_op<Scalar>, DerType> >
+    inline const AutoDiffScalar<typename MakeCwiseUnaryOp<ei_scalar_opposite_op<Scalar>, DerType>::Type >
     operator-() const
     {
-      return AutoDiffScalar<CwiseUnaryOp<ei_scalar_opposite_op<Scalar>, DerType> >(
+      return AutoDiffScalar<typename MakeCwiseUnaryOp<ei_scalar_opposite_op<Scalar>, DerType>::Type >(
         -m_value,
         -m_derivatives);
     }
 
-    inline const AutoDiffScalar<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType> >
+    inline const AutoDiffScalar<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType>::Type >
     operator*(const Scalar& other) const
     {
-      return AutoDiffScalar<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType> >(
+      return AutoDiffScalar<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType>::Type >(
         m_value * other,
         (m_derivatives * other));
     }
 
-    friend inline const AutoDiffScalar<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType> >
+    friend inline const AutoDiffScalar<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType>::Type >
     operator*(const Scalar& other, const AutoDiffScalar& a)
     {
-      return AutoDiffScalar<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType> >(
+      return AutoDiffScalar<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType>::Type >(
         a.value() * other,
         a.derivatives() * other);
     }
 
-    inline const AutoDiffScalar<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType> >
+    inline const AutoDiffScalar<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType>::Type >
     operator/(const Scalar& other) const
     {
-      return AutoDiffScalar<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType> >(
+      return AutoDiffScalar<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType>::Type >(
         m_value / other,
         (m_derivatives * (Scalar(1)/other)));
     }
 
-    friend inline const AutoDiffScalar<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType> >
+    friend inline const AutoDiffScalar<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType>::Type >
     operator/(const Scalar& other, const AutoDiffScalar& a)
     {
-      return AutoDiffScalar<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType> >(
+      return AutoDiffScalar<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType>::Type >(
         other / a.value(),
         a.derivatives() * (-Scalar(1)/other));
     }
 
     template<typename OtherDerType>
-    inline const AutoDiffScalar<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>,
-        NestByValue<CwiseBinaryOp<ei_scalar_difference_op<Scalar>,
-          NestByValue<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType> >,
-          NestByValue<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, OtherDerType> > > > > >
+    inline const AutoDiffScalar<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>,
+        typename MakeNestByValue<typename MakeCwiseBinaryOp<ei_scalar_difference_op<Scalar>,
+          typename MakeNestByValue<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType>::Type>::Type,
+          typename MakeNestByValue<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>, typename ei_cleantype<OtherDerType>::type>::Type>::Type >::Type >::Type >::Type >
     operator/(const AutoDiffScalar<OtherDerType>& other) const
     {
       ei_make_coherent(m_derivatives, other.derivatives());
-      return AutoDiffScalar<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>,
-        NestByValue<CwiseBinaryOp<ei_scalar_difference_op<Scalar>,
-          NestByValue<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType> >,
-          NestByValue<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, OtherDerType> > > > > >(
+      return AutoDiffScalar<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>,
+        typename MakeNestByValue<typename MakeCwiseBinaryOp<ei_scalar_difference_op<Scalar>,
+          typename MakeNestByValue<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType>::Type>::Type,
+          typename MakeNestByValue<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>, typename ei_cleantype<OtherDerType>::type>::Type>::Type >::Type >::Type >::Type >(
         m_value / other.value(),
           ((m_derivatives * other.value()).nestByValue() - (m_value * other.derivatives()).nestByValue()).nestByValue()
         * (Scalar(1)/(other.value()*other.value())));
     }
 
     template<typename OtherDerType>
-    inline const AutoDiffScalar<CwiseBinaryOp<ei_scalar_sum_op<Scalar>,
-        NestByValue<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType> >,
-        NestByValue<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, OtherDerType> > > >
+    inline const AutoDiffScalar<typename MakeCwiseBinaryOp<ei_scalar_sum_op<Scalar>,
+        typename MakeNestByValue<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType>::Type>::Type,
+        typename MakeNestByValue<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>, typename ei_cleantype<OtherDerType>::type>::Type>::Type >::Type >
     operator*(const AutoDiffScalar<OtherDerType>& other) const
     {
       ei_make_coherent(m_derivatives, other.derivatives());
-      return AutoDiffScalar<CwiseBinaryOp<ei_scalar_sum_op<Scalar>,
-        NestByValue<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType> >,
-        NestByValue<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, OtherDerType> > > >(
+      return AutoDiffScalar<typename MakeCwiseBinaryOp<ei_scalar_sum_op<Scalar>,
+        typename MakeNestByValue<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType>::Type>::Type,
+        typename MakeNestByValue<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>, typename ei_cleantype<OtherDerType>::type>::Type>::Type >::Type >(
         m_value * other.value(),
         (m_derivatives * other.value()).nestByValue() + (m_value * other.derivatives()).nestByValue());
     }
@@ -299,11 +308,11 @@ struct ei_make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRo
 
 #define EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(FUNC,CODE) \
   template<typename DerType> \
-  inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::ei_scalar_multiple_op<typename Eigen::ei_traits<DerType>::Scalar>, DerType> > \
+  inline const Eigen::AutoDiffScalar<typename Eigen::MakeCwiseUnaryOp<Eigen::ei_scalar_multiple_op<typename Eigen::ei_traits<DerType>::Scalar>, DerType>::Type > \
   FUNC(const Eigen::AutoDiffScalar<DerType>& x) { \
     using namespace Eigen; \
     typedef typename ei_traits<DerType>::Scalar Scalar; \
-    typedef AutoDiffScalar<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType> > ReturnType; \
+    typedef AutoDiffScalar<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType>::Type > ReturnType; \
     CODE; \
   }
 
@@ -330,12 +339,12 @@ namespace std
     return ReturnType(std::log(x.value),x.derivatives() * (Scalar(1).x.value()));)
 
   template<typename DerType>
-  inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::ei_scalar_multiple_op<typename Eigen::ei_traits<DerType>::Scalar>, DerType> >
+  inline const Eigen::AutoDiffScalar<typename Eigen::MakeCwiseUnaryOp<Eigen::ei_scalar_multiple_op<typename Eigen::ei_traits<DerType>::Scalar>, DerType>::Type >
   pow(const Eigen::AutoDiffScalar<DerType>& x, typename Eigen::ei_traits<DerType>::Scalar y)
   {
     using namespace Eigen;
     typedef typename ei_traits<DerType>::Scalar Scalar;
-    return AutoDiffScalar<CwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType> >(
+    return AutoDiffScalar<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<Scalar>, DerType>::Type >(
       std::pow(x.value(),y),
       x.derivatives() * (y * std::pow(x.value(),y-1)));
   }
@@ -375,7 +384,7 @@ EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(ei_log,
   return ReturnType(ei_log(x.value),x.derivatives() * (Scalar(1).x.value()));)
 
 template<typename DerType>
-inline const AutoDiffScalar<CwiseUnaryOp<ei_scalar_multiple_op<typename ei_traits<DerType>::Scalar>, DerType> >
+inline const AutoDiffScalar<typename MakeCwiseUnaryOp<ei_scalar_multiple_op<typename ei_traits<DerType>::Scalar>, DerType>::Type >
 ei_pow(const AutoDiffScalar<DerType>& x, typename ei_traits<DerType>::Scalar y)
 { return std::pow(x,y);}
 

From 5e3e6ff71a314b0a0ca569d6a7726757a39e9434 Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Tue, 20 Oct 2009 22:08:13 +0200
Subject: [PATCH 21/34] Added Windows support to the BenchTimer.

---
 bench/BenchTimer.h | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/bench/BenchTimer.h b/bench/BenchTimer.h
index bfc3a99b3..c1f473597 100644
--- a/bench/BenchTimer.h
+++ b/bench/BenchTimer.h
@@ -26,8 +26,14 @@
 #ifndef EIGEN_BENCH_TIMER_H
 #define EIGEN_BENCH_TIMER_H
 
+#ifndef WIN32
 #include <sys/time.h>
 #include <unistd.h>
+#else
+#define NOMINMAX
+#include <windows.h>
+#endif
+
 #include <cstdlib>
 #include <numeric>
 
@@ -40,7 +46,15 @@ class BenchTimer
 {
 public:
 
-  BenchTimer() { reset(); }
+  BenchTimer() 
+  { 
+#ifdef WIN32
+    LARGE_INTEGER freq;
+    QueryPerformanceFrequency(&freq);
+    m_frequency = (double)freq.QuadPart;
+#endif
+    reset(); 
+  }
 
   ~BenchTimer() {}
 
@@ -51,23 +65,35 @@ public:
     m_best = std::min(m_best, getTime() - m_start);
   }
 
-  /** Return the best elapsed time.
+  /** Return the best elapsed time in seconds.
     */
   inline double value(void)
   {
-      return m_best;
+    return m_best;
   }
 
+#ifdef WIN32
+  inline double getTime(void)
+#else
   static inline double getTime(void)
+#endif
   {
+#ifdef WIN32
+    LARGE_INTEGER query_ticks;
+    QueryPerformanceCounter(&query_ticks);
+    return query_ticks.QuadPart/m_frequency;
+#else
       struct timeval tv;
       struct timezone tz;
       gettimeofday(&tv, &tz);
       return (double)tv.tv_sec + 1.e-6 * (double)tv.tv_usec;
+#endif
   }
 
 protected:
-
+#ifdef WIN32
+  double m_frequency;
+#endif
   double m_best, m_start;
 
 };

From 471b4d509234cbcbd4a1cd45d48fe10efcc2bcf1 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Tue, 20 Oct 2009 21:53:54 -0400
Subject: [PATCH 22/34] handle mark's first commits before he configured his id

---
 scripts/eigen_gen_credits.cpp | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/scripts/eigen_gen_credits.cpp b/scripts/eigen_gen_credits.cpp
index 086548e26..d7a625d47 100644
--- a/scripts/eigen_gen_credits.cpp
+++ b/scripts/eigen_gen_credits.cpp
@@ -13,10 +13,24 @@ using namespace std;
 std::string contributor_name(const std::string& line)
 {
   string result;
+
+  // let's first take care of the case of isolated email addresses, like
+  // "user@localhost.localdomain" entries
+  if(line.find("markb@localhost.localdomain") != string::npos)
+  {
+    return "Mark Borgerding";
+  }
+
+  // from there on we assume that we have a entry of the form
+  // either:
+  //   Bla bli Blurp
+  // or:
+  //   Bla bli Blurp <bblurp@email.com>
+  
   size_t position_of_email_address = line.find_first_of('<');
   if(position_of_email_address != string::npos)
   {
-    // there is an e-mail address.
+    // there is an e-mail address in <...>.
     
     // Hauke once committed as "John Smith", fix that.
     if(line.find("hauke.heibel") != string::npos)
@@ -29,7 +43,7 @@ std::string contributor_name(const std::string& line)
   }
   else
   {
-    // there is no e-mail address.
+    // there is no e-mail address in <...>.
     
     if(line.find("convert-repo") != string::npos)
       result = "";

From c3180b7ffbc98d69764b3c1ab17b36e289f7cf7e Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Tue, 20 Oct 2009 23:25:49 -0400
Subject: [PATCH 23/34] MatrixBase:  * support resize() to same size (nop). The
 case of FFT was another case where that make one's life far easier.    hope
 that's ok with you Gael. but indeed, i don't use it in the ReturnByValue
 stuff.

FFT:
 * Support MatrixBase (well, in the case with direct memory access such as Map)
 * adapt unit test
---
 Eigen/src/Core/MatrixBase.h        |  19 +++++
 Eigen/src/Core/util/StaticAssert.h |   3 +-
 unsupported/Eigen/FFT              |  32 +++++++-
 unsupported/test/FFT.cpp           | 127 ++++++++++++++++++++++-------
 4 files changed, 148 insertions(+), 33 deletions(-)

diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 387c11385..0e8b49f75 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -190,6 +190,25 @@ template<typename Derived> class MatrixBase
       * i.e., the number of rows for a columns major matrix, and the number of cols otherwise */
     int innerSize() const { return (int(Flags)&RowMajorBit) ? this->cols() : this->rows(); }
 
+    /** Only plain matrices, not expressions may be resized; therefore the only useful resize method is
+      * Matrix::resize(). The present method only asserts that the new size equals the old size, and does
+      * nothing else.
+      */
+    void resize(int size)
+    {
+      ei_assert(size == this->size()
+                && "MatrixBase::resize() does not actually allow to resize.");
+    }
+    /** Only plain matrices, not expressions may be resized; therefore the only useful resize method is
+      * Matrix::resize(). The present method only asserts that the new size equals the old size, and does
+      * nothing else.
+      */
+    void resize(int rows, int cols)
+    {
+      ei_assert(rows == this->rows() && cols == this->cols()
+                && "MatrixBase::resize() does not actually allow to resize.");
+    }
+
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal the plain matrix type corresponding to this expression. Note that is not necessarily
       * exactly the return type of eval(): in the case of plain matrices, the return type of eval() is a const
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index 883f2d95e..6210bc91c 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -78,7 +78,8 @@
         INVALID_MATRIX_TEMPLATE_PARAMETERS,
         BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER,
         THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX,
-        THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE
+        THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE,
+        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES
       };
     };
 
diff --git a/unsupported/Eigen/FFT b/unsupported/Eigen/FFT
index dc7e85908..fafdb829b 100644
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT
@@ -36,7 +36,7 @@
 #define DEFAULT_FFT_IMPL ei_fftw_impl
 #endif
 
-// intel Math Kernel Library: fastest, commerical -- incompatible with Eigen in GPL form
+// intel Math Kernel Library: fastest, commercial -- incompatible with Eigen in GPL form
 #ifdef _MKL_DFTI_H_ // mkl_dfti.h has been included, we can use MKL FFT routines
 // TODO 
 // #include "src/FFT/ei_imkl_impl.h"
@@ -70,6 +70,20 @@ class FFT
         fwd( &dst[0],&src[0],src.size() );
     }
 
+    template<typename InputDerived, typename ComplexDerived>
+    void fwd( MatrixBase<ComplexDerived> & dst, const MatrixBase<InputDerived> & src)
+    {
+        EIGEN_STATIC_ASSERT_VECTOR_ONLY(InputDerived)
+        EIGEN_STATIC_ASSERT_VECTOR_ONLY(ComplexDerived)
+        EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(ComplexDerived,InputDerived) // size at compile-time
+        EIGEN_STATIC_ASSERT((ei_is_same_type<typename ComplexDerived::Scalar, Complex>::ret),
+                            YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+        EIGEN_STATIC_ASSERT(int(InputDerived::Flags)&int(ComplexDerived::Flags)&DirectAccessBit,
+                            THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES)
+        dst.derived().resize( src.size() );
+        fwd( &dst[0],&src[0],src.size() );
+    }
+
     template <typename _Output>
     void inv( _Output * dst, const Complex * src, int nfft)
     {
@@ -83,8 +97,24 @@ class FFT
         inv( &dst[0],&src[0],src.size() );
     }
 
+    template<typename OutputDerived, typename ComplexDerived>
+    void inv( MatrixBase<OutputDerived> & dst, const MatrixBase<ComplexDerived> & src)
+    {
+        EIGEN_STATIC_ASSERT_VECTOR_ONLY(OutputDerived)
+        EIGEN_STATIC_ASSERT_VECTOR_ONLY(ComplexDerived)
+        EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(ComplexDerived,OutputDerived) // size at compile-time
+        EIGEN_STATIC_ASSERT((ei_is_same_type<typename ComplexDerived::Scalar, Complex>::ret),
+                            YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+        EIGEN_STATIC_ASSERT(int(OutputDerived::Flags)&int(ComplexDerived::Flags)&DirectAccessBit,
+                            THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES)
+        dst.derived().resize( src.size() );
+        inv( &dst[0],&src[0],src.size() );
+    }
+
     // TODO: multi-dimensional FFTs
+    
     // TODO: handle Eigen MatrixBase
+     // ---> i added fwd and inv specializations above + unit test, is this enough? (bjacob)
 
     traits_type & traits() {return m_traits;}
   private:
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index f0b9b68bf..cc68f3718 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -39,16 +39,16 @@ complex<long double>  promote(double x) { return complex<long double>( x); }
 complex<long double>  promote(long double x) { return complex<long double>( x); }
     
 
-    template <typename T1,typename T2>
-    long double fft_rmse( const vector<T1> & fftbuf,const vector<T2> & timebuf)
+    template <typename VectorType1,typename VectorType2>
+    long double fft_rmse( const VectorType1 & fftbuf,const VectorType2 & timebuf)
     {
         long double totalpower=0;
         long double difpower=0;
         cerr <<"idx\ttruth\t\tvalue\t|dif|=\n";
-        for (size_t k0=0;k0<fftbuf.size();++k0) {
+        for (size_t k0=0;k0<size_t(fftbuf.size());++k0) {
             complex<long double> acc = 0;
             long double phinc = -2.*k0* M_PIl / timebuf.size();
-            for (size_t k1=0;k1<timebuf.size();++k1) {
+            for (size_t k1=0;k1<size_t(timebuf.size());++k1) {
                 acc +=  promote( timebuf[k1] ) * exp( complex<long double>(0,k1*phinc) );
             }
             totalpower += norm(acc);
@@ -61,8 +61,8 @@ complex<long double>  promote(long double x) { return complex<long double>( x);
         return sqrt(difpower/totalpower);
     }
 
-    template <typename T1,typename T2>
-    long double dif_rmse( const vector<T1> buf1,const vector<T2> buf2)
+    template <typename VectorType1,typename VectorType2>
+    long double dif_rmse( const VectorType1& buf1,const VectorType2& buf2)
     {
         long double totalpower=0;
         long double difpower=0;
@@ -74,35 +74,59 @@ complex<long double>  promote(long double x) { return complex<long double>( x);
         return sqrt(difpower/totalpower);
     }
 
-template <class T>
-void test_scalar(int nfft)
+enum { StdVectorContainer, EigenVectorContainer };
+
+template<int Container, typename Scalar> struct VectorType;
+
+template<typename Scalar> struct VectorType<StdVectorContainer,Scalar>
 {
-    typedef typename Eigen::FFT<T>::Complex Complex;
-    typedef typename Eigen::FFT<T>::Scalar Scalar;
+  typedef vector<Scalar> type;
+};
+
+template<typename Scalar> struct VectorType<EigenVectorContainer,Scalar>
+{
+  typedef Matrix<Scalar,Dynamic,1> type;
+};
+
+template <int Container, typename T>
+void test_scalar_generic(int nfft)
+{
+    typedef typename FFT<T>::Complex Complex;
+    typedef typename FFT<T>::Scalar Scalar;
+    typedef typename VectorType<Container,Scalar>::type ScalarVector;
+    typedef typename VectorType<Container,Complex>::type ComplexVector;
 
     FFT<T> fft;
-    vector<Scalar> inbuf(nfft);
-    vector<Complex> outbuf;
+    ScalarVector inbuf(nfft);
+    ComplexVector outbuf;
     for (int k=0;k<nfft;++k)
         inbuf[k]= (T)(rand()/(double)RAND_MAX - .5);
     fft.fwd( outbuf,inbuf);
     VERIFY( fft_rmse(outbuf,inbuf) < test_precision<T>()  );// gross check
 
-    vector<Scalar> buf3;
+    ScalarVector buf3;
     fft.inv( buf3 , outbuf);
     VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>()  );// gross check
 }
 
-template <class T>
-void test_complex(int nfft)
+template <typename T>
+void test_scalar(int nfft)
 {
-    typedef typename Eigen::FFT<T>::Complex Complex;
+  test_scalar_generic<StdVectorContainer,T>(nfft);
+  test_scalar_generic<EigenVectorContainer,T>(nfft);
+}
+
+template <int Container, typename T>
+void test_complex_generic(int nfft)
+{
+    typedef typename FFT<T>::Complex Complex;
+    typedef typename VectorType<Container,Complex>::type ComplexVector;
 
     FFT<T> fft;
 
-    vector<Complex> inbuf(nfft);
-    vector<Complex> outbuf;
-    vector<Complex> buf3;
+    ComplexVector inbuf(nfft);
+    ComplexVector outbuf;
+    ComplexVector buf3;
     for (int k=0;k<nfft;++k)
         inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) );
     fft.fwd( outbuf , inbuf);
@@ -114,22 +138,63 @@ void test_complex(int nfft)
     VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>()  );// gross check
 }
 
+template <typename T>
+void test_complex(int nfft)
+{
+  test_complex_generic<StdVectorContainer,T>(nfft);
+  test_complex_generic<EigenVectorContainer,T>(nfft);
+}
+
 void test_FFT()
 {
 
-  CALL_SUBTEST( test_complex<float>(32) ); CALL_SUBTEST( test_complex<double>(32) ); CALL_SUBTEST( test_complex<long double>(32) );
-  CALL_SUBTEST( test_complex<float>(256) ); CALL_SUBTEST( test_complex<double>(256) ); CALL_SUBTEST( test_complex<long double>(256) );
-  CALL_SUBTEST( test_complex<float>(3*8) ); CALL_SUBTEST( test_complex<double>(3*8) ); CALL_SUBTEST( test_complex<long double>(3*8) );
-  CALL_SUBTEST( test_complex<float>(5*32) ); CALL_SUBTEST( test_complex<double>(5*32) ); CALL_SUBTEST( test_complex<long double>(5*32) );
-  CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); CALL_SUBTEST( test_complex<long double>(2*3*4) );
-  CALL_SUBTEST( test_complex<float>(2*3*4*5) ); CALL_SUBTEST( test_complex<double>(2*3*4*5) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5) );
-  CALL_SUBTEST( test_complex<float>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<double>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5*7) );
+  CALL_SUBTEST( test_complex<float>(32) );
+  CALL_SUBTEST( test_complex<double>(32) );
+  CALL_SUBTEST( test_complex<long double>(32) );
+  
+  CALL_SUBTEST( test_complex<float>(256) );
+  CALL_SUBTEST( test_complex<double>(256) );
+  CALL_SUBTEST( test_complex<long double>(256) );
+  
+  CALL_SUBTEST( test_complex<float>(3*8) );
+  CALL_SUBTEST( test_complex<double>(3*8) );
+  CALL_SUBTEST( test_complex<long double>(3*8) );
+  
+  CALL_SUBTEST( test_complex<float>(5*32) );
+  CALL_SUBTEST( test_complex<double>(5*32) );
+  CALL_SUBTEST( test_complex<long double>(5*32) );
+  
+  CALL_SUBTEST( test_complex<float>(2*3*4) );
+  CALL_SUBTEST( test_complex<double>(2*3*4) );
+  CALL_SUBTEST( test_complex<long double>(2*3*4) );
+  
+  CALL_SUBTEST( test_complex<float>(2*3*4*5) );
+  CALL_SUBTEST( test_complex<double>(2*3*4*5) );
+  CALL_SUBTEST( test_complex<long double>(2*3*4*5) );
+  
+  CALL_SUBTEST( test_complex<float>(2*3*4*5*7) );
+  CALL_SUBTEST( test_complex<double>(2*3*4*5*7) );
+  CALL_SUBTEST( test_complex<long double>(2*3*4*5*7) );
 
 
 
-  CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); CALL_SUBTEST( test_scalar<long double>(32) );
-  CALL_SUBTEST( test_scalar<float>(45) ); CALL_SUBTEST( test_scalar<double>(45) ); CALL_SUBTEST( test_scalar<long double>(45) );
-  CALL_SUBTEST( test_scalar<float>(50) ); CALL_SUBTEST( test_scalar<double>(50) ); CALL_SUBTEST( test_scalar<long double>(50) );
-  CALL_SUBTEST( test_scalar<float>(256) ); CALL_SUBTEST( test_scalar<double>(256) ); CALL_SUBTEST( test_scalar<long double>(256) );
-  CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
+  CALL_SUBTEST( test_scalar<float>(32) );
+  CALL_SUBTEST( test_scalar<double>(32) );
+  CALL_SUBTEST( test_scalar<long double>(32) );
+  
+  CALL_SUBTEST( test_scalar<float>(45) );
+  CALL_SUBTEST( test_scalar<double>(45) );
+  CALL_SUBTEST( test_scalar<long double>(45) );
+  
+  CALL_SUBTEST( test_scalar<float>(50) );
+  CALL_SUBTEST( test_scalar<double>(50) );
+  CALL_SUBTEST( test_scalar<long double>(50) );
+  
+  CALL_SUBTEST( test_scalar<float>(256) );
+  CALL_SUBTEST( test_scalar<double>(256) );
+  CALL_SUBTEST( test_scalar<long double>(256) );
+  
+  CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) );
+  CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) );
+  CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
 }

From 85f8d1f0c65b3d0ce511b85a2847fec57637f4b6 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Wed, 21 Oct 2009 10:27:17 -0400
Subject: [PATCH 24/34] renamed 'Traits' to 'Impl', added vim modelines for
 syntax highlighting

---
 unsupported/Eigen/Complex |  2 ++
 unsupported/Eigen/FFT     | 19 ++++++++++---------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/unsupported/Eigen/Complex b/unsupported/Eigen/Complex
index a0483abdf..e1c41ab38 100644
--- a/unsupported/Eigen/Complex
+++ b/unsupported/Eigen/Complex
@@ -178,3 +178,5 @@ operator<< (std::basic_ostream<charT,traits>& ostr, const Complex<T>& rhs)
 }
 
 #endif
+/* vim: set filetype=cpp et sw=2 ts=2 ai: */
+
diff --git a/unsupported/Eigen/FFT b/unsupported/Eigen/FFT
index dc7e85908..1e96f4975 100644
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT
@@ -46,21 +46,21 @@
 namespace Eigen {
 
 template <typename _Scalar,
-         typename _Traits=DEFAULT_FFT_IMPL<_Scalar> 
+         typename _Impl=DEFAULT_FFT_IMPL<_Scalar> 
          >
 class FFT
 {
   public:
-    typedef _Traits traits_type;
-    typedef typename traits_type::Scalar Scalar;
-    typedef typename traits_type::Complex Complex;
+    typedef _Impl impl_type;
+    typedef typename impl_type::Scalar Scalar;
+    typedef typename impl_type::Complex Complex;
 
-    FFT(const traits_type & traits=traits_type() ) :m_traits(traits) { }
+    FFT(const impl_type & impl=impl_type() ) :m_impl(impl) { }
 
     template <typename _Input>
     void fwd( Complex * dst, const _Input * src, int nfft)
     {
-        m_traits.fwd(dst,src,nfft);
+        m_impl.fwd(dst,src,nfft);
     }
 
     template <typename _Input>
@@ -73,7 +73,7 @@ class FFT
     template <typename _Output>
     void inv( _Output * dst, const Complex * src, int nfft)
     {
-        m_traits.inv( dst,src,nfft );
+        m_impl.inv( dst,src,nfft );
     }
 
     template <typename _Output>
@@ -86,10 +86,11 @@ class FFT
     // TODO: multi-dimensional FFTs
     // TODO: handle Eigen MatrixBase
 
-    traits_type & traits() {return m_traits;}
+    impl_type & impl() {return m_impl;}
   private:
-    traits_type m_traits;
+    impl_type m_impl;
 };
 #undef DEFAULT_FFT_IMPL
 }
 #endif
+/* vim: set filetype=cpp et sw=2 ts=2 ai: */

From e3d08443dc272f740447de0147efc69cf7de1c93 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Wed, 21 Oct 2009 20:53:05 -0400
Subject: [PATCH 25/34] inlining,all namespace declaration moved to FFT,
 removed preprocessor definitions,

---
 unsupported/Eigen/FFT                       | 41 +++++++++++++--------
 unsupported/Eigen/src/FFT/ei_fftw_impl.h    | 38 ++++++++++++++++---
 unsupported/Eigen/src/FFT/ei_kissfft_impl.h | 28 +++++++-------
 unsupported/test/CMakeLists.txt             |  2 +-
 4 files changed, 73 insertions(+), 36 deletions(-)

diff --git a/unsupported/Eigen/FFT b/unsupported/Eigen/FFT
index 97ec8e49b..36afdde8d 100644
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT
@@ -25,29 +25,39 @@
 #ifndef EIGEN_FFT_H
 #define EIGEN_FFT_H
 
-// ei_kissfft_impl:  small, free, reasonably efficient default, derived from kissfft
-#include "src/FFT/ei_kissfft_impl.h"
-#define DEFAULT_FFT_IMPL ei_kissfft_impl
+#include <complex>
+#include <vector>
+#include <map>
 
+#ifdef EIGEN_FFTW_DEFAULT
 // FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size
-#ifdef FFTW_ESTIMATE  // definition of FFTW_ESTIMATE indicates the caller has included fftw3.h, we can use FFTW routines
-#include "src/FFT/ei_fftw_impl.h"
-#undef DEFAULT_FFT_IMPL
-#define DEFAULT_FFT_IMPL ei_fftw_impl
-#endif
-
-// intel Math Kernel Library: fastest, commercial -- incompatible with Eigen in GPL form
-#ifdef _MKL_DFTI_H_ // mkl_dfti.h has been included, we can use MKL FFT routines
+#  include <fftw3.h>
+   namespace Eigen {
+#    include "src/FFT/ei_fftw_impl.h"
+     //template <typename T> typedef struct ei_fftw_impl  default_fft_impl; this does not work
+     template <typename T> struct default_fft_impl : public ei_fftw_impl<T> {};
+   }
+#elif defined EIGEN_MKL_DEFAULT
 // TODO 
-// #include "src/FFT/ei_imkl_impl.h"
-// #define DEFAULT_FFT_IMPL ei_imkl_impl
+// intel Math Kernel Library: fastest, commercial -- may be incompatible with Eigen in GPL form
+   namespace Eigen {
+#    include "src/FFT/ei_imklfft_impl.h"
+     template <typename T> struct default_fft_impl : public ei_imklfft_impl {};
+   }
+#else
+// ei_kissfft_impl:  small, free, reasonably efficient default, derived from kissfft
+//
+  namespace Eigen {
+#   include "src/FFT/ei_kissfft_impl.h"
+     template <typename T> 
+       struct default_fft_impl : public ei_kissfft_impl<T> {};
+  }
 #endif
 
 namespace Eigen {
 
 template <typename _Scalar,
-         typename _Impl=DEFAULT_FFT_IMPL<_Scalar> 
-         >
+         typename _Impl=default_fft_impl<_Scalar> >
 class FFT
 {
   public:
@@ -120,7 +130,6 @@ class FFT
   private:
     impl_type m_impl;
 };
-#undef DEFAULT_FFT_IMPL
 }
 #endif
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/unsupported/Eigen/src/FFT/ei_fftw_impl.h b/unsupported/Eigen/src/FFT/ei_fftw_impl.h
index d592bbb20..e1f67f334 100644
--- a/unsupported/Eigen/src/FFT/ei_fftw_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_fftw_impl.h
@@ -22,7 +22,8 @@
 // License and a copy of the GNU General Public License along with
 // Eigen. If not, see <http://www.gnu.org/licenses/>.
 
-namespace Eigen {
+
+
   // FFTW uses non-const arguments
   // so we must use ugly const_cast calls for all the args it uses
   //
@@ -32,21 +33,25 @@ namespace Eigen {
   // 2. fftw_complex is compatible with std::complex
   //    This assumes std::complex<T> layout is array of size 2 with real,imag
   template <typename T> 
+  inline 
   T * ei_fftw_cast(const T* p) 
   { 
       return const_cast<T*>( p); 
   }
 
+  inline 
   fftw_complex * ei_fftw_cast( const std::complex<double> * p) 
-  { 
+  {
       return const_cast<fftw_complex*>( reinterpret_cast<const fftw_complex*>(p) ); 
   }
 
+  inline 
   fftwf_complex * ei_fftw_cast( const std::complex<float> * p) 
   { 
       return const_cast<fftwf_complex*>( reinterpret_cast<const fftwf_complex*>(p) ); 
   }
 
+  inline 
   fftwl_complex * ei_fftw_cast( const std::complex<long double> * p) 
   { 
       return const_cast<fftwl_complex*>( reinterpret_cast<const fftwl_complex*>(p) ); 
@@ -64,18 +69,22 @@ namespace Eigen {
       ei_fftw_plan() :m_plan(NULL) {}
       ~ei_fftw_plan() {if (m_plan) fftwf_destroy_plan(m_plan);}
 
+      inline
       void fwd(complex_type * dst,complex_type * src,int nfft) {
           if (m_plan==NULL) m_plan = fftwf_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE);
           fftwf_execute_dft( m_plan, src,dst);
       }
+      inline
       void inv(complex_type * dst,complex_type * src,int nfft) {
           if (m_plan==NULL) m_plan = fftwf_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE);
           fftwf_execute_dft( m_plan, src,dst);
       }
+      inline
       void fwd(complex_type * dst,scalar_type * src,int nfft) {
           if (m_plan==NULL) m_plan = fftwf_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE);
           fftwf_execute_dft_r2c( m_plan,src,dst);
       }
+      inline
       void inv(scalar_type * dst,complex_type * src,int nfft) {
           if (m_plan==NULL)
               m_plan = fftwf_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE);
@@ -91,18 +100,22 @@ namespace Eigen {
       ei_fftw_plan() :m_plan(NULL) {}
       ~ei_fftw_plan() {if (m_plan) fftw_destroy_plan(m_plan);}
 
+      inline
       void fwd(complex_type * dst,complex_type * src,int nfft) {
           if (m_plan==NULL) m_plan = fftw_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE);
           fftw_execute_dft( m_plan, src,dst);
       }
+      inline
       void inv(complex_type * dst,complex_type * src,int nfft) {
           if (m_plan==NULL) m_plan = fftw_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE);
           fftw_execute_dft( m_plan, src,dst);
       }
+      inline
       void fwd(complex_type * dst,scalar_type * src,int nfft) {
           if (m_plan==NULL) m_plan = fftw_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE);
           fftw_execute_dft_r2c( m_plan,src,dst);
       }
+      inline
       void inv(scalar_type * dst,complex_type * src,int nfft) {
           if (m_plan==NULL)
               m_plan = fftw_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE);
@@ -118,18 +131,22 @@ namespace Eigen {
       ei_fftw_plan() :m_plan(NULL) {}
       ~ei_fftw_plan() {if (m_plan) fftwl_destroy_plan(m_plan);}
 
+      inline
       void fwd(complex_type * dst,complex_type * src,int nfft) {
           if (m_plan==NULL) m_plan = fftwl_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE);
           fftwl_execute_dft( m_plan, src,dst);
       }
+      inline
       void inv(complex_type * dst,complex_type * src,int nfft) {
           if (m_plan==NULL) m_plan = fftwl_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE);
           fftwl_execute_dft( m_plan, src,dst);
       }
+      inline
       void fwd(complex_type * dst,scalar_type * src,int nfft) {
           if (m_plan==NULL) m_plan = fftwl_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE);
           fftwl_execute_dft_r2c( m_plan,src,dst);
       }
+      inline
       void inv(scalar_type * dst,complex_type * src,int nfft) {
           if (m_plan==NULL)
               m_plan = fftwl_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE);
@@ -143,17 +160,20 @@ namespace Eigen {
       typedef _Scalar Scalar;
       typedef std::complex<Scalar> Complex;
 
+      inline
       void clear() 
       {
         m_plans.clear();
       }
 
+      inline
       void fwd( Complex * dst,const Complex *src,int nfft)
       {
         get_plan(nfft,false,dst,src).fwd(ei_fftw_cast(dst), ei_fftw_cast(src),nfft );
       }
 
       // real-to-complex forward FFT
+      inline
       void fwd( Complex * dst,const Scalar * src,int nfft) 
       {
           get_plan(nfft,false,dst,src).fwd(ei_fftw_cast(dst), ei_fftw_cast(src) ,nfft);
@@ -163,30 +183,37 @@ namespace Eigen {
       }
 
       // inverse complex-to-complex
+      inline
       void inv(Complex * dst,const Complex  *src,int nfft)
       {
         get_plan(nfft,true,dst,src).inv(ei_fftw_cast(dst), ei_fftw_cast(src),nfft );
+
+        //TODO move scaling to Eigen::FFT
         // scaling
-        Scalar s = 1./nfft;
+        Scalar s = Scalar(1.)/nfft;
         for (int k=0;k<nfft;++k)
           dst[k] *= s;
       }
 
       // half-complex to scalar
+      inline
       void inv( Scalar * dst,const Complex * src,int nfft) 
       {
         get_plan(nfft,true,dst,src).inv(ei_fftw_cast(dst), ei_fftw_cast(src),nfft );
-        Scalar s = 1./nfft;
+
+        //TODO move scaling to Eigen::FFT
+        Scalar s = Scalar(1.)/nfft;
         for (int k=0;k<nfft;++k)
           dst[k] *= s;
       }
 
-  private:
+  protected:
       typedef ei_fftw_plan<Scalar> PlanData;
       typedef std::map<int,PlanData> PlanMap;
 
       PlanMap m_plans;
 
+      inline
       PlanData & get_plan(int nfft,bool inverse,void * dst,const void * src)
       {
           bool inplace = (dst==src);
@@ -195,4 +222,3 @@ namespace Eigen {
           return m_plans[key];
       }
   };
-}
diff --git a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
index a84ac68a0..c068d8765 100644
--- a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
@@ -22,11 +22,7 @@
 // License and a copy of the GNU General Public License along with
 // Eigen. If not, see <http://www.gnu.org/licenses/>.
 
-#include <complex>
-#include <vector>
-#include <map>
 
-namespace Eigen {
 
   // This FFT implementation was derived from kissfft http:sourceforge.net/projects/kissfft
   // Copyright 2003-2009 Mark Borgerding
@@ -51,13 +47,6 @@ namespace Eigen {
           m_twiddles[i] = exp( Complex(0,i*phinc) );
       }
 
-      void conjugate()
-      {
-        m_inverse = !m_inverse;
-        for ( size_t i=0;i<m_twiddles.size() ;++i)
-          m_twiddles[i] = conj( m_twiddles[i] );
-      }
-
       void factorize(int nfft)
       {
         //start factoring out 4's, then 2's, then 3,5,7,9,...
@@ -116,6 +105,7 @@ namespace Eigen {
           }
         }
 
+      inline
       void bfly2( Complex * Fout, const size_t fstride, int m)
       {
         for (int k=0;k<m;++k) {
@@ -125,6 +115,7 @@ namespace Eigen {
         }
       }
 
+      inline
       void bfly4( Complex * Fout, const size_t fstride, const size_t m)
       {
         Complex scratch[6];
@@ -147,6 +138,7 @@ namespace Eigen {
         }
       }
 
+      inline
       void bfly3( Complex * Fout, const size_t fstride, const size_t m)
       {
         size_t k=m;
@@ -175,6 +167,7 @@ namespace Eigen {
         }while(--k);
       }
 
+      inline
       void bfly5( Complex * Fout, const size_t fstride, const size_t m)
       {
         Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
@@ -241,6 +234,7 @@ namespace Eigen {
       }
 
       /* perform the butterfly for one stage of a mixed radix FFT */
+      inline
       void bfly_generic(
           Complex * Fout,
           const size_t fstride,
@@ -290,6 +284,7 @@ namespace Eigen {
       }
 
       template <typename _Src>
+      inline
         void fwd( Complex * dst,const _Src *src,int nfft)
         {
           get_plan(nfft,false).work(0, dst, src, 1,1);
@@ -299,6 +294,7 @@ namespace Eigen {
       // perform two FFTs of src even and src odd
       // then twiddle to recombine them into the half-spectrum format
       // then fill in the conjugate symmetric half
+      inline
       void fwd( Complex * dst,const Scalar * src,int nfft) 
       {
         if ( nfft&3  ) {
@@ -334,6 +330,7 @@ namespace Eigen {
       }
 
       // inverse complex-to-complex
+      inline
       void inv(Complex * dst,const Complex  *src,int nfft)
       {
         get_plan(nfft,true).work(0, dst, src, 1,1);
@@ -341,6 +338,7 @@ namespace Eigen {
       }
 
       // half-complex to scalar
+      inline
       void inv( Scalar * dst,const Complex * src,int nfft) 
       {
         if (nfft&3) {
@@ -369,7 +367,7 @@ namespace Eigen {
         }
       }
 
-      private:
+      protected:
       typedef ei_kiss_cpx_fft<Scalar> PlanData;
       typedef std::map<int,PlanData> PlanMap;
 
@@ -377,8 +375,10 @@ namespace Eigen {
       std::map<int, std::vector<Complex> > m_realTwiddles;
       std::vector<Complex> m_tmpBuf;
 
+      inline
       int PlanKey(int nfft,bool isinverse) const { return (nfft<<1) | isinverse; }
 
+      inline
       PlanData & get_plan(int nfft,bool inverse)
       {
         // TODO look for PlanKey(nfft, ! inverse) and conjugate the twiddles
@@ -390,6 +390,7 @@ namespace Eigen {
         return pd;
       }
 
+      inline
       Complex * real_twiddles(int ncfft2)
       {
         std::vector<Complex> & twidref = m_realTwiddles[ncfft2];// creates new if not there
@@ -403,10 +404,11 @@ namespace Eigen {
         return &twidref[0];
       }
 
+      // TODO move scaling up into Eigen::FFT
+      inline
       void scale(Complex *dst,int n,Scalar s) 
       {
         for (int k=0;k<n;++k)
           dst[k] *= s;
       }
     };
-}
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index f42077bdc..d182c9abf 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -23,6 +23,6 @@ ei_add_test(FFT)
 
 find_package(FFTW)
 if(FFTW_FOUND)
-  ei_add_test(FFTW  " " "-lfftw3 -lfftw3f -lfftw3l" )
+  ei_add_test(FFTW  "-DEIGEN_FFTW_DEFAULT " "-lfftw3 -lfftw3f -lfftw3l" )
 endif(FFTW_FOUND)
 

From 99bb29abcfc4a62b8d099da6fb045f8d9a22cdf3 Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Thu, 22 Oct 2009 10:11:26 +0200
Subject: [PATCH 26/34] demeaning with colwise expression

---
 Eigen/src/Geometry/Umeyama.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/Eigen/src/Geometry/Umeyama.h b/Eigen/src/Geometry/Umeyama.h
index 7652aa92e..d0f36d2c9 100644
--- a/Eigen/src/Geometry/Umeyama.h
+++ b/Eigen/src/Geometry/Umeyama.h
@@ -131,17 +131,11 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
   const VectorType dst_mean = dst.rowwise().sum() * one_over_n;
 
   // demeaning of src and dst points
-  RowMajorMatrixType src_demean(m,n);
-  RowMajorMatrixType dst_demean(m,n);
-  for (int i=0; i<n; ++i)
-  {
-    src_demean.col(i) = src.col(i) - src_mean;
-    dst_demean.col(i) = dst.col(i) - dst_mean;
-  }
+  const RowMajorMatrixType src_demean = src.colwise() - src_mean;
+  const RowMajorMatrixType dst_demean = dst.colwise() - dst_mean;
 
   // Eq. (36)-(37)
   const Scalar src_var = src_demean.rowwise().squaredNorm().sum() * one_over_n;
-  // const Scalar dst_var = dst_demean.rowwise().squaredNorm().sum() * one_over_n;
 
   // Eq. (38)
   const MatrixType sigma = one_over_n * dst_demean * src_demean.transpose();

From 83a7b7c44c4842e1cfa89179624f4448d39a9e5b Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Thu, 22 Oct 2009 15:56:10 -0400
Subject: [PATCH 27/34] support gcc 3.3

---
 test/geo_hyperplane.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/geo_hyperplane.cpp b/test/geo_hyperplane.cpp
index f8281a16b..010989feb 100644
--- a/test/geo_hyperplane.cpp
+++ b/test/geo_hyperplane.cpp
@@ -121,7 +121,8 @@ template<typename Scalar> void lines()
     VERIFY_IS_APPROX(result, center);
 
     // check conversions between two types of lines
-    CoeffsType converted_coeffs = HLine(PLine(line_u)).coeffs();
+    PLine pl(line_u); // gcc 3.3 will commit suicide if we don't name this variable
+    CoeffsType converted_coeffs = HLine(pl).coeffs();
     converted_coeffs *= (line_u.coeffs()[0])/(converted_coeffs[0]);
     VERIFY(line_u.coeffs().isApprox(converted_coeffs));
   }

From a382963b04d4b847c6b0ef0611511fb6a8e11718 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 23 Oct 2009 14:26:14 +0200
Subject: [PATCH 28/34] * extend Map to allow the user to specify whether the
 mapped data   is aligned or not. This is done using the Aligned constant:  
 Map<MatrixType,Aligned>::Map(data); * rename ForceAligned to
 EnforceAlignedAccess, and update its doc,   and emphasize this is mainly an
 internal stuff.

---
 Eigen/src/Core/Block.h          | 18 +++++++--------
 Eigen/src/Core/Map.h            | 35 ++++++++++++++++-------------
 Eigen/src/Core/MapBase.h        | 40 +++++++++++++++++++++++----------
 Eigen/src/Core/StableNorm.h     |  2 +-
 Eigen/src/Core/util/Constants.h |  4 ++--
 test/map.cpp                    |  7 +++---
 6 files changed, 63 insertions(+), 43 deletions(-)

diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index cebfeaf75..5fffdcb01 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -33,10 +33,10 @@
   * \param MatrixType the type of the object in which we are taking a block
   * \param BlockRows the number of rows of the block we are taking at compile time (optional)
   * \param BlockCols the number of columns of the block we are taking at compile time (optional)
-  * \param _PacketAccess allows to enforce aligned loads and stores if set to \b ForceAligned.
-  *                      The default is \b AsRequested. This parameter is internaly used by Eigen
-  *                      in expressions such as \code mat.block() += other; \endcode and most of
-  *                      the time this is the only way it is used.
+  * \param _PacketAccess \internal used to enforce aligned loads in expressions such as
+  *                      \code mat.block() += other; \endcode. Possible values are
+  *                      \c AsRequested (default) and \c EnforceAlignedAccess.
+  *                      See class MapBase for more details.
   * \param _DirectAccessStatus \internal used for partial specialization
   *
   * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
@@ -84,9 +84,9 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols, _PacketAccess, _DirectA
     CoeffReadCost = ei_traits<MatrixType>::CoeffReadCost,
     PacketAccess = _PacketAccess
   };
-  typedef typename ei_meta_if<int(PacketAccess)==ForceAligned,
+  typedef typename ei_meta_if<int(PacketAccess)==EnforceAlignedAccess,
                  Block<MatrixType, BlockRows, BlockCols, _PacketAccess, _DirectAccessStatus>&,
-                 Block<MatrixType, BlockRows, BlockCols, ForceAligned, _DirectAccessStatus> >::ret AlignedDerivedType;
+                 Block<MatrixType, BlockRows, BlockCols, EnforceAlignedAccess, _DirectAccessStatus> >::ret AlignedDerivedType;
 };
 
 template<typename MatrixType, int BlockRows, int BlockCols, int PacketAccess, int _DirectAccessStatus> class Block
@@ -228,13 +228,13 @@ class Block<MatrixType,BlockRows,BlockCols,PacketAccess,HasDirectAccess>
 
     class InnerIterator;
     typedef typename ei_traits<Block>::AlignedDerivedType AlignedDerivedType;
-    friend class Block<MatrixType,BlockRows,BlockCols,PacketAccess==AsRequested?ForceAligned:AsRequested,HasDirectAccess>;
+    friend class Block<MatrixType,BlockRows,BlockCols,PacketAccess==EnforceAlignedAccess?AsRequested:EnforceAlignedAccess,HasDirectAccess>;
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block)
 
-    AlignedDerivedType _convertToForceAligned()
+    AlignedDerivedType _convertToEnforceAlignedAccess()
     {
-      return Block<MatrixType,BlockRows,BlockCols,ForceAligned,HasDirectAccess>
+      return Block<MatrixType,BlockRows,BlockCols,EnforceAlignedAccess,HasDirectAccess>
                     (m_matrix, Base::m_data, Base::m_rows.value(), Base::m_cols.value());
     }
 
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index f6bc814e2..dba7e20e4 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -31,16 +31,14 @@
   * \brief A matrix or vector expression mapping an existing array of data.
   *
   * \param MatrixType the equivalent matrix type of the mapped data
-  * \param _PacketAccess allows to enforce aligned loads and stores if set to ForceAligned.
-  *                      The default is AsRequested. This parameter is internaly used by Eigen
-  *                      in expressions such as \code Map<...>(...) += other; \endcode and most
-  *                      of the time this is the only way it is used.
+  * \param PointerAlignment specifies whether the pointer is \c Aligned, or \c Unaligned.
+  *                         The default is \c Unaligned.
   *
   * This class represents a matrix or vector expression mapping an existing array of data.
   * It can be used to let Eigen interface without any overhead with non-Eigen data structures,
   * such as plain C arrays or structures from other libraries.
   *
-  * \b Tips: to change the array of data mapped by a Map object, you can use the C++
+  * \b Tip: to change the array of data mapped by a Map object, you can use the C++
   * placement new syntax:
   *
   * Example: \include Map_placement_new.cpp
@@ -48,22 +46,27 @@
   *
   * This class is the return type of Matrix::Map() but can also be used directly.
   *
+  * \b Note \b to \b Eigen \b developers: The template parameter \c PointerAlignment
+  * can also be or-ed with \c EnforceAlignedAccess in order to enforce aligned read 
+  * in expressions such as \code A += B; \endcode. See class MapBase for further details.
+  *
   * \sa Matrix::Map()
   */
-template<typename MatrixType, int _PacketAccess>
-struct ei_traits<Map<MatrixType, _PacketAccess> > : public ei_traits<MatrixType>
+template<typename MatrixType, int Options>
+struct ei_traits<Map<MatrixType, Options> > : public ei_traits<MatrixType>
 {
   enum {
-    PacketAccess = _PacketAccess,
-    Flags = ei_traits<MatrixType>::Flags & ~AlignedBit
+    PacketAccess = Options & EnforceAlignedAccess,
+    Flags = (Options&Aligned)==Aligned ? ei_traits<MatrixType>::Flags |  AlignedBit
+                                       : ei_traits<MatrixType>::Flags & ~AlignedBit
   };
-  typedef typename ei_meta_if<int(PacketAccess)==ForceAligned,
-                              Map<MatrixType, _PacketAccess>&,
-                              Map<MatrixType, ForceAligned> >::ret AlignedDerivedType;
+  typedef typename ei_meta_if<int(PacketAccess)==EnforceAlignedAccess,
+                              Map<MatrixType, Options>&,
+                              Map<MatrixType, Options|EnforceAlignedAccess> >::ret AlignedDerivedType;
 };
 
-template<typename MatrixType, int PacketAccess> class Map
-  : public MapBase<Map<MatrixType, PacketAccess> >
+template<typename MatrixType, int Options> class Map
+  : public MapBase<Map<MatrixType, Options> >
 {
   public:
 
@@ -72,9 +75,9 @@ template<typename MatrixType, int PacketAccess> class Map
 
     inline int stride() const { return this->innerSize(); }
 
-    AlignedDerivedType _convertToForceAligned()
+    AlignedDerivedType _convertToEnforceAlignedAccess()
     {
-      return Map<MatrixType,ForceAligned>(Base::m_data, Base::m_rows.value(), Base::m_cols.value());
+      return AlignedDerivedType(Base::m_data, Base::m_rows.value(), Base::m_cols.value());
     }
 
     inline Map(const Scalar* data) : Base(data) {}
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index 88a3fac1e..8770732de 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -32,11 +32,17 @@
   *
   * Expression classes inheriting MapBase must define the constant \c PacketAccess,
   * and type \c AlignedDerivedType in their respective ei_traits<> specialization structure.
-  * The value of \c PacketAccess can be either:
-  *  - \b ForceAligned which enforces both aligned loads and stores
-  *  - \b AsRequested which is the default behavior
+  * The value of \c PacketAccess can be either \b AsRequested, or set to \b EnforceAlignedAccess which
+  * enforces both aligned loads and stores.
+  * 
+  * \c EnforceAlignedAccess is automatically set in expressions such as 
+  * \code A += B; \endcode where A is either a Block or a Map. Here,
+  * this expression is transfomed into \code A = A_with_EnforceAlignedAccess + B; \endcode
+  * avoiding unaligned loads from A. Indeed, since Eigen's packet evaluation mechanism
+  * automatically align to the destination matrix, we know that loads to A will be aligned too.
+  * 
   * The type \c AlignedDerivedType should correspond to the equivalent expression type
-  * with \c PacketAccess being \c ForceAligned.
+  * with \c PacketAccess set to \c EnforceAlignedAccess.
   *
   * \sa class Map, class Block
   */
@@ -79,19 +85,19 @@ template<typename Derived> class MapBase
       * \sa MapBase::stride() */
     inline const Scalar* data() const { return m_data; }
 
-    template<bool IsForceAligned,typename Dummy> struct force_aligned_impl {
+    template<bool IsEnforceAlignedAccess,typename Dummy> struct force_aligned_impl {
       static AlignedDerivedType run(MapBase& a) { return a.derived(); }
     };
 
     template<typename Dummy> struct force_aligned_impl<false,Dummy> {
-      static AlignedDerivedType run(MapBase& a) { return a.derived()._convertToForceAligned(); }
+      static AlignedDerivedType run(MapBase& a) { return a.derived()._convertToEnforceAlignedAccess(); }
     };
 
     /** \returns an expression equivalent to \c *this but having the \c PacketAccess constant
-      * set to \c ForceAligned. Must be reimplemented by the derived class. */
+      * set to \c EnforceAlignedAccess. Must be reimplemented by the derived class. */
     AlignedDerivedType forceAligned()
     {
-      return force_aligned_impl<int(PacketAccess)==int(ForceAligned),Derived>::run(*this);
+      return force_aligned_impl<int(PacketAccess)==int(EnforceAlignedAccess),Derived>::run(*this);
     }
 
     inline const Scalar& coeff(int row, int col) const
@@ -131,7 +137,7 @@ template<typename Derived> class MapBase
     template<int LoadMode>
     inline PacketScalar packet(int row, int col) const
     {
-      return ei_ploadt<Scalar, int(PacketAccess) == ForceAligned ? Aligned : LoadMode>
+      return ei_ploadt<Scalar, int(PacketAccess) == EnforceAlignedAccess ? Aligned : LoadMode>
                (m_data + (IsRowMajor ? col + row * stride()
                                      : row + col * stride()));
     }
@@ -139,13 +145,13 @@ template<typename Derived> class MapBase
     template<int LoadMode>
     inline PacketScalar packet(int index) const
     {
-      return ei_ploadt<Scalar, int(PacketAccess) == ForceAligned ? Aligned : LoadMode>(m_data + index);
+      return ei_ploadt<Scalar, int(PacketAccess) == EnforceAlignedAccess ? Aligned : LoadMode>(m_data + index);
     }
 
     template<int StoreMode>
     inline void writePacket(int row, int col, const PacketScalar& x)
     {
-      ei_pstoret<Scalar, PacketScalar, int(PacketAccess) == ForceAligned ? Aligned : StoreMode>
+      ei_pstoret<Scalar, PacketScalar, int(PacketAccess) == EnforceAlignedAccess ? Aligned : StoreMode>
                (const_cast<Scalar*>(m_data) + (IsRowMajor ? col + row * stride()
                                                           : row + col * stride()), x);
     }
@@ -153,13 +159,14 @@ template<typename Derived> class MapBase
     template<int StoreMode>
     inline void writePacket(int index, const PacketScalar& x)
     {
-      ei_pstoret<Scalar, PacketScalar, int(PacketAccess) == ForceAligned ? Aligned : StoreMode>
+      ei_pstoret<Scalar, PacketScalar, int(PacketAccess) == EnforceAlignedAccess ? Aligned : StoreMode>
         (const_cast<Scalar*>(m_data) + index, x);
     }
 
     inline MapBase(const Scalar* data) : m_data(data), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
     {
       EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
+      checkDataAlignment();
     }
 
     inline MapBase(const Scalar* data, int size)
@@ -170,6 +177,7 @@ template<typename Derived> class MapBase
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
       ei_assert(size >= 0);
       ei_assert(data == 0 || SizeAtCompileTime == Dynamic || SizeAtCompileTime == size);
+      checkDataAlignment();
     }
 
     inline MapBase(const Scalar* data, int rows, int cols)
@@ -178,6 +186,7 @@ template<typename Derived> class MapBase
       ei_assert( (data == 0)
               || (   rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
                   && cols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols)));
+      checkDataAlignment();
     }
 
     Derived& operator=(const MapBase& other)
@@ -215,6 +224,13 @@ template<typename Derived> class MapBase
     { return derived() = forceAligned() / other; }
 
   protected:
+
+    void checkDataAlignment() const
+    {
+      ei_assert( ((!(ei_traits<Derived>::Flags&AlignedBit))
+                  || ((std::size_t(m_data)&0xf)==0)) && "data is not aligned");
+    }
+    
     const Scalar* EIGEN_RESTRICT m_data;
     const ei_int_if_dynamic<RowsAtCompileTime> m_rows;
     const ei_int_if_dynamic<ColsAtCompileTime> m_cols;
diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h
index 06e69c448..f2d1e7240 100644
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -59,7 +59,7 @@ MatrixBase<Derived>::stableNorm() const
   RealScalar invScale = 1;
   RealScalar ssq = 0; // sum of square
   enum {
-    Alignment = (int(Flags)&DirectAccessBit) || (int(Flags)&AlignedBit) ? ForceAligned : AsRequested
+    Alignment = (int(Flags)&DirectAccessBit) || (int(Flags)&AlignedBit) ? EnforceAlignedAccess : AsRequested
   };
   int n = size();
   int bi=0;
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index affc1d478..169fb5aec 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -196,8 +196,8 @@ const unsigned int UnitLowerTriangular = LowerTriangularBit | UnitDiagBit;
 
 enum { DiagonalOnTheLeft, DiagonalOnTheRight };
 
-enum { Aligned, Unaligned };
-enum { ForceAligned, AsRequested };
+enum { Unaligned=0, Aligned=1 };
+enum { AsRequested=0, EnforceAlignedAccess=2 };
 enum { ConditionalJumpCost = 5 };
 enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
 enum DirectionType { Vertical, Horizontal, BothDirections };
diff --git a/test/map.cpp b/test/map.cpp
index 62e727304..fbff647f6 100644
--- a/test/map.cpp
+++ b/test/map.cpp
@@ -37,14 +37,15 @@ template<typename VectorType> void map_class(const VectorType& m)
   Scalar* array3unaligned = size_t(array3)%16 == 0 ? array3+1 : array3;
   
   Map<VectorType, Aligned>(array1, size) = VectorType::Random(size);
-  Map<VectorType>(array2, size) = Map<VectorType>(array1, size);
+  Map<VectorType, Aligned>(array2, size) = Map<VectorType,Aligned>(array1, size);
   Map<VectorType>(array3unaligned, size) = Map<VectorType>(array1, size);
-  VectorType ma1 = Map<VectorType>(array1, size);
+  VectorType ma1 = Map<VectorType, Aligned>(array1, size);
   VectorType ma2 = Map<VectorType, Aligned>(array2, size);
   VectorType ma3 = Map<VectorType>(array3unaligned, size);
   VERIFY_IS_APPROX(ma1, ma2);
   VERIFY_IS_APPROX(ma1, ma3);
-  
+  VERIFY_RAISES_ASSERT((Map<VectorType,Aligned>(array3unaligned, size)));
+
   ei_aligned_delete(array1, size);
   ei_aligned_delete(array2, size);
   delete[] array3;

From 66fe5a5f36c80c64fb6078c7e7cb089057bff96f Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Sat, 24 Oct 2009 14:48:34 +0200
Subject: [PATCH 29/34] It is just not that easy and requires more work to get
 it done right.

---
 Eigen/src/Core/util/Macros.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 706b30174..5ee17e91e 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -256,7 +256,7 @@ using Eigen::ei_cos;
 
 // C++0x features
 #if defined(__GXX_EXPERIMENTAL_CXX0X__) || (defined(_MSC_VER) && (_MSC_VER >= 1600))
-  #define EIGEN_REF_TO_TEMPORARY &&
+  #define EIGEN_REF_TO_TEMPORARY const &
 #else
   #define EIGEN_REF_TO_TEMPORARY const &
 #endif

From 7cc9fb5d0adc1a4a2d515d6c42c92c93f80a8966 Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Tue, 27 Oct 2009 15:29:12 +0100
Subject: [PATCH 30/34] Umeyama is now working with fixed size src and dst
 points.

---
 Eigen/src/Geometry/Umeyama.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Geometry/Umeyama.h b/Eigen/src/Geometry/Umeyama.h
index d0f36d2c9..551a69e5b 100644
--- a/Eigen/src/Geometry/Umeyama.h
+++ b/Eigen/src/Geometry/Umeyama.h
@@ -117,7 +117,7 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
   enum { Dimension = EIGEN_ENUM_MIN(Derived::RowsAtCompileTime, OtherDerived::RowsAtCompileTime) };
 
   typedef Matrix<Scalar, Dimension, 1> VectorType;
-  typedef typename ei_plain_matrix_type<Derived>::type MatrixType;
+  typedef Matrix<Scalar, Dimension, Dimension> MatrixType;
   typedef typename ei_plain_matrix_type_row_major<Derived>::type RowMajorMatrixType;
 
   const int m = src.rows(); // dimension

From dbaba9019ba2ca116406f286a71a745179e61531 Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Tue, 27 Oct 2009 15:57:21 +0100
Subject: [PATCH 31/34] Added more common typedefs.

---
 Eigen/src/Core/Matrix.h        | 11 ++++++++++-
 Eigen/src/Geometry/Transform.h | 19 ++++++++++++++-----
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index 027e6bb70..cc2d71588 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -794,11 +794,20 @@ typedef Matrix<Type, Size, 1>    Vector##SizeSuffix##TypeSuffix;  \
 /** \ingroup matrixtypedefs */                                    \
 typedef Matrix<Type, 1, Size>    RowVector##SizeSuffix##TypeSuffix;
 
+#define EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, Size)         \
+/** \ingroup matrixtypedefs */                                    \
+typedef Matrix<Type, Size, Dynamic> Matrix##Size##X##TypeSuffix;  \
+/** \ingroup matrixtypedefs */                                    \
+typedef Matrix<Type, Dynamic, Size> Matrix##X##Size##TypeSuffix;
+
 #define EIGEN_MAKE_TYPEDEFS_ALL_SIZES(Type, TypeSuffix) \
 EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 2, 2) \
 EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 3, 3) \
 EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 4, 4) \
-EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Dynamic, X)
+EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Dynamic, X) \
+EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, 2) \
+EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, 3) \
+EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, 4)
 
 EIGEN_MAKE_TYPEDEFS_ALL_SIZES(int,                  i)
 EIGEN_MAKE_TYPEDEFS_ALL_SIZES(float,                f)
diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index d03fd52fd..c49356361 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -480,6 +480,15 @@ typedef Transform<double,2> Transform2d;
 /** \ingroup Geometry_Module */
 typedef Transform<double,3> Transform3d;
 
+/** \ingroup Geometry_Module */
+typedef Transform<float,2,Isometry> Isometry2f;
+/** \ingroup Geometry_Module */
+typedef Transform<float,3,Isometry> Isometry3f;
+/** \ingroup Geometry_Module */
+typedef Transform<double,2,Isometry> Isometry2d;
+/** \ingroup Geometry_Module */
+typedef Transform<double,3,Isometry> Isometry3d;
+
 /** \ingroup Geometry_Module */
 typedef Transform<float,2> Affine2f;
 /** \ingroup Geometry_Module */
@@ -512,7 +521,7 @@ typedef Transform<double,3,Projective> Projective3d;
 **************************/
 
 #ifdef EIGEN_QT_SUPPORT
-/** Initialises \c *this from a QMatrix assuming the dimension is 2.
+/** Initializes \c *this from a QMatrix assuming the dimension is 2.
   *
   * This function is available only if the token EIGEN_QT_SUPPORT is defined.
   */
@@ -538,7 +547,7 @@ Transform<Scalar,Dim,Mode>& Transform<Scalar,Dim,Mode>::operator=(const QMatrix&
 
 /** \returns a QMatrix from \c *this assuming the dimension is 2.
   *
-  * \warning this convertion might loss data if \c *this is not affine
+  * \warning this conversion might loss data if \c *this is not affine
   *
   * This function is available only if the token EIGEN_QT_SUPPORT is defined.
   */
@@ -551,7 +560,7 @@ QMatrix Transform<Scalar,Dim,Mode>::toQMatrix(void) const
                  matrix.coeff(0,2), matrix.coeff(1,2));
 }
 
-/** Initialises \c *this from a QTransform assuming the dimension is 2.
+/** Initializes \c *this from a QTransform assuming the dimension is 2.
   *
   * This function is available only if the token EIGEN_QT_SUPPORT is defined.
   */
@@ -881,7 +890,7 @@ Transform<Scalar,Dim,Mode>::fromPositionOrientationScale(const MatrixBase<Positi
   * \returns the inverse transformation according to some given knowledge
   * on \c *this.
   *
-  * \param traits allows to optimize the inversion process when the transformion
+  * \param traits allows to optimize the inversion process when the transformation
   * is known to be not a general transformation. The possible values are:
   *  - Projective if the transformation is not necessarily affine, i.e., if the
   *    last row is not guaranteed to be [0 ... 0 1]
@@ -950,7 +959,7 @@ struct ei_transform_take_affine_part<Transform<Scalar,Dim,AffineCompact> > {
 };
 
 /*****************************************************
-*** Specializations of construct from matix        ***
+*** Specializations of construct from matrix       ***
 *****************************************************/
 
 template<typename Other, int Mode, int Dim, int HDim>

From 427f8a87d1c184e9fb0c5fdc62197be2a0abb910 Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Tue, 27 Oct 2009 16:02:36 +0100
Subject: [PATCH 32/34] Added dox for the new typedefs.

---
 Eigen/src/Core/Matrix.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index cc2d71588..17d2f2836 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -58,6 +58,9 @@ template <typename Derived, typename OtherDerived, bool IsVector = static_cast<b
   * \li \c MatrixXf is a dynamic-size matrix of floats (\c Matrix<float, Dynamic, Dynamic>)
   * \li \c VectorXf is a dynamic-size vector of floats (\c Matrix<float, Dynamic, 1>)
   *
+  * \li \c Matrix2Xf is a partially fixed-size (dynamic-size) matrix of floats (\c Matrix<float, 2, Dynamic>)
+  * \li \c MatrixX3d is a partially dynamic-size (fixed-size) matrix of double (\c Matrix<double, Dynamic, 3>)
+  *
   * See \link matrixtypedefs this page \endlink for a complete list of predefined \em %Matrix and \em Vector typedefs.
   *
   * You can access elements of vectors and matrices using normal subscripting:

From 611d2b0b1d64880ff207368a901c7faba73a78c5 Mon Sep 17 00:00:00 2001
From: Mathieu Gautier <mathieu.gautier@cea.fr>
Date: Tue, 27 Oct 2009 13:19:16 +0000
Subject: [PATCH 33/34] Quaternion could now map an array of 4 scalars :

new classes :
* QuaternionBase
* Map<Quaternion>
---
 Eigen/src/Core/util/ForwardDeclarations.h |   1 +
 Eigen/src/Geometry/Quaternion.h           | 545 ++++++++++++++++++++++
 Eigen/src/Geometry/arch/Geometry_SSE.h    |  21 +
 3 files changed, 567 insertions(+)

diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 65e5ce687..025904734 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -129,6 +129,7 @@ template<typename Scalar>     class PlanarRotation;
 // Geometry module:
 template<typename Derived, int _Dim> class RotationBase;
 template<typename Lhs, typename Rhs> class Cross;
+template<typename Derived> class QuaternionBase;
 template<typename Scalar> class Quaternion;
 template<typename Scalar> class Rotation2D;
 template<typename Scalar> class AngleAxis;
diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index 2f9f97807..f3f60a08a 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -507,4 +507,549 @@ struct ei_quaternion_assign_impl<Other,4,1>
   }
 };
 
+/*###################################################################
+      QuaternionBase and Map<Quaternion> and Quat
+  ###################################################################*/
+
+template<typename Other,
+         int OtherRows=Other::RowsAtCompileTime,
+         int OtherCols=Other::ColsAtCompileTime>
+struct ei_quaternionbase_assign_impl;
+
+template<typename Scalar> class Quat; // [XXX] => remove when Quat becomes Quaternion
+
+template<typename Derived>
+struct ei_traits<QuaternionBase<Derived> >
+{
+  typedef typename ei_traits<Derived>::Scalar Scalar;
+  enum {
+    PacketAccess = ei_traits<Derived>::PacketAccess
+  };
+};
+
+template<class Derived>
+class QuaternionBase : public RotationBase<Derived, 3> {
+  typedef RotationBase<Derived, 3> Base;
+public:
+  using Base::operator*;
+
+  typedef typename ei_traits<QuaternionBase<Derived> >::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+ // typedef typename Matrix<Scalar,4,1> Coefficients;
+  /** the type of a 3D vector */
+  typedef Matrix<Scalar,3,1> Vector3;
+  /** the equivalent rotation matrix type */
+  typedef Matrix<Scalar,3,3> Matrix3;
+  /** the equivalent angle-axis type */
+  typedef AngleAxis<Scalar> AngleAxisType;
+
+  /** \returns the \c x coefficient */
+  inline Scalar x() const { return this->derived().coeffs().coeff(0); }
+  /** \returns the \c y coefficient */
+  inline Scalar y() const { return this->derived().coeffs().coeff(1); }
+  /** \returns the \c z coefficient */
+  inline Scalar z() const { return this->derived().coeffs().coeff(2); }
+  /** \returns the \c w coefficient */
+  inline Scalar w() const { return this->derived().coeffs().coeff(3); }
+
+  /** \returns a reference to the \c x coefficient */
+  inline Scalar& x() { return this->derived().coeffs().coeffRef(0); }
+  /** \returns a reference to the \c y coefficient */
+  inline Scalar& y() { return this->derived().coeffs().coeffRef(1); }
+  /** \returns a reference to the \c z coefficient */
+  inline Scalar& z() { return this->derived().coeffs().coeffRef(2); }
+  /** \returns a reference to the \c w coefficient */
+  inline Scalar& w() { return this->derived().coeffs().coeffRef(3); }
+
+  /** \returns a read-only vector expression of the imaginary part (x,y,z) */
+  inline const VectorBlock<typename ei_traits<Derived>::Coefficients,3,1> vec() const { return this->derived().coeffs().template start<3>(); }
+
+  /** \returns a vector expression of the imaginary part (x,y,z) */
+  inline VectorBlock<typename ei_traits<Derived>::Coefficients,3,1> vec() { return this->derived().coeffs().template start<3>(); }
+
+  /** \returns a read-only vector expression of the coefficients (x,y,z,w) */
+  inline const typename ei_traits<Derived>::Coefficients& coeffs() const { return this->derived().coeffs(); }
+
+  /** \returns a vector expression of the coefficients (x,y,z,w) */
+  inline typename ei_traits<Derived>::Coefficients& coeffs() { return this->derived().coeffs(); }
+
+  template<class OtherDerived> QuaternionBase& operator=(const QuaternionBase<OtherDerived>& other);
+  QuaternionBase& operator=(const AngleAxisType& aa);
+  template<class OtherDerived>
+  QuaternionBase& operator=(const MatrixBase<OtherDerived>& m);
+
+  /** \returns a quaternion representing an identity rotation
+    * \sa MatrixBase::Identity()
+    */
+  inline static Quat<Scalar> Identity() { return Quat<Scalar>(1, 0, 0, 0); }
+
+  /** \sa Quaternion2::Identity(), MatrixBase::setIdentity()
+    */
+  inline QuaternionBase& setIdentity() { coeffs() << 0, 0, 0, 1; return *this; }
+
+  /** \returns the squared norm of the quaternion's coefficients
+    * \sa Quaternion2::norm(), MatrixBase::squaredNorm()
+    */
+  inline Scalar squaredNorm() const { return coeffs().squaredNorm(); }
+
+  /** \returns the norm of the quaternion's coefficients
+    * \sa Quaternion2::squaredNorm(), MatrixBase::norm()
+    */
+  inline Scalar norm() const { return coeffs().norm(); }
+
+  /** Normalizes the quaternion \c *this
+    * \sa normalized(), MatrixBase::normalize() */
+  inline void normalize() { coeffs().normalize(); }
+  /** \returns a normalized version of \c *this
+    * \sa normalize(), MatrixBase::normalized() */
+  inline Quat<Scalar> normalized() const { return Quat<Scalar>(coeffs().normalized()); }
+
+    /** \returns the dot product of \c *this and \a other
+    * Geometrically speaking, the dot product of two unit quaternions
+    * corresponds to the cosine of half the angle between the two rotations.
+    * \sa angularDistance()
+    */
+  template<class OtherDerived> inline Scalar dot(const QuaternionBase<OtherDerived>& other) const { return coeffs().dot(other.coeffs()); }
+
+  template<class OtherDerived> inline Scalar angularDistance(const QuaternionBase<OtherDerived>& other) const;
+
+  Matrix3 toRotationMatrix(void) const;
+
+  template<typename Derived1, typename Derived2>
+  QuaternionBase& setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
+
+  template<class OtherDerived> inline Quat<Scalar> operator* (const QuaternionBase<OtherDerived>& q) const;
+  template<class OtherDerived> inline QuaternionBase& operator*= (const QuaternionBase<OtherDerived>& q);
+
+  Quat<Scalar> inverse(void) const;
+  Quat<Scalar> conjugate(void) const;
+
+  template<class OtherDerived> Quat<Scalar> slerp(Scalar t, const QuaternionBase<OtherDerived>& other) const;
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+    * determined by \a prec.
+    *
+    * \sa MatrixBase::isApprox() */
+  bool isApprox(const QuaternionBase& other, typename RealScalar prec = precision<Scalar>()) const
+  { return coeffs().isApprox(other.coeffs(), prec); }
+
+  Vector3 _transformVector(Vector3 v) const;
+
+};
+
+/* ########### Quat -> Quaternion */
+
+template<typename _Scalar>
+struct ei_traits<Quat<_Scalar> >
+{
+  typedef _Scalar Scalar;
+  typedef Matrix<_Scalar,4,1> Coefficients;
+  enum{
+    PacketAccess = Aligned 
+  };
+};
+
+template<typename _Scalar>
+class Quat : public QuaternionBase<Quat<_Scalar> >{
+  typedef QuaternionBase<Quat<_Scalar> > Base;
+public:
+  using Base::operator=;
+
+  typedef _Scalar Scalar;
+
+  typedef typename ei_traits<Quat<Scalar> >::Coefficients Coefficients;
+  typedef typename Base::AngleAxisType AngleAxisType;
+
+  /** Default constructor leaving the quaternion uninitialized. */
+  inline Quat() {}
+
+  /** Constructs and initializes the quaternion \f$ w+xi+yj+zk \f$ from
+    * its four coefficients \a w, \a x, \a y and \a z.
+    *
+    * \warning Note the order of the arguments: the real \a w coefficient first,
+    * while internally the coefficients are stored in the following order:
+    * [\c x, \c y, \c z, \c w]
+    */
+  inline Quat(Scalar w, Scalar x, Scalar y, Scalar z)
+  { coeffs() << x, y, z, w; }
+
+  /** Constructs and initialize a quaternion from the array data
+    * This constructor is also used to map an array */
+  inline Quat(const Scalar* data) : m_coeffs(data) {}
+
+  /** Copy constructor */
+//  template<class Derived> inline Quat(const QuaternionBase<Derived>& other) { m_coeffs = other.coeffs(); } [XXX] redundant with 703
+
+  /** Constructs and initializes a quaternion from the angle-axis \a aa */
+  explicit inline Quat(const AngleAxisType& aa) { *this = aa; }
+
+  /** Constructs and initializes a quaternion from either:
+    *  - a rotation matrix expression,
+    *  - a 4D vector expression representing quaternion coefficients.
+    */
+  template<typename Derived>
+  explicit inline Quat(const MatrixBase<Derived>& other) { *this = other; } 
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<class Derived>
+  inline typename ei_cast_return_type<Quat, QuaternionBase<Derived> >::type cast() const
+  { return typename ei_cast_return_type<Quat, QuaternionBase<Derived> >::type(*this); }
+
+  /** Copy constructor with scalar type conversion */
+  template<class Derived>
+  inline explicit Quat(const QuaternionBase<Derived>& other)
+  { m_coeffs = other.coeffs().template cast<Scalar>(); }
+
+  inline Coefficients& coeffs() { return m_coeffs;}
+  inline const Coefficients& coeffs() const { return m_coeffs;}
+
+protected:
+  Coefficients m_coeffs;
+};
+
+/* ########### Map<Quat> */
+
+/** \class Map<Quat>
+  * \nonstableyet
+  *
+  * \brief Expression of a quaternion
+  *
+  * \param Scalar the type of the vector of diagonal coefficients
+  *
+  * \sa class Quaternion, class QuaternionBase
+  */
+template<typename _Scalar, int _PacketAccess>
+struct ei_traits<Map<Quat<_Scalar>, _PacketAccess> >:
+ei_traits<Quat<_Scalar> >
+{
+  typedef _Scalar Scalar;
+  typedef Map<Matrix<_Scalar,4,1> > Coefficients;
+  enum {
+    PacketAccess = _PacketAccess
+  };
+};
+
+template<typename _Scalar, int PacketAccess>
+class Map<Quat<_Scalar>, PacketAccess > : public QuaternionBase<Map<Quat<_Scalar>, PacketAccess> >, ei_no_assignment_operator {
+  public:
+    
+    typedef _Scalar Scalar;
+
+    typedef typename ei_traits<Map<Quat<Scalar>, PacketAccess> >::Coefficients Coefficients;
+
+    inline Map<Quat<Scalar>, PacketAccess >(const Scalar* coeffs) : m_coeffs(coeffs) {}
+    
+    inline Coefficients& coeffs() { return m_coeffs;}
+    inline const Coefficients& coeffs() const { return m_coeffs;}
+
+  protected:
+    Coefficients m_coeffs;
+};
+
+typedef Map<Quat<double> > QuaternionMapd;
+typedef Map<Quat<float> > QuaternionMapf;
+typedef Map<Quat<double>, Aligned> QuaternionMapAlignedd;
+typedef Map<Quat<float>, Aligned> QuaternionMapAlignedf;
+
+// Generic Quaternion * Quaternion product
+template<int Arch, class Derived, class OtherDerived, typename Scalar, int PacketAccess> struct ei_quat_product
+{
+  inline static Quat<Scalar> run(const QuaternionBase<Derived>& a, const QuaternionBase<OtherDerived>& b){
+    return Quat<Scalar>
+    (
+      a.w() * b.w() - a.x() * b.x() - a.y() * b.y() - a.z() * b.z(),
+      a.w() * b.x() + a.x() * b.w() + a.y() * b.z() - a.z() * b.y(),
+      a.w() * b.y() + a.y() * b.w() + a.z() * b.x() - a.x() * b.z(),
+      a.w() * b.z() + a.z() * b.w() + a.x() * b.y() - a.y() * b.x()
+    );
+  }
+};
+
+/** \returns the concatenation of two rotations as a quaternion-quaternion product */
+template <class Derived>
+template <class OtherDerived>
+inline Quat<typename ei_traits<QuaternionBase<Derived> >::Scalar> QuaternionBase<Derived>::operator* (const QuaternionBase<OtherDerived>& other) const
+{
+  EIGEN_STATIC_ASSERT((ei_is_same_type<typename Derived::Scalar, typename OtherDerived::Scalar>::ret),
+   YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+   return ei_quat_product<EiArch, Derived, OtherDerived, 
+                          ei_traits<Derived>::Scalar, 
+                          ei_traits<Derived>::PacketAccess && ei_traits<OtherDerived>::PacketAccess>::run(*this, other);
+}
+
+/** \sa operator*(Quaternion) */
+template <class Derived>
+template <class OtherDerived>
+inline QuaternionBase<Derived>& QuaternionBase<Derived>::operator*= (const QuaternionBase<OtherDerived>& other)
+{
+  return (*this = *this * other);
+}
+
+/** Rotation of a vector by a quaternion.
+  * \remarks If the quaternion is used to rotate several points (>1)
+  * then it is much more efficient to first convert it to a 3x3 Matrix.
+  * Comparison of the operation cost for n transformations:
+  *   - Quaternion2:    30n
+  *   - Via a Matrix3: 24 + 15n
+  */
+template <class Derived>
+inline typename QuaternionBase<Derived>::Vector3
+QuaternionBase<Derived>::_transformVector(Vector3 v) const
+{
+    // Note that this algorithm comes from the optimization by hand
+    // of the conversion to a Matrix followed by a Matrix/Vector product.
+    // It appears to be much faster than the common algorithm found
+    // in the litterature (30 versus 39 flops). It also requires two
+    // Vector3 as temporaries.
+    Vector3 uv = Scalar(2) * this->vec().cross(v);
+    return v + this->w() * uv + this->vec().cross(uv);
+}
+
+template<class Derived>
+template<class OtherDerived>
+inline QuaternionBase<Derived>& QuaternionBase<Derived>::operator=(const QuaternionBase<OtherDerived>& other)
+{
+  coeffs() = other.coeffs();
+  return *this;
+}
+
+/** Set \c *this from an angle-axis \a aa and returns a reference to \c *this
+  */
+template<class Derived>
+inline QuaternionBase<Derived>& QuaternionBase<Derived>::operator=(const AngleAxisType& aa)
+{
+  Scalar ha = Scalar(0.5)*aa.angle(); // Scalar(0.5) to suppress precision loss warnings
+  this->w() = ei_cos(ha);
+  this->vec() = ei_sin(ha) * aa.axis();
+  return *this;
+}
+
+/** Set \c *this from the expression \a xpr:
+  *   - if \a xpr is a 4x1 vector, then \a xpr is assumed to be a quaternion
+  *   - if \a xpr is a 3x3 matrix, then \a xpr is assumed to be rotation matrix
+  *     and \a xpr is converted to a quaternion
+  */
+
+template<class Derived>
+template<class MatrixDerived>
+inline QuaternionBase<Derived>& QuaternionBase<Derived>::operator=(const MatrixBase<MatrixDerived>& xpr)
+{
+  EIGEN_STATIC_ASSERT((ei_is_same_type<typename Derived::Scalar, typename MatrixDerived::Scalar>::ret),
+   YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+  ei_quaternionbase_assign_impl<MatrixDerived>::run(*this, xpr.derived());
+  return *this;
+}
+
+/** Convert the quaternion to a 3x3 rotation matrix. The quaternion is required to
+  * be normalized, otherwise the result is undefined.
+  */
+template<class Derived>
+inline typename QuaternionBase<Derived>::Matrix3
+QuaternionBase<Derived>::toRotationMatrix(void) const
+{
+  // NOTE if inlined, then gcc 4.2 and 4.4 get rid of the temporary (not gcc 4.3 !!)
+  // if not inlined then the cost of the return by value is huge ~ +35%,
+  // however, not inlining this function is an order of magnitude slower, so
+  // it has to be inlined, and so the return by value is not an issue
+  Matrix3 res;
+
+  const Scalar tx  = 2*this->x();
+  const Scalar ty  = 2*this->y();
+  const Scalar tz  = 2*this->z();
+  const Scalar twx = tx*this->w();
+  const Scalar twy = ty*this->w();
+  const Scalar twz = tz*this->w();
+  const Scalar txx = tx*this->x();
+  const Scalar txy = ty*this->x();
+  const Scalar txz = tz*this->x();
+  const Scalar tyy = ty*this->y();
+  const Scalar tyz = tz*this->y();
+  const Scalar tzz = tz*this->z();
+
+  res.coeffRef(0,0) = 1-(tyy+tzz);
+  res.coeffRef(0,1) = txy-twz;
+  res.coeffRef(0,2) = txz+twy;
+  res.coeffRef(1,0) = txy+twz;
+  res.coeffRef(1,1) = 1-(txx+tzz);
+  res.coeffRef(1,2) = tyz-twx;
+  res.coeffRef(2,0) = txz-twy;
+  res.coeffRef(2,1) = tyz+twx;
+  res.coeffRef(2,2) = 1-(txx+tyy);
+
+  return res;
+}
+
+/** Sets \c *this to be a quaternion representing a rotation between
+  * the two arbitrary vectors \a a and \a b. In other words, the built
+  * rotation represent a rotation sending the line of direction \a a
+  * to the line of direction \a b, both lines passing through the origin.
+  *
+  * \returns a reference to \c *this.
+  *
+  * Note that the two input vectors do \b not have to be normalized, and
+  * do not need to have the same norm.
+  */
+template<class Derived>
+template<typename Derived1, typename Derived2>
+inline QuaternionBase<Derived>& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
+{
+  Vector3 v0 = a.normalized();
+  Vector3 v1 = b.normalized();
+  Scalar c = v1.dot(v0);
+
+  // if dot == -1, vectors are nearly opposites
+  // => accuraletly compute the rotation axis by computing the
+  //    intersection of the two planes. This is done by solving:
+  //       x^T v0 = 0
+  //       x^T v1 = 0
+  //    under the constraint:
+  //       ||x|| = 1
+  //    which yields a singular value problem
+  if (c < Scalar(-1)+precision<Scalar>())
+  {
+    c = std::max<Scalar>(c,-1);
+    Matrix<Scalar,2,3> m; m << v0.transpose(), v1.transpose();
+    SVD<Matrix<Scalar,2,3> > svd(m);
+    Vector3 axis = svd.matrixV().col(2);
+
+    Scalar w2 = (Scalar(1)+c)*Scalar(0.5);
+    this->w() = ei_sqrt(w2);
+    this->vec() = axis * ei_sqrt(Scalar(1) - w2);
+    return *this;
+  }
+  Vector3 axis = v0.cross(v1);
+  Scalar s = ei_sqrt((Scalar(1)+c)*Scalar(2));
+  Scalar invs = Scalar(1)/s;
+  this->vec() = axis * invs;
+  this->w() = s * Scalar(0.5);
+
+  return *this;
+}
+
+/** \returns the multiplicative inverse of \c *this
+  * Note that in most cases, i.e., if you simply want the opposite rotation,
+  * and/or the quaternion is normalized, then it is enough to use the conjugate.
+  *
+  * \sa Quaternion2::conjugate()
+  */
+template <class Derived>
+inline Quat<typename ei_traits<QuaternionBase<Derived> >::Scalar> QuaternionBase<Derived>::inverse() const
+{
+  // FIXME should this function be called multiplicativeInverse and conjugate() be called inverse() or opposite()  ??
+  Scalar n2 = this->squaredNorm();
+  if (n2 > 0)
+    return Quat<Scalar>(conjugate().coeffs() / n2);
+  else
+  {
+    // return an invalid result to flag the error
+    return Quat<Scalar>(ei_traits<Derived>::Coefficients::Zero());
+  }
+}
+
+/** \returns the conjugate of the \c *this which is equal to the multiplicative inverse
+  * if the quaternion is normalized.
+  * The conjugate of a quaternion represents the opposite rotation.
+  *
+  * \sa Quaternion2::inverse()
+  */
+template <class Derived>
+inline Quat<typename ei_traits<QuaternionBase<Derived> >::Scalar> QuaternionBase<Derived>::conjugate() const
+{
+  return Quat<Scalar>(this->w(),-this->x(),-this->y(),-this->z());
+}
+
+/** \returns the angle (in radian) between two rotations
+  * \sa dot()
+  */
+template <class Derived>
+template <class OtherDerived>
+inline typename ei_traits<QuaternionBase<Derived> >::Scalar QuaternionBase<Derived>::angularDistance(const QuaternionBase<OtherDerived>& other) const
+{
+  double d = ei_abs(this->dot(other));
+  if (d>=1.0)
+    return 0;
+  return Scalar(2) * std::acos(d);
+}
+
+/** \returns the spherical linear interpolation between the two quaternions
+  * \c *this and \a other at the parameter \a t
+  */
+template <class Derived>
+template <class OtherDerived>
+Quat<typename ei_traits<QuaternionBase<Derived> >::Scalar> QuaternionBase<Derived>::slerp(Scalar t, const QuaternionBase<OtherDerived>& other) const
+{
+  static const Scalar one = Scalar(1) - precision<Scalar>();
+  Scalar d = this->dot(other);
+  Scalar absD = ei_abs(d);
+  if (absD>=one)
+    return Quat<Scalar>(*this);
+
+  // theta is the angle between the 2 quaternions
+  Scalar theta = std::acos(absD);
+  Scalar sinTheta = ei_sin(theta);
+
+  Scalar scale0 = ei_sin( ( Scalar(1) - t ) * theta) / sinTheta;
+  Scalar scale1 = ei_sin( ( t * theta) ) / sinTheta;
+  if (d<0)
+    scale1 = -scale1;
+
+  return Quat<Scalar>(scale0 * coeffs() + scale1 * other.coeffs());
+}
+
+// set from a rotation matrix
+template<typename Other>
+struct ei_quaternionbase_assign_impl<Other,3,3>
+{
+  typedef typename Other::Scalar Scalar;
+  template<class Derived> inline static void run(QuaternionBase<Derived>& q, const Other& mat)
+  {
+    // This algorithm comes from  "Quaternion Calculus and Fast Animation",
+    // Ken Shoemake, 1987 SIGGRAPH course notes
+    Scalar t = mat.trace();
+    if (t > 0)
+    {
+      t = ei_sqrt(t + Scalar(1.0));
+      q.w() = Scalar(0.5)*t;
+      t = Scalar(0.5)/t;
+      q.x() = (mat.coeff(2,1) - mat.coeff(1,2)) * t;
+      q.y() = (mat.coeff(0,2) - mat.coeff(2,0)) * t;
+      q.z() = (mat.coeff(1,0) - mat.coeff(0,1)) * t;
+    }
+    else
+    {
+      int i = 0;
+      if (mat.coeff(1,1) > mat.coeff(0,0))
+        i = 1;
+      if (mat.coeff(2,2) > mat.coeff(i,i))
+        i = 2;
+      int j = (i+1)%3;
+      int k = (j+1)%3;
+
+      t = ei_sqrt(mat.coeff(i,i)-mat.coeff(j,j)-mat.coeff(k,k) + Scalar(1.0));
+      q.coeffs().coeffRef(i) = Scalar(0.5) * t;
+      t = Scalar(0.5)/t;
+      q.w() = (mat.coeff(k,j)-mat.coeff(j,k))*t;
+      q.coeffs().coeffRef(j) = (mat.coeff(j,i)+mat.coeff(i,j))*t;
+      q.coeffs().coeffRef(k) = (mat.coeff(k,i)+mat.coeff(i,k))*t;
+    }
+  }
+};
+
+// set from a vector of coefficients assumed to be a quaternion
+template<typename Other>
+struct ei_quaternionbase_assign_impl<Other,4,1>
+{
+  typedef typename Other::Scalar Scalar;
+  template<class Derived> inline static void run(QuaternionBase<Derived>& q, const Other& vec)
+  {
+    q.coeffs() = vec;
+  }
+};
+
+
 #endif // EIGEN_QUATERNION_H
diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SSE.h
index d0342febc..c608e4843 100644
--- a/Eigen/src/Geometry/arch/Geometry_SSE.h
+++ b/Eigen/src/Geometry/arch/Geometry_SSE.h
@@ -45,6 +45,27 @@ ei_quaternion_product<EiArch_SSE,float>(const Quaternion<float>& _a, const Quate
   return res;
 }
 
+template<class Derived, class OtherDerived> struct ei_quat_product<EiArch_SSE, Derived, OtherDerived, float, Aligned>
+{
+  inline static Quat<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
+  {
+    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0,0,0,0x80000000));
+    Quat<float> res;
+    __m128 a = _a.coeffs().packet<Aligned>(0);
+    __m128 b = _b.coeffs().packet<Aligned>(0);
+    __m128 flip1 = _mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a,1,2,0,2),
+                                         ei_vec4f_swizzle1(b,2,0,1,2)),mask);
+    __m128 flip2 = _mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a,3,3,3,1),
+                                         ei_vec4f_swizzle1(b,0,1,2,1)),mask);
+    ei_pstore(&res.x(),
+              _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,ei_vec4f_swizzle1(b,3,3,3,3)),
+                                    _mm_mul_ps(ei_vec4f_swizzle1(a,2,0,1,0),
+                                               ei_vec4f_swizzle1(b,1,2,0,0))),
+                         _mm_add_ps(flip1,flip2)));
+    return res;
+  }
+};
+
 template<typename VectorLhs,typename VectorRhs>
 struct ei_cross3_impl<EiArch_SSE,VectorLhs,VectorRhs,float,true> {
   inline static typename ei_plain_matrix_type<VectorLhs>::type

From 6219f9acfa61e54baf266f816b7eaf9ffbd9841e Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Tue, 27 Oct 2009 15:45:24 -0400
Subject: [PATCH 34/34] * rename new Quat class to Quaternion, remove existing
 Quaternion * add Copyright line for Mathieu * cast() was broken (compile
 errors) and needed anyway to be in QuaternionBase * it's VectorBlock<T,3>,
 don't pass additional parameter 1, it has different meaning!! * make it
 compile with GCC (put 'typename' at the right location)

---
 Eigen/src/Geometry/Quaternion.h        | 578 +++----------------------
 Eigen/src/Geometry/arch/Geometry_SSE.h |  23 +-
 2 files changed, 60 insertions(+), 541 deletions(-)

diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index f3f60a08a..67b040165 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2009 Mathieu Gautier <mathieu.gautier@cea.fr>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
@@ -25,11 +26,6 @@
 #ifndef EIGEN_QUATERNION_H
 #define EIGEN_QUATERNION_H
 
-template<typename Other,
-         int OtherRows=Other::RowsAtCompileTime,
-         int OtherCols=Other::ColsAtCompileTime>
-struct ei_quaternion_assign_impl;
-
 /** \geometry_module \ingroup Geometry_Module
   *
   * \class Quaternion
@@ -52,471 +48,12 @@ struct ei_quaternion_assign_impl;
   * \sa  class AngleAxis, class Transform
   */
 
-template<typename _Scalar> struct ei_traits<Quaternion<_Scalar> >
-{
-  typedef _Scalar Scalar;
-};
-
-template<typename _Scalar>
-class Quaternion : public RotationBase<Quaternion<_Scalar>,3>
-{
-  typedef RotationBase<Quaternion<_Scalar>,3> Base;
-
-
-
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,4)
-
-  using Base::operator*;
-
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-
-  /** the type of the Coefficients 4-vector */
-  typedef Matrix<Scalar, 4, 1> Coefficients;
-  /** the type of a 3D vector */
-  typedef Matrix<Scalar,3,1> Vector3;
-  /** the equivalent rotation matrix type */
-  typedef Matrix<Scalar,3,3> Matrix3;
-  /** the equivalent angle-axis type */
-  typedef AngleAxis<Scalar> AngleAxisType;
-
-  /** \returns the \c x coefficient */
-  inline Scalar x() const { return m_coeffs.coeff(0); }
-  /** \returns the \c y coefficient */
-  inline Scalar y() const { return m_coeffs.coeff(1); }
-  /** \returns the \c z coefficient */
-  inline Scalar z() const { return m_coeffs.coeff(2); }
-  /** \returns the \c w coefficient */
-  inline Scalar w() const { return m_coeffs.coeff(3); }
-
-  /** \returns a reference to the \c x coefficient */
-  inline Scalar& x() { return m_coeffs.coeffRef(0); }
-  /** \returns a reference to the \c y coefficient */
-  inline Scalar& y() { return m_coeffs.coeffRef(1); }
-  /** \returns a reference to the \c z coefficient */
-  inline Scalar& z() { return m_coeffs.coeffRef(2); }
-  /** \returns a reference to the \c w coefficient */
-  inline Scalar& w() { return m_coeffs.coeffRef(3); }
-
-  /** \returns a read-only vector expression of the imaginary part (x,y,z) */
-  inline const Block<Coefficients,3,1> vec() const { return m_coeffs.template start<3>(); }
-
-  /** \returns a vector expression of the imaginary part (x,y,z) */
-  inline Block<Coefficients,3,1> vec() { return m_coeffs.template start<3>(); }
-
-  /** \returns a read-only vector expression of the coefficients (x,y,z,w) */
-  inline const Coefficients& coeffs() const { return m_coeffs; }
-
-  /** \returns a vector expression of the coefficients (x,y,z,w) */
-  inline Coefficients& coeffs() { return m_coeffs; }
-
-  /** Default constructor leaving the quaternion uninitialized. */
-  inline Quaternion() {}
-
-  /** Constructs and initializes the quaternion \f$ w+xi+yj+zk \f$ from
-    * its four coefficients \a w, \a x, \a y and \a z.
-    *
-    * \warning Note the order of the arguments: the real \a w coefficient first,
-    * while internally the coefficients are stored in the following order:
-    * [\c x, \c y, \c z, \c w]
-    */
-  inline Quaternion(Scalar w, Scalar x, Scalar y, Scalar z)
-  { m_coeffs << x, y, z, w; }
-
-  /** Copy constructor */
-  inline Quaternion(const Quaternion& other) { m_coeffs = other.m_coeffs; }
-
-  /** Constructs and initializes a quaternion from the angle-axis \a aa */
-  explicit inline Quaternion(const AngleAxisType& aa) { *this = aa; }
-
-  /** Constructs and initializes a quaternion from either:
-    *  - a rotation matrix expression,
-    *  - a 4D vector expression representing quaternion coefficients.
-    * \sa operator=(MatrixBase<Derived>)
-    */
-  template<typename Derived>
-  explicit inline Quaternion(const MatrixBase<Derived>& other) { *this = other; }
-
-  Quaternion& operator=(const Quaternion& other);
-  Quaternion& operator=(const AngleAxisType& aa);
-  template<typename Derived>
-  Quaternion& operator=(const MatrixBase<Derived>& m);
-
-  /** \returns a quaternion representing an identity rotation
-    * \sa MatrixBase::Identity()
-    */
-  inline static Quaternion Identity() { return Quaternion(1, 0, 0, 0); }
-
-  /** \sa Quaternion::Identity(), MatrixBase::setIdentity()
-    */
-  inline Quaternion& setIdentity() { m_coeffs << 0, 0, 0, 1; return *this; }
-
-  /** \returns the squared norm of the quaternion's coefficients
-    * \sa Quaternion::norm(), MatrixBase::squaredNorm()
-    */
-  inline Scalar squaredNorm() const { return m_coeffs.squaredNorm(); }
-
-  /** \returns the norm of the quaternion's coefficients
-    * \sa Quaternion::squaredNorm(), MatrixBase::norm()
-    */
-  inline Scalar norm() const { return m_coeffs.norm(); }
-
-  /** Normalizes the quaternion \c *this
-    * \sa normalized(), MatrixBase::normalize() */
-  inline void normalize() { m_coeffs.normalize(); }
-  /** \returns a normalized version of \c *this
-    * \sa normalize(), MatrixBase::normalized() */
-  inline Quaternion normalized() const { return Quaternion(m_coeffs.normalized()); }
-
-  /** \returns the dot product of \c *this and \a other
-    * Geometrically speaking, the dot product of two unit quaternions
-    * corresponds to the cosine of half the angle between the two rotations.
-    * \sa angularDistance()
-    */
-  inline Scalar dot(const Quaternion& other) const { return m_coeffs.dot(other.m_coeffs); }
-
-  inline Scalar angularDistance(const Quaternion& other) const;
-
-  Matrix3 toRotationMatrix(void) const;
-
-  template<typename Derived1, typename Derived2>
-  Quaternion& setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
-
-  inline Quaternion operator* (const Quaternion& q) const;
-  inline Quaternion& operator*= (const Quaternion& q);
-
-  Quaternion inverse(void) const;
-  Quaternion conjugate(void) const;
-
-  Quaternion slerp(Scalar t, const Quaternion& other) const;
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename ei_cast_return_type<Quaternion,Quaternion<NewScalarType> >::type cast() const
-  { return typename ei_cast_return_type<Quaternion,Quaternion<NewScalarType> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Quaternion(const Quaternion<OtherScalarType>& other)
-  { m_coeffs = other.coeffs().template cast<Scalar>(); }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Quaternion& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_coeffs.isApprox(other.m_coeffs, prec); }
-
-  Vector3 _transformVector(Vector3 v) const;
-
-protected:
-  Coefficients m_coeffs;
-};
-
-/** \ingroup Geometry_Module
-  * single precision quaternion type */
-typedef Quaternion<float> Quaternionf;
-/** \ingroup Geometry_Module
-  * double precision quaternion type */
-typedef Quaternion<double> Quaterniond;
-
-// Generic Quaternion * Quaternion product
-template<int Arch,typename Scalar> inline Quaternion<Scalar>
-ei_quaternion_product(const Quaternion<Scalar>& a, const Quaternion<Scalar>& b)
-{
-  return Quaternion<Scalar>
-  (
-    a.w() * b.w() - a.x() * b.x() - a.y() * b.y() - a.z() * b.z(),
-    a.w() * b.x() + a.x() * b.w() + a.y() * b.z() - a.z() * b.y(),
-    a.w() * b.y() + a.y() * b.w() + a.z() * b.x() - a.x() * b.z(),
-    a.w() * b.z() + a.z() * b.w() + a.x() * b.y() - a.y() * b.x()
-  );
-}
-
-/** \returns the concatenation of two rotations as a quaternion-quaternion product */
-template <typename Scalar>
-inline Quaternion<Scalar> Quaternion<Scalar>::operator* (const Quaternion& other) const
-{
-  return ei_quaternion_product<EiArch>(*this,other);
-}
-
-/** \sa operator*(Quaternion) */
-template <typename Scalar>
-inline Quaternion<Scalar>& Quaternion<Scalar>::operator*= (const Quaternion& other)
-{
-  return (*this = *this * other);
-}
-
-/** Rotation of a vector by a quaternion.
-  * \remarks If the quaternion is used to rotate several points (>1)
-  * then it is much more efficient to first convert it to a 3x3 Matrix.
-  * Comparison of the operation cost for n transformations:
-  *   - Quaternion:    30n
-  *   - Via a Matrix3: 24 + 15n
-  */
-template <typename Scalar>
-inline typename Quaternion<Scalar>::Vector3
-Quaternion<Scalar>::_transformVector(Vector3 v) const
-{
-    // Note that this algorithm comes from the optimization by hand
-    // of the conversion to a Matrix followed by a Matrix/Vector product.
-    // It appears to be much faster than the common algorithm found
-    // in the litterature (30 versus 39 flops). It also requires two
-    // Vector3 as temporaries.
-    Vector3 uv = Scalar(2) * this->vec().cross(v);
-    return v + this->w() * uv + this->vec().cross(uv);
-}
-
-template<typename Scalar>
-inline Quaternion<Scalar>& Quaternion<Scalar>::operator=(const Quaternion& other)
-{
-  m_coeffs = other.m_coeffs;
-  return *this;
-}
-
-/** Set \c *this from an angle-axis \a aa and returns a reference to \c *this
-  */
-template<typename Scalar>
-inline Quaternion<Scalar>& Quaternion<Scalar>::operator=(const AngleAxisType& aa)
-{
-  Scalar ha = Scalar(0.5)*aa.angle(); // Scalar(0.5) to suppress precision loss warnings
-  this->w() = ei_cos(ha);
-  this->vec() = ei_sin(ha) * aa.axis();
-  return *this;
-}
-
-/** Set \c *this from the expression \a xpr:
-  *   - if \a xpr is a 4x1 vector, then \a xpr is assumed to be a quaternion
-  *   - if \a xpr is a 3x3 matrix, then \a xpr is assumed to be rotation matrix
-  *     and \a xpr is converted to a quaternion
-  */
-template<typename Scalar>
-template<typename Derived>
-inline Quaternion<Scalar>& Quaternion<Scalar>::operator=(const MatrixBase<Derived>& xpr)
-{
-  ei_quaternion_assign_impl<Derived>::run(*this, xpr.derived());
-  return *this;
-}
-
-/** Convert the quaternion to a 3x3 rotation matrix. The quaternion is required to
-  * be normalized, otherwise the result is undefined.
-  */
-template<typename Scalar>
-inline typename Quaternion<Scalar>::Matrix3
-Quaternion<Scalar>::toRotationMatrix(void) const
-{
-  // NOTE if inlined, then gcc 4.2 and 4.4 get rid of the temporary (not gcc 4.3 !!)
-  // if not inlined then the cost of the return by value is huge ~ +35%,
-  // however, not inlining this function is an order of magnitude slower, so
-  // it has to be inlined, and so the return by value is not an issue
-  Matrix3 res;
-
-  const Scalar tx  = 2*this->x();
-  const Scalar ty  = 2*this->y();
-  const Scalar tz  = 2*this->z();
-  const Scalar twx = tx*this->w();
-  const Scalar twy = ty*this->w();
-  const Scalar twz = tz*this->w();
-  const Scalar txx = tx*this->x();
-  const Scalar txy = ty*this->x();
-  const Scalar txz = tz*this->x();
-  const Scalar tyy = ty*this->y();
-  const Scalar tyz = tz*this->y();
-  const Scalar tzz = tz*this->z();
-
-  res.coeffRef(0,0) = 1-(tyy+tzz);
-  res.coeffRef(0,1) = txy-twz;
-  res.coeffRef(0,2) = txz+twy;
-  res.coeffRef(1,0) = txy+twz;
-  res.coeffRef(1,1) = 1-(txx+tzz);
-  res.coeffRef(1,2) = tyz-twx;
-  res.coeffRef(2,0) = txz-twy;
-  res.coeffRef(2,1) = tyz+twx;
-  res.coeffRef(2,2) = 1-(txx+tyy);
-
-  return res;
-}
-
-/** Sets \c *this to be a quaternion representing a rotation between
-  * the two arbitrary vectors \a a and \a b. In other words, the built
-  * rotation represent a rotation sending the line of direction \a a
-  * to the line of direction \a b, both lines passing through the origin.
-  *
-  * \returns a reference to \c *this.
-  *
-  * Note that the two input vectors do \b not have to be normalized, and
-  * do not need to have the same norm.
-  */
-template<typename Scalar>
-template<typename Derived1, typename Derived2>
-inline Quaternion<Scalar>& Quaternion<Scalar>::setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
-{
-  Vector3 v0 = a.normalized();
-  Vector3 v1 = b.normalized();
-  Scalar c = v1.dot(v0);
-
-  // if dot == -1, vectors are nearly opposites
-  // => accuraletly compute the rotation axis by computing the
-  //    intersection of the two planes. This is done by solving:
-  //       x^T v0 = 0
-  //       x^T v1 = 0
-  //    under the constraint:
-  //       ||x|| = 1
-  //    which yields a singular value problem
-  if (c < Scalar(-1)+precision<Scalar>())
-  {
-    c = std::max<Scalar>(c,-1);
-    Matrix<Scalar,2,3> m; m << v0.transpose(), v1.transpose();
-    SVD<Matrix<Scalar,2,3> > svd(m);
-    Vector3 axis = svd.matrixV().col(2);
-
-    Scalar w2 = (Scalar(1)+c)*Scalar(0.5);
-    this->w() = ei_sqrt(w2);
-    this->vec() = axis * ei_sqrt(Scalar(1) - w2);
-    return *this;
-  }
-  Vector3 axis = v0.cross(v1);
-  Scalar s = ei_sqrt((Scalar(1)+c)*Scalar(2));
-  Scalar invs = Scalar(1)/s;
-  this->vec() = axis * invs;
-  this->w() = s * Scalar(0.5);
-
-  return *this;
-}
-
-/** \returns the multiplicative inverse of \c *this
-  * Note that in most cases, i.e., if you simply want the opposite rotation,
-  * and/or the quaternion is normalized, then it is enough to use the conjugate.
-  *
-  * \sa Quaternion::conjugate()
-  */
-template <typename Scalar>
-inline Quaternion<Scalar> Quaternion<Scalar>::inverse() const
-{
-  // FIXME should this function be called multiplicativeInverse and conjugate() be called inverse() or opposite()  ??
-  Scalar n2 = this->squaredNorm();
-  if (n2 > 0)
-    return Quaternion(conjugate().coeffs() / n2);
-  else
-  {
-    // return an invalid result to flag the error
-    return Quaternion(Coefficients::Zero());
-  }
-}
-
-/** \returns the conjugate of the \c *this which is equal to the multiplicative inverse
-  * if the quaternion is normalized.
-  * The conjugate of a quaternion represents the opposite rotation.
-  *
-  * \sa Quaternion::inverse()
-  */
-template <typename Scalar>
-inline Quaternion<Scalar> Quaternion<Scalar>::conjugate() const
-{
-  return Quaternion(this->w(),-this->x(),-this->y(),-this->z());
-}
-
-/** \returns the angle (in radian) between two rotations
-  * \sa dot()
-  */
-template <typename Scalar>
-inline Scalar Quaternion<Scalar>::angularDistance(const Quaternion& other) const
-{
-  double d = ei_abs(this->dot(other));
-  if (d>=1.0)
-    return 0;
-  return Scalar(2) * std::acos(d);
-}
-
-/** \returns the spherical linear interpolation between the two quaternions
-  * \c *this and \a other at the parameter \a t
-  */
-template <typename Scalar>
-Quaternion<Scalar> Quaternion<Scalar>::slerp(Scalar t, const Quaternion& other) const
-{
-  static const Scalar one = Scalar(1) - precision<Scalar>();
-  Scalar d = this->dot(other);
-  Scalar absD = ei_abs(d);
-  if (absD>=one)
-    return *this;
-
-  // theta is the angle between the 2 quaternions
-  Scalar theta = std::acos(absD);
-  Scalar sinTheta = ei_sin(theta);
-
-  Scalar scale0 = ei_sin( ( Scalar(1) - t ) * theta) / sinTheta;
-  Scalar scale1 = ei_sin( ( t * theta) ) / sinTheta;
-  if (d<0)
-    scale1 = -scale1;
-
-  return Quaternion(scale0 * m_coeffs + scale1 * other.m_coeffs);
-}
-
-// set from a rotation matrix
-template<typename Other>
-struct ei_quaternion_assign_impl<Other,3,3>
-{
-  typedef typename Other::Scalar Scalar;
-  inline static void run(Quaternion<Scalar>& q, const Other& mat)
-  {
-    // This algorithm comes from  "Quaternion Calculus and Fast Animation",
-    // Ken Shoemake, 1987 SIGGRAPH course notes
-    Scalar t = mat.trace();
-    if (t > 0)
-    {
-      t = ei_sqrt(t + Scalar(1.0));
-      q.w() = Scalar(0.5)*t;
-      t = Scalar(0.5)/t;
-      q.x() = (mat.coeff(2,1) - mat.coeff(1,2)) * t;
-      q.y() = (mat.coeff(0,2) - mat.coeff(2,0)) * t;
-      q.z() = (mat.coeff(1,0) - mat.coeff(0,1)) * t;
-    }
-    else
-    {
-      int i = 0;
-      if (mat.coeff(1,1) > mat.coeff(0,0))
-        i = 1;
-      if (mat.coeff(2,2) > mat.coeff(i,i))
-        i = 2;
-      int j = (i+1)%3;
-      int k = (j+1)%3;
-
-      t = ei_sqrt(mat.coeff(i,i)-mat.coeff(j,j)-mat.coeff(k,k) + Scalar(1.0));
-      q.coeffs().coeffRef(i) = Scalar(0.5) * t;
-      t = Scalar(0.5)/t;
-      q.w() = (mat.coeff(k,j)-mat.coeff(j,k))*t;
-      q.coeffs().coeffRef(j) = (mat.coeff(j,i)+mat.coeff(i,j))*t;
-      q.coeffs().coeffRef(k) = (mat.coeff(k,i)+mat.coeff(i,k))*t;
-    }
-  }
-};
-
-// set from a vector of coefficients assumed to be a quaternion
-template<typename Other>
-struct ei_quaternion_assign_impl<Other,4,1>
-{
-  typedef typename Other::Scalar Scalar;
-  inline static void run(Quaternion<Scalar>& q, const Other& vec)
-  {
-    q.coeffs() = vec;
-  }
-};
-
-/*###################################################################
-      QuaternionBase and Map<Quaternion> and Quat
-  ###################################################################*/
-
 template<typename Other,
          int OtherRows=Other::RowsAtCompileTime,
          int OtherCols=Other::ColsAtCompileTime>
 struct ei_quaternionbase_assign_impl;
 
-template<typename Scalar> class Quat; // [XXX] => remove when Quat becomes Quaternion
+template<typename Scalar> class Quaternion; // [XXX] => remove when Quaternion becomes Quaternion
 
 template<typename Derived>
 struct ei_traits<QuaternionBase<Derived> >
@@ -528,7 +65,8 @@ struct ei_traits<QuaternionBase<Derived> >
 };
 
 template<class Derived>
-class QuaternionBase : public RotationBase<Derived, 3> {
+class QuaternionBase : public RotationBase<Derived, 3>
+{
   typedef RotationBase<Derived, 3> Base;
 public:
   using Base::operator*;
@@ -563,10 +101,10 @@ public:
   inline Scalar& w() { return this->derived().coeffs().coeffRef(3); }
 
   /** \returns a read-only vector expression of the imaginary part (x,y,z) */
-  inline const VectorBlock<typename ei_traits<Derived>::Coefficients,3,1> vec() const { return this->derived().coeffs().template start<3>(); }
+  inline const VectorBlock<typename ei_traits<Derived>::Coefficients,3> vec() const { return this->derived().coeffs().template start<3>(); }
 
   /** \returns a vector expression of the imaginary part (x,y,z) */
-  inline VectorBlock<typename ei_traits<Derived>::Coefficients,3,1> vec() { return this->derived().coeffs().template start<3>(); }
+  inline VectorBlock<typename ei_traits<Derived>::Coefficients,3> vec() { return this->derived().coeffs().template start<3>(); }
 
   /** \returns a read-only vector expression of the coefficients (x,y,z,w) */
   inline const typename ei_traits<Derived>::Coefficients& coeffs() const { return this->derived().coeffs(); }
@@ -582,7 +120,7 @@ public:
   /** \returns a quaternion representing an identity rotation
     * \sa MatrixBase::Identity()
     */
-  inline static Quat<Scalar> Identity() { return Quat<Scalar>(1, 0, 0, 0); }
+  inline static Quaternion<Scalar> Identity() { return Quaternion<Scalar>(1, 0, 0, 0); }
 
   /** \sa Quaternion2::Identity(), MatrixBase::setIdentity()
     */
@@ -603,7 +141,7 @@ public:
   inline void normalize() { coeffs().normalize(); }
   /** \returns a normalized version of \c *this
     * \sa normalize(), MatrixBase::normalized() */
-  inline Quat<Scalar> normalized() const { return Quat<Scalar>(coeffs().normalized()); }
+  inline Quaternion<Scalar> normalized() const { return Quaternion<Scalar>(coeffs().normalized()); }
 
     /** \returns the dot product of \c *this and \a other
     * Geometrically speaking, the dot product of two unit quaternions
@@ -619,29 +157,38 @@ public:
   template<typename Derived1, typename Derived2>
   QuaternionBase& setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
 
-  template<class OtherDerived> inline Quat<Scalar> operator* (const QuaternionBase<OtherDerived>& q) const;
+  template<class OtherDerived> inline Quaternion<Scalar> operator* (const QuaternionBase<OtherDerived>& q) const;
   template<class OtherDerived> inline QuaternionBase& operator*= (const QuaternionBase<OtherDerived>& q);
 
-  Quat<Scalar> inverse(void) const;
-  Quat<Scalar> conjugate(void) const;
+  Quaternion<Scalar> inverse(void) const;
+  Quaternion<Scalar> conjugate(void) const;
 
-  template<class OtherDerived> Quat<Scalar> slerp(Scalar t, const QuaternionBase<OtherDerived>& other) const;
+  template<class OtherDerived> Quaternion<Scalar> slerp(Scalar t, const QuaternionBase<OtherDerived>& other) const;
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
     * determined by \a prec.
     *
     * \sa MatrixBase::isApprox() */
-  bool isApprox(const QuaternionBase& other, typename RealScalar prec = precision<Scalar>()) const
+  bool isApprox(const QuaternionBase& other, RealScalar prec = precision<Scalar>()) const
   { return coeffs().isApprox(other.coeffs(), prec); }
 
   Vector3 _transformVector(Vector3 v) const;
 
+  /** \returns \c *this with scalar type casted to \a NewScalarType
+    *
+    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+    * then this function smartly returns a const reference to \c *this.
+    */
+  template<typename NewScalarType>
+  inline typename ei_cast_return_type<Derived,Quaternion<NewScalarType> >::type cast() const
+  {
+    return typename ei_cast_return_type<Derived,Quaternion<NewScalarType> >::type(
+      coeffs().template cast<NewScalarType>());
+  }
 };
 
-/* ########### Quat -> Quaternion */
-
 template<typename _Scalar>
-struct ei_traits<Quat<_Scalar> >
+struct ei_traits<Quaternion<_Scalar> >
 {
   typedef _Scalar Scalar;
   typedef Matrix<_Scalar,4,1> Coefficients;
@@ -651,18 +198,18 @@ struct ei_traits<Quat<_Scalar> >
 };
 
 template<typename _Scalar>
-class Quat : public QuaternionBase<Quat<_Scalar> >{
-  typedef QuaternionBase<Quat<_Scalar> > Base;
+class Quaternion : public QuaternionBase<Quaternion<_Scalar> >{
+  typedef QuaternionBase<Quaternion<_Scalar> > Base;
 public:
   using Base::operator=;
 
   typedef _Scalar Scalar;
 
-  typedef typename ei_traits<Quat<Scalar> >::Coefficients Coefficients;
+  typedef typename ei_traits<Quaternion<Scalar> >::Coefficients Coefficients;
   typedef typename Base::AngleAxisType AngleAxisType;
 
   /** Default constructor leaving the quaternion uninitialized. */
-  inline Quat() {}
+  inline Quaternion() {}
 
   /** Constructs and initializes the quaternion \f$ w+xi+yj+zk \f$ from
     * its four coefficients \a w, \a x, \a y and \a z.
@@ -671,38 +218,29 @@ public:
     * while internally the coefficients are stored in the following order:
     * [\c x, \c y, \c z, \c w]
     */
-  inline Quat(Scalar w, Scalar x, Scalar y, Scalar z)
+  inline Quaternion(Scalar w, Scalar x, Scalar y, Scalar z)
   { coeffs() << x, y, z, w; }
 
   /** Constructs and initialize a quaternion from the array data
     * This constructor is also used to map an array */
-  inline Quat(const Scalar* data) : m_coeffs(data) {}
+  inline Quaternion(const Scalar* data) : m_coeffs(data) {}
 
   /** Copy constructor */
-//  template<class Derived> inline Quat(const QuaternionBase<Derived>& other) { m_coeffs = other.coeffs(); } [XXX] redundant with 703
+//  template<class Derived> inline Quaternion(const QuaternionBase<Derived>& other) { m_coeffs = other.coeffs(); } [XXX] redundant with 703
 
   /** Constructs and initializes a quaternion from the angle-axis \a aa */
-  explicit inline Quat(const AngleAxisType& aa) { *this = aa; }
+  explicit inline Quaternion(const AngleAxisType& aa) { *this = aa; }
 
   /** Constructs and initializes a quaternion from either:
     *  - a rotation matrix expression,
     *  - a 4D vector expression representing quaternion coefficients.
     */
   template<typename Derived>
-  explicit inline Quat(const MatrixBase<Derived>& other) { *this = other; } 
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<class Derived>
-  inline typename ei_cast_return_type<Quat, QuaternionBase<Derived> >::type cast() const
-  { return typename ei_cast_return_type<Quat, QuaternionBase<Derived> >::type(*this); }
+  explicit inline Quaternion(const MatrixBase<Derived>& other) { *this = other; }
 
   /** Copy constructor with scalar type conversion */
   template<class Derived>
-  inline explicit Quat(const QuaternionBase<Derived>& other)
+  inline explicit Quaternion(const QuaternionBase<Derived>& other)
   { m_coeffs = other.coeffs().template cast<Scalar>(); }
 
   inline Coefficients& coeffs() { return m_coeffs;}
@@ -712,9 +250,9 @@ protected:
   Coefficients m_coeffs;
 };
 
-/* ########### Map<Quat> */
+/* ########### Map<Quaternion> */
 
-/** \class Map<Quat>
+/** \class Map<Quaternion>
   * \nonstableyet
   *
   * \brief Expression of a quaternion
@@ -724,8 +262,8 @@ protected:
   * \sa class Quaternion, class QuaternionBase
   */
 template<typename _Scalar, int _PacketAccess>
-struct ei_traits<Map<Quat<_Scalar>, _PacketAccess> >:
-ei_traits<Quat<_Scalar> >
+struct ei_traits<Map<Quaternion<_Scalar>, _PacketAccess> >:
+ei_traits<Quaternion<_Scalar> >
 {
   typedef _Scalar Scalar;
   typedef Map<Matrix<_Scalar,4,1> > Coefficients;
@@ -735,14 +273,14 @@ ei_traits<Quat<_Scalar> >
 };
 
 template<typename _Scalar, int PacketAccess>
-class Map<Quat<_Scalar>, PacketAccess > : public QuaternionBase<Map<Quat<_Scalar>, PacketAccess> >, ei_no_assignment_operator {
+class Map<Quaternion<_Scalar>, PacketAccess > : public QuaternionBase<Map<Quaternion<_Scalar>, PacketAccess> >, ei_no_assignment_operator {
   public:
     
     typedef _Scalar Scalar;
 
-    typedef typename ei_traits<Map<Quat<Scalar>, PacketAccess> >::Coefficients Coefficients;
+    typedef typename ei_traits<Map<Quaternion<Scalar>, PacketAccess> >::Coefficients Coefficients;
 
-    inline Map<Quat<Scalar>, PacketAccess >(const Scalar* coeffs) : m_coeffs(coeffs) {}
+    inline Map<Quaternion<Scalar>, PacketAccess >(const Scalar* coeffs) : m_coeffs(coeffs) {}
     
     inline Coefficients& coeffs() { return m_coeffs;}
     inline const Coefficients& coeffs() const { return m_coeffs;}
@@ -751,16 +289,16 @@ class Map<Quat<_Scalar>, PacketAccess > : public QuaternionBase<Map<Quat<_Scalar
     Coefficients m_coeffs;
 };
 
-typedef Map<Quat<double> > QuaternionMapd;
-typedef Map<Quat<float> > QuaternionMapf;
-typedef Map<Quat<double>, Aligned> QuaternionMapAlignedd;
-typedef Map<Quat<float>, Aligned> QuaternionMapAlignedf;
+typedef Map<Quaternion<double> > QuaternionMapd;
+typedef Map<Quaternion<float> > QuaternionMapf;
+typedef Map<Quaternion<double>, Aligned> QuaternionMapAlignedd;
+typedef Map<Quaternion<float>, Aligned> QuaternionMapAlignedf;
 
 // Generic Quaternion * Quaternion product
 template<int Arch, class Derived, class OtherDerived, typename Scalar, int PacketAccess> struct ei_quat_product
 {
-  inline static Quat<Scalar> run(const QuaternionBase<Derived>& a, const QuaternionBase<OtherDerived>& b){
-    return Quat<Scalar>
+  inline static Quaternion<Scalar> run(const QuaternionBase<Derived>& a, const QuaternionBase<OtherDerived>& b){
+    return Quaternion<Scalar>
     (
       a.w() * b.w() - a.x() * b.x() - a.y() * b.y() - a.z() * b.z(),
       a.w() * b.x() + a.x() * b.w() + a.y() * b.z() - a.z() * b.y(),
@@ -773,12 +311,12 @@ template<int Arch, class Derived, class OtherDerived, typename Scalar, int Packe
 /** \returns the concatenation of two rotations as a quaternion-quaternion product */
 template <class Derived>
 template <class OtherDerived>
-inline Quat<typename ei_traits<QuaternionBase<Derived> >::Scalar> QuaternionBase<Derived>::operator* (const QuaternionBase<OtherDerived>& other) const
+inline Quaternion<typename ei_traits<QuaternionBase<Derived> >::Scalar> QuaternionBase<Derived>::operator* (const QuaternionBase<OtherDerived>& other) const
 {
   EIGEN_STATIC_ASSERT((ei_is_same_type<typename Derived::Scalar, typename OtherDerived::Scalar>::ret),
    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
    return ei_quat_product<EiArch, Derived, OtherDerived, 
-                          ei_traits<Derived>::Scalar, 
+                          typename ei_traits<Derived>::Scalar,
                           ei_traits<Derived>::PacketAccess && ei_traits<OtherDerived>::PacketAccess>::run(*this, other);
 }
 
@@ -938,16 +476,16 @@ inline QuaternionBase<Derived>& QuaternionBase<Derived>::setFromTwoVectors(const
   * \sa Quaternion2::conjugate()
   */
 template <class Derived>
-inline Quat<typename ei_traits<QuaternionBase<Derived> >::Scalar> QuaternionBase<Derived>::inverse() const
+inline Quaternion<typename ei_traits<QuaternionBase<Derived> >::Scalar> QuaternionBase<Derived>::inverse() const
 {
   // FIXME should this function be called multiplicativeInverse and conjugate() be called inverse() or opposite()  ??
   Scalar n2 = this->squaredNorm();
   if (n2 > 0)
-    return Quat<Scalar>(conjugate().coeffs() / n2);
+    return Quaternion<Scalar>(conjugate().coeffs() / n2);
   else
   {
     // return an invalid result to flag the error
-    return Quat<Scalar>(ei_traits<Derived>::Coefficients::Zero());
+    return Quaternion<Scalar>(ei_traits<Derived>::Coefficients::Zero());
   }
 }
 
@@ -958,9 +496,9 @@ inline Quat<typename ei_traits<QuaternionBase<Derived> >::Scalar> QuaternionBase
   * \sa Quaternion2::inverse()
   */
 template <class Derived>
-inline Quat<typename ei_traits<QuaternionBase<Derived> >::Scalar> QuaternionBase<Derived>::conjugate() const
+inline Quaternion<typename ei_traits<QuaternionBase<Derived> >::Scalar> QuaternionBase<Derived>::conjugate() const
 {
-  return Quat<Scalar>(this->w(),-this->x(),-this->y(),-this->z());
+  return Quaternion<Scalar>(this->w(),-this->x(),-this->y(),-this->z());
 }
 
 /** \returns the angle (in radian) between two rotations
@@ -981,13 +519,13 @@ inline typename ei_traits<QuaternionBase<Derived> >::Scalar QuaternionBase<Deriv
   */
 template <class Derived>
 template <class OtherDerived>
-Quat<typename ei_traits<QuaternionBase<Derived> >::Scalar> QuaternionBase<Derived>::slerp(Scalar t, const QuaternionBase<OtherDerived>& other) const
+Quaternion<typename ei_traits<QuaternionBase<Derived> >::Scalar> QuaternionBase<Derived>::slerp(Scalar t, const QuaternionBase<OtherDerived>& other) const
 {
   static const Scalar one = Scalar(1) - precision<Scalar>();
   Scalar d = this->dot(other);
   Scalar absD = ei_abs(d);
   if (absD>=one)
-    return Quat<Scalar>(*this);
+    return Quaternion<Scalar>(*this);
 
   // theta is the angle between the 2 quaternions
   Scalar theta = std::acos(absD);
@@ -998,7 +536,7 @@ Quat<typename ei_traits<QuaternionBase<Derived> >::Scalar> QuaternionBase<Derive
   if (d<0)
     scale1 = -scale1;
 
-  return Quat<Scalar>(scale0 * coeffs() + scale1 * other.coeffs());
+  return Quaternion<Scalar>(scale0 * coeffs() + scale1 * other.coeffs());
 }
 
 // set from a rotation matrix
diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SSE.h
index c608e4843..1b8f6aead 100644
--- a/Eigen/src/Geometry/arch/Geometry_SSE.h
+++ b/Eigen/src/Geometry/arch/Geometry_SSE.h
@@ -26,31 +26,12 @@
 #ifndef EIGEN_GEOMETRY_SSE_H
 #define EIGEN_GEOMETRY_SSE_H
 
-template<> inline Quaternion<float>
-ei_quaternion_product<EiArch_SSE,float>(const Quaternion<float>& _a, const Quaternion<float>& _b)
-{
-  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0,0,0,0x80000000));
-  Quaternion<float> res;
-  __m128 a = _a.coeffs().packet<Aligned>(0);
-  __m128 b = _b.coeffs().packet<Aligned>(0);
-  __m128 flip1 = _mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a,1,2,0,2),
-                                       ei_vec4f_swizzle1(b,2,0,1,2)),mask);
-  __m128 flip2 = _mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a,3,3,3,1),
-                                       ei_vec4f_swizzle1(b,0,1,2,1)),mask);
-  ei_pstore(&res.x(),
-            _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,ei_vec4f_swizzle1(b,3,3,3,3)),
-                                  _mm_mul_ps(ei_vec4f_swizzle1(a,2,0,1,0),
-                                             ei_vec4f_swizzle1(b,1,2,0,0))),
-                       _mm_add_ps(flip1,flip2)));
-  return res;
-}
-
 template<class Derived, class OtherDerived> struct ei_quat_product<EiArch_SSE, Derived, OtherDerived, float, Aligned>
 {
-  inline static Quat<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
+  inline static Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
   {
     const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0,0,0,0x80000000));
-    Quat<float> res;
+    Quaternion<float> res;
     __m128 a = _a.coeffs().packet<Aligned>(0);
     __m128 b = _b.coeffs().packet<Aligned>(0);
     __m128 flip1 = _mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a,1,2,0,2),