高度にテンプレート化されたC++のコードがどれだけ最適化されるかを確かめてみる。

あるベクトルと行列の積を計算するような関数 product() を作る。引数に、ベクトルや行列を表す様々なものを受け取れるよう、ジェネリックに作ることにする。

そのような場合に、コードが実際どのように最適化されるのかを見てみることにした。

例として、あるベクトルを要素とする対角行列のビューを作るアダプタをこの関数に渡してみると、どうなるか。

#include <cstddef>
#include <cstring>
#include <boost/array.hpp>
#include <boost/mpl/bool.hpp>
#include <boost/mpl/and.hpp>
#include <boost/multi_array.hpp>
#include <boost/type_traits/is_class.hpp>
#include <boost/utility/enable_if.hpp>
#include <boost/assert.hpp>
#include <iostream>

// metafunction to retrieve the element type of the given container
template<typename T_>
struct element_type_of
{
    template<typename TT_>
    struct element_type_cannot_be_retrieved_from_;

    // compile-time assertion
    enum { _ = sizeof(element_type_cannot_be_retrieved_from_<T_>) };
};

template<typename T_, std::size_t N_>
struct element_type_of<boost::array<T_, N_> >
{
    typedef T_ type;
};

template<typename T_, std::size_t N_>
struct element_type_of<T_[N_]>
{
    typedef T_ type;
};

// metafunction to determine if the given type can represent a vector.
template<typename T_>
struct is_vector: boost::mpl::false_ {};

template<typename T_, std::size_t N_>
struct is_vector<boost::array<T_, N_> >: boost::mpl::true_ {};

// metafunction to determine if the given type can represent a square matrix.
template<typename T_>
struct is_matrix: boost::mpl::false_ {};

template<typename T_, std::size_t N_>
struct is_matrix<T_[N_][N_]>: boost::mpl::true_ {};

template<typename T_, std::size_t N_>
struct is_matrix<boost::multi_array<T_, N_> >: boost::mpl::true_ {};

template<typename T_, std::size_t N_>
inline std::size_t size(const T_(&)[N_])
{
    return N_;
}

template<typename T_>
inline typename boost::enable_if<boost::is_class<T_>, std::size_t>::type size(T_ const& x)
{
    return x.size();
}

// an adapter to convert a vector to a diagonal matrix
template<typename Tvec_>
struct const_diagonal_matrix_adapter
{
    typedef typename element_type_of<Tvec_>::type element_type;
    typedef std::size_t size_type;

    struct view
    {

        size_type size() const
        {
            return ::size(vec_);
        }

        element_type const& operator[](size_type idx)
        {
            return idx_ != idx ? zero_: vec_[idx];
        }

        view(Tvec_ const& vec, size_type idx): vec_(vec), idx_(idx) {}

    private:
        static const element_type zero_;
        Tvec_ const& vec_;
        size_type idx_;
    };

    typedef view value_type;

    size_type size() const
    {
        return ::size(vec_);
    }

    value_type operator[](size_type idx) const
    {
        return view(vec_, idx);
    }

    const_diagonal_matrix_adapter(Tvec_ const& vec): vec_(vec) {}

private:
    Tvec_ const& vec_;
};

template<typename Tvec_>
const typename const_diagonal_matrix_adapter<Tvec_>::element_type
const_diagonal_matrix_adapter<Tvec_>::view::zero_ = 0.;


template<typename Tvec_>
inline const_diagonal_matrix_adapter<Tvec_> make_diagonal_matrix(Tvec_ const& vec)
{
    return const_diagonal_matrix_adapter<Tvec_>(vec);
}

// mark const_diagonal_matrix_adapter<> as a square_matrix
template<typename Tvec_>
struct is_matrix<const_diagonal_matrix_adapter<Tvec_> >: boost::mpl::true_ {};

// helper function to create a boost::const_multi_array_ref<> from an array
template<typename T_, std::size_t N1_, std::size_t N2_>
inline boost::const_multi_array_ref<T_, 2>
make_const_multi_array_ref(const T_(&v)[N1_][N2_])
{
    return boost::const_multi_array_ref<T_, 2>(
        reinterpret_cast<T_ const*>(v),
        boost::extents[N1_][N2_]);
}

// calculate the product of a vector and a matrix
template<typename T1, typename T2>
inline T1 product(T1 const& v, T2 const& m, typename boost::enable_if<boost::mpl::and_<is_vector<T1>, is_matrix<T2> > >::type* = 0)
{
    T1 retval;
    BOOST_ASSERT(size(v) == size(m) && size(m) == size(m[0]));
    for (std::size_t i = 0; i < size(v); ++i)
    {
        typename element_type_of<T1>::type x = 0;
        for (std::size_t j = 0; j < size(v); ++j)
        {
            x += v[j] * m[j][i];
        }
        retval[i] = x;
    }
    return retval;
}

// stream helper
template<typename Telem, typename Ttraits, typename Tvec_>
inline typename
boost::enable_if<is_vector<Tvec_>, std::basic_ostream<Telem, Ttraits> >::type&
operator<<(std::basic_ostream<Telem, Ttraits>& strm, Tvec_ const& vec)
{
    strm << "(" << vec[0] << ", " << vec[1] << ", " << vec[2] << ")";
    return strm;
}

int main(int, char**)
{
    boost::array<double, 3> const v1 = { { 1., 2. ,3. } };
    double const v2[3] = { 4., 5., 6. };
    double const m1[3][3] = { { 0., 1., 0. }, { 1., 0., 0. }, { 0., 0., 1. } };
    boost::multi_array<double, 2> const m2(make_const_multi_array_ref(m1));
    std::cout << v1 << "->" << product(v1, m1) << std::endl;;
    std::cout << v1 << "->" << product(v1, m2) << std::endl;;
    std::cout << v1 << "->" << product(v1, make_diagonal_matrix(v2)) << std::endl;
    return 0;
}

これを

g++ -S -O3 -o - templatized_math_operations.cpp | c++filt

としてみたときの結果から抜粋してみる。

まず product, double[3][3]>(boost::array const&, double const(&)[3][3], ...) の方。

	.section	.text._Z8productIN5boost5arrayIdLm3EEEA3_A3_dET_RKS5_RKT0_PNS0_9enable_ifINS0_3mpl4and_I9is_vectorIS5_E9is_matrixIS8_EN4mpl_5bool_ILb1EEESK_SK_EEvE4typeE,"axG",@progbits,boost::array<double, 3ul> product<boost::array<double, 3ul>, double [3][3]>(boost::array<double, 3ul> const&, double const (&) [3][3], boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<double [3][3]>, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*),comdat
	.p2align 4,,15
	.weak	boost::array<double, 3ul> product<boost::array<double, 3ul>, double [3][3]>(boost::array<double, 3ul> const&, double const (&) [3][3], boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<double [3][3]>, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*)
	.type	boost::array<double, 3ul> product<boost::array<double, 3ul>, double [3][3]>(boost::array<double, 3ul> const&, double const (&) [3][3], boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<double [3][3]>, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*), @function
boost::array<double, 3ul> product<boost::array<double, 3ul>, double [3][3]>(boost::array<double, 3ul> const&, double const (&) [3][3], boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<double [3][3]>, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*):
.LFB2667:
	.cfi_startproc
	.cfi_personality 0x3,__gxx_personality_v0
	movsd	(%rsi), %xmm0
	movq	%rdi, %rax
	movsd	8(%rsi), %xmm2
	mulsd	(%rdx), %xmm0
	xorpd	%xmm1, %xmm1
	mulsd	24(%rdx), %xmm2
	addsd	%xmm1, %xmm0
	addsd	%xmm2, %xmm0
	movsd	16(%rsi), %xmm2
	mulsd	48(%rdx), %xmm2
	addsd	%xmm2, %xmm0
	movsd	%xmm0, (%rdi)
	movsd	(%rsi), %xmm0
	movsd	8(%rsi), %xmm2
	mulsd	8(%rdx), %xmm0
	mulsd	32(%rdx), %xmm2
	addsd	%xmm1, %xmm0
	addsd	%xmm2, %xmm0
	movsd	16(%rsi), %xmm2
	mulsd	56(%rdx), %xmm2
	addsd	%xmm2, %xmm0
	movsd	%xmm0, 8(%rdi)
	movsd	(%rsi), %xmm0
	mulsd	16(%rdx), %xmm0
	addsd	%xmm1, %xmm0
        # この辺で xmm1 の値を頑張って保持しなくてもいいや、と思ったらしく
        # xmm2 を使うのを止めている。なぜ。
	movsd	8(%rsi), %xmm1
	mulsd	40(%rdx), %xmm1
	addsd	%xmm1, %xmm0
	movsd	16(%rsi), %xmm1
	mulsd	64(%rdx), %xmm1
	addsd	%xmm1, %xmm0
	movsd	%xmm0, 16(%rdi)
	ret
	.cfi_endproc
.LFE2667:
	.size	boost::array<double, 3ul> product<boost::array<double, 3ul>, double [3][3]>(boost::array<double, 3ul> const&, double const (&) [3][3], boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<double [3][3]>, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*), .-boost::array<double, 3ul> product<boost::array<double, 3ul>, double [3][3]>(boost::array<double, 3ul> const&, double const (&) [3][3], boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<double [3][3]>, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*)

不要な xorpd % と addsd %xmm1, %xmm0 が一部できてしまっているけれども、基本的に真っ直ぐなコードを生成してくれた。

次は、product, boost::multi_array >(boost::array const&, boost::multi_array const&, ...) 。行列の大きさは、コンパイル時には決定できないので、ループの展開に失敗し、済し崩しで分岐の多いコードになっている。

	.section	.text._Z8productIN5boost5arrayIdLm3EEENS0_11multi_arrayIdLm2ESaIdEEEET_RKS6_RKT0_PNS0_9enable_ifINS0_3mpl4and_I9is_vectorIS6_E9is_matrixIS9_EN4mpl_5bool_ILb1EEESL_SL_EEvE4typeE,"axG",@progbits,boost::array<double, 3ul> product<boost::array<double, 3ul>, boost::multi_array<double, 2ul, std::allocator<double> > >(boost::array<double, 3ul> const&, boost::multi_array<double, 2ul, std::allocator<double> > const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<boost::multi_array<double, 2ul, std::allocator<double> > >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*),comdat
	.p2align 4,,15
	.weak	boost::array<double, 3ul> product<boost::array<double, 3ul>, boost::multi_array<double, 2ul, std::allocator<double> > >(boost::array<double, 3ul> const&, boost::multi_array<double, 2ul, std::allocator<double> > const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<boost::multi_array<double, 2ul, std::allocator<double> > >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*)
	.type	boost::array<double, 3ul> product<boost::array<double, 3ul>, boost::multi_array<double, 2ul, std::allocator<double> > >(boost::array<double, 3ul> const&, boost::multi_array<double, 2ul, std::allocator<double> > const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<boost::multi_array<double, 2ul, std::allocator<double> > >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*), @function
boost::array<double, 3ul> product<boost::array<double, 3ul>, boost::multi_array<double, 2ul, std::allocator<double> > >(boost::array<double, 3ul> const&, boost::multi_array<double, 2ul, std::allocator<double> > const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<boost::multi_array<double, 2ul, std::allocator<double> > >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*):
.LFB2670:
	.cfi_startproc
	.cfi_personality 0x3,__gxx_personality_v0
	subq	$8, %rsp
	.cfi_def_cfa_offset 16
	cmpq	$3, 32(%rdx)
	movq	%rdi, %rax
	jne	.L80
	movq	64(%rdx), %r9
	movq	80(%rdx), %rcx
	movq	%r9, %rdi
	salq	$3, %rcx
	addq	(%rdx), %rcx
	negq	%rdi
	testq	%rdi, %rdi
	js	.L88
	cmpq	$2, %rdi
	ja	.L89
	cmpq	$3, 40(%rdx)
	jne	.L80
	movl	$1, %r10d
	movl	$2, %r8d
	movq	72(%rdx), %rdi
	movq	%r10, %r11
	subq	%r9, %r8
	subq	%r9, %r11
	cmpq	$2, %r8
	ja	.L84
	cmpq	$2, %r11
	ja	.L85
	movq	%rdi, %r8
	movsd	(%rsi), %xmm0
	negq	%r8
	testq	%r8, %r8
	js	.L86
	cmpq	$2, %r8
	ja	.L87
	movq	48(%rdx), %r8
	mulsd	(%rcx), %xmm0
	movsd	8(%rsi), %xmm2
	xorpd	%xmm1, %xmm1
	leaq	(%rcx,%r8,8), %r9
	salq	$4, %r8
	subq	%rdi, %r10
	leaq	(%rcx,%r8), %r8
	mulsd	(%r9), %xmm2
	addsd	%xmm1, %xmm0
	addsd	%xmm2, %xmm0
	movsd	16(%rsi), %xmm2
	mulsd	(%r8), %xmm2
	addsd	%xmm2, %xmm0
	movsd	%xmm0, (%rax)
	movsd	(%rsi), %xmm0
	js	.L86
	cmpq	$2, %r10
	ja	.L87
	movq	56(%rdx), %rdx
	movsd	8(%rsi), %xmm2
	movl	$2, %r10d
	subq	%rdi, %r10
	mulsd	(%rcx,%rdx,8), %xmm0
	mulsd	(%r9,%rdx,8), %xmm2
	addsd	%xmm1, %xmm0
	addsd	%xmm2, %xmm0
	movsd	16(%rsi), %xmm2
	mulsd	(%r8,%rdx,8), %xmm2
	addsd	%xmm2, %xmm0
	movsd	%xmm0, 8(%rax)
	movsd	(%rsi), %xmm0
	js	.L86
	cmpq	$2, %r10
	ja	.L87
	salq	$4, %rdx
	mulsd	(%rcx,%rdx), %xmm0
	addsd	%xmm1, %xmm0
	movsd	8(%rsi), %xmm1
	mulsd	(%r9,%rdx), %xmm1
	addsd	%xmm1, %xmm0
	movsd	16(%rsi), %xmm1
	mulsd	(%r8,%rdx), %xmm1
	addsd	%xmm1, %xmm0
	movsd	%xmm0, 16(%rax)
	addq	$8, %rsp
	ret
.L84:
	negq	%rdi
	testq	%rdi, %rdi
	js	.L86
	cmpq	$2, %rdi
	ja	.L87
	testq	%r11, %r11
	.p2align 4,,3
	js	.L88
	cmpq	$2, %r11
	.p2align 4,,5
	ja	.L89
	testq	%r8, %r8
	.p2align 4,,5
	jns	.L89
.L88:
	movl	boost::detail::multi_array::const_sub_array<double, 1ul, double const*> boost::detail::multi_array::value_accessor_n<double, 2ul>::access<boost::detail::multi_array::const_sub_array<double, 1ul, double const*>, double const*>(boost::type<boost::detail::multi_array::const_sub_array<double, 1ul, double const*> >, long, double const*, unsigned long const*, long const*, long const*) const::__PRETTY_FUNCTION__, %ecx
	movl	$135, %edx
	movl	$.LC2, %esi
	movl	$.LC3, %edi
	call	__assert_fail
.L87:
	movl	double const& boost::detail::multi_array::value_accessor_one<double>::access<double const&, double const*>(boost::type<double const&>, long, double const*, unsigned long const*, long const*, long const*) const::__PRETTY_FUNCTION__, %ecx
	movl	$178, %edx
	movl	$.LC2, %esi
	movl	$.LC4, %edi
	call	__assert_fail
.L86:
	movl	double const& boost::detail::multi_array::value_accessor_one<double>::access<double const&, double const*>(boost::type<double const&>, long, double const*, unsigned long const*, long const*, long const*) const::__PRETTY_FUNCTION__, %ecx
	movl	$177, %edx
	movl	$.LC2, %esi
	movl	$.LC3, %edi
	call	__assert_fail
.L85:
	negq	%rdi
	testq	%rdi, %rdi
	js	.L86
	cmpq	$2, %rdi
	ja	.L87
	testq	%r11, %r11
	.p2align 4,,3
	js	.L88
.L89:
	movl	boost::detail::multi_array::const_sub_array<double, 1ul, double const*> boost::detail::multi_array::value_accessor_n<double, 2ul>::access<boost::detail::multi_array::const_sub_array<double, 1ul, double const*>, double const*>(boost::type<boost::detail::multi_array::const_sub_array<double, 1ul, double const*> >, long, double const*, unsigned long const*, long const*, long const*) const::__PRETTY_FUNCTION__, %ecx
	movl	$136, %edx
	movl	$.LC2, %esi
	movl	$.LC4, %edi
	call	__assert_fail
.L80:
	movl	boost::array<double, 3ul> product<boost::array<double, 3ul>, boost::multi_array<double, 2ul, std::allocator<double> > >(boost::array<double, 3ul> const&, boost::multi_array<double, 2ul, std::allocator<double> > const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<boost::multi_array<double, 2ul, std::allocator<double> > >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*)::__PRETTY_FUNCTION__, %ecx
	movl	$140, %edx
	movl	$.LC8, %esi
	movl	$.LC9, %edi
	call	__assert_fail
	.cfi_endproc
.LFE2670:
	.size	boost::array<double, 3ul> product<boost::array<double, 3ul>, boost::multi_array<double, 2ul, std::allocator<double> > >(boost::array<double, 3ul> const&, boost::multi_array<double, 2ul, std::allocator<double> > const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<boost::multi_array<double, 2ul, std::allocator<double> > >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*), .-boost::array<double, 3ul> product<boost::array<double, 3ul>, boost::multi_array<double, 2ul, std::allocator<double> > >(boost::array<double, 3ul> const&, boost::multi_array<double, 2ul, std::allocator<double> > const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<boost::multi_array<double, 2ul, std::allocator<double> > >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*)

そして、最後に、product, const_diagonal_matrix_adapter >(boost::array const&, const_diagonal_matrix_adapter const&, ...) の方。

	.section	.text._Z8productIN5boost5arrayIdLm3EEE29const_diagonal_matrix_adapterIA3_dEET_RKS6_RKT0_PNS0_9enable_ifINS0_3mpl4and_I9is_vectorIS6_E9is_matrixIS9_EN4mpl_5bool_ILb1EEESL_SL_EEvE4typeE,"axG",@progbits,boost::array<double, 3ul> product<boost::array<double, 3ul>, const_diagonal_matrix_adapter<double [3]> >(boost::array<double, 3ul> const&, const_diagonal_matrix_adapter<double [3]> const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<const_diagonal_matrix_adapter<double [3]> >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*),comdat
	.p2align 4,,15
	.weak	boost::array<double, 3ul> product<boost::array<double, 3ul>, const_diagonal_matrix_adapter<double [3]> >(boost::array<double, 3ul> const&, const_diagonal_matrix_adapter<double [3]> const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<const_diagonal_matrix_adapter<double [3]> >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*)
	.type	boost::array<double, 3ul> product<boost::array<double, 3ul>, const_diagonal_matrix_adapter<double [3]> >(boost::array<double, 3ul> const&, const_diagonal_matrix_adapter<double [3]> const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<const_diagonal_matrix_adapter<double [3]> >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*), @function
boost::array<double, 3ul> product<boost::array<double, 3ul>, const_diagonal_matrix_adapter<double [3]> >(boost::array<double, 3ul> const&, const_diagonal_matrix_adapter<double [3]> const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<const_diagonal_matrix_adapter<double [3]> >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*):
.LFB2675:
	.cfi_startproc
	.cfi_personality 0x3,__gxx_personality_v0
	movq	(%rdx), %rdx
	xorpd	%xmm0, %xmm0
	movsd	8(%rsi), %xmm2
	movq	%rdi, %rax
	movsd	(%rdx), %xmm1
	mulsd	%xmm0, %xmm2
	mulsd	(%rsi), %xmm1
	addsd	%xmm0, %xmm1
	addsd	%xmm2, %xmm1
	movsd	16(%rsi), %xmm2
	mulsd	%xmm0, %xmm2
	addsd	%xmm2, %xmm1
	movsd	%xmm1, (%rdi)
	movsd	(%rsi), %xmm1
	movsd	8(%rdx), %xmm2
	mulsd	%xmm0, %xmm1
	mulsd	8(%rsi), %xmm2
	addsd	%xmm0, %xmm1
	addsd	%xmm2, %xmm1
	movsd	16(%rsi), %xmm2
	mulsd	%xmm0, %xmm2
	addsd	%xmm2, %xmm1
	movsd	%xmm1, 8(%rdi)
	movsd	(%rsi), %xmm1
	mulsd	%xmm0, %xmm1
	addsd	%xmm0, %xmm1
	mulsd	8(%rsi), %xmm0
	addsd	%xmm0, %xmm1
	movsd	16(%rdx), %xmm0
	mulsd	16(%rsi), %xmm0
	addsd	%xmm1, %xmm0
	movsd	%xmm0, 16(%rdi)
	ret
	.cfi_endproc
.LFE2675:
	.size	boost::array<double, 3ul> product<boost::array<double, 3ul>, const_diagonal_matrix_adapter<double [3]> >(boost::array<double, 3ul> const&, const_diagonal_matrix_adapter<double [3]> const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<const_diagonal_matrix_adapter<double [3]> >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*), .-boost::array<double, 3ul> product<boost::array<double, 3ul>, const_diagonal_matrix_adapter<double [3]> >(boost::array<double, 3ul> const&, const_diagonal_matrix_adapter<double [3]> const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<const_diagonal_matrix_adapter<double [3]> >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*)

すごい。ループの unroll から始まって、何もかもが展開されている。
正直ここまでやってくれるとは思わなかった。

ただ、ここまでやってくれているのだから、mulsd %xmm0, XXX は削減してほしかったなあ。

コンパイラのバージョン:

gcc version 4.4.1 (Ubuntu 4.4.1-4ubuntu9)