高度にテンプレート化されたC++のコードがどれだけ最適化されるかを確かめてみる。
あるベクトルと行列の積を計算するような関数 product() を作る。引数に、ベクトルや行列を表す様々なものを受け取れるよう、ジェネリックに作ることにする。
そのような場合に、コードが実際どのように最適化されるのかを見てみることにした。
例として、あるベクトルを要素とする対角行列のビューを作るアダプタをこの関数に渡してみると、どうなるか。
#include <cstddef> #include <cstring> #include <boost/array.hpp> #include <boost/mpl/bool.hpp> #include <boost/mpl/and.hpp> #include <boost/multi_array.hpp> #include <boost/type_traits/is_class.hpp> #include <boost/utility/enable_if.hpp> #include <boost/assert.hpp> #include <iostream> // metafunction to retrieve the element type of the given container template<typename T_> struct element_type_of { template<typename TT_> struct element_type_cannot_be_retrieved_from_; // compile-time assertion enum { _ = sizeof(element_type_cannot_be_retrieved_from_<T_>) }; }; template<typename T_, std::size_t N_> struct element_type_of<boost::array<T_, N_> > { typedef T_ type; }; template<typename T_, std::size_t N_> struct element_type_of<T_[N_]> { typedef T_ type; }; // metafunction to determine if the given type can represent a vector. template<typename T_> struct is_vector: boost::mpl::false_ {}; template<typename T_, std::size_t N_> struct is_vector<boost::array<T_, N_> >: boost::mpl::true_ {}; // metafunction to determine if the given type can represent a square matrix. template<typename T_> struct is_matrix: boost::mpl::false_ {}; template<typename T_, std::size_t N_> struct is_matrix<T_[N_][N_]>: boost::mpl::true_ {}; template<typename T_, std::size_t N_> struct is_matrix<boost::multi_array<T_, N_> >: boost::mpl::true_ {}; template<typename T_, std::size_t N_> inline std::size_t size(const T_(&)[N_]) { return N_; } template<typename T_> inline typename boost::enable_if<boost::is_class<T_>, std::size_t>::type size(T_ const& x) { return x.size(); } // an adapter to convert a vector to a diagonal matrix template<typename Tvec_> struct const_diagonal_matrix_adapter { typedef typename element_type_of<Tvec_>::type element_type; typedef std::size_t size_type; struct view { size_type size() const { return ::size(vec_); } element_type const& operator[](size_type idx) { return idx_ != idx ? zero_: vec_[idx]; } view(Tvec_ const& vec, size_type idx): vec_(vec), idx_(idx) {} private: static const element_type zero_; Tvec_ const& vec_; size_type idx_; }; typedef view value_type; size_type size() const { return ::size(vec_); } value_type operator[](size_type idx) const { return view(vec_, idx); } const_diagonal_matrix_adapter(Tvec_ const& vec): vec_(vec) {} private: Tvec_ const& vec_; }; template<typename Tvec_> const typename const_diagonal_matrix_adapter<Tvec_>::element_type const_diagonal_matrix_adapter<Tvec_>::view::zero_ = 0.; template<typename Tvec_> inline const_diagonal_matrix_adapter<Tvec_> make_diagonal_matrix(Tvec_ const& vec) { return const_diagonal_matrix_adapter<Tvec_>(vec); } // mark const_diagonal_matrix_adapter<> as a square_matrix template<typename Tvec_> struct is_matrix<const_diagonal_matrix_adapter<Tvec_> >: boost::mpl::true_ {}; // helper function to create a boost::const_multi_array_ref<> from an array template<typename T_, std::size_t N1_, std::size_t N2_> inline boost::const_multi_array_ref<T_, 2> make_const_multi_array_ref(const T_(&v)[N1_][N2_]) { return boost::const_multi_array_ref<T_, 2>( reinterpret_cast<T_ const*>(v), boost::extents[N1_][N2_]); } // calculate the product of a vector and a matrix template<typename T1, typename T2> inline T1 product(T1 const& v, T2 const& m, typename boost::enable_if<boost::mpl::and_<is_vector<T1>, is_matrix<T2> > >::type* = 0) { T1 retval; BOOST_ASSERT(size(v) == size(m) && size(m) == size(m[0])); for (std::size_t i = 0; i < size(v); ++i) { typename element_type_of<T1>::type x = 0; for (std::size_t j = 0; j < size(v); ++j) { x += v[j] * m[j][i]; } retval[i] = x; } return retval; } // stream helper template<typename Telem, typename Ttraits, typename Tvec_> inline typename boost::enable_if<is_vector<Tvec_>, std::basic_ostream<Telem, Ttraits> >::type& operator<<(std::basic_ostream<Telem, Ttraits>& strm, Tvec_ const& vec) { strm << "(" << vec[0] << ", " << vec[1] << ", " << vec[2] << ")"; return strm; } int main(int, char**) { boost::array<double, 3> const v1 = { { 1., 2. ,3. } }; double const v2[3] = { 4., 5., 6. }; double const m1[3][3] = { { 0., 1., 0. }, { 1., 0., 0. }, { 0., 0., 1. } }; boost::multi_array<double, 2> const m2(make_const_multi_array_ref(m1)); std::cout << v1 << "->" << product(v1, m1) << std::endl;; std::cout << v1 << "->" << product(v1, m2) << std::endl;; std::cout << v1 << "->" << product(v1, make_diagonal_matrix(v2)) << std::endl; return 0; }
これを
g++ -S -O3 -o - templatized_math_operations.cpp | c++filt
としてみたときの結果から抜粋してみる。
まず product
.section .text._Z8productIN5boost5arrayIdLm3EEEA3_A3_dET_RKS5_RKT0_PNS0_9enable_ifINS0_3mpl4and_I9is_vectorIS5_E9is_matrixIS8_EN4mpl_5bool_ILb1EEESK_SK_EEvE4typeE,"axG",@progbits,boost::array<double, 3ul> product<boost::array<double, 3ul>, double [3][3]>(boost::array<double, 3ul> const&, double const (&) [3][3], boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<double [3][3]>, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*),comdat .p2align 4,,15 .weak boost::array<double, 3ul> product<boost::array<double, 3ul>, double [3][3]>(boost::array<double, 3ul> const&, double const (&) [3][3], boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<double [3][3]>, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*) .type boost::array<double, 3ul> product<boost::array<double, 3ul>, double [3][3]>(boost::array<double, 3ul> const&, double const (&) [3][3], boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<double [3][3]>, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*), @function boost::array<double, 3ul> product<boost::array<double, 3ul>, double [3][3]>(boost::array<double, 3ul> const&, double const (&) [3][3], boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<double [3][3]>, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*): .LFB2667: .cfi_startproc .cfi_personality 0x3,__gxx_personality_v0 movsd (%rsi), %xmm0 movq %rdi, %rax movsd 8(%rsi), %xmm2 mulsd (%rdx), %xmm0 xorpd %xmm1, %xmm1 mulsd 24(%rdx), %xmm2 addsd %xmm1, %xmm0 addsd %xmm2, %xmm0 movsd 16(%rsi), %xmm2 mulsd 48(%rdx), %xmm2 addsd %xmm2, %xmm0 movsd %xmm0, (%rdi) movsd (%rsi), %xmm0 movsd 8(%rsi), %xmm2 mulsd 8(%rdx), %xmm0 mulsd 32(%rdx), %xmm2 addsd %xmm1, %xmm0 addsd %xmm2, %xmm0 movsd 16(%rsi), %xmm2 mulsd 56(%rdx), %xmm2 addsd %xmm2, %xmm0 movsd %xmm0, 8(%rdi) movsd (%rsi), %xmm0 mulsd 16(%rdx), %xmm0 addsd %xmm1, %xmm0 # この辺で xmm1 の値を頑張って保持しなくてもいいや、と思ったらしく # xmm2 を使うのを止めている。なぜ。 movsd 8(%rsi), %xmm1 mulsd 40(%rdx), %xmm1 addsd %xmm1, %xmm0 movsd 16(%rsi), %xmm1 mulsd 64(%rdx), %xmm1 addsd %xmm1, %xmm0 movsd %xmm0, 16(%rdi) ret .cfi_endproc .LFE2667: .size boost::array<double, 3ul> product<boost::array<double, 3ul>, double [3][3]>(boost::array<double, 3ul> const&, double const (&) [3][3], boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<double [3][3]>, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*), .-boost::array<double, 3ul> product<boost::array<double, 3ul>, double [3][3]>(boost::array<double, 3ul> const&, double const (&) [3][3], boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<double [3][3]>, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*)
不要な xorpd % と addsd %xmm1, %xmm0 が一部できてしまっているけれども、基本的に真っ直ぐなコードを生成してくれた。
次は、product
.section .text._Z8productIN5boost5arrayIdLm3EEENS0_11multi_arrayIdLm2ESaIdEEEET_RKS6_RKT0_PNS0_9enable_ifINS0_3mpl4and_I9is_vectorIS6_E9is_matrixIS9_EN4mpl_5bool_ILb1EEESL_SL_EEvE4typeE,"axG",@progbits,boost::array<double, 3ul> product<boost::array<double, 3ul>, boost::multi_array<double, 2ul, std::allocator<double> > >(boost::array<double, 3ul> const&, boost::multi_array<double, 2ul, std::allocator<double> > const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<boost::multi_array<double, 2ul, std::allocator<double> > >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*),comdat .p2align 4,,15 .weak boost::array<double, 3ul> product<boost::array<double, 3ul>, boost::multi_array<double, 2ul, std::allocator<double> > >(boost::array<double, 3ul> const&, boost::multi_array<double, 2ul, std::allocator<double> > const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<boost::multi_array<double, 2ul, std::allocator<double> > >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*) .type boost::array<double, 3ul> product<boost::array<double, 3ul>, boost::multi_array<double, 2ul, std::allocator<double> > >(boost::array<double, 3ul> const&, boost::multi_array<double, 2ul, std::allocator<double> > const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<boost::multi_array<double, 2ul, std::allocator<double> > >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*), @function boost::array<double, 3ul> product<boost::array<double, 3ul>, boost::multi_array<double, 2ul, std::allocator<double> > >(boost::array<double, 3ul> const&, boost::multi_array<double, 2ul, std::allocator<double> > const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<boost::multi_array<double, 2ul, std::allocator<double> > >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*): .LFB2670: .cfi_startproc .cfi_personality 0x3,__gxx_personality_v0 subq $8, %rsp .cfi_def_cfa_offset 16 cmpq $3, 32(%rdx) movq %rdi, %rax jne .L80 movq 64(%rdx), %r9 movq 80(%rdx), %rcx movq %r9, %rdi salq $3, %rcx addq (%rdx), %rcx negq %rdi testq %rdi, %rdi js .L88 cmpq $2, %rdi ja .L89 cmpq $3, 40(%rdx) jne .L80 movl $1, %r10d movl $2, %r8d movq 72(%rdx), %rdi movq %r10, %r11 subq %r9, %r8 subq %r9, %r11 cmpq $2, %r8 ja .L84 cmpq $2, %r11 ja .L85 movq %rdi, %r8 movsd (%rsi), %xmm0 negq %r8 testq %r8, %r8 js .L86 cmpq $2, %r8 ja .L87 movq 48(%rdx), %r8 mulsd (%rcx), %xmm0 movsd 8(%rsi), %xmm2 xorpd %xmm1, %xmm1 leaq (%rcx,%r8,8), %r9 salq $4, %r8 subq %rdi, %r10 leaq (%rcx,%r8), %r8 mulsd (%r9), %xmm2 addsd %xmm1, %xmm0 addsd %xmm2, %xmm0 movsd 16(%rsi), %xmm2 mulsd (%r8), %xmm2 addsd %xmm2, %xmm0 movsd %xmm0, (%rax) movsd (%rsi), %xmm0 js .L86 cmpq $2, %r10 ja .L87 movq 56(%rdx), %rdx movsd 8(%rsi), %xmm2 movl $2, %r10d subq %rdi, %r10 mulsd (%rcx,%rdx,8), %xmm0 mulsd (%r9,%rdx,8), %xmm2 addsd %xmm1, %xmm0 addsd %xmm2, %xmm0 movsd 16(%rsi), %xmm2 mulsd (%r8,%rdx,8), %xmm2 addsd %xmm2, %xmm0 movsd %xmm0, 8(%rax) movsd (%rsi), %xmm0 js .L86 cmpq $2, %r10 ja .L87 salq $4, %rdx mulsd (%rcx,%rdx), %xmm0 addsd %xmm1, %xmm0 movsd 8(%rsi), %xmm1 mulsd (%r9,%rdx), %xmm1 addsd %xmm1, %xmm0 movsd 16(%rsi), %xmm1 mulsd (%r8,%rdx), %xmm1 addsd %xmm1, %xmm0 movsd %xmm0, 16(%rax) addq $8, %rsp ret .L84: negq %rdi testq %rdi, %rdi js .L86 cmpq $2, %rdi ja .L87 testq %r11, %r11 .p2align 4,,3 js .L88 cmpq $2, %r11 .p2align 4,,5 ja .L89 testq %r8, %r8 .p2align 4,,5 jns .L89 .L88: movl boost::detail::multi_array::const_sub_array<double, 1ul, double const*> boost::detail::multi_array::value_accessor_n<double, 2ul>::access<boost::detail::multi_array::const_sub_array<double, 1ul, double const*>, double const*>(boost::type<boost::detail::multi_array::const_sub_array<double, 1ul, double const*> >, long, double const*, unsigned long const*, long const*, long const*) const::__PRETTY_FUNCTION__, %ecx movl $135, %edx movl $.LC2, %esi movl $.LC3, %edi call __assert_fail .L87: movl double const& boost::detail::multi_array::value_accessor_one<double>::access<double const&, double const*>(boost::type<double const&>, long, double const*, unsigned long const*, long const*, long const*) const::__PRETTY_FUNCTION__, %ecx movl $178, %edx movl $.LC2, %esi movl $.LC4, %edi call __assert_fail .L86: movl double const& boost::detail::multi_array::value_accessor_one<double>::access<double const&, double const*>(boost::type<double const&>, long, double const*, unsigned long const*, long const*, long const*) const::__PRETTY_FUNCTION__, %ecx movl $177, %edx movl $.LC2, %esi movl $.LC3, %edi call __assert_fail .L85: negq %rdi testq %rdi, %rdi js .L86 cmpq $2, %rdi ja .L87 testq %r11, %r11 .p2align 4,,3 js .L88 .L89: movl boost::detail::multi_array::const_sub_array<double, 1ul, double const*> boost::detail::multi_array::value_accessor_n<double, 2ul>::access<boost::detail::multi_array::const_sub_array<double, 1ul, double const*>, double const*>(boost::type<boost::detail::multi_array::const_sub_array<double, 1ul, double const*> >, long, double const*, unsigned long const*, long const*, long const*) const::__PRETTY_FUNCTION__, %ecx movl $136, %edx movl $.LC2, %esi movl $.LC4, %edi call __assert_fail .L80: movl boost::array<double, 3ul> product<boost::array<double, 3ul>, boost::multi_array<double, 2ul, std::allocator<double> > >(boost::array<double, 3ul> const&, boost::multi_array<double, 2ul, std::allocator<double> > const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<boost::multi_array<double, 2ul, std::allocator<double> > >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*)::__PRETTY_FUNCTION__, %ecx movl $140, %edx movl $.LC8, %esi movl $.LC9, %edi call __assert_fail .cfi_endproc .LFE2670: .size boost::array<double, 3ul> product<boost::array<double, 3ul>, boost::multi_array<double, 2ul, std::allocator<double> > >(boost::array<double, 3ul> const&, boost::multi_array<double, 2ul, std::allocator<double> > const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<boost::multi_array<double, 2ul, std::allocator<double> > >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*), .-boost::array<double, 3ul> product<boost::array<double, 3ul>, boost::multi_array<double, 2ul, std::allocator<double> > >(boost::array<double, 3ul> const&, boost::multi_array<double, 2ul, std::allocator<double> > const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<boost::multi_array<double, 2ul, std::allocator<double> > >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*)
そして、最後に、product
.section .text._Z8productIN5boost5arrayIdLm3EEE29const_diagonal_matrix_adapterIA3_dEET_RKS6_RKT0_PNS0_9enable_ifINS0_3mpl4and_I9is_vectorIS6_E9is_matrixIS9_EN4mpl_5bool_ILb1EEESL_SL_EEvE4typeE,"axG",@progbits,boost::array<double, 3ul> product<boost::array<double, 3ul>, const_diagonal_matrix_adapter<double [3]> >(boost::array<double, 3ul> const&, const_diagonal_matrix_adapter<double [3]> const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<const_diagonal_matrix_adapter<double [3]> >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*),comdat .p2align 4,,15 .weak boost::array<double, 3ul> product<boost::array<double, 3ul>, const_diagonal_matrix_adapter<double [3]> >(boost::array<double, 3ul> const&, const_diagonal_matrix_adapter<double [3]> const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<const_diagonal_matrix_adapter<double [3]> >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*) .type boost::array<double, 3ul> product<boost::array<double, 3ul>, const_diagonal_matrix_adapter<double [3]> >(boost::array<double, 3ul> const&, const_diagonal_matrix_adapter<double [3]> const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<const_diagonal_matrix_adapter<double [3]> >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*), @function boost::array<double, 3ul> product<boost::array<double, 3ul>, const_diagonal_matrix_adapter<double [3]> >(boost::array<double, 3ul> const&, const_diagonal_matrix_adapter<double [3]> const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<const_diagonal_matrix_adapter<double [3]> >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*): .LFB2675: .cfi_startproc .cfi_personality 0x3,__gxx_personality_v0 movq (%rdx), %rdx xorpd %xmm0, %xmm0 movsd 8(%rsi), %xmm2 movq %rdi, %rax movsd (%rdx), %xmm1 mulsd %xmm0, %xmm2 mulsd (%rsi), %xmm1 addsd %xmm0, %xmm1 addsd %xmm2, %xmm1 movsd 16(%rsi), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm1 movsd %xmm1, (%rdi) movsd (%rsi), %xmm1 movsd 8(%rdx), %xmm2 mulsd %xmm0, %xmm1 mulsd 8(%rsi), %xmm2 addsd %xmm0, %xmm1 addsd %xmm2, %xmm1 movsd 16(%rsi), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm1 movsd %xmm1, 8(%rdi) movsd (%rsi), %xmm1 mulsd %xmm0, %xmm1 addsd %xmm0, %xmm1 mulsd 8(%rsi), %xmm0 addsd %xmm0, %xmm1 movsd 16(%rdx), %xmm0 mulsd 16(%rsi), %xmm0 addsd %xmm1, %xmm0 movsd %xmm0, 16(%rdi) ret .cfi_endproc .LFE2675: .size boost::array<double, 3ul> product<boost::array<double, 3ul>, const_diagonal_matrix_adapter<double [3]> >(boost::array<double, 3ul> const&, const_diagonal_matrix_adapter<double [3]> const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<const_diagonal_matrix_adapter<double [3]> >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*), .-boost::array<double, 3ul> product<boost::array<double, 3ul>, const_diagonal_matrix_adapter<double [3]> >(boost::array<double, 3ul> const&, const_diagonal_matrix_adapter<double [3]> const&, boost::enable_if<boost::mpl::and_<is_vector<boost::array<double, 3ul> >, is_matrix<const_diagonal_matrix_adapter<double [3]> >, mpl_::bool_<true>, mpl_::bool_<true>, mpl_::bool_<true> >, void>::type*)
すごい。ループの unroll から始まって、何もかもが展開されている。
正直ここまでやってくれるとは思わなかった。
ただ、ここまでやってくれているのだから、mulsd %xmm0, XXX は削減してほしかったなあ。
コンパイラのバージョン:
gcc version 4.4.1 (Ubuntu 4.4.1-4ubuntu9)