// SPDX-License-Identifier: Apache-2.0 // // Copyright 2008-2016 Conrad Sanderson (http://conradsanderson.id.au) // Copyright 2008-2016 National ICT Australia (NICTA) // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ------------------------------------------------------------------------ //! \addtogroup fn_accu //! @{ template arma_hot inline typename T1::elem_type accu_proxy_linear(const Proxy& P) { arma_extra_debug_sigprint(); typedef typename T1::elem_type eT; eT val = eT(0); typename Proxy::ea_type Pea = P.get_ea(); const uword n_elem = P.get_n_elem(); if( arma_config::openmp && Proxy::use_mp && mp_gate::eval(n_elem) ) { #if defined(ARMA_USE_OPENMP) { // NOTE: using parallelisation with manual reduction workaround to take into account complex numbers; // NOTE: OpenMP versions lower than 4.0 do not support user-defined reduction const int n_threads_max = mp_thread_limit::get(); const uword n_threads_use = (std::min)(uword(podarray_prealloc_n_elem::val), uword(n_threads_max)); const uword chunk_size = n_elem / n_threads_use; podarray partial_accs(n_threads_use); #pragma omp parallel for schedule(static) num_threads(int(n_threads_use)) for(uword thread_id=0; thread_id < n_threads_use; ++thread_id) { const uword start = (thread_id+0) * chunk_size; const uword endp1 = (thread_id+1) * chunk_size; eT acc = eT(0); for(uword i=start; i < endp1; ++i) { acc += Pea[i]; } partial_accs[thread_id] = acc; } for(uword thread_id=0; thread_id < n_threads_use; ++thread_id) { val += partial_accs[thread_id]; } for(uword i=(n_threads_use*chunk_size); i < n_elem; ++i) { val += Pea[i]; } } #endif } else { #if defined(__FAST_MATH__) { if(P.is_aligned()) { typename Proxy::aligned_ea_type Pea_aligned = P.get_aligned_ea(); for(uword i=0; i arma_hot inline typename T1::elem_type accu_proxy_at_mp(const Proxy& P) { arma_extra_debug_sigprint(); typedef typename T1::elem_type eT; eT val = eT(0); #if defined(ARMA_USE_OPENMP) { const uword n_rows = P.get_n_rows(); const uword n_cols = P.get_n_cols(); if(n_cols == 1) { const int n_threads_max = mp_thread_limit::get(); const uword n_threads_use = (std::min)(uword(podarray_prealloc_n_elem::val), uword(n_threads_max)); const uword chunk_size = n_rows / n_threads_use; podarray partial_accs(n_threads_use); #pragma omp parallel for schedule(static) num_threads(int(n_threads_use)) for(uword thread_id=0; thread_id < n_threads_use; ++thread_id) { const uword start = (thread_id+0) * chunk_size; const uword endp1 = (thread_id+1) * chunk_size; eT acc = eT(0); for(uword i=start; i < endp1; ++i) { acc += P.at(i,0); } partial_accs[thread_id] = acc; } for(uword thread_id=0; thread_id < n_threads_use; ++thread_id) { val += partial_accs[thread_id]; } for(uword i=(n_threads_use*chunk_size); i < n_rows; ++i) { val += P.at(i,0); } } else if(n_rows == 1) { const int n_threads_max = mp_thread_limit::get(); const uword n_threads_use = (std::min)(uword(podarray_prealloc_n_elem::val), uword(n_threads_max)); const uword chunk_size = n_cols / n_threads_use; podarray partial_accs(n_threads_use); #pragma omp parallel for schedule(static) num_threads(int(n_threads_use)) for(uword thread_id=0; thread_id < n_threads_use; ++thread_id) { const uword start = (thread_id+0) * chunk_size; const uword endp1 = (thread_id+1) * chunk_size; eT acc = eT(0); for(uword i=start; i < endp1; ++i) { acc += P.at(0,i); } partial_accs[thread_id] = acc; } for(uword thread_id=0; thread_id < n_threads_use; ++thread_id) { val += partial_accs[thread_id]; } for(uword i=(n_threads_use*chunk_size); i < n_cols; ++i) { val += P.at(0,i); } } else { podarray col_accs(n_cols); const int n_threads = mp_thread_limit::get(); #pragma omp parallel for schedule(static) num_threads(n_threads) for(uword col=0; col < n_cols; ++col) { eT val1 = eT(0); eT val2 = eT(0); uword i,j; for(i=0, j=1; j < n_rows; i+=2, j+=2) { val1 += P.at(i,col); val2 += P.at(j,col); } if(i < n_rows) { val1 += P.at(i,col); } col_accs[col] = val1 + val2; } val = arrayops::accumulate(col_accs.memptr(), n_cols); } } #else { arma_ignore(P); } #endif return val; } template arma_hot inline typename T1::elem_type accu_proxy_at(const Proxy& P) { arma_extra_debug_sigprint(); typedef typename T1::elem_type eT; if(arma_config::openmp && Proxy::use_mp && mp_gate::eval(P.get_n_elem())) { return accu_proxy_at_mp(P); } const uword n_rows = P.get_n_rows(); const uword n_cols = P.get_n_cols(); eT val = eT(0); if(n_rows != 1) { eT val1 = eT(0); eT val2 = eT(0); for(uword col=0; col < n_cols; ++col) { uword i,j; for(i=0, j=1; j < n_rows; i+=2, j+=2) { val1 += P.at(i,col); val2 += P.at(j,col); } if(i < n_rows) { val1 += P.at(i,col); } } val = val1 + val2; } else { for(uword col=0; col < n_cols; ++col) { val += P.at(0,col); } } return val; } //! accumulate the elements of a matrix template arma_warn_unused arma_hot inline typename enable_if2< is_arma_type::value, typename T1::elem_type >::result accu(const T1& X) { arma_extra_debug_sigprint(); const Proxy P(X); if(is_Mat::stored_type>::value || is_subview_col::stored_type>::value) { const quasi_unwrap::stored_type> tmp(P.Q); return arrayops::accumulate(tmp.M.memptr(), tmp.M.n_elem); } return (Proxy::use_at) ? accu_proxy_at(P) : accu_proxy_linear(P); } //! explicit handling of multiply-and-accumulate template arma_warn_unused inline typename T1::elem_type accu(const eGlue& expr) { arma_extra_debug_sigprint(); typedef eGlue expr_type; typedef typename expr_type::proxy1_type::stored_type P1_stored_type; typedef typename expr_type::proxy2_type::stored_type P2_stored_type; const bool have_direct_mem_1 = (is_Mat::value) || (is_subview_col::value); const bool have_direct_mem_2 = (is_Mat::value) || (is_subview_col::value); if(have_direct_mem_1 && have_direct_mem_2) { const quasi_unwrap tmp1(expr.P1.Q); const quasi_unwrap tmp2(expr.P2.Q); return op_dot::direct_dot(tmp1.M.n_elem, tmp1.M.memptr(), tmp2.M.memptr()); } const Proxy P(expr); return (Proxy::use_at) ? accu_proxy_at(P) : accu_proxy_linear(P); } //! explicit handling of Hamming norm (also known as zero norm) template arma_warn_unused inline uword accu(const mtOp& X) { arma_extra_debug_sigprint(); typedef typename T1::elem_type eT; const eT val = X.aux; const Proxy P(X.m); uword n_nonzero = 0; if(Proxy::use_at == false) { typedef typename Proxy::ea_type ea_type; ea_type A = P.get_ea(); const uword n_elem = P.get_n_elem(); for(uword i=0; i arma_warn_unused inline uword accu(const mtOp& X) { arma_extra_debug_sigprint(); typedef typename T1::elem_type eT; const eT val = X.aux; const Proxy P(X.m); uword n_nonzero = 0; if(Proxy::use_at == false) { typedef typename Proxy::ea_type ea_type; ea_type A = P.get_ea(); const uword n_elem = P.get_n_elem(); for(uword i=0; i arma_warn_unused inline uword accu(const mtGlue& X) { arma_extra_debug_sigprint(); const Proxy PA(X.A); const Proxy PB(X.B); arma_debug_assert_same_size(PA, PB, "operator!="); uword n_nonzero = 0; if( (Proxy::use_at == false) && (Proxy::use_at == false) ) { typedef typename Proxy::ea_type PA_ea_type; typedef typename Proxy::ea_type PB_ea_type; PA_ea_type A = PA.get_ea(); PB_ea_type B = PB.get_ea(); const uword n_elem = PA.get_n_elem(); for(uword i=0; i < n_elem; ++i) { n_nonzero += (A[i] != B[i]) ? uword(1) : uword(0); } } else { const uword PA_n_cols = PA.get_n_cols(); const uword PA_n_rows = PA.get_n_rows(); if(PA_n_rows == 1) { for(uword col=0; col < PA_n_cols; ++col) { n_nonzero += (PA.at(0,col) != PB.at(0,col)) ? uword(1) : uword(0); } } else { for(uword col=0; col < PA_n_cols; ++col) for(uword row=0; row < PA_n_rows; ++row) { n_nonzero += (PA.at(row,col) != PB.at(row,col)) ? uword(1) : uword(0); } } } return n_nonzero; } template arma_warn_unused inline uword accu(const mtGlue& X) { arma_extra_debug_sigprint(); const Proxy PA(X.A); const Proxy PB(X.B); arma_debug_assert_same_size(PA, PB, "operator=="); uword n_nonzero = 0; if( (Proxy::use_at == false) && (Proxy::use_at == false) ) { typedef typename Proxy::ea_type PA_ea_type; typedef typename Proxy::ea_type PB_ea_type; PA_ea_type A = PA.get_ea(); PB_ea_type B = PB.get_ea(); const uword n_elem = PA.get_n_elem(); for(uword i=0; i < n_elem; ++i) { n_nonzero += (A[i] == B[i]) ? uword(1) : uword(0); } } else { const uword PA_n_cols = PA.get_n_cols(); const uword PA_n_rows = PA.get_n_rows(); if(PA_n_rows == 1) { for(uword col=0; col < PA_n_cols; ++col) { n_nonzero += (PA.at(0,col) == PB.at(0,col)) ? uword(1) : uword(0); } } else { for(uword col=0; col < PA_n_cols; ++col) for(uword row=0; row < PA_n_rows; ++row) { n_nonzero += (PA.at(row,col) == PB.at(row,col)) ? uword(1) : uword(0); } } } return n_nonzero; } //! accumulate the elements of a subview (submatrix) template arma_warn_unused arma_hot inline eT accu(const subview& X) { arma_extra_debug_sigprint(); const uword X_n_rows = X.n_rows; const uword X_n_cols = X.n_cols; if(X_n_rows == 1) { const Mat& m = X.m; const uword col_offset = X.aux_col1; const uword row_offset = X.aux_row1; eT val1 = eT(0); eT val2 = eT(0); uword i,j; for(i=0, j=1; j < X_n_cols; i+=2, j+=2) { val1 += m.at(row_offset, col_offset + i); val2 += m.at(row_offset, col_offset + j); } if(i < X_n_cols) { val1 += m.at(row_offset, col_offset + i); } return val1 + val2; } if(X_n_cols == 1) { return arrayops::accumulate( X.colptr(0), X_n_rows ); } eT val = eT(0); for(uword col=0; col < X_n_cols; ++col) { val += arrayops::accumulate( X.colptr(col), X_n_rows ); } return val; } template arma_warn_unused arma_hot inline eT accu(const subview_col& X) { arma_extra_debug_sigprint(); return arrayops::accumulate( X.colmem, X.n_rows ); } // template arma_hot inline typename T1::elem_type accu_cube_proxy_linear(const ProxyCube& P) { arma_extra_debug_sigprint(); typedef typename T1::elem_type eT; eT val = eT(0); typename ProxyCube::ea_type Pea = P.get_ea(); const uword n_elem = P.get_n_elem(); if( arma_config::openmp && ProxyCube::use_mp && mp_gate::eval(n_elem) ) { #if defined(ARMA_USE_OPENMP) { // NOTE: using parallelisation with manual reduction workaround to take into account complex numbers; // NOTE: OpenMP versions lower than 4.0 do not support user-defined reduction const int n_threads_max = mp_thread_limit::get(); const uword n_threads_use = (std::min)(uword(podarray_prealloc_n_elem::val), uword(n_threads_max)); const uword chunk_size = n_elem / n_threads_use; podarray partial_accs(n_threads_use); #pragma omp parallel for schedule(static) num_threads(int(n_threads_use)) for(uword thread_id=0; thread_id < n_threads_use; ++thread_id) { const uword start = (thread_id+0) * chunk_size; const uword endp1 = (thread_id+1) * chunk_size; eT acc = eT(0); for(uword i=start; i < endp1; ++i) { acc += Pea[i]; } partial_accs[thread_id] = acc; } for(uword thread_id=0; thread_id < n_threads_use; ++thread_id) { val += partial_accs[thread_id]; } for(uword i=(n_threads_use*chunk_size); i < n_elem; ++i) { val += Pea[i]; } } #endif } else { #if defined(__FAST_MATH__) { if(P.is_aligned()) { typename ProxyCube::aligned_ea_type Pea_aligned = P.get_aligned_ea(); for(uword i=0; i arma_hot inline typename T1::elem_type accu_cube_proxy_at_mp(const ProxyCube& P) { arma_extra_debug_sigprint(); typedef typename T1::elem_type eT; eT val = eT(0); #if defined(ARMA_USE_OPENMP) { const uword n_rows = P.get_n_rows(); const uword n_cols = P.get_n_cols(); const uword n_slices = P.get_n_slices(); podarray slice_accs(n_slices); const int n_threads = mp_thread_limit::get(); #pragma omp parallel for schedule(static) num_threads(n_threads) for(uword slice = 0; slice < n_slices; ++slice) { eT val1 = eT(0); eT val2 = eT(0); for(uword col = 0; col < n_cols; ++col) { uword i,j; for(i=0, j=1; j arma_hot inline typename T1::elem_type accu_cube_proxy_at(const ProxyCube& P) { arma_extra_debug_sigprint(); typedef typename T1::elem_type eT; if(arma_config::openmp && ProxyCube::use_mp && mp_gate::eval(P.get_n_elem())) { return accu_cube_proxy_at_mp(P); } const uword n_rows = P.get_n_rows(); const uword n_cols = P.get_n_cols(); const uword n_slices = P.get_n_slices(); eT val1 = eT(0); eT val2 = eT(0); for(uword slice = 0; slice < n_slices; ++slice) for(uword col = 0; col < n_cols; ++col ) { uword i,j; for(i=0, j=1; j arma_warn_unused arma_hot inline typename T1::elem_type accu(const BaseCube& X) { arma_extra_debug_sigprint(); const ProxyCube P(X.get_ref()); if(is_Cube::stored_type>::value) { unwrap_cube::stored_type> tmp(P.Q); return arrayops::accumulate(tmp.M.memptr(), tmp.M.n_elem); } return (ProxyCube::use_at) ? accu_cube_proxy_at(P) : accu_cube_proxy_linear(P); } //! explicit handling of multiply-and-accumulate (cube version) template arma_warn_unused inline typename T1::elem_type accu(const eGlueCube& expr) { arma_extra_debug_sigprint(); typedef eGlueCube expr_type; typedef typename ProxyCube::stored_type P1_stored_type; typedef typename ProxyCube::stored_type P2_stored_type; if(is_Cube::value && is_Cube::value) { const unwrap_cube tmp1(expr.P1.Q); const unwrap_cube tmp2(expr.P2.Q); return op_dot::direct_dot(tmp1.M.n_elem, tmp1.M.memptr(), tmp2.M.memptr()); } const ProxyCube P(expr); return (ProxyCube::use_at) ? accu_cube_proxy_at(P) : accu_cube_proxy_linear(P); } // template arma_warn_unused inline typename arma_scalar_only::result accu(const T& x) { return x; } //! accumulate values in a sparse object template arma_warn_unused inline typename T1::elem_type accu(const SpBase& expr) { arma_extra_debug_sigprint(); typedef typename T1::elem_type eT; const SpProxy P(expr.get_ref()); const uword N = P.get_n_nonzero(); if(N == 0) { return eT(0); } if(SpProxy::use_iterator == false) { // direct counting return arrayops::accumulate(P.get_values(), N); } if(is_SpSubview::stored_type>::value) { const SpSubview& sv = reinterpret_cast< const SpSubview& >(P.Q); if(sv.n_rows == sv.m.n_rows) { const SpMat& m = sv.m; const uword col = sv.aux_col1; return arrayops::accumulate(&(m.values[ m.col_ptrs[col] ]), N); } } typename SpProxy::const_iterator_type it = P.begin(); eT val = eT(0); for(uword i=0; i < N; ++i) { val += (*it); ++it; } return val; } //! explicit handling of accu(A + B), where A and B are sparse matrices template arma_warn_unused inline typename T1::elem_type accu(const SpGlue& expr) { arma_extra_debug_sigprint(); const unwrap_spmat UA(expr.A); const unwrap_spmat UB(expr.B); arma_debug_assert_same_size(UA.M.n_rows, UA.M.n_cols, UB.M.n_rows, UB.M.n_cols, "addition"); return (accu(UA.M) + accu(UB.M)); } //! explicit handling of accu(A - B), where A and B are sparse matrices template arma_warn_unused inline typename T1::elem_type accu(const SpGlue& expr) { arma_extra_debug_sigprint(); const unwrap_spmat UA(expr.A); const unwrap_spmat UB(expr.B); arma_debug_assert_same_size(UA.M.n_rows, UA.M.n_cols, UB.M.n_rows, UB.M.n_cols, "subtraction"); return (accu(UA.M) - accu(UB.M)); } //! explicit handling of accu(A % B), where A and B are sparse matrices template arma_warn_unused inline typename T1::elem_type accu(const SpGlue& expr) { arma_extra_debug_sigprint(); typedef typename T1::elem_type eT; const SpProxy px(expr.A); const SpProxy py(expr.B); typename SpProxy::const_iterator_type x_it = px.begin(); typename SpProxy::const_iterator_type x_it_end = px.end(); typename SpProxy::const_iterator_type y_it = py.begin(); typename SpProxy::const_iterator_type y_it_end = py.end(); eT acc = eT(0); while( (x_it != x_it_end) || (y_it != y_it_end) ) { if(x_it == y_it) { acc += ((*x_it) * (*y_it)); ++x_it; ++y_it; } else { const uword x_it_col = x_it.col(); const uword x_it_row = x_it.row(); const uword y_it_col = y_it.col(); const uword y_it_row = y_it.row(); if((x_it_col < y_it_col) || ((x_it_col == y_it_col) && (x_it_row < y_it_row))) // if y is closer to the end { ++x_it; } else // x is closer to the end { ++y_it; } } } return acc; } template arma_warn_unused inline typename T1::elem_type accu(const SpOp& expr) { arma_extra_debug_sigprint(); typedef typename T1::elem_type eT; const bool is_vectorise = \ (is_same_type::yes) || (is_same_type::yes) || (is_same_type::yes); if(is_vectorise) { return accu(expr.m); } const SpMat tmp = expr; return accu(tmp); } //! @}