// ************************************************************************** // // PARALUTION www.paralution.com // // Copyright (C) 2015 PARALUTION Labs UG (haftungsbeschränkt) & Co. KG // Am Hasensprung 6, 76571 Gaggenau // Handelsregister: Amtsgericht Mannheim, HRA 706051 // Vertreten durch: // PARALUTION Labs Verwaltungs UG (haftungsbeschränkt) // Am Hasensprung 6, 76571 Gaggenau // Handelsregister: Amtsgericht Mannheim, HRB 721277 // Geschäftsführer: Dimitar Lukarski, Nico Trost // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see <http://www.gnu.org/licenses/>. // // ************************************************************************** // PARALUTION version 1.1.0 #include "../../utils/def.hpp" #include "../../utils/log.hpp" #include "../../utils/allocate_free.hpp" #include "kernels_ocl.hpp" #include "ocl_allocate_free.hpp" #include "ocl_utils.hpp" #include "ocl_matrix_csr.hpp" #include "ocl_matrix_coo.hpp" #include "ocl_matrix_ell.hpp" #include "ocl_matrix_hyb.hpp" #include "ocl_vector.hpp" #include "../host/host_matrix_hyb.hpp" #include "../backend_manager.hpp" #include <algorithm> namespace paralution { template <typename ValueType> OCLAcceleratorMatrixHYB<ValueType>::OCLAcceleratorMatrixHYB() { // no default constructors LOG_INFO("no default constructor"); FATAL_ERROR(__FILE__, __LINE__); } template <typename ValueType> OCLAcceleratorMatrixHYB<ValueType>::OCLAcceleratorMatrixHYB(const Paralution_Backend_Descriptor local_backend) { LOG_DEBUG(this, "OCLAcceleratorMatrixHYB::OCLAcceleratorMatrixHYB()", "constructor with local_backend"); this->mat_.ELL.val = NULL; this->mat_.ELL.col = NULL; this->mat_.ELL.max_row = 0; this->mat_.COO.row = NULL; this->mat_.COO.col = NULL; this->mat_.COO.val = NULL; this->ell_nnz_ = 0; this->coo_nnz_ = 0; this->set_backend(local_backend); } template <typename ValueType> OCLAcceleratorMatrixHYB<ValueType>::~OCLAcceleratorMatrixHYB() { LOG_DEBUG(this, "OCLAcceleratorMatrixHYB::~OCLAcceleratorMatrixHYB()", "destructor"); this->Clear(); } template <typename ValueType> void OCLAcceleratorMatrixHYB<ValueType>::info(void) const { LOG_INFO("OCLAcceleratorMatrixHYB<ValueType>"); } template <typename ValueType> void OCLAcceleratorMatrixHYB<ValueType>::AllocateHYB(const int ell_nnz, const int coo_nnz, const int ell_max_row, const int nrow, const int ncol) { assert (ell_nnz >= 0); assert (coo_nnz >= 0); assert (ell_max_row >= 0); assert (ncol >= 0); assert (nrow >= 0); if (this->nnz_ > 0) this->Clear(); if (ell_nnz + coo_nnz > 0) { // ELL assert (ell_nnz == ell_max_row * nrow); allocate_ocl(ell_nnz, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &this->mat_.ELL.col); allocate_ocl(ell_nnz, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &this->mat_.ELL.val); ocl_set_to(ell_nnz, (int) 0, this->mat_.ELL.col, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); ocl_set_to(ell_nnz, (ValueType) 0, this->mat_.ELL.val, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); this->mat_.ELL.max_row = ell_max_row; this->ell_nnz_ = ell_nnz; // COO allocate_ocl(coo_nnz, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &this->mat_.COO.row); allocate_ocl(coo_nnz, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &this->mat_.COO.col); allocate_ocl(coo_nnz, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &this->mat_.COO.val); ocl_set_to(coo_nnz, (int) 0, this->mat_.COO.row, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); ocl_set_to(coo_nnz, (int) 0, this->mat_.COO.col, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); ocl_set_to(coo_nnz, (ValueType) 0, this->mat_.COO.val, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); this->nrow_ = nrow; this->ncol_ = ncol; this->coo_nnz_ = coo_nnz; this->nnz_ = ell_nnz + coo_nnz; } } template <typename ValueType> void OCLAcceleratorMatrixHYB<ValueType>::Clear(void) { if (this->nnz_ > 0) { free_ocl(&this->mat_.COO.row); free_ocl(&this->mat_.COO.col); free_ocl(&this->mat_.COO.val); free_ocl(&this->mat_.ELL.val); free_ocl(&this->mat_.ELL.col); this->ell_nnz_ = 0; this->coo_nnz_ = 0; this->mat_.ELL.max_row = 0; this->nrow_ = 0; this->ncol_ = 0; this->nnz_ = 0; } } template <typename ValueType> void OCLAcceleratorMatrixHYB<ValueType>::CopyFromHost(const HostMatrix<ValueType> &src) { assert (&src != NULL); const HostMatrixHYB<ValueType> *cast_mat; // copy only in the same format assert (this->get_mat_format() == src.get_mat_format()); // CPU to OCL copy if ((cast_mat = dynamic_cast<const HostMatrixHYB<ValueType>*> (&src)) != NULL) { if (this->nnz_ == 0) this->AllocateHYB(cast_mat->ell_nnz_, cast_mat->coo_nnz_, cast_mat->mat_.ELL.max_row, cast_mat->nrow_, cast_mat->ncol_); assert (this->nnz_ == cast_mat->nnz_); assert (this->nrow_ == cast_mat->nrow_); assert (this->ncol_ == cast_mat->ncol_); if (this->ell_nnz_ > 0) { // ELL ocl_host2dev(this->ell_nnz_, // size cast_mat->mat_.ELL.col, // src this->mat_.ELL.col, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); ocl_host2dev(this->ell_nnz_, // size cast_mat->mat_.ELL.val, // src this->mat_.ELL.val, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); } if (this->coo_nnz_ > 0) { // COO ocl_host2dev(this->coo_nnz_, // size cast_mat->mat_.COO.row, // src this->mat_.COO.row, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); ocl_host2dev(this->coo_nnz_, // size cast_mat->mat_.COO.col, // src this->mat_.COO.col, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); ocl_host2dev(this->coo_nnz_, // size cast_mat->mat_.COO.val, // src this->mat_.COO.val, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); } } else { LOG_INFO("Error unsupported OCL matrix type"); this->info(); src.info(); FATAL_ERROR(__FILE__, __LINE__); } } template <typename ValueType> void OCLAcceleratorMatrixHYB<ValueType>::CopyToHost(HostMatrix<ValueType> *dst) const { assert (dst != NULL); HostMatrixHYB<ValueType> *cast_mat; // copy only in the same format assert (this->get_mat_format() == dst->get_mat_format()); // OCL to CPU copy if ((cast_mat = dynamic_cast<HostMatrixHYB<ValueType>*> (dst)) != NULL) { cast_mat->set_backend(this->local_backend_); if (cast_mat->nnz_ == 0) cast_mat->AllocateHYB(this->ell_nnz_, this->coo_nnz_, this->mat_.ELL.max_row, this->nrow_, this->ncol_); assert (this->nnz_ == cast_mat->nnz_); assert (this->nrow_ == cast_mat->nrow_); assert (this->ncol_ == cast_mat->ncol_); if (this->ell_nnz_ > 0) { // ELL ocl_dev2host(this->ell_nnz_, // size this->mat_.ELL.col, // src cast_mat->mat_.ELL.col, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); ocl_dev2host(this->ell_nnz_, // size this->mat_.ELL.val, // src cast_mat->mat_.ELL.val, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); } if (this->coo_nnz_ > 0) { // COO ocl_dev2host(this->coo_nnz_, // size this->mat_.COO.row, // src cast_mat->mat_.COO.row, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); ocl_dev2host(this->coo_nnz_, // size this->mat_.COO.col, // src cast_mat->mat_.COO.col, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); ocl_dev2host(this->coo_nnz_, // size this->mat_.COO.val, // src cast_mat->mat_.COO.val, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); } } else { LOG_INFO("Error unsupported OCL matrix type"); this->info(); dst->info(); FATAL_ERROR(__FILE__, __LINE__); } } template <typename ValueType> void OCLAcceleratorMatrixHYB<ValueType>::CopyFrom(const BaseMatrix<ValueType> &src) { assert (&src != NULL); const OCLAcceleratorMatrixHYB<ValueType> *ocl_cast_mat; const HostMatrix<ValueType> *host_cast_mat; // copy only in the same format assert (this->get_mat_format() == src.get_mat_format()); // OCL to OCL copy if ((ocl_cast_mat = dynamic_cast<const OCLAcceleratorMatrixHYB<ValueType>*> (&src)) != NULL) { if (this->nnz_ == 0) this->AllocateHYB(ocl_cast_mat->ell_nnz_, ocl_cast_mat->coo_nnz_, ocl_cast_mat->mat_.ELL.max_row, ocl_cast_mat->nrow_, ocl_cast_mat->ncol_); assert (this->nnz_ == ocl_cast_mat->nnz_); assert (this->nrow_ == ocl_cast_mat->nrow_); assert (this->ncol_ == ocl_cast_mat->ncol_); if (this->ell_nnz_ > 0) { // ELL // must be within same opencl context ocl_dev2dev(this->ell_nnz_, // size ocl_cast_mat->mat_.ELL.col, // src this->mat_.ELL.col, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); ocl_dev2dev(this->ell_nnz_, // size ocl_cast_mat->mat_.ELL.val, // src this->mat_.ELL.val, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); } if (this->coo_nnz_ > 0) { // COO // must be within same opencl context ocl_dev2dev(this->coo_nnz_, // size ocl_cast_mat->mat_.COO.row, // src this->mat_.COO.row, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); ocl_dev2dev(this->coo_nnz_, // size ocl_cast_mat->mat_.COO.col, // src this->mat_.COO.col, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); ocl_dev2dev(this->coo_nnz_, // size ocl_cast_mat->mat_.COO.val, // src this->mat_.COO.val, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); } } else { //CPU to OCL if ((host_cast_mat = dynamic_cast<const HostMatrix<ValueType>*> (&src)) != NULL) { this->CopyFromHost(*host_cast_mat); } else { LOG_INFO("Error unsupported OCL matrix type"); this->info(); src.info(); FATAL_ERROR(__FILE__, __LINE__); } } } template <typename ValueType> void OCLAcceleratorMatrixHYB<ValueType>::CopyTo(BaseMatrix<ValueType> *dst) const { assert (dst != NULL); OCLAcceleratorMatrixHYB<ValueType> *ocl_cast_mat; HostMatrix<ValueType> *host_cast_mat; // copy only in the same format assert (this->get_mat_format() == dst->get_mat_format()); // OCL to OCL copy if ((ocl_cast_mat = dynamic_cast<OCLAcceleratorMatrixHYB<ValueType>*> (dst)) != NULL) { ocl_cast_mat->set_backend(this->local_backend_); if (ocl_cast_mat->nnz_ == 0) ocl_cast_mat->AllocateHYB(this->ell_nnz_, this->coo_nnz_, this->mat_.ELL.max_row, this->nrow_, this->ncol_); assert (this->nnz_ == ocl_cast_mat->nnz_); assert (this->nrow_ == ocl_cast_mat->nrow_); assert (this->ncol_ == ocl_cast_mat->ncol_); if (this->ell_nnz_ > 0) { // ELL // must be within same opencl context ocl_dev2dev(this->ell_nnz_, // size this->mat_.ELL.col, // src ocl_cast_mat->mat_.ELL.col, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); ocl_dev2dev(this->ell_nnz_, // size this->mat_.ELL.val, // src ocl_cast_mat->mat_.ELL.val, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); } if (this->coo_nnz_ > 0) { // COO // must be within same opencl context ocl_dev2dev(this->coo_nnz_, // size this->mat_.COO.row, // src ocl_cast_mat->mat_.COO.row, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); ocl_dev2dev(this->coo_nnz_, // size this->mat_.COO.col, // src ocl_cast_mat->mat_.COO.col, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); ocl_dev2dev(this->coo_nnz_, // size this->mat_.COO.val, // src ocl_cast_mat->mat_.COO.val, // dst OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); } } else { //OCL to CPU if ((host_cast_mat = dynamic_cast<HostMatrix<ValueType>*> (dst)) != NULL) { this->CopyToHost(host_cast_mat); } else { LOG_INFO("Error unsupported OCL matrix type"); this->info(); dst->info(); FATAL_ERROR(__FILE__, __LINE__); } } } template <typename ValueType> bool OCLAcceleratorMatrixHYB<ValueType>::ConvertFrom(const BaseMatrix<ValueType> &mat) { assert (&mat != NULL); this->Clear(); // empty matrix is empty matrix if (mat.get_nnz() == 0) return true; const OCLAcceleratorMatrixHYB<ValueType> *cast_mat_hyb; if ((cast_mat_hyb = dynamic_cast<const OCLAcceleratorMatrixHYB<ValueType>*> (&mat)) != NULL) { this->CopyFrom(*cast_mat_hyb); return true; } const OCLAcceleratorMatrixCSR<ValueType> *cast_mat_csr; if ((cast_mat_csr = dynamic_cast<const OCLAcceleratorMatrixCSR<ValueType>*> (&mat)) != NULL) { this->Clear(); assert (cast_mat_csr->nrow_ > 0); assert (cast_mat_csr->ncol_ > 0); assert (cast_mat_csr->nnz_ > 0); int nrow = cast_mat_csr->nrow_; int ncol = cast_mat_csr->ncol_; int max_row = cast_mat_csr->nnz_ / nrow; // get nnz per row for COO part int *nnz_coo = NULL; allocate_ocl(nrow, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &nnz_coo); size_t LocalSize = this->local_backend_.OCL_max_work_group_size; size_t GlobalSize = (nrow / LocalSize + 1) * LocalSize; cl_int err = ocl_kernel<ValueType>(CL_KERNEL_ELL_NNZ_COO, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, LocalSize, GlobalSize, nrow, max_row, cast_mat_csr->mat_.row_offset, nnz_coo); CHECK_OCL_ERROR(err, __FILE__, __LINE__); // TODO fix int kernel for reduce int *hostBuffer = NULL; allocate_host(nrow, &hostBuffer); ocl_dev2host(nrow, nnz_coo, hostBuffer, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); int num_nnz_coo = 0; for (int i=0; i<nrow; ++i) num_nnz_coo += hostBuffer[i]; free_host(&hostBuffer); /* // get nnz for COO part by summing up nnz per row array int reducesize = (int) this->local_backend_.OCL_computeUnits * 4; int *deviceBuffer = NULL; int *hostBuffer = NULL; allocate_ocl(reducesize, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &deviceBuffer); GlobalSize = reducesize * LocalSize; int GROUP_SIZE = ((nrow / reducesize + 1) / (int) LocalSize + 1) * (int) LocalSize; int LOCAL_SIZE = GROUP_SIZE / (int) LocalSize; err = ocl_kernel<int>(CL_KERNEL_REDUCE, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, LocalSize, GlobalSize, nrow, nnz_coo, deviceBuffer, GROUP_SIZE, LOCAL_SIZE); CHECK_OCL_ERROR(err, __FILE__, __LINE__); allocate_host(reducesize, &hostBuffer); ocl_dev2host(reducesize, deviceBuffer, hostBuffer, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); free_ocl(&deviceBuffer); int num_nnz_coo = 0; for (int i=0; i<reducesize; ++i) num_nnz_coo += hostBuffer[i]; free_host(&hostBuffer); */ // END TODO // allocate ELL and COO matrices int num_nnz_ell = max_row * nrow; if (num_nnz_ell <= 0 || num_nnz_coo <= 0) { free_ocl(&nnz_coo); return false; } this->AllocateHYB(num_nnz_ell, num_nnz_coo, max_row, nrow, ncol); ocl_set_to(num_nnz_ell, (int) -1, this->mat_.ELL.col, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); // copy up to num_cols_per_row values of row i into the ELL int *nnz_ell = NULL; allocate_ocl(nrow, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &nnz_ell); GlobalSize = (nrow / LocalSize + 1) * LocalSize; err = ocl_kernel<ValueType>(CL_KERNEL_ELL_FILL_ELL, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, LocalSize, GlobalSize, nrow, max_row, cast_mat_csr->mat_.row_offset, cast_mat_csr->mat_.col, cast_mat_csr->mat_.val, this->mat_.ELL.col, this->mat_.ELL.val, nnz_ell); CHECK_OCL_ERROR(err, __FILE__, __LINE__); // TODO currently performing partial sum on host allocate_host(nrow, &hostBuffer); ocl_dev2host(nrow, nnz_ell, hostBuffer, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); for (int i=1; i<nrow; ++i) hostBuffer[i] += hostBuffer[i-1]; ocl_host2dev(nrow, hostBuffer, nnz_ell, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue); free_host(&hostBuffer); // end TODO // copy any remaining values in row i into the COO err = ocl_kernel<ValueType>(CL_KERNEL_ELL_FILL_COO, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, LocalSize, GlobalSize, nrow, cast_mat_csr->mat_.row_offset, cast_mat_csr->mat_.col, cast_mat_csr->mat_.val, nnz_coo, nnz_ell, this->mat_.COO.row, this->mat_.COO.col, this->mat_.COO.val); CHECK_OCL_ERROR(err, __FILE__, __LINE__); free_ocl(&nnz_ell); free_ocl(&nnz_coo); this->nrow_ = cast_mat_csr->nrow_; this->ncol_ = cast_mat_csr->ncol_; this->nnz_ = num_nnz_ell + num_nnz_coo; this->mat_.ELL.max_row = max_row; this->ell_nnz_ = num_nnz_ell; this->coo_nnz_ = num_nnz_coo; return true; } return false; } template <typename ValueType> void OCLAcceleratorMatrixHYB<ValueType>::Apply(const BaseVector<ValueType> &in, BaseVector<ValueType> *out) const { if (this->nnz_ > 0) { assert (&in != NULL); assert (out != NULL); assert (in. get_size() >= 0); assert (out->get_size() >= 0); assert (in. get_size() == this->ncol_); assert (out->get_size() == this->nrow_); const OCLAcceleratorVector<ValueType> *cast_in = dynamic_cast<const OCLAcceleratorVector<ValueType>*> (&in); OCLAcceleratorVector<ValueType> *cast_out = dynamic_cast< OCLAcceleratorVector<ValueType>*> (out); assert (cast_in != NULL); assert (cast_out != NULL); // ELL if (this->ell_nnz_ > 0) { size_t LocalSize = this->local_backend_.OCL_max_work_group_size; size_t GlobalSize = (this->nrow_ / LocalSize + 1) * LocalSize; cl_int err = ocl_kernel<ValueType>(CL_KERNEL_ELL_SPMV, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, LocalSize, GlobalSize, this->nrow_, this->ncol_, this->mat_.ELL.max_row, this->mat_.ELL.col, this->mat_.ELL.val, cast_in->vec_, cast_out->vec_); CHECK_OCL_ERROR(err, __FILE__, __LINE__); } // COO if (this->coo_nnz_ > 0) { // do not support super small matrices assert (this->coo_nnz_ > this->local_backend_.OCL_warp_size); // ---------------------------------------------------------- // Modified and adapted from CUSP 0.3.1, // http://code.google.com/p/cusp-library/ // NVIDIA, APACHE LICENSE 2.0 // ---------------------------------------------------------- // see __spmv_coo_flat(...) // ---------------------------------------------------------- // CHANGELOG // - adapted interface // ---------------------------------------------------------- const int BLOCK_SIZE = int(this->local_backend_.OCL_max_work_group_size); // const unsigned int MAX_BLOCKS = this->local_backend_.GPU_max_blocks; const unsigned int MAX_BLOCKS = 32; // cusp::detail::device::arch::max_active_blocks(spmv_coo_flat_kernel<IndexType, ValueType, BLOCK_SIZE, UseCache>, BLOCK_SIZE, (size_t) 0); const unsigned int WARPS_PER_BLOCK = BLOCK_SIZE / this->local_backend_.OCL_warp_size; const unsigned int num_units = this->coo_nnz_ / this->local_backend_.OCL_warp_size; const unsigned int num_warps = std::min(num_units, WARPS_PER_BLOCK * MAX_BLOCKS); const unsigned int num_blocks = (num_warps + (WARPS_PER_BLOCK-1)) / WARPS_PER_BLOCK; // (N + (granularity - 1)) / granularity const unsigned int num_iters = (num_units + (num_warps-1)) / num_warps; const unsigned int interval_size = this->local_backend_.OCL_warp_size * num_iters; const int tail = num_units * this->local_backend_.OCL_warp_size; // do the last few nonzeros separately (fewer than this->local_backend_.GPU_warp elements) const unsigned int active_warps = (interval_size == 0) ? 0 : ((tail + (interval_size-1))/interval_size); int *temp_rows = NULL; ValueType *temp_vals = NULL; allocate_ocl(active_warps, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &temp_rows); allocate_ocl(active_warps, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &temp_vals); cl_int err; size_t LocalSize = this->local_backend_.OCL_max_work_group_size; size_t GlobalSize = num_blocks * LocalSize; ValueType scalar = (ValueType) 1; err = ocl_kernel<ValueType>(CL_KERNEL_COO_SPMV_FLAT, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, LocalSize, GlobalSize, tail, interval_size, this->mat_.COO.row, this->mat_.COO.col, this->mat_.COO.val, scalar, cast_in->vec_, cast_out->vec_, temp_rows, temp_vals); CHECK_OCL_ERROR(err, __FILE__, __LINE__); err = ocl_kernel<ValueType>(CL_KERNEL_COO_SPMV_REDUCE_UPDATE, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, LocalSize, LocalSize, active_warps, temp_rows, temp_vals, cast_out->vec_); CHECK_OCL_ERROR(err, __FILE__, __LINE__); free_ocl(&temp_rows); free_ocl(&temp_vals); err = ocl_kernel<ValueType>(CL_KERNEL_COO_SPMV_SERIAL, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, 1, 1, this->coo_nnz_, this->mat_.COO.row, this->mat_.COO.col, this->mat_.COO.val, scalar, cast_in->vec_, cast_out->vec_, tail); CHECK_OCL_ERROR(err, __FILE__, __LINE__); } } } template <typename ValueType> void OCLAcceleratorMatrixHYB<ValueType>::ApplyAdd(const BaseVector<ValueType> &in, const ValueType scalar, BaseVector<ValueType> *out) const { if (this->nnz_ > 0) { assert (&in != NULL); assert (out != NULL); assert (in. get_size() >= 0); assert (out->get_size() >= 0); assert (in. get_size() == this->ncol_); assert (out->get_size() == this->nrow_); const OCLAcceleratorVector<ValueType> *cast_in = dynamic_cast<const OCLAcceleratorVector<ValueType>*> (&in); OCLAcceleratorVector<ValueType> *cast_out = dynamic_cast< OCLAcceleratorVector<ValueType>*> (out); assert (cast_in != NULL); assert (cast_out != NULL); // ELL if (this->ell_nnz_ > 0) { size_t LocalSize = this->local_backend_.OCL_max_work_group_size; size_t GlobalSize = (this->nrow_ / LocalSize + 1) * LocalSize; cl_int err = ocl_kernel<ValueType>(CL_KERNEL_ELL_ADD_SPMV, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, LocalSize, GlobalSize, this->nrow_, this->ncol_, this->mat_.ELL.max_row, this->mat_.ELL.col, this->mat_.ELL.val, scalar, cast_in->vec_, cast_out->vec_); CHECK_OCL_ERROR(err, __FILE__, __LINE__); } if (this->coo_nnz_ > 0) { // do not support super small matrices assert (this->coo_nnz_ > this->local_backend_.OCL_warp_size); // ---------------------------------------------------------- // Modified and adapted from CUSP 0.3.1, // http://code.google.com/p/cusp-library/ // NVIDIA, APACHE LICENSE 2.0 // ---------------------------------------------------------- // see __spmv_coo_flat(...) // ---------------------------------------------------------- // CHANGELOG // - adapted interface // ---------------------------------------------------------- const int BLOCK_SIZE = int(this->local_backend_.OCL_max_work_group_size); // const unsigned int MAX_BLOCKS = this->local_backend_.GPU_max_blocks; const unsigned int MAX_BLOCKS = 32; // cusp::detail::device::arch::max_active_blocks(spmv_coo_flat_kernel<IndexType, ValueType, BLOCK_SIZE, UseCache>, BLOCK_SIZE, (size_t) 0); const unsigned int WARPS_PER_BLOCK = BLOCK_SIZE / this->local_backend_.OCL_warp_size; const unsigned int num_units = this->coo_nnz_ / this->local_backend_.OCL_warp_size; const unsigned int num_warps = std::min(num_units, WARPS_PER_BLOCK * MAX_BLOCKS); const unsigned int num_blocks = (num_warps + (WARPS_PER_BLOCK-1)) / WARPS_PER_BLOCK; // (N + (granularity - 1)) / granularity const unsigned int num_iters = (num_units + (num_warps-1)) / num_warps; const unsigned int interval_size = this->local_backend_.OCL_warp_size * num_iters; const int tail = num_units * this->local_backend_.OCL_warp_size; // do the last few nonzeros separately (fewer than this->local_backend_.GPU_warp elements) const unsigned int active_warps = (interval_size == 0) ? 0 : ((tail + (interval_size-1))/interval_size); int *temp_rows = NULL; ValueType *temp_vals = NULL; allocate_ocl(active_warps, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &temp_rows); allocate_ocl(active_warps, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &temp_vals); cl_int err; size_t LocalSize = this->local_backend_.OCL_max_work_group_size; size_t GlobalSize = num_blocks * LocalSize; err = ocl_kernel<ValueType>(CL_KERNEL_COO_SPMV_FLAT, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, LocalSize, GlobalSize, tail, interval_size, this->mat_.COO.row, this->mat_.COO.col, this->mat_.COO.val, scalar, cast_in->vec_, cast_out->vec_, temp_rows, temp_vals); CHECK_OCL_ERROR(err, __FILE__, __LINE__); err = ocl_kernel<ValueType>(CL_KERNEL_COO_SPMV_REDUCE_UPDATE, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, LocalSize, LocalSize, active_warps, temp_rows, temp_vals, cast_out->vec_); CHECK_OCL_ERROR(err, __FILE__, __LINE__); free_ocl(&temp_rows); free_ocl(&temp_vals); err = ocl_kernel<ValueType>(CL_KERNEL_COO_SPMV_SERIAL, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, 1, 1, this->coo_nnz_, this->mat_.COO.row, this->mat_.COO.col, this->mat_.COO.val, scalar, cast_in->vec_, cast_out->vec_, tail); CHECK_OCL_ERROR(err, __FILE__, __LINE__); } } } template class OCLAcceleratorMatrixHYB<double>; template class OCLAcceleratorMatrixHYB<float>; }