// ************************************************************************** // // PARALUTION www.paralution.com // // Copyright (C) 2015 PARALUTION Labs UG (haftungsbeschränkt) & Co. KG // Am Hasensprung 6, 76571 Gaggenau // Handelsregister: Amtsgericht Mannheim, HRA 706051 // Vertreten durch: // PARALUTION Labs Verwaltungs UG (haftungsbeschränkt) // Am Hasensprung 6, 76571 Gaggenau // Handelsregister: Amtsgericht Mannheim, HRB 721277 // Geschäftsführer: Dimitar Lukarski, Nico Trost // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see <http://www.gnu.org/licenses/>. // // ************************************************************************** // PARALUTION version 1.1.0 #include "../../utils/def.hpp" #include "gpu_matrix_csr.hpp" #include "gpu_matrix_coo.hpp" #include "gpu_matrix_dia.hpp" #include "gpu_matrix_ell.hpp" #include "gpu_matrix_hyb.hpp" #include "gpu_matrix_mcsr.hpp" #include "gpu_matrix_bcsr.hpp" #include "gpu_matrix_dense.hpp" #include "gpu_vector.hpp" #include "../host/host_matrix_dense.hpp" #include "../base_matrix.hpp" #include "../base_vector.hpp" #include "../backend_manager.hpp" #include "../../utils/log.hpp" #include "gpu_utils.hpp" #include "cuda_kernels_general.hpp" #include "cuda_kernels_dense.hpp" #include "gpu_allocate_free.hpp" #include "../matrix_formats_ind.hpp" #include <cuda.h> namespace paralution { template <typename ValueType> GPUAcceleratorMatrixDENSE<ValueType>::GPUAcceleratorMatrixDENSE() { // no default constructors LOG_INFO("no default constructor"); FATAL_ERROR(__FILE__, __LINE__); } template <typename ValueType> GPUAcceleratorMatrixDENSE<ValueType>::GPUAcceleratorMatrixDENSE(const Paralution_Backend_Descriptor local_backend) { LOG_DEBUG(this, "GPUAcceleratorMatrixDENSE::GPUAcceleratorMatrixDENSE()", "constructor with local_backend"); this->mat_.val = NULL; this->set_backend(local_backend); CHECK_CUDA_ERROR(__FILE__, __LINE__); } template <typename ValueType> GPUAcceleratorMatrixDENSE<ValueType>::~GPUAcceleratorMatrixDENSE() { LOG_DEBUG(this, "GPUAcceleratorMatrixDENSE::~GPUAcceleratorMatrixDENSE()", "destructor"); this->Clear(); } template <typename ValueType> void GPUAcceleratorMatrixDENSE<ValueType>::info(void) const { LOG_INFO("GPUAcceleratorMatrixDENSE<ValueType>"); } template <typename ValueType> void GPUAcceleratorMatrixDENSE<ValueType>::AllocateDENSE(const int nrow, const int ncol) { assert( ncol >= 0); assert( nrow >= 0); if (this->get_nnz() > 0) this->Clear(); if (nrow*ncol > 0) { allocate_gpu(nrow*ncol, &this->mat_.val); set_to_zero_gpu(this->local_backend_.GPU_block_size, this->local_backend_.GPU_max_threads, nrow*ncol, mat_.val); this->nrow_ = nrow; this->ncol_ = ncol; this->nnz_ = nrow*ncol; } } template <typename ValueType> void GPUAcceleratorMatrixDENSE<ValueType>::Clear() { if (this->get_nnz() > 0) { free_gpu(&this->mat_.val); this->nrow_ = 0; this->ncol_ = 0; this->nnz_ = 0; } } template <typename ValueType> void GPUAcceleratorMatrixDENSE<ValueType>::SetDataPtrDENSE(ValueType **val, const int nrow, const int ncol) { assert(*val != NULL); assert(nrow > 0); assert(ncol > 0); this->Clear(); cudaDeviceSynchronize(); this->nrow_ = nrow; this->ncol_ = ncol; this->nnz_ = nrow*ncol; this->mat_.val = *val; } template <typename ValueType> void GPUAcceleratorMatrixDENSE<ValueType>::LeaveDataPtrDENSE(ValueType **val) { assert(this->nrow_ > 0); assert(this->ncol_ > 0); assert(this->nnz_ > 0); assert(this->nnz_ == this->nrow_*this->ncol_); cudaDeviceSynchronize(); *val = this->mat_.val; this->mat_.val = NULL; this->nrow_ = 0; this->ncol_ = 0; this->nnz_ = 0; } template <typename ValueType> void GPUAcceleratorMatrixDENSE<ValueType>::CopyFromHost(const HostMatrix<ValueType> &src) { const HostMatrixDENSE<ValueType> *cast_mat; // copy only in the same format assert(this->get_mat_format() == src.get_mat_format()); // CPU to GPU copy if ((cast_mat = dynamic_cast<const HostMatrixDENSE<ValueType>*> (&src)) != NULL) { if (this->get_nnz() == 0) this->AllocateDENSE(src.get_nrow(), src.get_ncol() ); assert(this->get_nnz() == src.get_nnz()); assert(this->get_nrow() == src.get_nrow()); assert(this->get_ncol() == src.get_ncol()); if (this->get_nnz() > 0) { cudaMemcpy(this->mat_.val, // dst cast_mat->mat_.val, // src this->get_nnz()*sizeof(ValueType), // size cudaMemcpyHostToDevice); CHECK_CUDA_ERROR(__FILE__, __LINE__); } } else { LOG_INFO("Error unsupported GPU matrix type"); this->info(); src.info(); FATAL_ERROR(__FILE__, __LINE__); } } template <typename ValueType> void GPUAcceleratorMatrixDENSE<ValueType>::CopyToHost(HostMatrix<ValueType> *dst) const { HostMatrixDENSE<ValueType> *cast_mat; // copy only in the same format assert(this->get_mat_format() == dst->get_mat_format()); // GPU to CPU copy if ((cast_mat = dynamic_cast<HostMatrixDENSE<ValueType>*> (dst)) != NULL) { cast_mat->set_backend(this->local_backend_); if (dst->get_nnz() == 0) cast_mat->AllocateDENSE(this->get_nrow(), this->get_ncol() ); assert(this->get_nnz() == dst->get_nnz()); assert(this->get_nrow() == dst->get_nrow()); assert(this->get_ncol() == dst->get_ncol()); if (this->get_nnz() > 0) { cudaMemcpy(cast_mat->mat_.val, // dst this->mat_.val, // src this->get_nnz()*sizeof(ValueType), // size cudaMemcpyDeviceToHost); CHECK_CUDA_ERROR(__FILE__, __LINE__); } } else { LOG_INFO("Error unsupported GPU matrix type"); this->info(); dst->info(); FATAL_ERROR(__FILE__, __LINE__); } } template <typename ValueType> void GPUAcceleratorMatrixDENSE<ValueType>::CopyFrom(const BaseMatrix<ValueType> &src) { const GPUAcceleratorMatrixDENSE<ValueType> *gpu_cast_mat; const HostMatrix<ValueType> *host_cast_mat; // copy only in the same format assert(this->get_mat_format() == src.get_mat_format()); // GPU to GPU copy if ((gpu_cast_mat = dynamic_cast<const GPUAcceleratorMatrixDENSE<ValueType>*> (&src)) != NULL) { if (this->get_nnz() == 0) this->AllocateDENSE(src.get_nrow(), src.get_ncol() ); assert(this->get_nnz() == src.get_nnz()); assert(this->get_nrow() == src.get_nrow()); assert(this->get_ncol() == src.get_ncol()); if (this->get_nnz() > 0) { cudaMemcpy(this->mat_.val, // dst gpu_cast_mat->mat_.val, // src this->get_nnz()*sizeof(ValueType), // size cudaMemcpyDeviceToDevice); CHECK_CUDA_ERROR(__FILE__, __LINE__); } } else { //CPU to GPU if ((host_cast_mat = dynamic_cast<const HostMatrix<ValueType>*> (&src)) != NULL) { this->CopyFromHost(*host_cast_mat); } else { LOG_INFO("Error unsupported GPU matrix type"); this->info(); src.info(); FATAL_ERROR(__FILE__, __LINE__); } } } template <typename ValueType> void GPUAcceleratorMatrixDENSE<ValueType>::CopyTo(BaseMatrix<ValueType> *dst) const { GPUAcceleratorMatrixDENSE<ValueType> *gpu_cast_mat; HostMatrix<ValueType> *host_cast_mat; // copy only in the same format assert(this->get_mat_format() == dst->get_mat_format()); // GPU to GPU copy if ((gpu_cast_mat = dynamic_cast<GPUAcceleratorMatrixDENSE<ValueType>*> (dst)) != NULL) { gpu_cast_mat->set_backend(this->local_backend_); if (this->get_nnz() == 0) gpu_cast_mat->AllocateDENSE(dst->get_nrow(), dst->get_ncol() ); assert(this->get_nnz() == dst->get_nnz()); assert(this->get_nrow() == dst->get_nrow()); assert(this->get_ncol() == dst->get_ncol()); if (this->get_nnz() > 0) { cudaMemcpy(gpu_cast_mat->mat_.val, // dst this->mat_.val, // src this->get_nnz()*sizeof(ValueType), // size cudaMemcpyDeviceToHost); CHECK_CUDA_ERROR(__FILE__, __LINE__); } } else { //GPU to CPU if ((host_cast_mat = dynamic_cast<HostMatrix<ValueType>*> (dst)) != NULL) { this->CopyToHost(host_cast_mat); } else { LOG_INFO("Error unsupported GPU matrix type"); this->info(); dst->info(); FATAL_ERROR(__FILE__, __LINE__); } } } template <typename ValueType> void GPUAcceleratorMatrixDENSE<ValueType>::CopyFromHostAsync(const HostMatrix<ValueType> &src) { const HostMatrixDENSE<ValueType> *cast_mat; // copy only in the same format assert(this->get_mat_format() == src.get_mat_format()); // CPU to GPU copy if ((cast_mat = dynamic_cast<const HostMatrixDENSE<ValueType>*> (&src)) != NULL) { if (this->get_nnz() == 0) this->AllocateDENSE(src.get_nrow(), src.get_ncol() ); assert(this->get_nnz() == src.get_nnz()); assert(this->get_nrow() == src.get_nrow()); assert(this->get_ncol() == src.get_ncol()); if (this->get_nnz() > 0) { cudaMemcpyAsync(this->mat_.val, // dst cast_mat->mat_.val, // src this->get_nnz()*sizeof(ValueType), // size cudaMemcpyHostToDevice); CHECK_CUDA_ERROR(__FILE__, __LINE__); } } else { LOG_INFO("Error unsupported GPU matrix type"); this->info(); src.info(); FATAL_ERROR(__FILE__, __LINE__); } } template <typename ValueType> void GPUAcceleratorMatrixDENSE<ValueType>::CopyToHostAsync(HostMatrix<ValueType> *dst) const { HostMatrixDENSE<ValueType> *cast_mat; // copy only in the same format assert(this->get_mat_format() == dst->get_mat_format()); // GPU to CPU copy if ((cast_mat = dynamic_cast<HostMatrixDENSE<ValueType>*> (dst)) != NULL) { cast_mat->set_backend(this->local_backend_); if (dst->get_nnz() == 0) cast_mat->AllocateDENSE(this->get_nrow(), this->get_ncol() ); assert(this->get_nnz() == dst->get_nnz()); assert(this->get_nrow() == dst->get_nrow()); assert(this->get_ncol() == dst->get_ncol()); if (this->get_nnz() > 0) { cudaMemcpyAsync(cast_mat->mat_.val, // dst this->mat_.val, // src this->get_nnz()*sizeof(ValueType), // size cudaMemcpyDeviceToHost); CHECK_CUDA_ERROR(__FILE__, __LINE__); } } else { LOG_INFO("Error unsupported GPU matrix type"); this->info(); dst->info(); FATAL_ERROR(__FILE__, __LINE__); } } template <typename ValueType> void GPUAcceleratorMatrixDENSE<ValueType>::CopyFromAsync(const BaseMatrix<ValueType> &src) { const GPUAcceleratorMatrixDENSE<ValueType> *gpu_cast_mat; const HostMatrix<ValueType> *host_cast_mat; // copy only in the same format assert(this->get_mat_format() == src.get_mat_format()); // GPU to GPU copy if ((gpu_cast_mat = dynamic_cast<const GPUAcceleratorMatrixDENSE<ValueType>*> (&src)) != NULL) { if (this->get_nnz() == 0) this->AllocateDENSE(src.get_nrow(), src.get_ncol() ); assert(this->get_nnz() == src.get_nnz()); assert(this->get_nrow() == src.get_nrow()); assert(this->get_ncol() == src.get_ncol()); if (this->get_nnz() > 0) { cudaMemcpy(this->mat_.val, // dst gpu_cast_mat->mat_.val, // src this->get_nnz()*sizeof(ValueType), // size cudaMemcpyDeviceToDevice); CHECK_CUDA_ERROR(__FILE__, __LINE__); } } else { //CPU to GPU if ((host_cast_mat = dynamic_cast<const HostMatrix<ValueType>*> (&src)) != NULL) { this->CopyFromHostAsync(*host_cast_mat); } else { LOG_INFO("Error unsupported GPU matrix type"); this->info(); src.info(); FATAL_ERROR(__FILE__, __LINE__); } } } template <typename ValueType> void GPUAcceleratorMatrixDENSE<ValueType>::CopyToAsync(BaseMatrix<ValueType> *dst) const { GPUAcceleratorMatrixDENSE<ValueType> *gpu_cast_mat; HostMatrix<ValueType> *host_cast_mat; // copy only in the same format assert(this->get_mat_format() == dst->get_mat_format()); // GPU to GPU copy if ((gpu_cast_mat = dynamic_cast<GPUAcceleratorMatrixDENSE<ValueType>*> (dst)) != NULL) { gpu_cast_mat->set_backend(this->local_backend_); if (this->get_nnz() == 0) gpu_cast_mat->AllocateDENSE(dst->get_nrow(), dst->get_ncol() ); assert(this->get_nnz() == dst->get_nnz()); assert(this->get_nrow() == dst->get_nrow()); assert(this->get_ncol() == dst->get_ncol()); if (this->get_nnz() > 0) { cudaMemcpy(gpu_cast_mat->mat_.val, // dst this->mat_.val, // src this->get_nnz()*sizeof(ValueType), // size cudaMemcpyDeviceToHost); CHECK_CUDA_ERROR(__FILE__, __LINE__); } } else { //GPU to CPU if ((host_cast_mat = dynamic_cast<HostMatrix<ValueType>*> (dst)) != NULL) { this->CopyToHostAsync(host_cast_mat); } else { LOG_INFO("Error unsupported GPU matrix type"); this->info(); dst->info(); FATAL_ERROR(__FILE__, __LINE__); } } } template <typename ValueType> bool GPUAcceleratorMatrixDENSE<ValueType>::ConvertFrom(const BaseMatrix<ValueType> &mat) { this->Clear(); // empty matrix is empty matrix if (mat.get_nnz() == 0) return true; const GPUAcceleratorMatrixDENSE<ValueType> *cast_mat_dense; if ((cast_mat_dense = dynamic_cast<const GPUAcceleratorMatrixDENSE<ValueType>*> (&mat)) != NULL) { this->CopyFrom(*cast_mat_dense); return true; } /* const GPUAcceleratorMatrixCSR<ValueType> *cast_mat_csr; if ((cast_mat_csr = dynamic_cast<const GPUAcceleratorMatrixCSR<ValueType>*> (&mat)) != NULL) { this->Clear(); FATAL_ERROR(__FILE__, __LINE__); this->nrow_ = cast_mat_csr->get_nrow(); this->ncol_ = cast_mat_csr->get_ncol(); this->nnz_ = cast_mat_csr->get_nnz(); return true; } */ return false; } template <> void GPUAcceleratorMatrixDENSE<double>::Apply(const BaseVector<double> &in, BaseVector<double> *out) const { if (this->get_nnz() > 0) { assert(in. get_size() >= 0); assert(out->get_size() >= 0); assert(in. get_size() == this->get_ncol()); assert(out->get_size() == this->get_nrow()); const GPUAcceleratorVector<double> *cast_in = dynamic_cast<const GPUAcceleratorVector<double>*> (&in); GPUAcceleratorVector<double> *cast_out = dynamic_cast< GPUAcceleratorVector<double>*> (out); assert(cast_in != NULL); assert(cast_out!= NULL); cublasStatus_t stat_t; const double alpha = double(1.0); const double beta = double(0.0); if (DENSE_IND_BASE == 0) { stat_t = cublasDgemv(CUBLAS_HANDLE(this->local_backend_.GPU_cublas_handle), CUBLAS_OP_N, this->get_nrow(), this->get_ncol(), &alpha, this->mat_.val, this->get_nrow(), cast_in->vec_, 1, &beta, cast_out->vec_, 1); CHECK_CUBLAS_ERROR(stat_t, __FILE__, __LINE__); } else { stat_t = cublasDgemv(CUBLAS_HANDLE(this->local_backend_.GPU_cublas_handle), CUBLAS_OP_T, this->get_ncol(), this->get_nrow(), &alpha, this->mat_.val, this->get_ncol(), cast_in->vec_, 1, &beta, cast_out->vec_, 1); CHECK_CUBLAS_ERROR(stat_t, __FILE__, __LINE__); } } } template <> void GPUAcceleratorMatrixDENSE<float>::Apply(const BaseVector<float> &in, BaseVector<float> *out) const { if (this->get_nnz() > 0) { assert(in. get_size() >= 0); assert(out->get_size() >= 0); assert(in. get_size() == this->get_ncol()); assert(out->get_size() == this->get_nrow()); const GPUAcceleratorVector<float> *cast_in = dynamic_cast<const GPUAcceleratorVector<float>*> (&in); GPUAcceleratorVector<float> *cast_out = dynamic_cast< GPUAcceleratorVector<float>*> (out); assert(cast_in != NULL); assert(cast_out!= NULL); cublasStatus_t stat_t; const float alpha = float(1.0); const float beta = float(0.0); if (DENSE_IND_BASE == 0) { stat_t = cublasSgemv(CUBLAS_HANDLE(this->local_backend_.GPU_cublas_handle), CUBLAS_OP_N, this->get_nrow(), this->get_ncol(), &alpha, this->mat_.val, this->get_nrow(), cast_in->vec_, 1, &beta, cast_out->vec_, 1); CHECK_CUBLAS_ERROR(stat_t, __FILE__, __LINE__); } else { stat_t = cublasSgemv(CUBLAS_HANDLE(this->local_backend_.GPU_cublas_handle), CUBLAS_OP_T, this->get_ncol(), this->get_nrow(), &alpha, this->mat_.val, this->get_ncol(), cast_in->vec_, 1, &beta, cast_out->vec_, 1); CHECK_CUBLAS_ERROR(stat_t, __FILE__, __LINE__); } } } template <> void GPUAcceleratorMatrixDENSE<double>::ApplyAdd(const BaseVector<double> &in, const double scalar, BaseVector<double> *out) const { if (this->get_nnz() > 0) { assert(in. get_size() >= 0); assert(out->get_size() >= 0); assert(in. get_size() == this->get_ncol()); assert(out->get_size() == this->get_nrow()); const GPUAcceleratorVector<double> *cast_in = dynamic_cast<const GPUAcceleratorVector<double>*> (&in); GPUAcceleratorVector<double> *cast_out = dynamic_cast< GPUAcceleratorVector<double>*> (out); assert(cast_in != NULL); assert(cast_out!= NULL); cublasStatus_t stat_t; const double alpha = scalar; const double beta = double(0.0); if (DENSE_IND_BASE == 0) { stat_t = cublasDgemv(CUBLAS_HANDLE(this->local_backend_.GPU_cublas_handle), CUBLAS_OP_N, this->get_nrow(), this->get_ncol(), &alpha, this->mat_.val, this->get_nrow(), cast_in->vec_, 1, &beta, cast_out->vec_, 1); CHECK_CUBLAS_ERROR(stat_t, __FILE__, __LINE__); } else { stat_t = cublasDgemv(CUBLAS_HANDLE(this->local_backend_.GPU_cublas_handle), CUBLAS_OP_T, this->get_ncol(), this->get_nrow(), &alpha, this->mat_.val, this->get_ncol(), cast_in->vec_, 1, &beta, cast_out->vec_, 1); CHECK_CUBLAS_ERROR(stat_t, __FILE__, __LINE__); } } } template <> void GPUAcceleratorMatrixDENSE<float>::ApplyAdd(const BaseVector<float> &in, const float scalar, BaseVector<float> *out) const { FATAL_ERROR(__FILE__, __LINE__); if (this->get_nnz() > 0) { assert(in. get_size() >= 0); assert(out->get_size() >= 0); assert(in. get_size() == this->get_ncol()); assert(out->get_size() == this->get_nrow()); const GPUAcceleratorVector<float> *cast_in = dynamic_cast<const GPUAcceleratorVector<float>*> (&in); GPUAcceleratorVector<float> *cast_out = dynamic_cast< GPUAcceleratorVector<float>*> (out); assert(cast_in != NULL); assert(cast_out!= NULL); cublasStatus_t stat_t; const float alpha = scalar; const float beta = float(0.0); if (DENSE_IND_BASE == 0) { stat_t = cublasSgemv(CUBLAS_HANDLE(this->local_backend_.GPU_cublas_handle), CUBLAS_OP_N, this->get_nrow(), this->get_ncol(), &alpha, this->mat_.val, this->get_nrow(), cast_in->vec_, 1, &beta, cast_out->vec_, 1); CHECK_CUBLAS_ERROR(stat_t, __FILE__, __LINE__); } else { stat_t = cublasSgemv(CUBLAS_HANDLE(this->local_backend_.GPU_cublas_handle), CUBLAS_OP_T, this->get_ncol(), this->get_nrow(), &alpha, this->mat_.val, this->get_ncol(), cast_in->vec_, 1, &beta, cast_out->vec_, 1); CHECK_CUBLAS_ERROR(stat_t, __FILE__, __LINE__); } } } template <> bool GPUAcceleratorMatrixDENSE<float>::MatMatMult(const BaseMatrix<float> &A, const BaseMatrix<float> &B) { assert((this != &A) && (this != &B)); assert(&A != NULL); assert(&B != NULL); const GPUAcceleratorMatrixDENSE<float> *cast_mat_A = dynamic_cast<const GPUAcceleratorMatrixDENSE<float>*> (&A); const GPUAcceleratorMatrixDENSE<float> *cast_mat_B = dynamic_cast<const GPUAcceleratorMatrixDENSE<float>*> (&B); assert(cast_mat_A != NULL); assert(cast_mat_B != NULL); assert(cast_mat_A->ncol_ == cast_mat_B->nrow_); cublasStatus_t stat_t; const float alpha = float(1.0); const float beta = float(0.0); if (DENSE_IND_BASE == 0) { stat_t = cublasSgemm(CUBLAS_HANDLE(this->local_backend_.GPU_cublas_handle), CUBLAS_OP_N, CUBLAS_OP_N, cast_mat_A->nrow_, cast_mat_B->ncol_, cast_mat_A->ncol_, &alpha, cast_mat_A->mat_.val, cast_mat_A->nrow_, cast_mat_B->mat_.val, cast_mat_A->ncol_, &beta, this->mat_.val, cast_mat_A->nrow_); CHECK_CUBLAS_ERROR(stat_t, __FILE__, __LINE__); } else { stat_t = cublasSgemm(CUBLAS_HANDLE(this->local_backend_.GPU_cublas_handle), CUBLAS_OP_T, CUBLAS_OP_T, cast_mat_A->nrow_, cast_mat_B->ncol_, cast_mat_A->ncol_, &alpha, cast_mat_A->mat_.val, cast_mat_A->ncol_, cast_mat_B->mat_.val, cast_mat_B->ncol_, &beta, this->mat_.val, cast_mat_A->nrow_); CHECK_CUBLAS_ERROR(stat_t, __FILE__, __LINE__); } return true; } template <> bool GPUAcceleratorMatrixDENSE<double>::MatMatMult(const BaseMatrix<double> &A, const BaseMatrix<double> &B) { assert((this != &A) && (this != &B)); assert(&A != NULL); assert(&B != NULL); const GPUAcceleratorMatrixDENSE<double> *cast_mat_A = dynamic_cast<const GPUAcceleratorMatrixDENSE<double>*> (&A); const GPUAcceleratorMatrixDENSE<double> *cast_mat_B = dynamic_cast<const GPUAcceleratorMatrixDENSE<double>*> (&B); assert(cast_mat_A != NULL); assert(cast_mat_B != NULL); assert(cast_mat_A->ncol_ == cast_mat_B->nrow_); cublasStatus_t stat_t; const double alpha = double(1.0); const double beta = double(0.0); if (DENSE_IND_BASE == 0) { stat_t = cublasDgemm(CUBLAS_HANDLE(this->local_backend_.GPU_cublas_handle), CUBLAS_OP_N, CUBLAS_OP_N, cast_mat_A->nrow_, cast_mat_B->ncol_, cast_mat_A->ncol_, &alpha, cast_mat_A->mat_.val, cast_mat_A->nrow_, cast_mat_B->mat_.val, cast_mat_A->ncol_, &beta, this->mat_.val, cast_mat_A->nrow_); CHECK_CUBLAS_ERROR(stat_t, __FILE__, __LINE__); } else { stat_t = cublasDgemm(CUBLAS_HANDLE(this->local_backend_.GPU_cublas_handle), CUBLAS_OP_T, CUBLAS_OP_T, cast_mat_A->nrow_, cast_mat_B->ncol_, cast_mat_A->ncol_, &alpha, cast_mat_A->mat_.val, cast_mat_A->ncol_, cast_mat_B->mat_.val, cast_mat_B->ncol_, &beta, this->mat_.val, cast_mat_A->nrow_); CHECK_CUBLAS_ERROR(stat_t, __FILE__, __LINE__); } return true; } template <typename ValueType> bool GPUAcceleratorMatrixDENSE<ValueType>::ReplaceColumnVector(const int idx, const BaseVector<ValueType> &vec) { assert(&vec != NULL); assert(vec.get_size() == this->get_nrow()); if (this->get_nnz() > 0) { const GPUAcceleratorVector<ValueType> *cast_vec = dynamic_cast<const GPUAcceleratorVector<ValueType>*> (&vec); assert(cast_vec != NULL); const int nrow = this->get_nrow(); const int ncol = this->get_ncol(); dim3 BlockSize(this->local_backend_.GPU_block_size); dim3 GridSize(nrow / this->local_backend_.GPU_block_size + 1); kernel_dense_replace_column_vector<ValueType, int> <<<GridSize, BlockSize>>>(cast_vec->vec_, idx, nrow, ncol, this->mat_.val); CHECK_CUDA_ERROR(__FILE__,__LINE__); } return true; } template <typename ValueType> bool GPUAcceleratorMatrixDENSE<ValueType>::ReplaceRowVector(const int idx, const BaseVector<ValueType> &vec) { assert(&vec != NULL); assert(vec.get_size() == this->get_ncol()); if (this->get_nnz() > 0) { const GPUAcceleratorVector<ValueType> *cast_vec = dynamic_cast<const GPUAcceleratorVector<ValueType>*> (&vec); assert(cast_vec != NULL); const int nrow = this->get_nrow(); const int ncol = this->get_ncol(); dim3 BlockSize(this->local_backend_.GPU_block_size); dim3 GridSize(ncol / this->local_backend_.GPU_block_size + 1); kernel_dense_replace_row_vector<ValueType, int> <<<GridSize, BlockSize>>>(cast_vec->vec_, idx, nrow, ncol, this->mat_.val); CHECK_CUDA_ERROR(__FILE__,__LINE__); } return true; } template <typename ValueType> bool GPUAcceleratorMatrixDENSE<ValueType>::ExtractColumnVector(const int idx, BaseVector<ValueType> *vec) const { assert(vec != NULL); assert(vec->get_size() == this->get_nrow()); if (this->get_nnz() > 0) { GPUAcceleratorVector<ValueType> *cast_vec = dynamic_cast<GPUAcceleratorVector<ValueType>*> (vec); assert(cast_vec != NULL); const int nrow = this->get_nrow(); const int ncol = this->get_ncol(); dim3 BlockSize(this->local_backend_.GPU_block_size); dim3 GridSize(nrow / this->local_backend_.GPU_block_size + 1); kernel_dense_extract_column_vector<ValueType, int> <<<GridSize, BlockSize>>>(cast_vec->vec_, idx, nrow, ncol, this->mat_.val); CHECK_CUDA_ERROR(__FILE__,__LINE__); } return true; } template <typename ValueType> bool GPUAcceleratorMatrixDENSE<ValueType>::ExtractRowVector(const int idx, BaseVector<ValueType> *vec) const { assert(vec != NULL); assert(vec->get_size() == this->get_ncol()); if (this->get_nnz() > 0) { GPUAcceleratorVector<ValueType> *cast_vec = dynamic_cast<GPUAcceleratorVector<ValueType>*> (vec); assert(cast_vec != NULL); const int nrow = this->get_nrow(); const int ncol = this->get_ncol(); dim3 BlockSize(this->local_backend_.GPU_block_size); dim3 GridSize(ncol / this->local_backend_.GPU_block_size + 1); kernel_dense_extract_row_vector<ValueType, int> <<<GridSize, BlockSize>>>(cast_vec->vec_, idx, nrow, ncol, this->mat_.val); CHECK_CUDA_ERROR(__FILE__,__LINE__); } return true; } template class GPUAcceleratorMatrixDENSE<double>; template class GPUAcceleratorMatrixDENSE<float>; }