// **************************************************************************
//
//    PARALUTION   www.paralution.com
//
//    Copyright (C) 2015  PARALUTION Labs UG (haftungsbeschränkt) & Co. KG
//                        Am Hasensprung 6, 76571 Gaggenau
//                        Handelsregister: Amtsgericht Mannheim, HRA 706051
//                        Vertreten durch:
//                        PARALUTION Labs Verwaltungs UG (haftungsbeschränkt)
//                        Am Hasensprung 6, 76571 Gaggenau
//                        Handelsregister: Amtsgericht Mannheim, HRB 721277
//                        Geschäftsführer: Dimitar Lukarski, Nico Trost
//
//    This program is free software: you can redistribute it and/or modify
//    it under the terms of the GNU General Public License as published by
//    the Free Software Foundation, either version 3 of the License, or
//    (at your option) any later version.
//
//    This program is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU General Public License for more details.
//
//    You should have received a copy of the GNU General Public License
//    along with this program.  If not, see <http://www.gnu.org/licenses/>.
//
// **************************************************************************



// PARALUTION version 1.1.0 


#ifndef PARALUTION_KERNELS_OCL_HPP_
#define PARALUTION_KERNELS_OCL_HPP_

#define KERNELCOUNT 74

#if defined(__APPLE__) && defined(__MACH__)
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

static const std::string kernels_ocl[KERNELCOUNT] = {
                "kernel_reverse_index",
                "kernel_buffer_addscalar",
                "kernel_scaleadd",
                "kernel_scaleaddscale",
                "kernel_scaleadd2",
                "kernel_pointwisemult",
                "kernel_pointwisemult2",
                "kernel_copy_offset_from",
                "kernel_permute",
                "kernel_permute_backward",
                "kernel_dot",
                "kernel_axpy",
                "kernel_csr_spmv_scalar",
                "kernel_csr_add_spmv_scalar",
                "kernel_csr_scale_diagonal",
                "kernel_csr_scale_offdiagonal",
                "kernel_csr_add_diagonal",
                "kernel_csr_add_offdiagonal",
                "kernel_csr_extract_diag",
                "kernel_csr_extract_inv_diag",
                "kernel_csr_extract_submatrix_row_nnz",
                "kernel_csr_extract_submatrix_copy",
                "kernel_csr_diagmatmult_r",
                "kernel_csr_add_csr_same_struct",
                "kernel_scale",
                "kernel_mcsr_spmv_scalar",
                "kernel_mcsr_add_spmv_scalar",
                "kernel_ell_spmv",
                "kernel_ell_add_spmv",
                "kernel_dia_spmv",
                "kernel_dia_add_spmv",
                "kernel_coo_permute",
                "kernel_coo_spmv_flat",
                "kernel_coo_spmv_reduce_update",
                "kernel_coo_spmv_serial",
                "kernel_red_recurse",
                "kernel_red_partial_sum",
                "kernel_red_extrapolate",
                "kernel_csr_permute_rows",
                "kernel_csr_permute_cols",
                "kernel_csr_calc_row_nnz",
                "kernel_csr_permute_row_nnz",
                "kernel_reduce",
                "kernel_ell_max_row",
                "kernel_ell_csr_to_ell",
                "kernel_asum",
                "kernel_amax",
                "kernel_dense_spmv",
                "kernel_csr_extract_l_triangular",
                "kernel_csr_slower_nnz_per_row",
                "kernel_csr_supper_nnz_per_row",
                "kernel_csr_lower_nnz_per_row",
                "kernel_csr_upper_nnz_per_row",
                "kernel_csr_compress_count_nrow",
                "kernel_csr_compress_copy",
                "kernel_scaleaddscale_offset",
                "kernel_csr_extract_u_triangular",
                "kernel_norm",
                "kernel_dotc",
                "kernel_csr_diagmatmult_l",
                "kernel_power",
                "kernel_ell_nnz_coo",
                "kernel_ell_fill_ell",
                "kernel_ell_fill_coo",
                "kernel_copy_from_float",
                "kernel_copy_from_double",
                "kernel_dense_replace_column_vector",
                "kernel_dense_replace_row_vector",
                "kernel_dense_extract_column_vector",
                "kernel_dense_extract_row_vector",
                "kernel_csr_extract_column_vector",
                "kernel_csr_replace_column_vector_offset",
                "kernel_csr_replace_column_vector",
                "kernel_coo_csr_to_coo"
};

#define CL_KERNEL_REVERSE_INDEX                     paralution_get_kernel_ocl<ValueType>(  0)
#define CL_KERNEL_BUFFER_ADDSCALAR                  paralution_get_kernel_ocl<ValueType>(  4)
#define CL_KERNEL_SCALEADD                          paralution_get_kernel_ocl<ValueType>(  8)
#define CL_KERNEL_SCALEADDSCALE                     paralution_get_kernel_ocl<ValueType>( 12)
#define CL_KERNEL_SCALEADD2                         paralution_get_kernel_ocl<ValueType>( 16)
#define CL_KERNEL_POINTWISEMULT                     paralution_get_kernel_ocl<ValueType>( 20)
#define CL_KERNEL_POINTWISEMULT2                    paralution_get_kernel_ocl<ValueType>( 24)
#define CL_KERNEL_COPY_OFFSET_FROM                  paralution_get_kernel_ocl<ValueType>( 28)
#define CL_KERNEL_PERMUTE                           paralution_get_kernel_ocl<ValueType>( 32)
#define CL_KERNEL_PERMUTE_BACKWARD                  paralution_get_kernel_ocl<ValueType>( 36)
#define CL_KERNEL_DOT                               paralution_get_kernel_ocl<ValueType>( 40)
#define CL_KERNEL_AXPY                              paralution_get_kernel_ocl<ValueType>( 44)
#define CL_KERNEL_CSR_SPMV_SCALAR                   paralution_get_kernel_ocl<ValueType>( 48)
#define CL_KERNEL_CSR_ADD_SPMV_SCALAR               paralution_get_kernel_ocl<ValueType>( 52)
#define CL_KERNEL_CSR_SCALE_DIAGONAL                paralution_get_kernel_ocl<ValueType>( 56)
#define CL_KERNEL_CSR_SCALE_OFFDIAGONAL             paralution_get_kernel_ocl<ValueType>( 60)
#define CL_KERNEL_CSR_ADD_DIAGONAL                  paralution_get_kernel_ocl<ValueType>( 64)
#define CL_KERNEL_CSR_ADD_OFFDIAGONAL               paralution_get_kernel_ocl<ValueType>( 68)
#define CL_KERNEL_CSR_EXTRACT_DIAG                  paralution_get_kernel_ocl<ValueType>( 72)
#define CL_KERNEL_CSR_EXTRACT_INV_DIAG              paralution_get_kernel_ocl<ValueType>( 76)
#define CL_KERNEL_CSR_EXTRACT_SUBMATRIX_ROW_NNZ     paralution_get_kernel_ocl<ValueType>( 80)
#define CL_KERNEL_CSR_EXTRACT_SUBMATRIX_COPY        paralution_get_kernel_ocl<ValueType>( 84)
#define CL_KERNEL_CSR_DIAGMATMULT_R                 paralution_get_kernel_ocl<ValueType>( 88)
#define CL_KERNEL_CSR_ADD_CSR_SAME_STRUCT           paralution_get_kernel_ocl<ValueType>( 92)
#define CL_KERNEL_SCALE                             paralution_get_kernel_ocl<ValueType>( 96)
#define CL_KERNEL_MCSR_SPMV_SCALAR                  paralution_get_kernel_ocl<ValueType>(100)
#define CL_KERNEL_MCSR_ADD_SPMV_SCALAR              paralution_get_kernel_ocl<ValueType>(104)
#define CL_KERNEL_ELL_SPMV                          paralution_get_kernel_ocl<ValueType>(108)
#define CL_KERNEL_ELL_ADD_SPMV                      paralution_get_kernel_ocl<ValueType>(112)
#define CL_KERNEL_DIA_SPMV                          paralution_get_kernel_ocl<ValueType>(116)
#define CL_KERNEL_DIA_ADD_SPMV                      paralution_get_kernel_ocl<ValueType>(120)
#define CL_KERNEL_COO_PERMUTE                       paralution_get_kernel_ocl<ValueType>(124)
#define CL_KERNEL_COO_SPMV_FLAT                     paralution_get_kernel_ocl<ValueType>(128)
#define CL_KERNEL_COO_SPMV_REDUCE_UPDATE            paralution_get_kernel_ocl<ValueType>(132)
#define CL_KERNEL_COO_SPMV_SERIAL                   paralution_get_kernel_ocl<ValueType>(136)
#define CL_KERNEL_RED_RECURSE                       paralution_get_kernel_ocl<ValueType>(140)
#define CL_KERNEL_RED_PARTIAL_SUM                   paralution_get_kernel_ocl<ValueType>(144)
#define CL_KERNEL_RED_EXTRAPOLATE                   paralution_get_kernel_ocl<ValueType>(148)
#define CL_KERNEL_CSR_PERMUTE_ROWS                  paralution_get_kernel_ocl<ValueType>(152)
#define CL_KERNEL_CSR_PERMUTE_COLS                  paralution_get_kernel_ocl<ValueType>(156)
#define CL_KERNEL_CSR_CALC_ROW_NNZ                  paralution_get_kernel_ocl<ValueType>(160)
#define CL_KERNEL_CSR_PERMUTE_ROW_NNZ               paralution_get_kernel_ocl<ValueType>(164)
#define CL_KERNEL_REDUCE                            paralution_get_kernel_ocl<ValueType>(168)
#define CL_KERNEL_ELL_MAX_ROW                       paralution_get_kernel_ocl<ValueType>(172)
#define CL_KERNEL_ELL_CSR_TO_ELL                    paralution_get_kernel_ocl<ValueType>(176)
#define CL_KERNEL_ASUM                              paralution_get_kernel_ocl<ValueType>(180)
#define CL_KERNEL_AMAX                              paralution_get_kernel_ocl<ValueType>(184)
#define CL_KERNEL_DENSE_SPMV                        paralution_get_kernel_ocl<ValueType>(188)
#define CL_KERNEL_CSR_EXTRACT_L_TRIANGULAR          paralution_get_kernel_ocl<ValueType>(192)
#define CL_KERNEL_CSR_SLOWER_NNZ_PER_ROW            paralution_get_kernel_ocl<ValueType>(196)
#define CL_KERNEL_CSR_SUPPER_NNZ_PER_ROW            paralution_get_kernel_ocl<ValueType>(200)
#define CL_KERNEL_CSR_LOWER_NNZ_PER_ROW             paralution_get_kernel_ocl<ValueType>(204)
#define CL_KERNEL_CSR_UPPER_NNZ_PER_ROW             paralution_get_kernel_ocl<ValueType>(208)
#define CL_KERNEL_CSR_COMPRESS_COUNT_NROW           paralution_get_kernel_ocl<ValueType>(212)
#define CL_KERNEL_CSR_COMPRESS_COPY                 paralution_get_kernel_ocl<ValueType>(216)
#define CL_KERNEL_SCALEADDSCALE_OFFSET              paralution_get_kernel_ocl<ValueType>(220)
#define CL_KERNEL_CSR_EXTRACT_U_TRIANGULAR          paralution_get_kernel_ocl<ValueType>(224)
#define CL_KERNEL_NORM                              paralution_get_kernel_ocl<ValueType>(228)
#define CL_KERNEL_DOTC                              paralution_get_kernel_ocl<ValueType>(232)
#define CL_KERNEL_CSR_DIAGMATMULT_L                 paralution_get_kernel_ocl<ValueType>(236)
#define CL_KERNEL_POWER                             paralution_get_kernel_ocl<ValueType>(240)
#define CL_KERNEL_ELL_NNZ_COO                       paralution_get_kernel_ocl<ValueType>(244)
#define CL_KERNEL_ELL_FILL_ELL                      paralution_get_kernel_ocl<ValueType>(248)
#define CL_KERNEL_ELL_FILL_COO                      paralution_get_kernel_ocl<ValueType>(252)
#define CL_KERNEL_COPY_FROM_FLOAT                   paralution_get_kernel_ocl<double>   (256)
#define CL_KERNEL_COPY_FROM_DOUBLE                  paralution_get_kernel_ocl<float>    (260)
#define CL_KERNEL_DENSE_REPLACE_COLUMN_VECTOR       paralution_get_kernel_ocl<ValueType>(264)
#define CL_KERNEL_DENSE_REPLACE_ROW_VECTOR          paralution_get_kernel_ocl<ValueType>(268)
#define CL_KERNEL_DENSE_EXTRACT_COLUMN_VECTOR       paralution_get_kernel_ocl<ValueType>(272)
#define CL_KERNEL_DENSE_EXTRACT_ROW_VECTOR          paralution_get_kernel_ocl<ValueType>(276)
#define CL_KERNEL_CSR_EXTRACT_COLUMN_VECTOR         paralution_get_kernel_ocl<ValueType>(280)
#define CL_KERNEL_CSR_REPLACE_COLUMN_VECTOR_OFFSET  paralution_get_kernel_ocl<ValueType>(284)
#define CL_KERNEL_CSR_REPLACE_COLUMN_VECTOR         paralution_get_kernel_ocl<ValueType>(288)
#define CL_KERNEL_COO_CSR_TO_COO                    paralution_get_kernel_ocl<ValueType>(292)

template <typename ValueType, typename A0, typename A1, typename A2>
cl_int ocl_kernel(cl_kernel kernel, cl_command_queue cmdQueue, const size_t LocalSize, const size_t GlobalSize, A0 arg0, A1 arg1, A2 arg2, bool sync = true) {

  cl_int err;
  cl_event event;

  err  = clSetKernelArg(kernel, 0, sizeof(A0), (void*) &arg0);
  err |= clSetKernelArg(kernel, 1, sizeof(A1), (void*) &arg1);
  err |= clSetKernelArg(kernel, 2, sizeof(A2), (void*) &arg2);

  if (err != CL_SUCCESS)
    return err;

  size_t szLocalWorkSize[1];
  size_t szGlobalWorkSize[1];

  szLocalWorkSize[0]  = LocalSize;
  szGlobalWorkSize[0] = GlobalSize;

  err = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, &event);

  if (err != CL_SUCCESS)
    return err;

  if (sync == true) {

    // Wait for kernel run to finish
    err = clWaitForEvents(1, &event);

    if (err != CL_SUCCESS)
      return err;

    // Release event when kernel run finished
    err = clReleaseEvent(event);

    if (err != CL_SUCCESS)
      return err;

  }

  return CL_SUCCESS;

}

template <typename ValueType, typename A0, typename A1, typename A2, typename A3>
cl_int ocl_kernel(cl_kernel kernel, cl_command_queue cmdQueue, const size_t LocalSize, const size_t GlobalSize, A0 arg0, A1 arg1, A2 arg2, A3 arg3, bool sync = true) {

  cl_int err;
  cl_event event;

  err  = clSetKernelArg(kernel, 0, sizeof(A0), (void*) &arg0);
  err |= clSetKernelArg(kernel, 1, sizeof(A1), (void*) &arg1);
  err |= clSetKernelArg(kernel, 2, sizeof(A2), (void*) &arg2);
  err |= clSetKernelArg(kernel, 3, sizeof(A3), (void*) &arg3);

  if (err != CL_SUCCESS)
    return err;

  size_t szLocalWorkSize[1];
  size_t szGlobalWorkSize[1];

  szLocalWorkSize[0]  = LocalSize;
  szGlobalWorkSize[0] = GlobalSize;

  err = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, &event);

  if (err != CL_SUCCESS)
    return err;

  if (sync == true) {

    // Wait for kernel run to finish
    err = clWaitForEvents(1, &event);

    if (err != CL_SUCCESS)
      return err;

    // Release event when kernel run finished
    err = clReleaseEvent(event);

    if (err != CL_SUCCESS)
      return err;

  }

  return CL_SUCCESS;

}

template <typename ValueType, typename A0, typename A1, typename A2, typename A3, typename A4>
cl_int ocl_kernel(cl_kernel kernel, cl_command_queue cmdQueue, const size_t LocalSize, const size_t GlobalSize, A0 arg0, A1 arg1, A2 arg2, A3 arg3, A4 arg4, bool sync = true) {

  cl_int err;
  cl_event event;

  err  = clSetKernelArg(kernel, 0, sizeof(A0), (void*) &arg0);
  err |= clSetKernelArg(kernel, 1, sizeof(A1), (void*) &arg1);
  err |= clSetKernelArg(kernel, 2, sizeof(A2), (void*) &arg2);
  err |= clSetKernelArg(kernel, 3, sizeof(A3), (void*) &arg3);
  err |= clSetKernelArg(kernel, 4, sizeof(A4), (void*) &arg4);

  if (err != CL_SUCCESS)
    return err;

  size_t szLocalWorkSize[1];
  size_t szGlobalWorkSize[1];

  szLocalWorkSize[0]  = LocalSize;
  szGlobalWorkSize[0] = GlobalSize;

  err = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, &event);

  if (err != CL_SUCCESS)
    return err;

  if (sync == true) {

    // Wait for kernel run to finish
    err = clWaitForEvents(1, &event);

    if (err != CL_SUCCESS)
      return err;

    // Release event when kernel run finished
    err = clReleaseEvent(event);

    if (err != CL_SUCCESS)
      return err;

  }

  return CL_SUCCESS;

}

template <typename ValueType, typename A0, typename A1, typename A2, typename A3, typename A4, typename A5>
cl_int ocl_kernel(cl_kernel kernel, cl_command_queue cmdQueue, const size_t LocalSize, const size_t GlobalSize, A0 arg0, A1 arg1, A2 arg2, A3 arg3, A4 arg4, A5 arg5, bool sync = true) {

  cl_int err;
  cl_event event;

  err  = clSetKernelArg(kernel, 0, sizeof(A0), (void*) &arg0);
  err |= clSetKernelArg(kernel, 1, sizeof(A1), (void*) &arg1);
  err |= clSetKernelArg(kernel, 2, sizeof(A2), (void*) &arg2);
  err |= clSetKernelArg(kernel, 3, sizeof(A3), (void*) &arg3);
  err |= clSetKernelArg(kernel, 4, sizeof(A4), (void*) &arg4);
  err |= clSetKernelArg(kernel, 5, sizeof(A5), (void*) &arg5);

  if (err != CL_SUCCESS)
    return err;

  size_t szLocalWorkSize[1];
  size_t szGlobalWorkSize[1];

  szLocalWorkSize[0]  = LocalSize;
  szGlobalWorkSize[0] = GlobalSize;

  err = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, &event);

  if (err != CL_SUCCESS)
    return err;

  if (sync == true) {

    // Wait for kernel run to finish
    err = clWaitForEvents(1, &event);

    if (err != CL_SUCCESS)
      return err;

    // Release event when kernel run finished
    err = clReleaseEvent(event);

    if (err != CL_SUCCESS)
      return err;

  }

  return CL_SUCCESS;

}

template <typename ValueType, typename A0, typename A1, typename A2, typename A3, typename A4, typename A5, typename A6>
cl_int ocl_kernel(cl_kernel kernel, cl_command_queue cmdQueue, const size_t LocalSize, const size_t GlobalSize, A0 arg0, A1 arg1, A2 arg2, A3 arg3, A4 arg4, A5 arg5, A6 arg6, bool sync = true) {

  cl_int err;
  cl_event event;

  err  = clSetKernelArg(kernel, 0, sizeof(A0), (void*) &arg0);
  err |= clSetKernelArg(kernel, 1, sizeof(A1), (void*) &arg1);
  err |= clSetKernelArg(kernel, 2, sizeof(A2), (void*) &arg2);
  err |= clSetKernelArg(kernel, 3, sizeof(A3), (void*) &arg3);
  err |= clSetKernelArg(kernel, 4, sizeof(A4), (void*) &arg4);
  err |= clSetKernelArg(kernel, 5, sizeof(A5), (void*) &arg5);
  err |= clSetKernelArg(kernel, 6, sizeof(A6), (void*) &arg6);

  if (err != CL_SUCCESS)
    return err;

  size_t szLocalWorkSize[1];
  size_t szGlobalWorkSize[1];

  szLocalWorkSize[0]  = LocalSize;
  szGlobalWorkSize[0] = GlobalSize;

  err = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, &event);

  if (err != CL_SUCCESS)
    return err;

  if (sync == true) {

    // Wait for kernel run to finish
    err = clWaitForEvents(1, &event);

    if (err != CL_SUCCESS)
      return err;

    // Release event when kernel run finished
    err = clReleaseEvent(event);

    if (err != CL_SUCCESS)
      return err;

  }

  return CL_SUCCESS;

}

template <typename ValueType, typename A0, typename A1, typename A2, typename A3, typename A4, typename A5, typename A6, typename A7>
cl_int ocl_kernel(cl_kernel kernel, cl_command_queue cmdQueue, const size_t LocalSize, const size_t GlobalSize, A0 arg0, A1 arg1, A2 arg2, A3 arg3, A4 arg4, A5 arg5, A6 arg6, A7 arg7, bool sync = true) {

  cl_int err;
  cl_event event;

  err  = clSetKernelArg(kernel, 0, sizeof(A0), (void*) &arg0);
  err |= clSetKernelArg(kernel, 1, sizeof(A1), (void*) &arg1);
  err |= clSetKernelArg(kernel, 2, sizeof(A2), (void*) &arg2);
  err |= clSetKernelArg(kernel, 3, sizeof(A3), (void*) &arg3);
  err |= clSetKernelArg(kernel, 4, sizeof(A4), (void*) &arg4);
  err |= clSetKernelArg(kernel, 5, sizeof(A5), (void*) &arg5);
  err |= clSetKernelArg(kernel, 6, sizeof(A6), (void*) &arg6);
  err |= clSetKernelArg(kernel, 7, sizeof(A7), (void*) &arg7);

  if (err != CL_SUCCESS)
    return err;

  size_t szLocalWorkSize[1];
  size_t szGlobalWorkSize[1];

  szLocalWorkSize[0]  = LocalSize;
  szGlobalWorkSize[0] = GlobalSize;

  err = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, &event);

  if (err != CL_SUCCESS)
    return err;

  if (sync == true) {

    // Wait for kernel run to finish
    err = clWaitForEvents(1, &event);

    if (err != CL_SUCCESS)
      return err;

    // Release event when kernel run finished
    err = clReleaseEvent(event);

    if (err != CL_SUCCESS)
      return err;

  }

  return CL_SUCCESS;

}

template <typename ValueType, typename A0, typename A1, typename A2, typename A3, typename A4, typename A5, typename A6, typename A7, typename A8>
cl_int ocl_kernel(cl_kernel kernel, cl_command_queue cmdQueue, const size_t LocalSize, const size_t GlobalSize, A0 arg0, A1 arg1, A2 arg2, A3 arg3, A4 arg4, A5 arg5, A6 arg6, A7 arg7, A8 arg8, bool sync = true) {

  cl_int err;
  cl_event event;

  err  = clSetKernelArg(kernel, 0, sizeof(A0), (void*) &arg0);
  err |= clSetKernelArg(kernel, 1, sizeof(A1), (void*) &arg1);
  err |= clSetKernelArg(kernel, 2, sizeof(A2), (void*) &arg2);
  err |= clSetKernelArg(kernel, 3, sizeof(A3), (void*) &arg3);
  err |= clSetKernelArg(kernel, 4, sizeof(A4), (void*) &arg4);
  err |= clSetKernelArg(kernel, 5, sizeof(A5), (void*) &arg5);
  err |= clSetKernelArg(kernel, 6, sizeof(A6), (void*) &arg6);
  err |= clSetKernelArg(kernel, 7, sizeof(A7), (void*) &arg7);
  err |= clSetKernelArg(kernel, 8, sizeof(A8), (void*) &arg8);

  if (err != CL_SUCCESS)
    return err;

  size_t szLocalWorkSize[1];
  size_t szGlobalWorkSize[1];

  szLocalWorkSize[0]  = LocalSize;
  szGlobalWorkSize[0] = GlobalSize;

  err = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, &event);

  if (err != CL_SUCCESS)
    return err;

  if (sync == true) {

    // Wait for kernel run to finish
    err = clWaitForEvents(1, &event);

    if (err != CL_SUCCESS)
      return err;

    // Release event when kernel run finished
    err = clReleaseEvent(event);

    if (err != CL_SUCCESS)
      return err;

  }

  return CL_SUCCESS;

}

template <typename ValueType, typename A0, typename A1, typename A2, typename A3, typename A4, typename A5, typename A6, typename A7, typename A8, typename A9>
cl_int ocl_kernel(cl_kernel kernel, cl_command_queue cmdQueue, const size_t LocalSize, const size_t GlobalSize, A0 arg0, A1 arg1, A2 arg2, A3 arg3, A4 arg4, A5 arg5, A6 arg6, A7 arg7, A8 arg8, A9 arg9, bool sync = true) {

  cl_int err;
  cl_event event;

  err  = clSetKernelArg(kernel, 0, sizeof(A0), (void*) &arg0);
  err |= clSetKernelArg(kernel, 1, sizeof(A1), (void*) &arg1);
  err |= clSetKernelArg(kernel, 2, sizeof(A2), (void*) &arg2);
  err |= clSetKernelArg(kernel, 3, sizeof(A3), (void*) &arg3);
  err |= clSetKernelArg(kernel, 4, sizeof(A4), (void*) &arg4);
  err |= clSetKernelArg(kernel, 5, sizeof(A5), (void*) &arg5);
  err |= clSetKernelArg(kernel, 6, sizeof(A6), (void*) &arg6);
  err |= clSetKernelArg(kernel, 7, sizeof(A7), (void*) &arg7);
  err |= clSetKernelArg(kernel, 8, sizeof(A8), (void*) &arg8);
  err |= clSetKernelArg(kernel, 9, sizeof(A9), (void*) &arg9);

  if (err != CL_SUCCESS)
    return err;

  size_t szLocalWorkSize[1];
  size_t szGlobalWorkSize[1];

  szLocalWorkSize[0]  = LocalSize;
  szGlobalWorkSize[0] = GlobalSize;

  err = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, &event);

  if (err != CL_SUCCESS)
    return err;

  if (sync == true) {

    // Wait for kernel run to finish
    err = clWaitForEvents(1, &event);

    if (err != CL_SUCCESS)
      return err;

    // Release event when kernel run finished
    err = clReleaseEvent(event);

    if (err != CL_SUCCESS)
      return err;

  }

  return CL_SUCCESS;

}

#endif // PARALUTION_KERNELS_OCL_HPP_