// **************************************************************************
//
//    PARALUTION   www.paralution.com
//
//    Copyright (C) 2015  PARALUTION Labs UG (haftungsbeschränkt) & Co. KG
//                        Am Hasensprung 6, 76571 Gaggenau
//                        Handelsregister: Amtsgericht Mannheim, HRA 706051
//                        Vertreten durch:
//                        PARALUTION Labs Verwaltungs UG (haftungsbeschränkt)
//                        Am Hasensprung 6, 76571 Gaggenau
//                        Handelsregister: Amtsgericht Mannheim, HRB 721277
//                        Geschäftsführer: Dimitar Lukarski, Nico Trost
//
//    This program is free software: you can redistribute it and/or modify
//    it under the terms of the GNU General Public License as published by
//    the Free Software Foundation, either version 3 of the License, or
//    (at your option) any later version.
//
//    This program is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU General Public License for more details.
//
//    You should have received a copy of the GNU General Public License
//    along with this program.  If not, see <http://www.gnu.org/licenses/>.
//
// **************************************************************************



// PARALUTION version 1.1.0 


#ifndef PARALUTION_OCL_KERNELS_VECTOR_HPP_
#define PARALUTION_OCL_KERNELS_VECTOR_HPP_

namespace paralution {

const char *ocl_kernels_vector =
	"__kernel void kernel_scale(const IndexType size, const ValueType alpha, __global ValueType *x) {\n"
	"\n"
	"  IndexType gid = get_global_id(0);\n"
	"\n"
	"  if (gid < size)\n"
	"    x[gid] = alpha * x[gid];\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_scaleadd(         const IndexType size,\n"
	"                                       const ValueType alpha,\n"
	"                              __global const ValueType *x,\n"
	"                              __global       ValueType *out) {\n"
	"\n"
	"  IndexType gid = get_global_id(0);\n"
	"\n"
	"  if (gid < size)\n"
	"    out[gid] = alpha * out[gid] + x[gid];\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_scaleaddscale(         const IndexType size,\n"
	"                                            const ValueType alpha,\n"
	"                                            const ValueType beta, \n"
	"                                   __global const ValueType *x,\n"
	"                                   __global       ValueType *out) {\n"
	"\n"
	"  IndexType gid = get_global_id(0);\n"
	"\n"
	"  if (gid < size)\n"
	"    out[gid] = alpha * out[gid] + beta * x[gid];\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_scaleaddscale_offset(         const IndexType size,\n"
	"                                                   const IndexType src_offset,\n"
	"                                                   const IndexType dst_offset, \n"
	"                                                   const ValueType alpha,\n"
	"                                                   const ValueType beta, \n"
	"                                          __global const ValueType *x,\n"
	"                                          __global       ValueType *out) {\n"
	"\n"
	"  IndexType gid = get_global_id(0);\n"
	"\n"
	"  if (gid < size)\n"
	"    out[gid+dst_offset] = alpha * out[gid+dst_offset] + beta * x[gid+src_offset];\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_scaleadd2(         const IndexType size,\n"
	"                                        const ValueType alpha,\n"
	"                                        const ValueType beta,\n"
	"                                        const ValueType gamma,\n"
	"                               __global const ValueType *x,\n"
	"                               __global const ValueType *y,\n"
	"                               __global       ValueType *out) {\n"
	"\n"
	"  IndexType gid = get_global_id(0);\n"
	"\n"
	"  if (gid < size)\n"
	"    out[gid] = alpha * out[gid] + beta * x[gid] + gamma * y[gid];\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_pointwisemult(         const IndexType size,\n"
	"                                   __global const ValueType *x,\n"
	"                                   __global       ValueType *out) {\n"
	"\n"
	"  IndexType gid = get_global_id(0);\n"
	"\n"
	"  if (gid < size)\n"
	"    out[gid] = out[gid] * x[gid];\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_pointwisemult2(         const IndexType size,\n"
	"                                    __global const ValueType *x,\n"
	"                                    __global const ValueType *y,\n"
	"                                    __global       ValueType *out) {\n"
	"\n"
	"  IndexType gid = get_global_id(0);\n"
	"\n"
	"  if (gid < size)\n"
	"    out[gid] = y[gid] * x[gid];\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_copy_offset_from(         const IndexType size,\n"
	"                                               const IndexType src_offset,\n"
	"                                               const IndexType dst_offset,\n"
	"                                      __global const ValueType *in,\n"
	"                                      __global       ValueType *out) {\n"
	"\n"
	"  IndexType gid = get_global_id(0);\n"
	"\n"
	"  if (gid < size)\n"
	"    out[gid+dst_offset] = in[gid+src_offset];\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_permute(         const IndexType size,\n"
	"                             __global const IndexType *permute,\n"
	"                             __global const ValueType *in,\n"
	"                             __global       ValueType *out) {\n"
	"\n"
	"  IndexType gid = get_global_id(0);\n"
	"\n"
	"  if (gid < size)\n"
	"    out[permute[gid]] = in[gid];\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_permute_backward(         const IndexType size,\n"
	"                                      __global const IndexType *permute,\n"
	"                                      __global const ValueType *in,\n"
	"                                      __global       ValueType *out) {\n"
	"\n"
	"  IndexType gid = get_global_id(0);\n"
	"\n"
	"  if (gid < size)\n"
	"    out[gid] = in[permute[gid]];\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_dot(         const IndexType  size,\n"
	"                         __global const ValueType *x,\n"
	"                         __global const ValueType *y,\n"
	"                         __global       ValueType *out,\n"
	"                                  const IndexType GROUP_SIZE,\n"
	"                                  const IndexType  LOCAL_SIZE) {\n"
	"\n"
	"    IndexType tid = get_local_id(0);\n"
	"\n"
	"    __local ValueType sdata[BLOCK_SIZE];\n"
	"    sdata[tid] = (ValueType) 0;\n"
	"\n"
	"    IndexType group_id = GROUP_SIZE * get_group_id(0);\n"
	"    IndexType gid = group_id + tid;\n"
	"\n"
	"    for (IndexType i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {\n"
	"\n"
	"      if (gid < size)\n"
	"        sdata[tid] += x[gid] * y[gid];\n"
	"      else\n"
	"        i = LOCAL_SIZE;\n"
	"\n"
	"    }\n"
	"\n"
	"    barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"    for (IndexType i = BLOCK_SIZE/2; i > 0; i /= 2) {\n"
	"\n"
	"      if (tid < i)\n"
	"        sdata[tid] += sdata[tid + i];\n"
	"\n"
	"      barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"    }\n"
	"\n"
	"    if (tid == 0)\n"
	"      out[get_group_id(0)] = sdata[tid];\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_dotc(         const IndexType size,\n"
	"                          __global const ValueType *x,\n"
	"                          __global const ValueType *y,\n"
	"                          __global       ValueType *out,\n"
	"                                   const IndexType GROUP_SIZE,\n"
	"                                   const IndexType LOCAL_SIZE) {\n"
	"\n"
	"    IndexType tid = get_local_id(0);\n"
	"\n"
	"    __local ValueType sdata[BLOCK_SIZE];\n"
	"    sdata[tid] = (ValueType) 0;\n"
	"\n"
	"    IndexType group_id = GROUP_SIZE * get_group_id(0);\n"
	"    IndexType gid = group_id + tid;\n"
	"\n"
	"    for (IndexType i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {\n"
	"\n"
	"      if (gid < size)\n"
	"        sdata[tid] += x[gid] * y[gid];\n"
	"      else\n"
	"        i = LOCAL_SIZE;\n"
	"\n"
	"    }\n"
	"\n"
	"    barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"    for (IndexType i = BLOCK_SIZE/2; i > 0; i /= 2) {\n"
	"\n"
	"      if (tid < i)\n"
	"        sdata[tid] += sdata[tid + i];\n"
	"\n"
	"      barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"    }\n"
	"\n"
	"    if (tid == 0)\n"
	"      out[get_group_id(0)] = sdata[tid];\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_norm(        const IndexType size,\n"
	"                         __global const ValueType *x,\n"
	"                         __global       ValueType *out,\n"
	"                                  const IndexType GROUP_SIZE,\n"
	"                                  const IndexType  LOCAL_SIZE) {\n"
	"\n"
	"    IndexType tid = get_local_id(0);\n"
	"    __local ValueType sdata[BLOCK_SIZE];\n"
	"    sdata[tid] = (ValueType) 0;\n"
	"\n"
	"    IndexType group_id = GROUP_SIZE * get_group_id(0);\n"
	"    IndexType gid = group_id + tid;\n"
	"\n"
	"    for (IndexType i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {\n"
	"\n"
	"      if (gid < size)\n"
	"        sdata[tid] += x[gid] * x[gid];\n"
	"      else\n"
	"        i = LOCAL_SIZE;\n"
	"\n"
	"    }\n"
	"\n"
	"    barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"    for (IndexType i = BLOCK_SIZE/2; i > 0; i /= 2) {\n"
	"\n"
	"      if (tid < i)\n"
	"        sdata[tid] += sdata[tid + i];\n"
	"\n"
	"      barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"    }\n"
	"\n"
	"    if (tid == 0)\n"
	"      out[get_group_id(0)] = sdata[tid];\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_axpy(         const IndexType size,\n"
	"                                   const ValueType alpha,\n"
	"                          __global const ValueType *x,\n"
	"                          __global       ValueType *out) {\n"
	"\n"
	"  IndexType gid = get_global_id(0);\n"
	"\n"
	"  if (gid < size)\n"
	"    out[gid] += alpha * x[gid];\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_reduce(         const IndexType size,\n"
	"                            __global const ValueType *data,\n"
	"                            __global       ValueType *out,\n"
	"                                     const IndexType GROUP_SIZE,\n"
	"                                     const IndexType LOCAL_SIZE) {\n"
	"\n"
	"    IndexType tid = get_local_id(0);\n"
	"    __local ValueType sdata[BLOCK_SIZE];\n"
	"    sdata[tid] = (ValueType) 0;\n"
	"\n"
	"    IndexType gid = GROUP_SIZE * get_group_id(0) + tid;\n"
	"\n"
	"    for (IndexType i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {\n"
	"\n"
	"      if (gid < size)\n"
	"        sdata[tid] += data[gid];\n"
	"      else\n"
	"        i = LOCAL_SIZE;\n"
	"\n"
	"    }\n"
	"\n"
	"    barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"    for (IndexType i = BLOCK_SIZE/2; i > 0; i /= 2) {\n"
	"\n"
	"      if ( tid < i )\n"
	"        sdata[tid] += sdata[tid + i];\n"
	"\n"
	"      barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"    }\n"
	"\n"
	"    if (tid == 0)\n"
	"      out[get_group_id(0)] = sdata[tid];\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_asum(         const IndexType size,\n"
	"                          __global const ValueType *data,\n"
	"                          __global       ValueType *out,\n"
	"                                   const IndexType GROUP_SIZE,\n"
	"                                   const IndexType LOCAL_SIZE) {\n"
	"\n"
	"    IndexType tid = get_local_id(0);\n"
	"    __local ValueType sdata[BLOCK_SIZE];\n"
	"    sdata[tid] = (ValueType) 0;\n"
	"\n"
	"    IndexType gid = GROUP_SIZE * get_group_id(0) + tid;\n"
	"\n"
	"    for (IndexType i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {\n"
	"\n"
	"      if (gid < size)\n"
	"        sdata[tid] += fabs(data[gid]);\n"
	"      else\n"
	"        i = LOCAL_SIZE;\n"
	"\n"
	"    }\n"
	"\n"
	"    barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"    for (IndexType i = BLOCK_SIZE/2; i > 0; i /= 2) {\n"
	"\n"
	"      if (tid < i)\n"
	"        sdata[tid] += sdata[tid + i];\n"
	"\n"
	"      barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"    }\n"
	"\n"
	"    if (tid == 0)\n"
	"      out[get_group_id(0)] = sdata[tid];\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_amax(         const IndexType size,\n"
	"                          __global const ValueType *data,\n"
	"                          __global       ValueType *out,\n"
	"                          __global       IndexType *iout,\n"
	"                                   const IndexType GROUP_SIZE,\n"
	"                                   const IndexType LOCAL_SIZE) {\n"
	"\n"
	"    IndexType tid = get_local_id(0);\n"
	"    __local IndexType idata[BLOCK_SIZE];\n"
	"    __local ValueType sdata[BLOCK_SIZE];\n"
	"    sdata[tid] = (ValueType) 0;\n"
	"    idata[tid] = 0;\n"
	"\n"
	"    IndexType gid = GROUP_SIZE * get_group_id(0) + tid;\n"
	"\n"
	"    for (IndexType i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {\n"
	"\n"
	"      if (gid < size) {\n"
	"        ValueType tmp = data[gid];\n"
	"        if (fabs(tmp) > fabs(sdata[tid])) {\n"
	"          sdata[tid] = fabs(tmp);\n"
	"          idata[tid] = gid;\n"
	"        }\n"
	"      }\n"
	"\n"
	"    }\n"
	"\n"
	"    barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"    for (IndexType i = BLOCK_SIZE/2; i > 0; i /= 2) {\n"
	"\n"
	"      if (tid < i) {\n"
	"        ValueType tmp = sdata[tid+i];\n"
	"        if (fabs(tmp) > fabs(sdata[tid])) {\n"
	"          sdata[tid] = fabs(tmp);\n"
	"          idata[tid] = idata[tid+i];\n"
	"        }\n"
	"      }\n"
	"\n"
	"      barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"    }\n"
	"\n"
	"    if (tid == 0) {\n"
	"      out[get_group_id(0)] = sdata[tid];\n"
	"      iout[get_group_id(0)] = idata[tid];\n"
	"    }\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_power(const IndexType n, const double power, __global ValueType *out) {\n"
	"\n"
	"  IndexType gid = get_global_id(0);\n"
	"\n"
	"  if (gid < n)\n"
	"    out[gid] = pow(out[gid], (ValueType) power);\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_copy_from_float(const IndexType n, __global const float *in, __global ValueType *out) {\n"
	"\n"
	"  IndexType ind = get_global_id(0);\n"
	"\n"
	"  if (ind < n)\n"
	"    out[ind] = (ValueType) in[ind];\n"
	"\n"
	"}\n"
	"\n"
	"__kernel void kernel_copy_from_double(const IndexType n, __global const double *in, __global ValueType *out) {\n"
	"\n"
	"  IndexType ind = get_global_id(0);\n"
	"\n"
	"  if (ind < n)\n"
	"    out[ind] = (ValueType) in[ind];\n"
	"\n"
	"}\n"
	"\n"
;
}

#endif // PARALUTION_OCL_KERNELS_VECTOR_HPP_