// **************************************************************************
//
//    PARALUTION   www.paralution.com
//
//    Copyright (C) 2015  PARALUTION Labs UG (haftungsbeschränkt) & Co. KG
//                        Am Hasensprung 6, 76571 Gaggenau
//                        Handelsregister: Amtsgericht Mannheim, HRA 706051
//                        Vertreten durch:
//                        PARALUTION Labs Verwaltungs UG (haftungsbeschränkt)
//                        Am Hasensprung 6, 76571 Gaggenau
//                        Handelsregister: Amtsgericht Mannheim, HRB 721277
//                        Geschäftsführer: Dimitar Lukarski, Nico Trost
//
//    This program is free software: you can redistribute it and/or modify
//    it under the terms of the GNU General Public License as published by
//    the Free Software Foundation, either version 3 of the License, or
//    (at your option) any later version.
//
//    This program is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU General Public License for more details.
//
//    You should have received a copy of the GNU General Public License
//    along with this program.  If not, see <http://www.gnu.org/licenses/>.
//
// **************************************************************************



// PARALUTION version 1.1.0 


__kernel void kernel_scale(const IndexType size, const ValueType alpha, __global ValueType *x) {

  IndexType gid = get_global_id(0);

  if (gid < size)
    x[gid] = alpha * x[gid];

}

__kernel void kernel_scaleadd(         const IndexType size,
                                       const ValueType alpha,
                              __global const ValueType *x,
                              __global       ValueType *out) {

  IndexType gid = get_global_id(0);

  if (gid < size)
    out[gid] = alpha * out[gid] + x[gid];

}

__kernel void kernel_scaleaddscale(         const IndexType size,
                                            const ValueType alpha,
                                            const ValueType beta, 
                                   __global const ValueType *x,
                                   __global       ValueType *out) {

  IndexType gid = get_global_id(0);

  if (gid < size)
    out[gid] = alpha * out[gid] + beta * x[gid];

}

__kernel void kernel_scaleaddscale_offset(         const IndexType size,
                                                   const IndexType src_offset,
                                                   const IndexType dst_offset, 
                                                   const ValueType alpha,
                                                   const ValueType beta, 
                                          __global const ValueType *x,
                                          __global       ValueType *out) {

  IndexType gid = get_global_id(0);

  if (gid < size)
    out[gid+dst_offset] = alpha * out[gid+dst_offset] + beta * x[gid+src_offset];

}

__kernel void kernel_scaleadd2(         const IndexType size,
                                        const ValueType alpha,
                                        const ValueType beta,
                                        const ValueType gamma,
                               __global const ValueType *x,
                               __global const ValueType *y,
                               __global       ValueType *out) {

  IndexType gid = get_global_id(0);

  if (gid < size)
    out[gid] = alpha * out[gid] + beta * x[gid] + gamma * y[gid];

}

__kernel void kernel_pointwisemult(         const IndexType size,
                                   __global const ValueType *x,
                                   __global       ValueType *out) {

  IndexType gid = get_global_id(0);

  if (gid < size)
    out[gid] = out[gid] * x[gid];

}

__kernel void kernel_pointwisemult2(         const IndexType size,
                                    __global const ValueType *x,
                                    __global const ValueType *y,
                                    __global       ValueType *out) {

  IndexType gid = get_global_id(0);

  if (gid < size)
    out[gid] = y[gid] * x[gid];

}

__kernel void kernel_copy_offset_from(         const IndexType size,
                                               const IndexType src_offset,
                                               const IndexType dst_offset,
                                      __global const ValueType *in,
                                      __global       ValueType *out) {

  IndexType gid = get_global_id(0);

  if (gid < size)
    out[gid+dst_offset] = in[gid+src_offset];

}

__kernel void kernel_permute(         const IndexType size,
                             __global const IndexType *permute,
                             __global const ValueType *in,
                             __global       ValueType *out) {

  IndexType gid = get_global_id(0);

  if (gid < size)
    out[permute[gid]] = in[gid];

}

__kernel void kernel_permute_backward(         const IndexType size,
                                      __global const IndexType *permute,
                                      __global const ValueType *in,
                                      __global       ValueType *out) {

  IndexType gid = get_global_id(0);

  if (gid < size)
    out[gid] = in[permute[gid]];

}

__kernel void kernel_dot(         const IndexType  size,
                         __global const ValueType *x,
                         __global const ValueType *y,
                         __global       ValueType *out,
                                  const IndexType GROUP_SIZE,
                                  const IndexType  LOCAL_SIZE) {

    IndexType tid = get_local_id(0);

    __local ValueType sdata[BLOCK_SIZE];
    sdata[tid] = (ValueType) 0;

    IndexType group_id = GROUP_SIZE * get_group_id(0);
    IndexType gid = group_id + tid;

    for (IndexType i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {

      if (gid < size)
        sdata[tid] += x[gid] * y[gid];
      else
        i = LOCAL_SIZE;

    }

    barrier(CLK_LOCAL_MEM_FENCE);

    for (IndexType i = BLOCK_SIZE/2; i > 0; i /= 2) {

      if (tid < i)
        sdata[tid] += sdata[tid + i];

      barrier(CLK_LOCAL_MEM_FENCE);

    }

    if (tid == 0)
      out[get_group_id(0)] = sdata[tid];

}

__kernel void kernel_dotc(         const IndexType size,
                          __global const ValueType *x,
                          __global const ValueType *y,
                          __global       ValueType *out,
                                   const IndexType GROUP_SIZE,
                                   const IndexType LOCAL_SIZE) {

    IndexType tid = get_local_id(0);

    __local ValueType sdata[BLOCK_SIZE];
    sdata[tid] = (ValueType) 0;

    IndexType group_id = GROUP_SIZE * get_group_id(0);
    IndexType gid = group_id + tid;

    for (IndexType i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {

      if (gid < size)
        sdata[tid] += x[gid] * y[gid];
      else
        i = LOCAL_SIZE;

    }

    barrier(CLK_LOCAL_MEM_FENCE);

    for (IndexType i = BLOCK_SIZE/2; i > 0; i /= 2) {

      if (tid < i)
        sdata[tid] += sdata[tid + i];

      barrier(CLK_LOCAL_MEM_FENCE);

    }

    if (tid == 0)
      out[get_group_id(0)] = sdata[tid];

}

__kernel void kernel_norm(        const IndexType size,
                         __global const ValueType *x,
                         __global       ValueType *out,
                                  const IndexType GROUP_SIZE,
                                  const IndexType  LOCAL_SIZE) {

    IndexType tid = get_local_id(0);
    __local ValueType sdata[BLOCK_SIZE];
    sdata[tid] = (ValueType) 0;

    IndexType group_id = GROUP_SIZE * get_group_id(0);
    IndexType gid = group_id + tid;

    for (IndexType i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {

      if (gid < size)
        sdata[tid] += x[gid] * x[gid];
      else
        i = LOCAL_SIZE;

    }

    barrier(CLK_LOCAL_MEM_FENCE);

    for (IndexType i = BLOCK_SIZE/2; i > 0; i /= 2) {

      if (tid < i)
        sdata[tid] += sdata[tid + i];

      barrier(CLK_LOCAL_MEM_FENCE);

    }

    if (tid == 0)
      out[get_group_id(0)] = sdata[tid];

}

__kernel void kernel_axpy(         const IndexType size,
                                   const ValueType alpha,
                          __global const ValueType *x,
                          __global       ValueType *out) {

  IndexType gid = get_global_id(0);

  if (gid < size)
    out[gid] += alpha * x[gid];

}

__kernel void kernel_reduce(         const IndexType size,
                            __global const ValueType *data,
                            __global       ValueType *out,
                                     const IndexType GROUP_SIZE,
                                     const IndexType LOCAL_SIZE) {

    IndexType tid = get_local_id(0);
    __local ValueType sdata[BLOCK_SIZE];
    sdata[tid] = (ValueType) 0;

    IndexType gid = GROUP_SIZE * get_group_id(0) + tid;

    for (IndexType i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {

      if (gid < size)
        sdata[tid] += data[gid];
      else
        i = LOCAL_SIZE;

    }

    barrier(CLK_LOCAL_MEM_FENCE);

    for (IndexType i = BLOCK_SIZE/2; i > 0; i /= 2) {

      if ( tid < i )
        sdata[tid] += sdata[tid + i];

      barrier(CLK_LOCAL_MEM_FENCE);

    }

    if (tid == 0)
      out[get_group_id(0)] = sdata[tid];

}

__kernel void kernel_asum(         const IndexType size,
                          __global const ValueType *data,
                          __global       ValueType *out,
                                   const IndexType GROUP_SIZE,
                                   const IndexType LOCAL_SIZE) {

    IndexType tid = get_local_id(0);
    __local ValueType sdata[BLOCK_SIZE];
    sdata[tid] = (ValueType) 0;

    IndexType gid = GROUP_SIZE * get_group_id(0) + tid;

    for (IndexType i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {

      if (gid < size)
        sdata[tid] += fabs(data[gid]);
      else
        i = LOCAL_SIZE;

    }

    barrier(CLK_LOCAL_MEM_FENCE);

    for (IndexType i = BLOCK_SIZE/2; i > 0; i /= 2) {

      if (tid < i)
        sdata[tid] += sdata[tid + i];

      barrier(CLK_LOCAL_MEM_FENCE);

    }

    if (tid == 0)
      out[get_group_id(0)] = sdata[tid];

}

__kernel void kernel_amax(         const IndexType size,
                          __global const ValueType *data,
                          __global       ValueType *out,
                          __global       IndexType *iout,
                                   const IndexType GROUP_SIZE,
                                   const IndexType LOCAL_SIZE) {

    IndexType tid = get_local_id(0);
    __local IndexType idata[BLOCK_SIZE];
    __local ValueType sdata[BLOCK_SIZE];
    sdata[tid] = (ValueType) 0;
    idata[tid] = 0;

    IndexType gid = GROUP_SIZE * get_group_id(0) + tid;

    for (IndexType i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {

      if (gid < size) {
        ValueType tmp = data[gid];
        if (fabs(tmp) > fabs(sdata[tid])) {
          sdata[tid] = fabs(tmp);
          idata[tid] = gid;
        }
      }

    }

    barrier(CLK_LOCAL_MEM_FENCE);

    for (IndexType i = BLOCK_SIZE/2; i > 0; i /= 2) {

      if (tid < i) {
        ValueType tmp = sdata[tid+i];
        if (fabs(tmp) > fabs(sdata[tid])) {
          sdata[tid] = fabs(tmp);
          idata[tid] = idata[tid+i];
        }
      }

      barrier(CLK_LOCAL_MEM_FENCE);

    }

    if (tid == 0) {
      out[get_group_id(0)] = sdata[tid];
      iout[get_group_id(0)] = idata[tid];
    }

}

__kernel void kernel_power(const IndexType n, const double power, __global ValueType *out) {

  IndexType gid = get_global_id(0);

  if (gid < n)
    out[gid] = pow(out[gid], (ValueType) power);

}

__kernel void kernel_copy_from_float(const IndexType n, __global const float *in, __global ValueType *out) {

  IndexType ind = get_global_id(0);

  if (ind < n)
    out[ind] = (ValueType) in[ind];

}

__kernel void kernel_copy_from_double(const IndexType n, __global const double *in, __global ValueType *out) {

  IndexType ind = get_global_id(0);

  if (ind < n)
    out[ind] = (ValueType) in[ind];

}