// **************************************************************************
//
//    PARALUTION   www.paralution.com
//
//    Copyright (C) 2015  PARALUTION Labs UG (haftungsbeschränkt) & Co. KG
//                        Am Hasensprung 6, 76571 Gaggenau
//                        Handelsregister: Amtsgericht Mannheim, HRA 706051
//                        Vertreten durch:
//                        PARALUTION Labs Verwaltungs UG (haftungsbeschränkt)
//                        Am Hasensprung 6, 76571 Gaggenau
//                        Handelsregister: Amtsgericht Mannheim, HRB 721277
//                        Geschäftsführer: Dimitar Lukarski, Nico Trost
//
//    This program is free software: you can redistribute it and/or modify
//    it under the terms of the GNU General Public License as published by
//    the Free Software Foundation, either version 3 of the License, or
//    (at your option) any later version.
//
//    This program is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU General Public License for more details.
//
//    You should have received a copy of the GNU General Public License
//    along with this program.  If not, see <http://www.gnu.org/licenses/>.
//
// **************************************************************************



// PARALUTION version 1.1.0 


#include "../utils/def.hpp"
#include "version.hpp" 
#include "backend_manager.hpp" 
#include "base_paralution.hpp"
#include "base_vector.hpp"
#include "base_matrix.hpp"
#include "host/host_affinity.hpp"
#include "host/host_vector.hpp"
#include "host/host_matrix_csr.hpp"
#include "host/host_matrix_coo.hpp"
#include "host/host_matrix_dia.hpp"
#include "host/host_matrix_ell.hpp"
#include "host/host_matrix_hyb.hpp"
#include "host/host_matrix_dense.hpp"
#include "host/host_matrix_mcsr.hpp"
#include "host/host_matrix_bcsr.hpp"
#include "../utils/log.hpp"

#include <stdlib.h>
#include <string.h>

#ifdef _OPENMP
#include <omp.h>
#endif

#ifdef SUPPORT_MKL
#include <mkl.h>
#include <mkl_spblas.h>
#endif

#ifdef SUPPORT_CUDA
#include "gpu/backend_gpu.hpp"
#endif

#ifdef SUPPORT_OCL
#include "ocl/backend_ocl.hpp"
#endif

#ifdef SUPPORT_MIC
#include "mic/backend_mic.hpp"
#endif

namespace paralution {

// Global backend descriptor and default values
Paralution_Backend_Descriptor _Backend_Descriptor = {
  false, // Init
#ifdef SUPPORT_CUDA
  GPU,   // default backend
#else
  #ifdef SUPPORT_OCL
  OCL,
   #else
     #ifdef SUPPORT_MIC
     MIC,
    #else
     None,
   #endif
#endif
#endif
  false, // use accelerator
  false, // disable accelerator
  1,     // OpenMP threads
  -1,    // pre-init OpenMP threads
  0,    // pre-init OpenMP threads
  true,  // host affinity (active)
  10000, // threshold size
  // GPU section
  NULL,  // *GPU_cublas_handle
  NULL,  // *GPU_cusparse_handle
  -1,    // GPU_dev;
  32,    // GPU_warp;
  256,   // GPU_blocksize;
  65535, // Maximum threads in the block
  // OCL section
  NULL,  // OCL_handle
  -1,    // OCL_platform;
  -1,    // OCL_device;
  0,     // OCL_max_work_group_size;
  0,     // OCL_max_compute_units
  -1,     // OCL_warp_size
  // MIC
  0,     // default is zero device
  {0x65, 0x64, 0x6e, 0x6f, 0x6e,
   0x75, 0x6c, 0x61, 0x76, 0x65,
   0x72, 0x73, 0x69, 0x61, 0x42},
  // LOG
  NULL   // FILE, file log
};

/// Host name
const std::string _paralution_host_name [1] = 
#ifdef SUPPORT_MKL
  {"CPU(MKL/OpenMP)"};
#else 
  {"CPU(OpenMP)"};
#endif

/// Backend names
const std::string _paralution_backend_name [4] =
  {"None",
   "GPU(CUDA)",
   "OpenCL",
   "MIC(OpenMP)"};

int init_paralution(void) {
  
  _paralution_open_log_file();

  LOG_DEBUG(0, "init_paralution()",
            "* begin");

  if (_get_backend_descriptor()->init == true) {
    LOG_INFO("PARALUTION platform has been initialized - restarting");
    stop_paralution();
  }

  if (strcmp(__PARALUTION_VER_TYPE, "B") == 0) {
    LOG_INFO("This version of PARALUTION is released under GPL.");
    LOG_INFO("By downloading this package you fully agree with the GPL license.");
  }

#ifdef SUPPORT_CUDA
  _get_backend_descriptor()->backend = GPU;
#else
  #ifdef SUPPORT_OCL
    _get_backend_descriptor()->backend = OCL;
  #else
    #ifdef SUPPORT_MIC
    _get_backend_descriptor()->backend = MIC;
   #else
    _get_backend_descriptor()->backend = None;
  #endif
 #endif
#endif

#ifdef _OPENMP
  _get_backend_descriptor()->OpenMP_def_threads = omp_get_max_threads();
  _get_backend_descriptor()->OpenMP_threads = omp_get_max_threads();
  _get_backend_descriptor()->OpenMP_def_nested = omp_get_nested();

  // the default in PARALUTION is 0
  omp_set_nested(0);

  paralution_set_omp_affinity(_get_backend_descriptor()->OpenMP_affinity);
#else 
  _get_backend_descriptor()->OpenMP_threads = 1;
#endif

  if (_get_backend_descriptor()->disable_accelerator == false) {

#ifdef SUPPORT_CUDA
    _get_backend_descriptor()->accelerator = paralution_init_gpu();
#endif
    
#ifdef SUPPORT_OCL
    _get_backend_descriptor()->accelerator = paralution_init_ocl();
#endif
    
#ifdef SUPPORT_MIC
    
#ifdef __INTEL_OFFLOAD
    
    _get_backend_descriptor()->accelerator = paralution_init_mic();
    
#else

  LOG_INFO("The MIC backend is compiled without __INTEL_OFFLOAD - Double check the compilation process!");
  FATAL_ERROR(__FILE__, __LINE__);

#endif

#endif

  } else {

    LOG_INFO("Warning: the accelerator is disabled");

  }

  if (_paralution_check_if_any_obj() == false) {
    LOG_INFO("Error: PARALUTION objects have been created before calling the init_paralution()!");
    FATAL_ERROR(__FILE__, __LINE__);

  }

  LOG_DEBUG(0, "init_paralution()",
            "* end");

  _get_backend_descriptor()->init = true ;
  return 0;

}

int stop_paralution(void) {

  LOG_DEBUG(0, "stop_paralution()",
            "* begin");

  _paralution_delete_all_obj();

#ifdef SUPPORT_CUDA
  paralution_stop_gpu();
#endif

#ifdef SUPPORT_OCL
  paralution_stop_ocl();
#endif

#ifdef SUPPORT_MIC
  paralution_stop_mic();
#endif

#ifdef _OPENMP
  assert(_get_backend_descriptor()->OpenMP_def_threads > 0);
  omp_set_num_threads(_get_backend_descriptor()->OpenMP_def_threads);

  assert((_get_backend_descriptor()->OpenMP_def_nested == 0) ||
         (_get_backend_descriptor()->OpenMP_def_nested == 1));

  omp_set_nested(_get_backend_descriptor()->OpenMP_def_nested);
#endif

  _get_backend_descriptor()->init = false;

  LOG_DEBUG(0, "stop_paralution()",
            "* end");

  _paralution_close_log_file();

  return 0;
}

int set_device_paralution(int dev) {

  LOG_DEBUG(0, "set_device_paralution()",
            dev);

  assert(_get_backend_descriptor()->init == false);

#ifdef SUPPORT_CUDA
  set_gpu_cuda_paralution(dev);
#endif

#ifdef SUPPORT_OCL
  _get_backend_descriptor()->OCL_dev = dev;
#endif

#ifdef SUPPORT_MIC
  _get_backend_descriptor()->MIC_dev = dev;
#endif

  return 0;

}

void set_omp_threads_paralution(int nthreads) {

  LOG_DEBUG(0, "set_omp_threads_paralution()",
            nthreads);

  assert(_get_backend_descriptor()->init == true);

#ifdef _OPENMP
  _get_backend_descriptor()->OpenMP_threads = nthreads;

  omp_set_num_threads(nthreads);  

 #if defined(__gnu_linux__) || defined(linux) || defined(__linux) || defined(__linux__)

  paralution_set_omp_affinity(_get_backend_descriptor()->OpenMP_affinity);

#endif // linux

#else // !omp
  LOG_INFO("No OpenMP support");
  _get_backend_descriptor()->OpenMP_threads = 1;
#endif // omp


}

void set_gpu_cuda_paralution(int ngpu) {

  LOG_DEBUG(0, "set_gpu_cuda_paralution()",
            ngpu);

  assert(_get_backend_descriptor()->init == false);

  _get_backend_descriptor()->GPU_dev = ngpu;

}

void set_ocl_paralution(int nplatform, int ndevice) {

  LOG_DEBUG(0, "set_ocl_paralution()",
            "nplatform=" << nplatform << 
            " ndevice" << ndevice);


  assert(_get_backend_descriptor()->init == false);

  _get_backend_descriptor()->OCL_plat = nplatform;
  _get_backend_descriptor()->OCL_dev = ndevice;

}

void set_ocl_platform_paralution(int platform) {

  LOG_DEBUG(0, "set_ocl_platform_paralution()",
            "platform=" << platform);

  assert(_get_backend_descriptor()->init == false);

  _get_backend_descriptor()->OCL_plat = platform;

}

void set_ocl_work_group_size_paralution(size_t size) {

  LOG_DEBUG(0, "set_ocl_work_group_size()",
            "size=" << size);

  assert(_get_backend_descriptor()->init == false);

  _get_backend_descriptor()->OCL_max_work_group_size = size;

}

void set_ocl_compute_units_paralution(size_t cu) {

  LOG_DEBUG(0, "set_ocl_compute_units()",
            "cu=" << cu);

  assert(_get_backend_descriptor()->init == false);

  _get_backend_descriptor()->OCL_computeUnits = cu;

}

void set_ocl_warp_size_paralution(int size) {

  LOG_DEBUG(0, "set_ocl_warp_size()",
            "size=" << size);

  assert(_get_backend_descriptor()->init == false);

  _get_backend_descriptor()->OCL_warp_size = size;

}

void info_paralution(void) {

  LOG_INFO("PARALUTION ver " <<
           __PARALUTION_VER_TYPE <<
           __PARALUTION_VER_MAJOR << "." <<
           __PARALUTION_VER_MINOR << "." <<
           __PARALUTION_VER_REV << 
           __PARALUTION_VER_PRE);

#if defined(__gnu_linux__) || defined(linux) || defined(__linux) || defined(__linux__)

  LOG_VERBOSE_INFO(3, "Compiled for Linux/Unix OS");

#else // Linux

#if  defined(__APPLE__)

  LOG_VERBOSE_INFO(3, "Compiled for Mac OS");

#else // Apple

#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) || defined(__WIN64) && !defined(__CYGWIN__)

  LOG_VERBOSE_INFO(3, "Compiled for Windows OS");

#else // Win

  // unknown
  LOG_VERBOSE_INFO(3, "Compiled for unknown OS");


#endif // Win

#endif // Apple

#endif // Linux


  info_paralution(_Backend_Descriptor);

}

void info_paralution(const struct Paralution_Backend_Descriptor backend_descriptor) {

  if (backend_descriptor.init == true) {
    LOG_INFO("PARALUTION platform is initialized");
  } else {
    LOG_INFO("PARALUTION platform is NOT initialized");
  }

  LOG_INFO("Accelerator backend: " << _paralution_backend_name[backend_descriptor.backend]);

#ifdef _OPENMP
  LOG_INFO("OpenMP threads:" << backend_descriptor.OpenMP_threads);
#else 
  LOG_INFO("No OpenMP support");
#endif

#ifdef SUPPORT_MKL
  LOG_INFO("MKL threads:" << mkl_get_max_threads() );
#else
  LOG_VERBOSE_INFO(3, "No MKL support");
#endif

  if (backend_descriptor.disable_accelerator == true) {
    LOG_INFO("The accelerator is disabled");
  }

#ifdef SUPPORT_CUDA
  if (backend_descriptor.accelerator)
    paralution_info_gpu(backend_descriptor);
  else
    LOG_INFO("GPU is not initialized");
#else
  LOG_VERBOSE_INFO(3, "No CUDA/GPU support");
#endif

#ifdef SUPPORT_OCL
  if (backend_descriptor.accelerator)
    paralution_info_ocl(backend_descriptor);
  else
    LOG_INFO("OpenCL is not initialized");
#else
  LOG_VERBOSE_INFO(3, "No OpenCL support");
#endif

#ifdef SUPPORT_MIC
  if (backend_descriptor.accelerator)
    paralution_info_mic(backend_descriptor);
  else
    LOG_INFO("MIC/OpenMP is not initialized");
#else
  LOG_VERBOSE_INFO(3, "No MIC/OpenMP support");
#endif

}


void set_omp_affinity(bool affinity) {

  assert(_get_backend_descriptor()->init == false);
  _get_backend_descriptor()->OpenMP_affinity = affinity;

}

void set_omp_threshold(const int threshold) {

  _get_backend_descriptor()->OpenMP_threshold = threshold;

}


bool _paralution_available_accelerator(void) {

  return _get_backend_descriptor()->accelerator;

}

void disable_accelerator_paralution(const bool onoff) {

  assert(_get_backend_descriptor()->init == false);

  _get_backend_descriptor()->disable_accelerator = onoff;
 
}

struct Paralution_Backend_Descriptor *_get_backend_descriptor(void) {

  return &_Backend_Descriptor;

}


void _set_backend_descriptor(const struct Paralution_Backend_Descriptor backend_descriptor) {

  *(_get_backend_descriptor()) = backend_descriptor;

}


template <typename ValueType>
AcceleratorVector<ValueType>* _paralution_init_base_backend_vector(const struct Paralution_Backend_Descriptor backend_descriptor) {

  LOG_DEBUG(0, "_paralution_init_base_backend_vector()",
            "");

  switch (backend_descriptor.backend) {

#ifdef SUPPORT_CUDA
  // GPU
  case GPU:
    return _paralution_init_base_gpu_vector<ValueType>(backend_descriptor);
    break;
#endif

#ifdef SUPPORT_OCL
  // OCL
  case OCL:
    return _paralution_init_base_ocl_vector<ValueType>(backend_descriptor);
    break;
#endif

#ifdef SUPPORT_MIC
  // GPU
  case MIC:
    return _paralution_init_base_mic_vector<ValueType>(backend_descriptor);
    break;
#endif


  case 979753345:
    LOG_INFO("This is the impossible but VS cannot handle switch statement with 'default' but no 'case' labels");
    FATAL_ERROR(__FILE__, __LINE__);
    return NULL;    
    break;

  default:
    // No backend supported!
    LOG_INFO("Paralution was not compiled with " << _paralution_backend_name[backend_descriptor.backend] << " support");
    LOG_INFO("Building Vector on " << _paralution_backend_name[backend_descriptor.backend] << " failed"); 
    FATAL_ERROR(__FILE__, __LINE__);
    return NULL;
  }

}
  
template <typename ValueType>
AcceleratorMatrix<ValueType>* _paralution_init_base_backend_matrix(const struct Paralution_Backend_Descriptor backend_descriptor,
                                                                   const unsigned int matrix_format) {

  LOG_DEBUG(0, "_paralution_init_base_backend_matrix()",
            matrix_format);

  switch (backend_descriptor.backend) {

#ifdef SUPPORT_CUDA      
  case GPU:
    return _paralution_init_base_gpu_matrix<ValueType>(backend_descriptor, matrix_format);
    break;
#endif

#ifdef SUPPORT_OCL
  case OCL:
    return _paralution_init_base_ocl_matrix<ValueType>(backend_descriptor, matrix_format);
    break;
#endif

#ifdef SUPPORT_MIC      
  case MIC:
    return _paralution_init_base_mic_matrix<ValueType>(backend_descriptor, matrix_format);
    break;
#endif

  case 979753345:
    LOG_INFO("This is the impossible but VS cannot handle switch statement with 'default' but no 'case' labels");
    FATAL_ERROR(__FILE__, __LINE__);
    return NULL;    
    break;


  default:
    LOG_INFO("Paralution was not compiled with " << _paralution_backend_name[backend_descriptor.backend] << " support");
    LOG_INFO("Building " << _matrix_format_names[matrix_format] << " Matrix on " << _paralution_backend_name[backend_descriptor.backend] << " failed"); 
    
    FATAL_ERROR(__FILE__, __LINE__);
    return NULL;
  }

}


template <typename ValueType>
HostMatrix<ValueType>* _paralution_init_base_host_matrix(const struct Paralution_Backend_Descriptor backend_descriptor,
                                                         const unsigned int matrix_format) {

  LOG_DEBUG(0, "_paralution_init_base_host_matrix()",
            matrix_format);

  switch (matrix_format) {
      
  case CSR:
    return new HostMatrixCSR<ValueType>(backend_descriptor);
    break;
      
  case COO:
    return new HostMatrixCOO<ValueType>(backend_descriptor);
    break;
 
  case DIA:
    return new HostMatrixDIA<ValueType>(backend_descriptor);
    break;

  case ELL:
    return new HostMatrixELL<ValueType>(backend_descriptor);
    break;
    
  case HYB:
    return new HostMatrixHYB<ValueType>(backend_descriptor);
    break;

  case DENSE:
    return new HostMatrixDENSE<ValueType>(backend_descriptor);
    break;

  case MCSR:
    return new HostMatrixMCSR<ValueType>(backend_descriptor);
    break;

  case BCSR:
    return new HostMatrixBCSR<ValueType>(backend_descriptor);
    break;

  default:
    return NULL;
  }

}


void _paralution_sync(void) {

  if (_paralution_available_accelerator() == true) {

#ifdef SUPPORT_CUDA
    paralution_gpu_sync();
#endif
    
#ifdef SUPPORT_OCL
    //  paralution_ocl_sync();
#endif
    
#ifdef SUPPORT_MIC
    //  paralution_mic_sync();
#endif
    
  }

}

void _set_omp_backend_threads(const struct Paralution_Backend_Descriptor backend_descriptor,
                              const int size) {

  // if the threshold is disabled or if the size is not in the threshold limit
  if ((backend_descriptor.OpenMP_threshold > 0) && 
      (size <= backend_descriptor.OpenMP_threshold) &&
      (size >= 0)) {
#ifdef _OPENMP
    omp_set_num_threads(1);
#endif
  } else {
#ifdef _OPENMP
    omp_set_num_threads(backend_descriptor.OpenMP_threads);
#endif
  }

}

size_t _paralution_add_obj(class ParalutionObj* ptr) {

#ifndef OBJ_TRACKING_OFF
  
  LOG_DEBUG(0, "Creating new PARALUTION object, ptr=",
            ptr);
  
  Paralution_Object_Data_Tracking.all_obj.push_back(ptr);
  
    LOG_DEBUG(0, "Creating new PARALUTION object, id=",
              Paralution_Object_Data_Tracking.all_obj.size()-1);

  return (Paralution_Object_Data_Tracking.all_obj.size()-1);

#else 

  return 0;

#endif

};

bool _paralution_del_obj(class ParalutionObj* ptr,
                         size_t id) {
  bool ok = false;

#ifndef OBJ_TRACKING_OFF

  LOG_DEBUG(0, "Deleting PARALUTION object, ptr=",
            ptr);

  LOG_DEBUG(0, "Deleting PARALUTION object, id=",
            id);
  
  if (Paralution_Object_Data_Tracking.all_obj[id] == ptr)
    ok = true;

  Paralution_Object_Data_Tracking.all_obj[id] = NULL;

  return ok;

#else

  ok = true;

  return ok;

#endif

};

void _paralution_delete_all_obj(void) {

#ifndef OBJ_TRACKING_OFF

  LOG_DEBUG(0, "_paralution_delete_all_obj()",
            "* begin");

  for (unsigned int i=0; 
       i<Paralution_Object_Data_Tracking.all_obj.size(); 
       ++i) {

    if (Paralution_Object_Data_Tracking.all_obj[i] != NULL)
      Paralution_Object_Data_Tracking.all_obj[i]->Clear();

    LOG_DEBUG(0, "clearing PARALUTION obj ptr=",
              Paralution_Object_Data_Tracking.all_obj[i]);

  }

  LOG_DEBUG(0, "_paralution_delete_all_obj()",
            "* end");
#endif

};

bool _paralution_check_if_any_obj(void) {

#ifndef OBJ_TRACKING_OFF

  if (Paralution_Object_Data_Tracking.all_obj.size() > 0) {
    return false;
  } 

#endif
  
  return true;

};


template AcceleratorVector<float>* _paralution_init_base_backend_vector(const struct Paralution_Backend_Descriptor backend_descriptor);
template AcceleratorVector<double>* _paralution_init_base_backend_vector(const struct Paralution_Backend_Descriptor backend_descriptor);
#ifdef SUPPORT_COMPLEX
template AcceleratorVector<std::complex<float> >* _paralution_init_base_backend_vector(const struct Paralution_Backend_Descriptor backend_descriptor);
template AcceleratorVector<std::complex<double> >* _paralution_init_base_backend_vector(const struct Paralution_Backend_Descriptor backend_descriptor);
#endif
template AcceleratorVector<int>* _paralution_init_base_backend_vector(const struct Paralution_Backend_Descriptor backend_descriptor);

template AcceleratorMatrix<float>* _paralution_init_base_backend_matrix(const struct Paralution_Backend_Descriptor backend_descriptor,
                                                                        const unsigned int matrix_format);
template AcceleratorMatrix<double>* _paralution_init_base_backend_matrix(const struct Paralution_Backend_Descriptor backend_descriptor,
                                                                         const unsigned int matrix_format);
#ifdef SUPPORT_COMPLEX
template AcceleratorMatrix<std::complex<float> >* _paralution_init_base_backend_matrix(const struct Paralution_Backend_Descriptor backend_descriptor,
                                                                                       const unsigned int matrix_format);
template AcceleratorMatrix<std::complex<double> >* _paralution_init_base_backend_matrix(const struct Paralution_Backend_Descriptor backend_descriptor,
                                                                                        const unsigned int matrix_format);
#endif
template HostMatrix<float>* _paralution_init_base_host_matrix(const struct Paralution_Backend_Descriptor backend_descriptor,
                                                              const unsigned int matrix_format);
template HostMatrix<double>* _paralution_init_base_host_matrix(const struct Paralution_Backend_Descriptor backend_descriptor,
                                                               const unsigned int matrix_format);
#ifdef SUPPORT_COMPLEX
template HostMatrix<std::complex<float> >* _paralution_init_base_host_matrix(const struct Paralution_Backend_Descriptor backend_descriptor,
                                                                             const unsigned int matrix_format);
template HostMatrix<std::complex<double> >* _paralution_init_base_host_matrix(const struct Paralution_Backend_Descriptor backend_descriptor,
                                                                              const unsigned int matrix_format);
#endif
}