//+------------------------------------------------------------------+
//|                        OCL_pi_double2_several_simple_kernels.mq5 |
//+------------------------------------------------------------------+
#property copyright "Copyright (c) 2012, Mthmt"
#property link      "http://www.mql5.com"
#property version   "1.00"
#property script_show_inputs;

input int   _device = 0;                                     // OpenCL device number (0, I have CPU)
long        _num_steps        = 1000000000; 
#define     _divisor            10000
input int   _ch               = 32;                          // number of parallel channels
double      _step             = 1.0 / _num_steps;
long        _itInKern         = _num_steps / _divisor / _ch; // iterations in the kernel; division of _num_steps by ( _divisor * _ch ) shall obviously result in an integer
string      _clSrc;                                          // kernel code will be here

string d2s(double arg,int dig) { return DoubleToString(arg,dig); }
string i2s(long arg)           { return IntegerToString(arg); }

const string clSrc16=
                     "#pragma OPENCL EXTENSION cl_khr_fp64 : enable                                  \r\n"
                     "#define _ITERATIONS "+i2s(_itInKern)+"                                         \r\n"
                     "#define _STEP "+d2s(_step,12)+"                                                \r\n"
                     "                                                                               \r\n"
                     "inline double dot16( double16 a, double16 b )                                  \r\n"
                     "{                                                                              \r\n"
                     "     double16 c  = a * b;                                                      \r\n"
                     "     double4 _1  = ( double4 ) ( 1., 1., 1., 1. );                             \r\n"
                     "     return dot( c.lo.lo + c.lo.hi + c.hi.lo  + c.hi.hi, _1 );                 \r\n"
                     "}                                                                              \r\n"
                     "                                                                               \r\n"
                     "__kernel void pi( __global double *out )                                       \r\n"
                     "{                                                                              \r\n"
                     "  int i = get_global_id( 0 );                                                  \r\n"
                     "                                                                               \r\n"
                     "  /// define vector constants                                           \r\n"
                     "  double16 _v16  = ( double16 ) ( 0., 1.,  2.,  3.,  4.,  5.,  6.,  7.,        \r\n"
                     "                                  8., 9., 10., 11., 12., 13., 14., 15. );      \r\n"
                     "                                                                               \r\n"
                     "  /// all vector-related with undefined type                                     \r\n"
                     "  double16 xVect;                                                              \r\n"
                     "  double16 sumVect           = ( double16 ) ( 0.0 );                           \r\n"
                     "  double16 doubleSTEPVect    = ( double16 ) ( _STEP );                         \r\n"
                     "                                                                               \r\n"
                     "  double16 in;                                                                 \r\n"
                     "  for( long j = 0; j < _ITERATIONS; j ++ )                                     \r\n"
                     "  {                                                                            \r\n"
                     "     in =  _v16 + 16. * ( i * _ITERATIONS + j );                               \r\n"
                     "     xVect = ( in  + 0.5 ) * doubleSTEPVect;          /// here                  \r\n"
                     "     sumVect += 4. / ( xVect * xVect + 1. );                                   \r\n"
                     "  }                                                                            \r\n"
                     "  out[ i ] = dot16( sumVect, 1. );                                             \r\n"
                     "}                                                                              \r\n";

const string clSrc8=
                    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable                                   \r\n"
                    "#define _ITERATIONS "+i2s(_itInKern)+"                                          \r\n"
                    "#define _STEP "+d2s(_step,12)+"                                                 \r\n"
                    "                                                                                \r\n"
                    "inline double dot8( double8 a, double8 b )                                      \r\n"
                    "{                                                                               \r\n"
                    "	   return      dot( a.lo, b.lo ) + dot( a.hi, b.hi );                        \r\n"
                    "}                                                                               \r\n"
                    "                                                                                \r\n"
                    "__kernel void pi( __global double *out )                                        \r\n"
                    "{                                                                               \r\n"
                    "  int i = get_global_id( 0 );                                                   \r\n"
                    "                                                                                \r\n"
                    "  /// define vector constants                                            \r\n"
                    "  double8 _v8  = ( double8 ) ( 0., 1.,  2.,  3.,  4.,  5.,  6.,  7. );          \r\n"
                    "                                                                                \r\n"
                    "  /// all vector-related with undefined type                                      \r\n"
                    "  double8 xVect;                                                                \r\n"
                    "  double8 sumVect           = ( double8 ) ( 0.0 );                              \r\n"
                    "  double8 doubleSTEPVect    = ( double8 ) ( _STEP );                            \r\n"
                    "                                                                                \r\n"
                    "  double8 in;                                                                   \r\n"
                    "  for( long j = 0; j < _ITERATIONS; j ++ )                                      \r\n"
                    "  {                                                                             \r\n"
                    "     in =  _v8 + 8. * ( i * _ITERATIONS + j );                                  \r\n"
                    "     xVect = ( in  + 0.5 ) * doubleSTEPVect;                          // here    \r\n"
                    "     sumVect += 4. / ( xVect * xVect + 1. );                                    \r\n"
                    "  }                                                                             \r\n"
                    "  out[ i ] = dot8( sumVect, 1. );                                               \r\n"
                    "}                                                                               \r\n";

const string clSrc4=
                    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable                                   \r\n"
                    "#define _ITERATIONS "+i2s(_itInKern)+"                                          \r\n"
                    "#define _STEP "+d2s(_step,12)+"                                                 \r\n"
                    "                                                                                \r\n"
                    "inline double dot4( double4 a, double4 b )                                      \r\n"
                    "{                                                                               \r\n"
                    "	   return      dot( a, b );                                                  \r\n"
                    "}                                                                               \r\n"
                    "                                                                                \r\n"
                    "__kernel void pi( __global double *out )                                        \r\n"
                    "{                                                                               \r\n"
                    "  int i = get_global_id( 0 );                                                   \r\n"
                    "                                                                                \r\n"
                    "  /// define vector constants                                            \r\n"
                    "  double4 _v4  = ( double4 ) ( 0., 1.,  2.,  3. );                              \r\n"
                    "                                                                                \r\n"
                    "  /// all vector-related with undefined type                                      \r\n"
                    "  double4 xVect;                                                                \r\n"
                    "  double4 sumVect           = ( double4 ) ( 0.0 );                              \r\n"
                    "  double4 doubleSTEPVect    = ( double4 ) ( _STEP );                            \r\n"
                    "                                                                                \r\n"
                    "  double4 in;                                                                   \r\n"
                    "  for( long j = 0; j < _ITERATIONS; j ++ )                                      \r\n"
                    "  {                                                                             \r\n"
                    "     in =  _v4 + 4. * ( i * _ITERATIONS + j );                                  \r\n"
                    "     xVect = ( in  + 0.5 ) * doubleSTEPVect;                          // here    \r\n"
                    "     sumVect += 4. / ( xVect * xVect + 1. );                                    \r\n"
                    "  }                                                                             \r\n"
                    "  out[ i ] = dot( sumVect, 1. );                                                \r\n"
                    "}                                                                               \r\n";

//--- this kernel is experimental but it works!
const string clSrc32=
                     "#pragma OPENCL EXTENSION cl_khr_fp64 : enable                                  \r\n"
                     "#define _ITERATIONS "+i2s(_itInKern)+"                                         \r\n"
                     "#define _STEP "+d2s(_step,12)+"                                                \r\n"
                     "                                                                               \r\n"
                     "typedef struct                                                                 \r\n"
                     "{                                                                              \r\n"
                     "  double16    lo;                                                              \r\n"
                     "  double16    hi;                                                              \r\n"
                     "} double32;                                                                    \r\n"
                     "                                                                               \r\n"
                     "inline double32 convert2double32( double a )                                   \r\n"
                     "{                                                                              \r\n"
                     "  double32 b;                                                                  \r\n"
                     "	b.lo = ( double16 )( a );                                                    \r\n"
                     "	b.hi = ( double16 )( a );                                                    \r\n"
                     "	return b;                                                                    \r\n"
                     "}                                                                              \r\n"
                     "                                                                               \r\n"
                     "inline double dot32( double32 a, double32 b )                                  \r\n"
                     "{                                                                              \r\n"
                     "     double32 c;                                                               \r\n"
                     "     c.lo = a.lo * b.lo;                                                       \r\n"
                     "     c.hi = a.hi * b.hi;                                                       \r\n"
                     "     double4 _1  = ( double4 ) ( 1., 1., 1., 1. );                             \r\n"
                     "     return dot( c.lo.lo.lo + c.lo.lo.hi + c.lo.hi.lo  + c.lo.hi.hi +          \r\n"
                     "                 c.hi.lo.lo + c.hi.lo.hi + c.hi.hi.lo  + c.hi.hi.hi, _1 );     \r\n"
                     "}                                                                              \r\n"
                     "                                                                               \r\n"
                     "__kernel void pi( __global double *out )                                       \r\n"
                     "{                                                                              \r\n"
                     "  int i = get_global_id( 0 );                                                  \r\n"
                     "                                                                               \r\n"
                     "  /// define vector constants                                           \r\n"
                     "  double32 _v32;                                                               \r\n"
                     "  _v32.lo = ( double16 ) (  0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,             \r\n"
                     "                	           8.,  9., 10., 11., 12., 13., 14., 15. );           \r\n"
                     "  _v32.hi = ( double16 ) ( 16., 17., 18., 19., 20., 21., 22., 23.,             \r\n"
                     "                     		 24., 25., 26., 27., 28., 29., 30., 31. );           \r\n"
                     "                                                                               \r\n"
                     "  /// all vector-related with undefined type                                     \r\n"
                     "  double32 xVect;                                                              \r\n"
                     "  double32 sumVect           = convert2double32( 0.0 );                        \r\n"
                     "  double32 double1Vect       = convert2double32( 1.0 );                        \r\n"
                     "                                                                               \r\n"
                     "  double32 in;                                                                 \r\n"
                     " /// work only with 16-vectors in the loop!                                   \r\n"
                     "  for( long j = 0; j < _ITERATIONS; j ++ )                                     \r\n"
                     "  {                                                                            \r\n"
                     "     in.lo = _v32.lo + 32. * ( i * _ITERATIONS + j );                          \r\n"
                     "     in.hi = _v32.hi + 32. * ( i * _ITERATIONS + j );                          \r\n"
                     "     xVect.lo = ( in.lo  + 0.5 ) * _STEP;                                      \r\n"
                     "     xVect.hi = ( in.hi  + 0.5 ) * _STEP;                                      \r\n"
                     "     sumVect.lo += 4. / ( xVect.lo * xVect.lo + 1. );                          \r\n"
                     "     sumVect.hi += 4. / ( xVect.hi * xVect.hi + 1. );                          \r\n"
                     "  }                                                                            \r\n"
                     "  out[ i ] = dot32( sumVect, double1Vect );                                    \r\n"
                     "}                                                                              \r\n";
//+------------------------------------------------------------------+
//| CPUcalc                                                          |
//+------------------------------------------------------------------+
double CPUcalc()
  {
   long start,stop;
   double x,pi,sum=0.0;
//--- step = 1. / num_steps;

   start=GetTickCount();
//--- the first and the stupidest option   
   for(long i=0; i<_num_steps; i++)
     {
      x=(i+0.5)*_step;
      sum+=4.0/(1.+x*x);
     }
   pi=sum*_step;
   stop=GetTickCount();

   Print("DULL: The value of PI is "+DoubleToString(pi,12));
   Print("DULL: The time to calculate PI was "+d2s(( stop-start)/1000.0,3)+" seconds");

//--- calculate using the second option
   start=GetTickCount();
   sum=0.;
   long divisor=40000;
   long internalCnt=_num_steps/divisor;
   double partsum=0.;
   for(long i=0; i<divisor; i++) /// divisor - this is also the number of works (kernel instances)
     {
      partsum=0.;
      for(long j=i*internalCnt; j<(i+1)*internalCnt; j++)
        {
         x=(j+0.5)*_step;
         partsum+=4.0/(1.+x*x);
        }
      sum+=partsum;
     }
   pi=sum*_step;
   stop=GetTickCount();

   Print("SMARTER: The value of PI is "+d2s(pi,12));
   Print("SMARTER: The time to calculate PI was "+d2s(( stop-start)/1000.0,3)+" seconds");

   return(( stop-start)/1000.0);         // the second time is output and not the first one
  }
//+------------------------------------------------------------------+
//| WriteCLProgram                                                   |
//+------------------------------------------------------------------+
void WriteCLProgram()
  {
   int h=FileOpen("pi_several_simple_kernels.cl",FILE_WRITE|FILE_TXT|FILE_ANSI);
   FileWrite(h,_clSrc);
   FileClose(h);
  }
//+------------------------------------------------------------------+
//| Script program start function                                    |
//+------------------------------------------------------------------+
int OnStart()
  {
   Print("=================================================================");
   Print("DOUBLE2: _step = "+d2s(_step,12)+"; _itInKern = "+i2s(_itInKern)+"; vectorization channels - "+i2s(_ch));
   int clCtx=CLContextCreate(_device);

   int clPrg=INVALID_HANDLE;
   switch(_ch)
     {
      case 4:  _clSrc = clSrc4;  break;
      case 8:  _clSrc = clSrc8;  break;
      case 16: _clSrc = clSrc16; break;
      case 32: _clSrc = clSrc32; break;
     }
   WriteCLProgram();
   clPrg=CLProgramCreate(clCtx,_clSrc);
   Print("GetLastError returned .. ",GetLastError());
   switch(GetLastError())
     {
      case ERR_OPENCL_INVALID_HANDLE:        Print("CLProgramCreate: invalid handle to the OpenCL program");                   break;
      case ERR_INVALID_PARAMETER:            Print( "CLProgramCreate: invalid string parameter" );                        break;
      case ERR_NOT_ENOUGH_MEMORY:            Print( "CLProgramCreate: not enough memory to complete operation" );          break;
      case ERR_OPENCL_PROGRAM_CREATE:        Print("CLProgramCreate: internal OpenCL error or compilation error.");      break;
      default:                               Print("CLProgramCreate: unknown error or no error.");
     }

   int clKrn=INVALID_HANDLE;
   if(( clKrn=CLKernelCreate(clPrg,"pi"))==INVALID_HANDLE)
     {
      Print("OpenCL kernel create failed.");
      Print("GetLastError returned .. ",GetLastError());
      switch(GetLastError())
        {
         case ERR_OPENCL_INVALID_HANDLE:        Print("CLKernelCreate: invalid handle to the OpenCL program");                break;
         case ERR_INVALID_PARAMETER:            Print("CLKernelCreate: invalid string parameter");                     break;
         case ERR_OPENCL_TOO_LONG_KERNEL_NAME:  Print("CLKernelCreate: kernel name exceeds 127 characters");           break;
         case ERR_OPENCL_KERNEL_CREATE:         Print("CLKernelCreate: internal error while creating an OpenCL object.");   break;
         default:                               Print("CLKernelCreate: unknown error or no error.");
        }
      return false;
     }

   int clMemOut=CLBufferCreate(clCtx,_divisor*sizeof(double),CL_MEM_READ_WRITE); // type double

   uint st=GetTickCount();

   CLSetKernelArgMem(clKrn,0,clMemOut);

   const uint offs[1]={ 0 };
   const uint works[1]={ _divisor };         // the number of tasks remains the same, even if every task is vectorized
   bool ex=CLExecute(clKrn,1,offs,works);

   double buf[];
   ArrayResize(buf,_divisor);               // total size of the buf[] buffer shall match the size of the kernel buffer
   uint read=CLBufferRead(clMemOut,buf);
   Print("read = "+i2s(read)+" elements");

//--- calculate pi adding up values of the output array ()
   double sum=0.0;
   for(int cnt=0; cnt<_divisor; cnt++) sum+=buf[cnt];
   double pi=sum*_step;
   double gone=(GetTickCount()-st)/1000.;

   ArrayResize(buf,0);                       // type double

   CLBufferFree(clMemOut);
   CLKernelFree(clKrn);
   CLProgramFree(clPrg);
   CLContextFree(clCtx);

   Print("OPENCL: pi = "+d2s(pi,12));
   Print("OPENCL: gone = "+d2s(gone,3)+" sec.");

/*
   double time = CPUcalc( );
   Print( "CPUtime / GPUtime = " + d2s( time / gone, 3 ) );
   Print( "==================================================" );         
   */
   return(0);
  }
//+------------------------------------------------------------------+