//+------------------------------------------------------------------+
//|                             OCL_pi_double2_parallel_straight.mq5 |
//+------------------------------------------------------------------+
#property copyright "Copyright (c) 2012, Mthmt"
#property link      "http://www.mql5.com"
#property version   "1.00"
#property script_show_inputs;

input int  _device = 0;  // OpenCL device number (0, I have CPU)
long       _num_steps    = 1000000000; 
#define    _divisor      20000
double     _step         = 1.0 / _num_steps;
input int  _ch           = 16;                      /// number of parallel channels
long       _intrnCnt     = _num_steps / _divisor / _ch;
#define    _tot          _divisor / _ch

string d2s(double arg,int dig) { return DoubleToString(arg,dig); }
string i2s(long arg) { return IntegerToString(arg); }

const string clSrc=
                   "/// enable extensions with doubles                                                            \r\n"
                   "#pragma OPENCL EXTENSION cl_khr_fp64 : enable                                               \r\n"
                   "#define _ITERATIONS "+i2s(_intrnCnt)+"                                                      \r\n"
                   "#define _STEP "+d2s(_step,12)+"                                                             \r\n"
                   "#define _CH "+i2s(_ch)+"                                                                    \r\n"
                   "#define _DOUBLETYPE double"+i2s(_ch)+"                                                      \r\n"
                   "                                                                                            \r\n"
                   "/// extensions for 4-, 8- and 16- scalar products                                       \r\n"
                   "#define dot4( a, b )         dot( a, b )                                                    \r\n"
                   "                                                                                            \r\n"
                   "inline double dot8( double8 a, double8 b )                                                  \r\n"
                   "{                                                                                           \r\n"
                   "     return dot4( a.lo, b.lo ) + dot4( a.hi, b.hi );                                        \r\n"
                   "}                                                                                           \r\n"
                   "                                                                                            \r\n"
                   "inline double dot16( double16 a, double16 b )                                               \r\n"
                   "{                                                                                           \r\n"
                   "     double16 c  = a * b;                                                                   \r\n"
                   "     double4 _1  = ( double4 ) ( 1., 1., 1., 1. );                                          \r\n"
                   "     return dot4( c.lo.lo + c.lo.hi + c.hi.lo  + c.hi.hi, _1 );                             \r\n"
                   "}                                                                                           \r\n"
                   "                                                                                            \r\n"
                   "__kernel void pi( __global double *out )                                                    \r\n"
                   "{                                                                                           \r\n"
                   "  int i = get_global_id( 0 );                                                               \r\n"
                   "                                                                                            \r\n"
                   "  /// define vector constants                                                        \r\n"
                   "  double16 v16  = ( double16 ) ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );    \r\n"
                   "  double8  v8   = v16.lo;                                                                   \r\n"
                   "  double4  v4   = v16.lo.lo;                                                                \r\n"
                   "  double2  v2   = v16.lo.lo.lo;                                                             \r\n"
                   "                                                                                            \r\n"
                   "  /// all vector-related with the calculated type                                                     \r\n"
                   "  _DOUBLETYPE in;                                                                           \r\n"
                   "  _DOUBLETYPE xVect;                                                                        \r\n"
                   "  _DOUBLETYPE sumVect          = ( _DOUBLETYPE ) ( 0.0 );                                   \r\n"
                   "  _DOUBLETYPE doubleOneVect    = ( _DOUBLETYPE ) ( 1.0 );                                   \r\n"
                   "  _DOUBLETYPE doubleCHVect     = ( _DOUBLETYPE ) ( _CH + 0. );                              \r\n"
                   "  _DOUBLETYPE doubleSTEPVect   = ( _DOUBLETYPE ) ( _STEP );                                 \r\n"
                   "                                                                                            \r\n"
                   "  for( long j = 0; j < _ITERATIONS; j ++ )                                                  \r\n"
                   "  {                                                                                         \r\n"
                   "     in =  v"+i2s(_ch)+" + doubleCHVect * ( i * _ITERATIONS + j );                          \r\n"
                   "     xVect = ( in + 0.5 ) * doubleSTEPVect;                                                 \r\n"
                   "     sumVect += 4.0 / ( xVect * xVect + 1. );                                               \r\n"
                   "  }                                                                                         \r\n"
                   "  out[ i ] = dot"+i2s(_ch)+"(  sumVect, doubleOneVect );                                    \r\n"
                   "}                                                                                           \r\n";
//+------------------------------------------------------------------+
//| CPUcalc                                                          |
//+------------------------------------------------------------------+
double CPUcalc()
  {
   long start,stop;
   double x,pi,sum=0.0;
//--- step = 1. / num_steps;

   start=GetTickCount();
//--- the first and the stupidest option   
   for(long i=0; i<_num_steps; i++)
     {
      x=(i+0.5)*_step;
      sum+=4.0/(1.+x*x);
     }
   pi=sum*_step;
   stop=GetTickCount();

   Print("DULL: The value of PI is "+DoubleToString(pi,12));
   Print("DULL: The time to calculate PI was "+d2s(( stop-start)/1000.0,3)+" seconds");

//--- calculate using the second option
   start=GetTickCount();
   sum=0.0;
   long divisor=40000;
   long internalCnt=_num_steps/divisor;
   double partsum=0.;
   for(long i=0; i<divisor; i++)
     {
      partsum=0.;
      for(long j=i*internalCnt; j<(i+1)*internalCnt; j++)
        {
         x=(j+0.5)*_step;
         partsum+=4.0/(1.+x*x);
        }
      sum+=partsum;
     }
   pi=sum*_step;
   stop=GetTickCount();

   Print("SMARTER: The value of PI is "+d2s(pi,12));
   Print("SMARTER: The time to calculate PI was "+d2s(( stop-start)/1000.0,3)+" seconds");

   return(( stop-start)/1000.0);       // the second time is output and not the first one
  }
//+------------------------------------------------------------------+
//| WriteCLProgram                                                   |
//+------------------------------------------------------------------+
void WriteCLProgram()
  {
   int h=FileOpen("pi_double2_parallel_straight.cl",FILE_WRITE|FILE_TXT|FILE_ANSI);
   FileWrite(h,clSrc);
   FileClose(h);
  }
//+------------------------------------------------------------------+
//| Script program start function                                    |
//+------------------------------------------------------------------+
int OnStart()
  {
   WriteCLProgram();
   Print("=================================================================");
   Print("DOUBLE2: _step = "+d2s(_step,12)+"; _intrnCnt = "+i2s(_intrnCnt));
   int clCtx=CLContextCreate(_device);

   int clPrg=CLProgramCreate(clCtx,clSrc);
   switch(GetLastError())
     {
      case ERR_OPENCL_INVALID_HANDLE:        Print("CLProgramCreate: invalid handle to the OpenCL program");                   break;
      case ERR_INVALID_PARAMETER:            Print( "CLProgramCreate: invalid string parameter" );                        break;
      case ERR_NOT_ENOUGH_MEMORY:            Print( "CLProgramCreate: not enough memory to complete operation" );          break;
      case ERR_OPENCL_PROGRAM_CREATE:        Print("CLProgramCreate: internal OpenCL error or compilation error.");      break;
      default:                               Print("CLProgramCreate: unknown error.");
     }

   int clKrn=INVALID_HANDLE;
   if(( clKrn=CLKernelCreate(clPrg,"pi"))==INVALID_HANDLE)
     {
      Print("OpenCL kernel cl_Krn create failed.");
      Print("GetLastError returned .. ",GetLastError());
      switch(GetLastError())
        {
         case ERR_OPENCL_INVALID_HANDLE:        Print("CLKernelCreate: invalid handle to the OpenCL program");                break;
         case ERR_INVALID_PARAMETER:            Print("CLKernelCreate: invalid string parameter");                     break;
         case ERR_OPENCL_TOO_LONG_KERNEL_NAME:  Print("CLKernelCreate: kernel name exceeds 127 characters");           break;
         case ERR_OPENCL_KERNEL_CREATE:         Print("CLKernelCreate: internal error while creating an OpenCL object.");   break;
         default:                               Print("CLKernelCreate: unknown error.");
        }
      return false;
     }

   int clMemOut=CLBufferCreate(clCtx,_divisor*sizeof(double),CL_MEM_READ_WRITE);   // type double

   uint st=GetTickCount();

   CLSetKernelArgMem(clKrn,0,clMemOut);

   const uint offs[ 1 ]  = { 0 };
   const uint works[ 1 ] = { _divisor };
   bool ex=CLExecute(clKrn,1,offs,works);

   double buf[];
   ArrayResize(buf,_divisor);                        // type double
   uint read=CLBufferRead(clMemOut,buf);
   Print("read = "+i2s(read)+" elements");

   double sum=0.0;
   for(int cnt=0; cnt<_divisor; cnt++) sum+=buf[cnt];
   double pi=sum*_step;
   ArrayResize(buf,0);                              // type double

//---CLBufferFree( clMemLong16 );   
   CLBufferFree(clMemOut);
   CLKernelFree(clKrn);
   CLProgramFree(clPrg);
   CLContextFree(clCtx);

   double gone=(GetTickCount()-st)/1000.;

   Print("OPENCL: pi = "+d2s(pi,12));
   Print("OPENCL: gone = "+d2s(gone,3)+" sec.");

   double time=CPUcalc();
   Print("CPUtime / GPUtime = "+d2s(time/gone,3));
   Print("==================================================");

   return(0);
  }
//+------------------------------------------------------------------+