/// \file
/// \brief NeuroNet.cl
/// Library consist OpenCL kernels
/// \author <A HREF="https://www.mql5.com/en/users/dng"> DNG </A>
/// \copyright Copyright 2019, DNG
//---
//--- by default some GPU doesn't support doubles
//--- cl_khr_fp64 directive is used to enable work with doubles
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
//+------------------------------------------------------------------+
///\ingroup neuron_base_ff Feed forward process kernel
/// Describes the forward path process for the Neuron Base (#CNeuronBaseOCL).
///\details Detailed description on <A HREF="https://www.mql5.com/ru/articles/8435#para41">the link.</A>
//+------------------------------------------------------------------+
__kernel void FeedForward(__global double *matrix_w,///<[in] Weights matrix (m+1)*n, where m - number of neurons in layer and n - number of outputs (neurons in next layer)
                          __global double *matrix_i,///<[in] Inputs tesor
                          __global double *matrix_o,///<[out] Output tensor
                          int inputs,///< Number of inputs
                          int activation///< Activation type (#ENUM_ACTIVATION)
                          )
  {
   int i=get_global_id(0);
   double sum=0.0;
   double4 inp, weight;
   int shift=(inputs+1)*i;
   for(int k=0; k<=inputs; k=k+4)
     {
      switch(inputs-k)
        {
         case 0:
           inp=(double4)(1,0,0,0);
           weight=(double4)(matrix_w[shift+k],0,0,0);
           break;
         case 1:
           inp=(double4)(matrix_i[k],1,0,0);
           weight=(double4)(matrix_w[shift+k],matrix_w[shift+k+1],0,0);
           break;
         case 2:
           inp=(double4)(matrix_i[k],matrix_i[k+1],1,0);
           weight=(double4)(matrix_w[shift+k],matrix_w[shift+k+1],matrix_w[shift+k+2],0);
           break;
         case 3:
           inp=(double4)(matrix_i[k],matrix_i[k+1],matrix_i[k+2],1);
           weight=(double4)(matrix_w[shift+k],matrix_w[shift+k+1],matrix_w[shift+k+2],matrix_w[shift+k+3]);
           break;
         default:
           inp=(double4)(matrix_i[k],matrix_i[k+1],matrix_i[k+2],matrix_i[k+3]);
           weight=(double4)(matrix_w[shift+k],matrix_w[shift+k+1],matrix_w[shift+k+2],matrix_w[shift+k+3]);
           break;
        }
      sum+=dot(inp,weight);
     }
   if(isnan(sum))
      sum=0;
   switch(activation)
     {
      case 0:
        sum=tanh(sum);
        break;
      case 1:
        sum=1/(1+exp(-clamp(sum,-20.0,20.0)));
        break;
      case 2:
        if(sum<0)
           sum*=0.01;
        break;
      default:
        break;
     }
   matrix_o[i]=sum;
  }
//+------------------------------------------------------------------+
///\ingroup neuron_base_gr  Neuron Base Output Gradients Calculation kernel
/// Describes the process of output gradients calculation for the Neuron Base (#CNeuronBaseOCL).
///\details Detailed description on <A HREF="https://www.mql5.com/ru/articles/8435#para42">the link.</A>
//+------------------------------------------------------------------+
__kernel void CalcOutputGradient(__global double *matrix_t,///<[in] Target tensor
                                 __global double *matrix_o,///<[in] Output tensor
                                 __global double *matrix_ig,///<[out] Tensor of gradients
                                 int activation///< Activation type (#ENUM_ACTIVATION)
                                 )
  {
   int i=get_global_id(0);
   double temp=0;
   double out=matrix_o[i];
   switch(activation)
     {
      case 0:
        temp=clamp(matrix_t[i],-1.0,1.0)-out;
        temp=temp*(1-pow(out==1 || out==-1 ? 0.99999999 : out,2));
        break;
      case 1:
        temp=clamp(matrix_t[i],0.0,1.0)-out;
        temp=temp*(out==0 || out==1 ? 0.00000001 : (out*(1-out)));
        break;
      case 2:
        temp=(matrix_t[i]-out)*(out>=0 ? 1.0 : 0.01);
        break;
      default:
        temp=(matrix_t[i]-out);
        break;
     }
   matrix_ig[i]=clamp(temp,-1.0,1.0);
  }
//+------------------------------------------------------------------+
///\ingroup neuron_base_gr  Neuron Base Hidden Gradients Calculation kernel
/// Describes the process of hidden gradients calculation for the Neuron Base (#CNeuronBaseOCL).
///\details Detailed description on <A HREF="https://www.mql5.com/ru/articles/8435#para42">the link.</A>
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradient(__global double *matrix_w,///<[in] Weights matrix (m+1)*n, where m - number of neurons in previous layer and n - number of neurons in current layer
                                 __global double *matrix_g,///<[in] Tensor of gradients at current layer
                                 __global double *matrix_o,///<[in] Previous layer Output tensor
                                 __global double *matrix_ig,///<[out] Tensor of gradients at previous layer
                                 int outputs,///< Number of outputs
                                 int activation///< Activation type (#ENUM_ACTIVATION)
                                 )
  {
   int i=get_global_id(0);
   int inputs=get_global_size(0);
   double sum=0;
   double out=matrix_o[i];
   double4 grad, weight;
   for(int k=0;k<outputs;k+=4)
     {
      switch(outputs-k)
        {
         case 1:
           grad=(double4)(matrix_g[k],0,0,0);
           weight=(double4)(matrix_w[k*inputs+i],0,0,0);
           break;
         case 2:
           grad=(double4)(matrix_g[k],matrix_g[k+1],0,0);
           weight=(double4)(matrix_w[k*inputs+i],matrix_w[k*inputs+i+1],0,0);
           break;
         case 3:
           grad=(double4)(matrix_g[k],matrix_g[k+1],matrix_g[k+2],0);
           weight=(double4)(matrix_w[k*inputs+i],matrix_w[k*inputs+i+1],matrix_w[k*inputs+i+2],0);
           break;
         default:
           grad=(double4)(matrix_g[k],matrix_g[k+1],matrix_g[k+2],matrix_g[k+3]);
           weight=(double4)(matrix_w[k*inputs+i],matrix_w[k*inputs+i+1],matrix_w[k*inputs+i+2],matrix_w[k*inputs+i+3]);
           break;
        }
      sum+=dot(grad,weight);
     }
   if(isnan(sum))
      sum=0;
   switch(activation)
     {
      case 0:
        sum=clamp(sum+out,-1.0,1.0)-out;
        sum=sum*(1-pow(out==1 || out==-1 ? 0.99999999 : out,2));
        break;
      case 1:
        sum=clamp(sum+out,0.0,1.0)-out;
        sum=sum*(out==0 || out==1 ? 0.00000001 : (out*(1-out)));
        break;
      case 2:
        if(out<0)
           sum*=0.01;
        break;
      default:
        break;
     }
   matrix_ig[i]=clamp(sum,-1.0,1.0);
  }
//+------------------------------------------------------------------+
///\ingroup neuron_base_opt  Neuron Base SGD Updating Weights Calculation kernel
/// Describes the process of SGD optimization weights for the Neuron Base (#CNeuronBaseOCL).
///\details Detailed description on <A HREF="https://www.mql5.com/ru/articles/8435#para43">the link.</A>
//+------------------------------------------------------------------+
__kernel void UpdateWeightsMomentum(__global double *matrix_w, ///<[in,out] Weights matrix (m+1)*n, where m - number of neurons in previous layer and n - number of neurons in current layer
                            __global double *matrix_g,         ///<[in] Tensor of gradients at current layer
                            __global double *matrix_i,         ///<[in] Inputs tesor
                            __global double *matrix_dw,        ///<[in,out] Matrix of delta weights in last correction
                            int inputs,                        ///< Number of inputs
                            double learning_rates,             ///< Learning rates
                            double momentum                    ///< Momentum multiplier
                            )
  {
   int i=get_global_id(0);
   int j=get_global_id(1);
   int wi=i*(inputs+1)+j; 
   double delta=learning_rates*matrix_g[i]*(j<inputs ? matrix_i[j] : 1) + momentum*matrix_dw[wi];
   if(!isnan(delta))
     {
      matrix_dw[wi]=delta;
      matrix_w[wi]+=delta;
     }
  };
//+------------------------------------------------------------------+
///\ingroup neuron_base_opt  Neuron Base Adam Updating Weights Calculation kernel
/// Describes the process of Adam optimization weights for the Neuron Base (#CNeuronBaseOCL).
///\details Detailed description on <A HREF="https://www.mql5.com/ru/articles/8598#para31">the link.</A>
//+------------------------------------------------------------------+
__kernel void UpdateWeightsAdam(__global double *matrix_w,        ///<[in,out] Weights matrix (m+1)*n, where m - number of neurons in previous layer and n - number of neurons in current layer
                                __global const double *matrix_g,  ///<[in] Tensor of gradients at current layer
                                __global const double *matrix_i,  ///<[in] Inputs tesor
                                __global double *matrix_m,        ///<[in,out] Matrix of first momentum
                                __global double *matrix_v,        ///<[in,out] Matrix of seconfd momentum
                                const int inputs,                 ///< Number of inputs
                                const double l,                   ///< Learning rates
                                const double b1,                  ///< First momentum multiplier
                                const double b2                   ///< Second momentum multiplier
                                )
  {
   const int i=get_global_id(0);
   const int j=get_global_id(1);
   const int wi=i*(inputs+1)+j*4;
   double4 m, v, weight, inp;
   switch(inputs+1-j*4)
     {
      case 0:
        inp=(double4)(1,0,0,0);
        weight=(double4)(matrix_w[wi],0,0,0);
        m=(double4)(matrix_m[wi],0,0,0);
        v=(double4)(matrix_v[wi],0,0,0);
        break;
      case 1:
        inp=(double4)(matrix_i[j],1,0,0);
        weight=(double4)(matrix_w[wi],matrix_w[wi+1],0,0);
        m=(double4)(matrix_m[wi],matrix_m[wi+1],0,0);
        v=(double4)(matrix_v[wi],matrix_v[wi+1],0,0);
        break;
      case 2:
        inp=(double4)(matrix_i[j],matrix_i[j+1],1,0);
        weight=(double4)(matrix_w[wi],matrix_w[wi+1],matrix_w[wi+2],0);
        m=(double4)(matrix_m[wi],matrix_m[wi+1],matrix_m[wi+2],0);
        v=(double4)(matrix_v[wi],matrix_v[wi+1],matrix_v[wi+2],0);
        break;
      case 3:
        inp=(double4)(matrix_i[j],matrix_i[j+1],matrix_i[j+2],1);
        weight=(double4)(matrix_w[wi],matrix_w[wi+1],matrix_w[wi+2],matrix_w[wi+3]);
        m=(double4)(matrix_m[wi],matrix_m[wi+1],matrix_m[wi+2],matrix_m[wi+3]);
        v=(double4)(matrix_v[wi],matrix_v[wi+1],matrix_v[wi+2],matrix_v[wi+3]);
        break;
      default:
        inp=(double4)(matrix_i[j],matrix_i[j+1],matrix_i[j+2],matrix_i[j+3]);
        weight=(double4)(matrix_w[wi],matrix_w[wi+1],matrix_w[wi+2],matrix_w[wi+3]);
        m=(double4)(matrix_m[wi],matrix_m[wi+1],matrix_m[wi+2],matrix_m[wi+3]);
        v=(double4)(matrix_v[wi],matrix_v[wi+1],matrix_v[wi+2],matrix_v[wi+3]);
        break;
     }
   double4 g=(double4)(matrix_g[i])*inp;
   double4 mt=b1*m+(1-b1)*g;
   double4 vt=b2*v+(1-b2)*pow(g,2);
   double4 delta=l*mt/(vt>0  ? sqrt(vt) : l*10);
   long4 lmt=isnan(mt);
   long4 lvt=isnan(vt);
   long4 ldt=isnan(delta);
   switch(inputs+1-j*4)
     {
      case 2:
        if(lmt.s2==0 && lvt.s2==0 && ldt.s2==0)
          {
           matrix_w[wi+2]+=delta.s2;
           matrix_m[wi+2]=mt.s2;
           matrix_v[wi+2]=vt.s2;
          }
      case 1:
        if(lmt.s1==0 && lvt.s1==0 && ldt.s1==0)
          {
           matrix_w[wi+1]+=delta.s1;
           matrix_m[wi+1]=mt.s1;
           matrix_v[wi+1]=vt.s1;
          }
      case 0:
        if(lmt.s0==0 && lvt.s0==0 && ldt.s0==0)
          {
           matrix_w[wi]+=delta.s0;
           matrix_m[wi]=mt.s0;
           matrix_v[wi]=vt.s0;
          }
        break;
      default:
        if(lmt.s0==0 && lvt.s0==0 && ldt.s0==0)
          {
           matrix_w[wi]+=delta.s0;
           matrix_m[wi]=mt.s0;
           matrix_v[wi]=vt.s0;
          }
        if(lmt.s1==0 && lvt.s1==0 && ldt.s1==0)
          {
           matrix_w[wi+1]+=delta.s1;
           matrix_m[wi+1]=mt.s1;
           matrix_v[wi+1]=vt.s1;
          }
        if(lmt.s2==0 && lvt.s2==0 && ldt.s2==0)
          {
           matrix_w[wi+2]+=delta.s2;
           matrix_m[wi+2]=mt.s2;
           matrix_v[wi+2]=vt.s2;
          }
        if(lmt.s3==0 && lvt.s3==0 && ldt.s3==0)
          {
           matrix_w[wi+3]+=delta.s3;
           matrix_m[wi+3]=mt.s3;
           matrix_v[wi+3]=vt.s3;
          }
        break;
     }
  };
//+------------------------------------------------------------------+
///\ingroup neuron_proof_ff
/// Kernel of the Pooling neuron for Feed forward process (#CNeuronProofOCL)
//+------------------------------------------------------------------+
__kernel void FeedForwardProof(__global double *matrix_i,   ///<[in] Inputs tesor
                               __global double *matrix_o,   ///<[out] Output tensor
                              int inputs,                   ///< Number of inputs
                              int window,                   ///< Size of input window
                              int step                      ///< Step size
                              )
  {
   int i=get_global_id(0);
   int pos=i*step;
   double result=matrix_o[pos];
   for(int k=1; k<window; k=k+1)
     {
      int shift=k+pos;
      if(shift>=inputs)
         break;
      result=max(result,matrix_o[shift]);
     }
   matrix_o[i]=result;
  }
//+------------------------------------------------------------------+
///\ingroup neuron_proof_gr
/// Kernel of the Pooling neuron to transfer gradient to previous layer (#CNeuronProofOCL)
//+------------------------------------------------------------------+
__kernel void CalcInputGradientProof(__global double *matrix_i,   ///<[in] Inputs tesor
                                      __global double *matrix_g,  ///<[in] Tensor of gradients at current layer
                                      __global double *matrix_o,  ///<[in] Output tensor
                                      __global double *matrix_ig, ///<[out] Tensor of gradients at previous layer
                                      int outputs,                ///< Number of outputs
                                      int window,                 ///< Size of input window
                                      int step                    ///< Step size
                                      )
  {
   int i=get_global_id(0);
   double prev_gradient=0;
   double value=matrix_i[i];
   int start=i-window+step;
   start=(start-start%step)/step;
   int stop=(i-i%step)/step+1;
   for(int out=max(0,start); out<min(outputs,stop); out++)
     {
      if(value==matrix_o[out])
         prev_gradient+=matrix_g[out];
     }
   matrix_ig[i]=prev_gradient;
  }
//+------------------------------------------------------------------+
///\ingroup neuron_conv_ff
/// Kernel of the Convolution neuron for Feed forward process (#CNeuronConvOCL)
//+------------------------------------------------------------------+
__kernel void FeedForwardConv(__global double *matrix_w,             ///<[in] Weights matrix (m+1)*n, where m - input window and n - output window
                              __global double *matrix_i,             ///<[in] Inputs tesor
                              __global double *matrix_o,             ///<[out] Output tensor
                              int inputs,                            ///< Number of inputs
                              int step,                              ///< Step size
                              int window_in,                         ///< Size of input window
                              int window_out,                        ///< Size of output window
                              uint activation                        ///< Activation type (#ENUM_ACTIVATION)
                              )
  {
   int i=get_global_id(0);
   int w_in=window_in;
   int w_out=window_out;
   double sum=0.0;
   double4 inp, weight;
   int shift_out=w_out*i;
   int shift_in=step*i;
   for(int out=0;out<w_out;out++)
     {
      int shift=(w_in+1)*out;
      int stop=(w_in<=(inputs-shift_in) ? w_in : (inputs-shift_in));
      for(int k=0; k<=stop; k+=4)
        {
         switch(stop-k)
           {
            case 0:
              inp=(double4)(1,0,0,0);
              weight=(double4)(matrix_w[shift+k],0,0,0);
              break;
            case 1:
              inp=(double4)(matrix_i[shift_in+k],1,0,0);
              weight=(double4)(matrix_w[shift+k],matrix_w[shift+k+1],0,0);
              break;
            case 2:
              inp=(double4)(matrix_i[shift_in+k],matrix_i[shift_in+k+1],1,0);
              weight=(double4)(matrix_w[shift+k],matrix_w[shift+k+1],matrix_w[shift+k+2],0);
              break;
            case 3:
              inp=(double4)(matrix_i[shift_in+k],matrix_i[shift_in+k+1],matrix_i[shift_in+k+2],1);
              weight=(double4)(matrix_w[shift+k],matrix_w[shift+k+1],matrix_w[shift+k+2],matrix_w[shift+k+3]);
              break;
            default:
              inp=(double4)(matrix_i[shift_in+k],matrix_i[shift_in+k+1],matrix_i[shift_in+k+2],matrix_i[shift_in+k+3]);
              weight=(double4)(matrix_w[shift+k],matrix_w[shift+k+1],matrix_w[shift+k+2],matrix_w[shift+k+3]);
              break;
           }
         sum+=dot(inp,weight);
        }
      if(isnan(sum))
         sum=0;
      switch(activation)
        {
         case 0:
           sum=tanh(sum);
           break;
         case 1:
           sum=1/(1+exp(-clamp(sum,-20.0,20.0)));
           break;
         case 2:
           if(sum<0)
              sum*=0.01;
           break;
         default:
           break;
        }
      matrix_o[out+shift_out]=sum;
     }
  }
//+------------------------------------------------------------------+
///\ingroup neuron_conv_gr
/// Kernel of the Convolution neuron to transfer gradient to previous layer (#CNeuronConvOCL)
//+------------------------------------------------------------------+
__kernel void CalcHiddenGradientConv(__global double *matrix_w,   ///<[in] Weights matrix (m+1)*n, where m - input window and n - output window
                                     __global double *matrix_g,   ///<[in] Tensor of gradients at current layer
                                    __global double *matrix_o,    ///<[in] Output tensor
                                    __global double *matrix_ig,   ///<[out] Tensor of gradients at previous layer
                                    int outputs,                  ///< Number of outputs
                                    int step,                     ///< Step size
                                    int window_in,                ///< Size of input window
                                    int window_out,               ///< Size of output window
                                    uint activation               ///< Activation type (#ENUM_ACTIVATION)
                                    )
  {
   int i=get_global_id(0);
   int inputs=get_global_size(0);
   double sum=0;
   double out=matrix_o[i];
   int start=i-window_in+step;
   start=(start-start%step)/step;
   int stop=(i-i%step)/step+1;
   if(stop>(outputs/window_out))
      stop=outputs;
   for(int h=0;h<window_out;h+=4)
     {
      for(int k=start;k<stop;k++)
        {
         int shift_w=(stop-k-1)*step+i%step+h*(window_in+1);
         int shift_g=k*window_out+h;
         if(shift_g>=outputs || shift_w>=(window_in+1)*window_out)
            break;
         sum+=matrix_g[k*window_out+h]*matrix_w[shift_w];
        }
     }
   if(isnan(sum))
      sum=0;
   switch(activation)
     {
      case 0:
        sum=clamp(sum+out,-1.0,1.0)-out;
        sum=sum*(1-pow(out==1 || out==-1 ? 0.99999999 : out,2));
        break;
      case 1:
        sum=clamp(sum+out,0.0,1.0)-out;
        sum=sum*(out==0 || out==1 ? 0.00000001 : (out*(1-out)));
        break;
      case 2:
        if(out<0)
           sum*=0.01;
        break;
      default:
        break;
     }
   matrix_ig[i]=clamp(sum,-1.0,1.0);
  }
//+------------------------------------------------------------------+
///\ingroup neuron_conv_opt Convolution Neuron SGD optimization Updating Weights Calculation kernel
/// Describes the process of SGD optimization weights for the Convolution Neuron (#CNeuronConvOCL).
//+------------------------------------------------------------------+
__kernel void UpdateWeightsConvMomentum(__global double *matrix_w,   ///<[in,out] Weights matrix (m+1)*n, where m - input window and n - output window
                                       __global double *matrix_g,    ///<[in] Tensor of gradients at current layer
                                       __global double *matrix_i,    ///<[in] Inputs tesor
                                       __global double *matrix_dw,   ///<[in,out] Matrix of delta weights in last correction
                                       int inputs,                   ///< Number of inputs
                                       double learning_rates,        ///< Learning rates
                                       double momentum,              ///< Momentum multiplier
                                       int window_in,                ///< Size of input window
                                       int window_out,               ///< Size of output window
                                       int step                      ///< Step size
                                       )
  {
   const int i=get_global_id(0);
   const int shift=i%(window_in+1);
   const int shift_out=(i-shift)/(window_in+1);
   int total=(inputs-window_in)%step;
   total=(inputs-window_in-total)/step+(total>0 ? 1 : 0);
   double grad=0;
   for(int t=0;t<total;t++)
     {
      if(shift!=window_in && (shift+t*window_in)>=inputs)
         break;
      grad+=matrix_g[t*window_out+shift_out]*(shift==window_in ? 1 : matrix_i[shift+t*step]);
     }
   double delta=learning_rates*grad + momentum*matrix_dw[i];
   if(!isnan(delta))
     {
      matrix_dw[i]=delta;
      matrix_w[i]+=delta;
     }
  };
//+------------------------------------------------------------------+
///\ingroup neuron_conv_opt Convolution Neuron Adam optimization Updating Weights Calculation kernel
/// Describes the process of Adam optimization weights for the Convolution Neuron (#CNeuronConvOCL).
//+------------------------------------------------------------------+
__kernel void UpdateWeightsConvAdam(__global double *matrix_w,    ///<[in,out] Weights matrix (m+1)*n, where m - input window and n - output window
                                __global const double *matrix_g,  ///<[in] Tensor of gradients at current layer
                                __global const double *matrix_i,  ///<[in] Inputs tesor
                                __global double *matrix_m,        ///<[in] Matrix of first momentum
                                __global double *matrix_v,        ///<[in] Matrix of seconfd momentum
                                const int inputs,                 ///< Number of inputs
                                const double l,                   ///< Learning rates
                                const double b1,                  ///< First momentum multiplier
                                const double b2,                  ///< Second momentum multiplier
                                int window_in,                    ///< Size of input window
                                int window_out,                   ///< Size of output window
                                int step                          ///< Step size
                                )
  {
   const int i=get_global_id(0);
   const int shift=i%(window_in+1);
   const int shift_out=(i-shift)/(window_in+1);
   int total=(inputs-(window_in-step))%step;
   total=(inputs-(window_in-step)-total)/step+(total>0 ? 1 : 0);
   double grad=0;
   for(int t=0;t<total;t++)
     {
      if(shift!=window_in && (shift+t*window_in)>=inputs)
         break;
      grad+=matrix_g[t*window_out+shift_out]*(shift==window_in ? 1 : matrix_i[shift+t*step]);
     }
   double mt=b1*matrix_m[i]+(1-b1)*grad;
   double vt=b2*matrix_v[i]+(1-b2)*pow(grad,2);
   if(!isnan(mt) && !isnan(vt))
     {
      double delta=l*mt/(vt>0 ? sqrt(vt) : l*10);
      matrix_w[i]+=delta;
      matrix_m[i]=mt;
      matrix_v[i]=vt;
     }
  };
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff Attention Neuron Score calculation kernel                                                                  |
/// Describes the Score calculation process for the Neuron of attention layer (#CNeuronAttentionOCL).
///\details Detailed description on <A HREF="https://www.mql5.com/ru/articles/8765#para43">the link.</A>
//+------------------------------------------------------------------+
__kernel void AttentionScore(__global double *querys, ///<[in] Matrix of Querys
                             __global double *keys,   ///<[in] Matrix of Keys
                             __global double *score,  ///<[out] Matrix of Scores
                             int dimension            ///< Dimension of Key
                             )
  {
   int q=get_global_id(0);
   int shift_q=q*dimension;
   int units=get_global_size(0);
   int shift_s=q*units;
   double koef=sqrt((double)(/*units*/dimension));
   if(koef<1)
      koef=1;
   double sum=0;
   for(int k=0;k<units;k++)
     {
      double result=0;
      int shift_k=k*dimension;
      for(int i=0;i<dimension;i++)
         result+=(querys[shift_q+i]*keys[shift_k+i]);
      result=exp(clamp(result/koef,-20.0,20.0));
      if(isnan(result))
         result=0;
      score[shift_s+k]=result;
      sum+=result;   
     }
   for(int k=0;(k<units && sum>1);k++)
      score[shift_s+k]/=sum;
  }
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff Attention Neuron Out calculation kernel
/// Describes the Attention out calculation process for the Neuron of attention layer (#CNeuronAttentionOCL).
///\details Detailed description on <A HREF="https://www.mql5.com/ru/articles/8765#para43">the link.</A>
//+------------------------------------------------------------------+
__kernel void AttentionOut(__global double *scores,   ///<[in] Matrix of Scores
                           __global double *values,   ///<[in] Matrix of Values
                           __global double *inputs,   ///<[in] Inputs tesor
                           __global double *out       ///<[out] Output tesor
                           )
  {
   int units=get_global_size(0);
   int u=get_global_id(0);
   int d=get_global_id(1);
   int dimension=get_global_size(1);
   int shift=u*dimension+d;
   double result=0;
   for(int i=0;i<units;i++)
      result+=scores[u*units+i]*values[i*dimension+d];
   out[shift]=(isnan(result) ? 0 : result)+inputs[shift];
  }
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff Kernel for calculation Sum of 2 matrixs with multiplyer.
/// Describes the calculation Sum of 2 matrixs.
///\details Detailed description on <A HREF="https://www.mql5.com/ru/articles/8765#para43">the link.</A>
//+------------------------------------------------------------------+
__kernel void SumMatrix(__global double *matrix1,     ///<[in] First matrix
                        __global double *matrix2,     ///<[in] Second matrix
                        __global double *matrix_out,  ///<[out] Output matrix
                        int dimension,                ///< Dimension of matrix
                        double multiplyer             ///< Multiplyer for output
                        )
  {
   const int i=get_global_id(0)*dimension;
   for(int k=0;k<dimension;k++)
      matrix_out[i+k]=(matrix1[i+k]+matrix2[i+k])*multiplyer;
  }
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff Kernel for calculation Sum of 4 matrixs with multiplyer.
/// Describes the calculation Sum of 4 matrixs.
///\details Detailed description on <A HREF="https://www.mql5.com/ru/articles/8909#para53">the link.</A>
//+------------------------------------------------------------------+
__kernel void Sum5Matrix(__global double *matrix1,     ///<[in] First matrix
                        __global double *matrix2,     ///<[in] Second matrix
                        __global double *matrix3,     ///<[in] Third matrix
                        __global double *matrix4,     ///<[in] Fourth matrix
                        __global double *matrix5,     ///<[in] Fifth matrix
                        __global double *matrix_out,  ///<[out] Output matrix
                        int dimension,                ///< Dimension of matrix
                        double multiplyer             ///< Multiplyer for output
                        )
  {
   const int i=get_global_id(0)*dimension;
   for(int k=0;k<dimension;k++)
      matrix_out[i+k]=(matrix1[i+k]+matrix2[i+k]+matrix3[i+k]+matrix4[i+k]+matrix5[i+k])*multiplyer;
  }
//+------------------------------------------------------------------+
///\ingroup neuron_atten_gr Attention layer's neuron Gradients Calculation kernel
/// Describes the gradients calculation process for the Neuron of attention layer (#CNeuronAttentionOCL).
///\details Detailed description on <A HREF="https://www.mql5.com/ru/articles/8765#para44">the link.</A>
/// @param[in] querys Matrix of Querys
/// @param[out] querys_g Matrix of Querys' Gradients
/// @param[in] keys Matrix of Keys
/// @param[out] keys_g Matrix of Keys' Gradients
/// @param[in] values Matrix of Values
/// @param[out] values_g Matrix of Values' Gradients
/// @param[in] scores Matrix of Scores
/// @param[in] gradient Matrix of Gradients from previous iteration
//+------------------------------------------------------------------+
__kernel void AttentionInsideGradients(__global double *querys,__global double *querys_g,
                                      __global double *keys,__global double *keys_g,
                                      __global double *values,__global double *values_g,
                                      __global double *scores,
                                      __global double *gradient)
  {
   int u=get_global_id(0);
   int d=get_global_id(1);
   int units=get_global_size(0);
   int dimension=get_global_size(1);
   double koef=sqrt((double)(units*dimension));
   if(koef<1)
      koef=1;
   
   double vg=0;
   double qg=0;
   double kg=0;
   
   for(int iu=0;iu<units;iu++)
     {
      double g=gradient[iu*dimension+d];
      double sc=scores[iu*units+u];
      vg+=sc*g;
//---
      double sqg=0;
      double skg=0;
      for(int id=0;id<dimension;id++)
        {
         sqg+=values[iu*dimension+id]*gradient[u*dimension+id];
         skg+=values[u*dimension+id]*gradient[iu*dimension+id];
        }
      qg+=(scores[u*units+iu]==0 || scores[u*units+iu]==1 ? 0.0001 : scores[u*units+iu]*(1-scores[u*units+iu]))*sqg*keys[iu*dimension+d]/koef;
//---
      kg+=(scores[iu*units+u]==0 || scores[iu*units+u]==1 ? 0.0001 : scores[iu*units+u]*(1-scores[iu*units+u]))*skg*querys[iu*dimension+d]/koef;
     }
   int shift=u*dimension+d;
   values_g[shift]=clamp((isnan(vg) ? 0.0 : vg),-1.0,1.0);
   querys_g[shift]=clamp((isnan(qg) ? 0.0 : qg),-1.0,1.0);
   keys_g[shift]=clamp((isnan(kg) ? 0.0 : kg),-1.0,1.0);
  }
//+------------------------------------------------------------------+
///\ingroup neuron_norm Kernels of matrix normalization process
/// Describes the process of matrix normalization.
///\details Detailed description on <A HREF="https://arxiv.org/abs/1607.06450">the link.</A>
/// @param[in,out] buffer In/Out Matrix
/// @param[in] dimension Dimension of matrix
//+------------------------------------------------------------------+
__kernel void Normalize(__global double *buffer,
                        int dimension)
  {
   int n=get_global_id(0);
   int shift=n*dimension;
   double mean=0;
   for(int i=0;i<dimension;i++)
      mean+=buffer[shift+i];
   if(isnan(mean))
      mean=0;
   else
      mean/=dimension;
   double variance=0;
   for(int i=0;i<dimension;i++)
      variance+=pow(buffer[shift+i]-mean,2);
   variance=sqrt((isnan(variance) ? 0 : variance)/dimension);
   for(int i=0;i<dimension;i++)
      buffer[shift+i]=(buffer[shift+i]-mean)/(variance==0 ? 1 : variance);
  }
//+------------------------------------------------------------------+
///\ingroup neuron_norm Kernels of weights matrix normalization process
/// Describes the process of weights matrix normalization.
///\details Detailed description on <A HREF="https://arxiv.org/abs/1607.06450">the link.</A>
/// @param[in,out] buffer In/Out Matrix
/// @param[in] dimension Dimension of matrix
//+------------------------------------------------------------------+
__kernel void NormalizeWeights(__global double *buffer,
                                 int dimension)
  {
   int n=get_global_id(0);
   int shift=n*dimension;
   double sum=0;
   for(int i=0;i<dimension;i++)
      sum=pow(buffer[shift+i],2);
   sum=sqrt(isnan(sum) ? 0 : sum);
   if(sum>1)
      for(int i=0;i<dimension;i++)
         buffer[shift+i]/=sum;
  }
//+------------------------------------------------------------------+
///\ingroup neuron_atten_ff 
/// Describes the process of concatenate 4 matrices.
///\details Detailed description on <A HREF="https://www.mql5.com/ru/articles/8909#para52">the link.</A>
/// @param[in] input1, input2, input3, input4 Input buffers
/// @param[in] window1, window2, window3, window4 Windows for every buffer
/// @param[out] output Output buffer
//+------------------------------------------------------------------+
__kernel void ConcatenateBuffers(__global double *input1, int window1,
                                 __global double *input2, int window2,
                                 __global double *input3, int window3,
                                 __global double *input4, int window4,
                                 __global double *output)
  {
   int n=get_global_id(0);
   int shift=n*(window1+window2+window3+window4);
   int shift_in=n*window1;
   for(int i=0;i<window1;i++)
      output[shift+i]=input1[shift_in+i];
//---
   shift+=window1;
   shift_in=n*window2;
   for(int i=0;i<window2;i++)
      output[shift+i]=input2[shift_in+i];
//---
   shift+=window2;
   shift_in=n*window3;
   for(int i=0;i<window3;i++)
      output[shift+i]=input3[shift_in+i];
//---
   shift+=window3;
   shift_in=n*window4;
   for(int i=0;i<window4;i++)
      output[shift+i]=input4[shift_in+i];
  }
//+------------------------------------------------------------------+
///\ingroup neuron_atten_gr 
/// Describes the process of deconcatenate matrix.
///\details Detailed description on <A HREF="https://www.mql5.com/ru/articles/8909#para53">the link.</A>
/// @param[in] output1, output2, output3, output4 Output buffers
/// @param[in] window1, window2, window3, window4 Windows for every buffer
/// @param[out] inputs Input buffer
//+------------------------------------------------------------------+
__kernel void DeconcatenateBuffers(__global double *output1, int window1,
                                 __global double *output2, int window2,
                                 __global double *output3, int window3,
                                 __global double *output4, int window4,
                                 __global double *inputs)
  {
   int n=get_global_id(0);
//--- Head 1
   int shift=n*(window1+window2+window3+window4);
   int shift_out=n*window1;
   for(int i=0;i<window1;i++)
      output1[shift_out+i]=inputs[shift+i];
//--- Head 2
   shift+=window1;
   shift_out=n*window2;
   for(int i=0;i<window2;i++)
      output2[shift_out+i]=inputs[shift+i];
//--- Head 3
   shift+=window2;
   shift_out=n*window3;
   for(int i=0;i<window3;i++)
      output3[shift_out+i]=inputs[shift+i];
//--- Head 4
   shift+=window3;
   shift_out=n*window4;
   for(int i=0;i<window4;i++)
      output4[shift_out+i]=inputs[shift+i];
  }
//+------------------------------------------------------------------+