9 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
16 __global
double *matrix_i,
17 __global
double *matrix_o,
22 int i=get_global_id(0);
25 int shift=(inputs+1)*i;
26 for(
int k=0; k<=inputs; k=k+4)
31 inp=(double4)(1,0,0,0);
32 weight=(double4)(matrix_w[shift+k],0,0,0);
35 inp=(double4)(matrix_i[k],1,0,0);
36 weight=(double4)(matrix_w[shift+k],matrix_w[shift+k+1],0,0);
39 inp=(double4)(matrix_i[k],matrix_i[k+1],1,0);
40 weight=(double4)(matrix_w[shift+k],matrix_w[shift+k+1],matrix_w[shift+k+2],0);
43 inp=(double4)(matrix_i[k],matrix_i[k+1],matrix_i[k+2],1);
44 weight=(double4)(matrix_w[shift+k],matrix_w[shift+k+1],matrix_w[shift+k+2],matrix_w[shift+k+3]);
47 inp=(double4)(matrix_i[k],matrix_i[k+1],matrix_i[k+2],matrix_i[k+3]);
48 weight=(double4)(matrix_w[shift+k],matrix_w[shift+k+1],matrix_w[shift+k+2],matrix_w[shift+k+3]);
59 sum=1/(1+exp(-clamp(sum,-50.0,50.0)));
76 __global
double *matrix_o,
77 __global
double *matrix_ig,
81 int i=get_global_id(0);
83 double out=matrix_o[i];
87 temp=clamp(matrix_t[i],-1.0,1.0)-out;
88 temp=temp*(1-pow(out==1 || out==-1 ? 0.99999999 : out,2));
91 temp=clamp(matrix_t[i],0.0,1.0)-out;
92 temp=temp*(out==0 || out==1 ? 0.00000001 : (out*(1-out)));
95 temp=(matrix_t[i]-out)*(out>=0 ? 1.0 : 0.01);
98 temp=(matrix_t[i]-out);
109 __global
double *matrix_g,
110 __global
double *matrix_o,
111 __global
double *matrix_ig,
116 int i=get_global_id(0);
117 int inputs=get_global_size(0);
119 double out=matrix_o[i];
120 double4 grad, weight;
121 for(
int k=0;k<outputs;k+=4)
126 grad=(double4)(matrix_g[k],0,0,0);
127 weight=(double4)(matrix_w[k*inputs+i],0,0,0);
130 grad=(double4)(matrix_g[k],matrix_g[k+1],0,0);
131 weight=(double4)(matrix_w[k*inputs+i],matrix_w[k*inputs+i+1],0,0);
134 grad=(double4)(matrix_g[k],matrix_g[k+1],matrix_g[k+2],0);
135 weight=(double4)(matrix_w[k*inputs+i],matrix_w[k*inputs+i+1],matrix_w[k*inputs+i+2],0);
138 grad=(double4)(matrix_g[k],matrix_g[k+1],matrix_g[k+2],matrix_g[k+3]);
139 weight=(double4)(matrix_w[k*inputs+i],matrix_w[k*inputs+i+1],matrix_w[k*inputs+i+2],matrix_w[k*inputs+i+3]);
142 sum+=dot(grad,weight);
147 sum=clamp(sum+out,-1.0,1.0)-out;
148 sum=sum*(1-pow(out==1 || out==-1 ? 0.99999999 : out,2));
151 sum=clamp(sum+out,0.0,1.0)-out;
152 sum=sum*(out==0 || out==1 ? 0.00000001 : (out*(1-out)));
169 __global
double *matrix_g,
170 __global
double *matrix_i,
171 __global
double *matrix_dw,
173 double learning_rates,
177 int i=get_global_id(0);
178 int j=get_global_id(1);
179 int wi=i*(inputs+1)+j;
180 double delta=learning_rates*matrix_g[i]*(j<inputs ? matrix_i[j] : 1) +
momentum*matrix_dw[wi];
190 __global
const double *matrix_g,
191 __global
const double *matrix_i,
192 __global
double *matrix_m,
193 __global
double *matrix_v,
200 const int i=get_global_id(0);
201 const int j=get_global_id(1);
202 const int wi=i*(inputs+1)+j*4;
203 double4 m, v, weight, inp;
207 inp=(double4)(1,0,0,0);
208 weight=(double4)(matrix_w[wi],0,0,0);
209 m=(double4)(matrix_m[wi],0,0,0);
210 v=(double4)(matrix_v[wi],0,0,0);
213 inp=(double4)(matrix_i[j],1,0,0);
214 weight=(double4)(matrix_w[wi],matrix_w[wi+1],0,0);
215 m=(double4)(matrix_m[wi],matrix_m[wi+1],0,0);
216 v=(double4)(matrix_v[wi],matrix_v[wi+1],0,0);
219 inp=(double4)(matrix_i[j],matrix_i[j+1],1,0);
220 weight=(double4)(matrix_w[wi],matrix_w[wi+1],matrix_w[wi+2],0);
221 m=(double4)(matrix_m[wi],matrix_m[wi+1],matrix_m[wi+2],0);
222 v=(double4)(matrix_v[wi],matrix_v[wi+1],matrix_v[wi+2],0);
225 inp=(double4)(matrix_i[j],matrix_i[j+1],matrix_i[j+2],1);
226 weight=(double4)(matrix_w[wi],matrix_w[wi+1],matrix_w[wi+2],matrix_w[wi+3]);
227 m=(double4)(matrix_m[wi],matrix_m[wi+1],matrix_m[wi+2],matrix_m[wi+3]);
228 v=(double4)(matrix_v[wi],matrix_v[wi+1],matrix_v[wi+2],matrix_v[wi+3]);
231 inp=(double4)(matrix_i[j],matrix_i[j+1],matrix_i[j+2],matrix_i[j+3]);
232 weight=(double4)(matrix_w[wi],matrix_w[wi+1],matrix_w[wi+2],matrix_w[wi+3]);
233 m=(double4)(matrix_m[wi],matrix_m[wi+1],matrix_m[wi+2],matrix_m[wi+3]);
234 v=(double4)(matrix_v[wi],matrix_v[wi+1],matrix_v[wi+2],matrix_v[wi+3]);
237 double4 g=(double4)(matrix_g[i])*inp;
238 double4 mt=
b1*m+(1-
b1)*g;
239 double4 vt=
b2*v+(1-
b2)*pow(g,2);
240 double4 delta=l*mt/(vt>0 ? sqrt(vt) : l*10);
244 matrix_w[wi+2]+=delta.s2;
245 matrix_m[wi+2]=mt.s2;
246 matrix_v[wi+2]=vt.s2;
248 matrix_w[wi+1]+=delta.s1;
249 matrix_m[wi+1]=mt.s1;
250 matrix_v[wi+1]=vt.s1;
252 matrix_w[wi]+=delta.s0;
257 matrix_w[wi]+=delta.s0;
260 matrix_w[wi+1]+=delta.s1;
261 matrix_m[wi+1]=mt.s1;
262 matrix_v[wi+1]=vt.s1;
263 matrix_w[wi+2]+=delta.s2;
264 matrix_m[wi+2]=mt.s2;
265 matrix_v[wi+2]=vt.s2;
266 matrix_w[wi+3]+=delta.s3;
267 matrix_m[wi+3]=mt.s3;
268 matrix_v[wi+3]=vt.s3;
277 __global
double *matrix_o,
283 int i=get_global_id(0);
285 double result=matrix_o[pos];
286 for(
int k=1; k<window; k=k+1)
291 result=max(result,matrix_o[shift]);
300 __global
double *matrix_g,
301 __global
double *matrix_o,
302 __global
double *matrix_ig,
308 int i=get_global_id(0);
309 double prev_gradient=0;
310 double value=matrix_i[i];
311 int start=i-window+step;
312 start=(start-start%step)/step;
313 int stop=(i-i%step)/step+1;
314 for(
int out=max(0,start); out<min(outputs,stop); out++)
316 if(value==matrix_o[out])
317 prev_gradient+=matrix_g[out];
319 matrix_ig[i]=prev_gradient;
326 __global
double *matrix_i,
327 __global
double *matrix_o,
335 int i=get_global_id(0);
337 int w_out=window_out;
340 int shift_out=w_out*i;
342 for(
int out=0;out<w_out;out++)
344 int shift=(w_in+1)*out;
345 int stop=(w_in<=(inputs-shift_in) ? w_in : (inputs-shift_in));
346 for(
int k=0; k<=stop; k=k+4)
351 inp=(double4)(1,0,0,0);
352 weight=(double4)(matrix_w[shift+k],0,0,0);
355 inp=(double4)(matrix_i[shift_in+k],1,0,0);
356 weight=(double4)(matrix_w[shift+k],matrix_w[shift+k+1],0,0);
359 inp=(double4)(matrix_i[shift_in+k],matrix_i[shift_in+k+1],1,0);
360 weight=(double4)(matrix_w[shift+k],matrix_w[shift+k+1],matrix_w[shift+k+2],0);
363 inp=(double4)(matrix_i[shift_in+k],matrix_i[shift_in+k+1],matrix_i[shift_in+k+2],1);
364 weight=(double4)(matrix_w[shift+k],matrix_w[shift+k+1],matrix_w[shift+k+2],matrix_w[shift+k+3]);
367 inp=(double4)(matrix_i[shift_in+k],matrix_i[shift_in+k+1],matrix_i[shift_in+k+2],matrix_i[shift_in+k+3]);
368 weight=(double4)(matrix_w[shift+k],matrix_w[shift+k+1],matrix_w[shift+k+2],matrix_w[shift+k+3]);
371 sum+=dot(inp,weight);
379 sum=1/(1+exp(-clamp(sum,-50.0,50.0)));
388 matrix_o[out+shift_out]=sum;
396 __global
double *matrix_g,
397 __global
double *matrix_o,
398 __global
double *matrix_ig,
406 int i=get_global_id(0);
407 int inputs=get_global_size(0);
409 double out=matrix_o[i];
410 int start=i-window_in+step;
411 start=(start-start%step)/step;
412 int stop=(i-i%step)/step+1;
413 if(stop>(outputs/window_out))
415 for(
int h=0;h<window_out;h+=4)
417 for(
int k=start;k<stop;k++)
419 int shift_w=(stop-k-1)*step+i%step+h*(window_in+1);
420 int shift_g=k*window_out+h;
421 if(shift_g>=outputs || shift_w>=(window_in+1)*window_out)
423 sum+=matrix_g[k*window_out+h]*matrix_w[shift_w];
429 sum=clamp(sum+out,-1.0,1.0)-out;
430 sum=sum*(1-pow(out==1 || out==-1 ? 0.99999999 : out,2));
433 sum=clamp(sum+out,0.0,1.0)-out;
434 sum=sum*(out==0 || out==1 ? 0.00000001 : (out*(1-out)));
450 __global
double *matrix_g,
451 __global
double *matrix_i,
452 __global
double *matrix_dw,
454 double learning_rates,
461 const int i=get_global_id(0);
462 const int shift=i%(window_in+1);
463 const int shift_out=(i-shift)/(window_in+1);
464 int total=(inputs-window_in)%step;
465 total=(inputs-window_in-total)/step+(total>0 ? 1 : 0);
467 for(
int t=0;t<total;t++)
469 if(shift!=window_in && (shift+t*window_in)>=inputs)
471 grad+=matrix_g[t*window_out+shift_out]*(shift==window_in ? 1 : matrix_i[shift+t*step]);
473 double delta=learning_rates*grad +
momentum*matrix_dw[i];
482 __global
const double *matrix_g,
483 __global
const double *matrix_i,
484 __global
double *matrix_m,
485 __global
double *matrix_v,
495 const int i=get_global_id(0);
496 const int shift=i%(window_in+1);
497 const int shift_out=(i-shift)/(window_in+1);
498 int total=(inputs-(window_in-step))%step;
499 total=(inputs-(window_in-step)-total)/step+(total>0 ? 1 : 0);
501 for(
int t=0;t<total;t++)
503 if(shift!=window_in && (shift+t*window_in)>=inputs)
505 grad+=matrix_g[t*window_out+shift_out]*(shift==window_in ? 1 : matrix_i[shift+t*step]);
507 double mt=
b1*matrix_m[i]+(1-
b1)*grad;
508 double vt=
b2*matrix_v[i]+(1-
b2)*pow(grad,2);
509 double delta=(vt>0 ? l*mt/sqrt(vt) : 0);
520 __global
double *keys,
521 __global
double *score,
525 int q=get_global_id(0);
526 int shift_q=q*dimension;
527 int units=get_global_size(0);
529 double koef=sqrt((
double)(units*dimension));
533 for(
int k=0;k<units;k++)
536 int shift_k=k*dimension;
537 for(
int i=0;i<dimension;i++)
538 result+=(querys[shift_q+i]*keys[shift_k+i]);
539 result=exp(clamp(result/koef,-500.0,500.0));
540 score[shift_s+k]=result;
543 for(
int k=0;k<units;k++)
544 score[shift_s+k]/=sum;
552 __global
double *values,
553 __global
double *inputs,
557 int units=get_global_size(0);
558 int u=get_global_id(0);
559 int d=get_global_id(1);
560 int dimension=get_global_size(1);
561 int shift=u*dimension+d;
563 for(
int i=0;i<units;i++)
564 result+=scores[u*units+i]*values[i*dimension+d];
565 out[shift]=result+inputs[shift];
573 __global
double *matrix2,
574 __global
double *matrix_out,
579 const int i=get_global_id(0)*dimension;
580 for(
int k=0;k<dimension;k++)
581 matrix_out[i+k]=(matrix1[i+k]+matrix2[i+k])*multiplyer;
597 __global
double *keys,__global
double *keys_g,
598 __global
double *values,__global
double *values_g,
599 __global
double *scores,
600 __global
double *gradient)
602 int u=get_global_id(0);
603 int d=get_global_id(1);
604 int units=get_global_size(0);
605 int dimension=get_global_size(1);
606 double koef=sqrt((
double)(units*dimension));
613 for(
int iu=0;iu<units;iu++)
615 double g=gradient[iu*dimension+d];
616 double sc=scores[iu*units+u];
621 for(
int id=0;
id<dimension;
id++)
623 sqg+=values[iu*dimension+id]*gradient[u*dimension+id];
624 skg+=values[u*dimension+id]*gradient[iu*dimension+id];
626 qg+=(scores[u*units+iu]==0 || scores[u*units+iu]==1 ? 0.0001 : scores[u*units+iu]*(1-scores[u*units+iu]))*sqg*keys[iu*dimension+d]/koef;
628 kg+=(scores[iu*units+u]==0 || scores[iu*units+u]==1 ? 0.0001 : scores[iu*units+u]*(1-scores[iu*units+u]))*skg*querys[iu*dimension+d]/koef;
630 int shift=u*dimension+d;
645 int n=get_global_id(0);
646 int shift=n*dimension;
648 for(
int i=0;i<dimension;i++)
649 mean+=buffer[shift+i];
652 for(
int i=0;i<dimension;i++)
653 variance+=pow(buffer[shift+i]-mean,2);
654 variance=sqrt(variance/dimension);
655 for(
int i=0;i<dimension;i++)
656 buffer[shift+i]=(buffer[shift+i]-mean)/(variance==0 ? 1 : variance);
668 int n=get_global_id(0);
669 int shift=n*dimension;
671 for(
int i=0;i<dimension;i++)
672 sum=pow(buffer[shift+i],2);
674 for(
int i=0;i<dimension;i++)
675 buffer[shift+i]/=sum;