I0216 16:14:26.759470 2475 caffe.cpp:391] Use GPU with device ID 0 I0216 16:14:26.773934 2475 device.cpp:62] CL_DEVICE_HOST_UNIFIED_MEMORY: 1 Build Status = -2 ( Err = -11 ) Log: 1:37:26: warning: OpenCL extension 'cl_khr_fp64' is core feature or supported optional core feature - ignoring #pragma OPENCL EXTENSION cl_khr_fp64 : enable ^ 1:59:26: warning: OpenCL extension 'cl_khr_global_int32_base_atomics' is core feature or supported optional core feature - ignoring #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable ^ fcl build 1 succeeded. error: undefined reference to `_Z12atom_cmpxchgPVU3AS1mmm()' error: backend compiler failed build. Sources: #define ENABLE_DOUBLE_SUPPORT #ifndef __OPENCL_VERSION__ #define __kernel #define __global #define __constant #define __local #define get_global_id(x) 0 #define get_global_size(x) 0 #define get_local_id(x) 0 #define get_local_size(x) 0 #define FLT_MAX 0 #define FLT_MIN 0 #define cl_khr_fp64 #define cl_amd_fp64 #ifndef DISABLE_DOUBLE_SUPPORT #define DOUBLE_SUPPORT_AVAILABLE #endif //DISABLE_DOUBLE_SUPPORT #define CLK_LOCAL_MEM_FENCE #define CLK_GLOBAL_MEM_FENCE #define Dtype float #define barrier(x) #define atomic_cmpxchg(x, y, z) x #define signbit(x) x #define int_tp long #define uint_tp unsigned long #define int_tpc long #define uint_tpc unsigned long #endif #define CONCAT(A,B) A##_##B #define TEMPLATE(name,type) CONCAT(name,type) #define TYPE_FLOAT 1 #define TYPE_DOUBLE 2 #if defined(cl_khr_fp64) #pragma OPENCL EXTENSION cl_khr_fp64 : enable #ifndef DISABLE_DOUBLE_SUPPORT #define DOUBLE_SUPPORT_AVAILABLE #endif //DISABLE_DOUBLE_SUPPORT #elif defined(cl_amd_fp64) #pragma OPENCL EXTENSION cl_amd_fp64 : enable #ifndef DISABLE_DOUBLE_SUPPORT #define DOUBLE_SUPPORT_AVAILABLE #endif //DISABLE_DOUBLE_SUPPORT #endif #if defined(cl_khr_int64_base_atomics) #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #define ATOMICS_64_AVAILABLE #endif #if defined(cl_khr_int32_base_atomics) #pragma OPENCL EXTENSION cl_khr_int32_base_atomics : enable #define ATOMICS_32_AVAILABLE #endif #if defined(cl_khr_global_int32_base_atomics) #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #define ATOMICS_32_AVAILABLE #endif #define ENABLE_DOUBLE_SUPPORT // Types used for parameters, offset computations and so on #define int_tp int #define uint_tp unsigned int // Definitions used to cast the types above as needed #define int_tpc int #define uint_tpc unsigned int #define Dtype float #define Dtype2 float2 #define Dtype4 float4 #define Dtype8 float8 #define Dtype16 float16 #define TYPE TYPE_FLOAT #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(relu_forward,Dtype)(const int_tp n, __global const Dtype* in, __global Dtype* out, Dtype negative_slope) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope; } } __kernel void TEMPLATE(relu_backward,Dtype)(const int_tp n, __global const Dtype* in_diff, __global const Dtype* in_data, __global Dtype* out_diff, Dtype negative_slope) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out_diff[index] = in_diff[index] * ((in_data[index] > 0?1.0:0.0) + (in_data[index] <= 0?1.0:0.0) * negative_slope); } } __kernel void TEMPLATE(tanh_forward,Dtype)(const int_tp n, __global const Dtype* in, __global Dtype* out) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out[index] = tanh(in[index]); } } __kernel void TEMPLATE(tanh_backward,Dtype)(const int_tp n, __global const Dtype* in_diff, __global const Dtype* out_data, __global Dtype* out_diff) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { Dtype tanhx = out_data[index]; out_diff[index] = in_diff[index] * (1 - tanhx * tanhx); } } __kernel void TEMPLATE(sigmoid_forward,Dtype)(const int_tp n, __global const Dtype* in, __global Dtype* out) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out[index] = 1.0 / (1.0 + exp(-in[index])); } } __kernel void TEMPLATE(sigmoid_backward,Dtype)(const int_tp n, __global const Dtype* in_diff, __global const Dtype* out_data, __global Dtype* out_diff) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { const Dtype sigmoid_x = out_data[index]; out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x); } } __kernel void TEMPLATE(threshold,Dtype)(const int_tp n, const Dtype threshold, __global const Dtype* in, __global Dtype* out) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out[index] = in[index] > threshold ? 1.0 : 0.0; } } __kernel void TEMPLATE(prelu_forward,Dtype)(const int_tp n, const int_tp channels, const int_tp dim, __global const Dtype* in, __global Dtype* out, __global const Dtype* slope_data, const int_tp div_factor) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { int_tp c = (index / dim) % channels / div_factor; out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c]; } } __kernel void TEMPLATE(prelu_backward,Dtype)(const int_tp n, const int_tp channels, const int_tp dim, __global const Dtype* in_diff, __global const Dtype* in_data, __global Dtype* out_diff, __global const Dtype* slope_data, const int_tp div_factor) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { int_tp c = (index / dim) % channels / div_factor; out_diff[index] = in_diff[index] * ((Dtype)(in_data[index] > 0?1.0:0.0) + (Dtype)(in_data[index] <= 0?1.0:0.0) * slope_data[c]); } } __kernel void TEMPLATE(prelu_param_backward,Dtype)(const int_tp n, const int_tp rows, const int_tp rowPitch, __global const Dtype* in_diff, __global const Dtype* in_data, __global Dtype* out_diff) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0?1.0:0.0); for (int k = 1; k < rows; k++) { out_diff[index] += in_diff[index + k * rowPitch] * in_data[index + k * rowPitch] * (in_data[index + k * rowPitch] <= 0?1.0:0.0); } } } __kernel void TEMPLATE(sce_loss_forward,Dtype)(const int_tp nthreads, __global const Dtype* input_data, __global const Dtype* target, __global Dtype* loss, const int_tp has_ignore_label_, const int_tp ignore_label_, __global Dtype* counts) { for (int_tp i = get_global_id(0); i < nthreads; i += get_global_size(0)) { const int_tp target_value = (int_tp)(target[i]); if (has_ignore_label_ == 1 && target_value == ignore_label_) { loss[i] = 0.0; counts[i] = 0.0; } else { loss[i] = input_data[i] * (target[i] - (input_data[i] >= 0.0)) - log((Dtype)1.0 + exp(input_data[i] - (Dtype)2.0 * input_data[i] * (input_data[i] >= 0.0))); counts[i] = 1.0; } } } __kernel void TEMPLATE(sce_loss_ignore_diff,Dtype)(const int_tp count, const int_tp ignore_label, __global const Dtype* target, __global Dtype* diff) { for (int_tp i = get_global_id(0); i < count; i += get_global_size(0)) { const int_tp target_value = (int_tp)(target[i]); if (target_value == ignore_label) { diff[i] = 0.0; } } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(gpu_set,Dtype)(const int_tp n, const Dtype alpha, __global Dtype* y) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[index] = alpha; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(batch_norm_use_global_stats_in_place,Dtype)(const int_tp num, const int_tp channels, const int_tp spatial_dim, const Dtype scale, const Dtype eps, __global const Dtype* mean, __global const Dtype* variance, __global Dtype* top) { const int_tp idx_num = get_global_id(0); const int_tp idx_chans = get_global_id(1); const int_tp idx_spatial_dim = get_global_id(2); Dtype m = mean[idx_chans]; Dtype v = variance[idx_chans]; m = -scale * m; v = (Dtype)native_powr((float)mad(scale, v, eps), (float)-0.5); const int_tp out_off = (idx_num * channels + idx_chans) * spatial_dim + idx_spatial_dim; top[out_off] = v * (top[out_off] + m); } __kernel void TEMPLATE(batch_norm_use_global_stats,Dtype)(const int_tp num, const int_tp channels, const int_tp spatial_dim, const Dtype scale, const Dtype eps, __global const Dtype* mean, __global const Dtype* variance, __global const Dtype* bottom, __global Dtype* top) { const int_tp idx_num = get_global_id(0); const int_tp idx_chans = get_global_id(1); const int_tp idx_spatial_dim = get_global_id(2); Dtype m = mean[idx_chans]; Dtype v = variance[idx_chans]; m = -scale * m; v = (Dtype)native_powr((float)mad(scale, v, eps), (float)-0.5); const int_tp out_off = (idx_num * channels + idx_chans) * spatial_dim + idx_spatial_dim; top[out_off] = v * (bottom[out_off] + m); } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(br_forward,Dtype)(const int_tp count, const int_tp inner_dim, __global const Dtype* in, __global const Dtype* permut, __global Dtype* out) { for (int_tp index = get_global_id(0); index < count; index += get_global_size(0)) { int_tp n = index / (inner_dim); int_tp in_n = (int_tp) (permut[n]); out[index] = in[in_n * (inner_dim) + index % (inner_dim)]; } } __kernel void TEMPLATE(br_backward,Dtype)(const int_tp count, const int_tp inner_dim, __global const Dtype* in, __global const Dtype* top_indexes, __global const Dtype* begins, __global const Dtype* counts, __global Dtype* out) { for (int_tp index = get_global_id(0); index < count; index += get_global_size(0)) { int_tp n = index / (inner_dim); out[index] = 0; int_tp lower = (int_tp) (begins[n]); int_tp upper = lower + (int_tp) (counts[n]); for (int_tp i = lower; i < upper; ++i) { int_tp in_n = (int_tp) (top_indexes[i]); out[index] += in[in_n * (inner_dim) + index % (inner_dim)]; } } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(null_kernel,Dtype)(Dtype arg) { Dtype out = arg; } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(bias_forward,Dtype)(const int_tp n, __global const Dtype* in, __global const Dtype* bias, const int_tp bias_dim, const int_tp inner_dim, __global Dtype* out) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { const int_tp bias_index = (index / inner_dim) % bias_dim; out[index] = in[index] + bias[bias_index]; } } __kernel void TEMPLATE(scale_forward,Dtype)(const int_tp n, __global const Dtype* in, __global const Dtype* scale, const int_tp scale_dim, const int_tp inner_dim, __global Dtype* out) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { const int_tp scale_index = (index / inner_dim) % scale_dim; out[index] = in[index] * scale[scale_index]; } } __kernel void TEMPLATE(scale_bias_forward,Dtype)(const int_tp n, __global const Dtype* in, __global const Dtype* scale, __global const Dtype* bias, const int_tp scale_dim, const int_tp inner_dim, __global Dtype* out) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { const int_tp scale_index = (index / inner_dim) % scale_dim; out[index] = in[index] * scale[scale_index] + bias[scale_index]; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(bnll_forward,Dtype)(const int_tp n, __global const Dtype* in, __global Dtype* out) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { if (in[index] > 0.0f) { out[index] = in[index] + log((Dtype) (1.0 + exp(-in[index]))); } else { out[index] = log((Dtype) (1.0 + exp(in[index]))); } } } __kernel void TEMPLATE(bnll_backward,Dtype)(const int_tp n, __global const Dtype* in_diff, __global const Dtype* in_data, __global Dtype* out_diff) { Dtype kBNLL_THRESHOLD = 50.; for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { Dtype expval = exp(min(in_data[index], kBNLL_THRESHOLD)); out_diff[index] = in_diff[index] * expval / (expval + 1.); } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(kernel_channel_max,Dtype)(const int_tp num, const int_tp channels, const int_tp spatial_dim, __global const Dtype* data, __global Dtype* out) { for (int_tp index = get_global_id(0); index < num * spatial_dim; index += get_global_size(0)) { int_tp n = index / spatial_dim; int_tp s = index % spatial_dim; float maxval = -FLT_MAX; for (int_tp c = 0; c < channels; ++c) { maxval = max((Dtype)(data[(n * channels + c) * spatial_dim + s]), (Dtype)maxval); } out[index] = maxval; } } __kernel void TEMPLATE(kernel_channel_subtract,Dtype)(const int_tp count, const int_tp num, const int_tp channels, const int_tp spatial_dim, __global const Dtype* channel_max, __global Dtype* data) { for (int_tp index = get_global_id(0); index < count; index += get_global_size(0)) { int_tp n = index / channels / spatial_dim; int_tp s = index % spatial_dim; data[index] -= channel_max[n * spatial_dim + s]; } } __kernel void TEMPLATE(kernel_exp,Dtype)(const int_tp count, __global const Dtype* data, __global Dtype* out) { for (int_tp index = get_global_id(0); index < count; index += get_global_size(0)) { out[index] = exp(data[index]); } } __kernel void TEMPLATE(kernel_channel_sum,Dtype)(const int_tp num, const int_tp channels, const int_tp spatial_dim, __global const Dtype* data, __global Dtype* channel_sum) { for (int_tp index = get_global_id(0); index < num * spatial_dim; index += get_global_size(0)) { int_tp n = index / spatial_dim; int_tp s = index % spatial_dim; Dtype sum = 0; for (int_tp c = 0; c < channels; ++c) { sum += data[(n * channels + c) * spatial_dim + s]; } channel_sum[index] = sum; } } __kernel void TEMPLATE(kernel_channel_div,Dtype)(const int_tp count, const int_tp num, const int_tp channels, const int_tp spatial_dim, __global const Dtype* channel_sum, __global Dtype* data) { for (int_tp index = get_global_id(0); index < count; index += get_global_size(0)) { int_tp n = index / channels / spatial_dim; int_tp s = index % spatial_dim; data[index] /= channel_sum[n * spatial_dim + s]; } } __kernel void TEMPLATE(kernel_channel_dot,Dtype)(const int_tp num, const int_tp channels, const int_tp spatial_dim, __global const Dtype* data_1, __global const Dtype* data_2, __global Dtype* channel_dot) { for (int_tp index = get_global_id(0); index < num * spatial_dim; index += get_global_size(0)) { int_tp n = index / spatial_dim; int_tp s = index % spatial_dim; Dtype dot = 0; for (int_tp c = 0; c < channels; ++c) { dot += (data_1[(n * channels + c) * spatial_dim + s] * data_2[(n * channels + c) * spatial_dim + s]); } channel_dot[index] = dot; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(concat,Dtype)(const int_tp nthreads, __global const Dtype* in_data, const int forward, const int_tp num_concats, const int_tp concat_size, const int_tp top_concat_axis, const int_tp bottom_concat_axis, const int_tp offset_concat_axis, __global Dtype* out_data) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp total_concat_size = concat_size * bottom_concat_axis; const int_tp concat_num = index / total_concat_size; const int_tp concat_index = index % total_concat_size; const int_tp top_index = concat_index + (concat_num * top_concat_axis + offset_concat_axis) * concat_size; if (forward == 1) { out_data[top_index] = in_data[index]; } else { out_data[index] = in_data[top_index]; } } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(cll_backward,Dtype)(const int_tp count, const int_tp channels, const Dtype margin, const Dtype alpha, __global const Dtype* y, __global const Dtype* diff, __global const Dtype* dist_sq, __global Dtype *bottom_diff) { for (int_tp i = get_global_id(0); i < count; i += get_global_size(0)) { int_tp n = i / channels; // the num index, to access y and dist_sq if (trunc(y[n]) != 0.) { // similar pairs bottom_diff[i] = alpha * diff[i]; } else { // dissimilar pairs Dtype mdist = 0.; Dtype beta = 0.; Dtype dist = sqrt(dist_sq[n]); mdist = (margin - dist); beta = -alpha * mdist / (dist + 1e-4) * diff[i]; if (mdist > 0.) { bottom_diff[i] = beta; } else { bottom_diff[i] = 0; } } } } __kernel void TEMPLATE(cll_backward_legacy,Dtype)(const int count, const int channels, const Dtype margin, const Dtype alpha, __global Dtype* y, __global Dtype* diff, __global Dtype* dist_sq, __global Dtype* bottom_diff) { for (int_tp i = get_global_id(0); i < count; i += get_global_size(0)) { int n = i / channels; // the num index, to access y and dist_sq if (trunc(y[n]) != 0.) { // similar pairs bottom_diff[i] = alpha * diff[i]; } else { // dissimilar pairs Dtype mdist = 0.; Dtype beta = 0.; mdist = (margin - dist_sq[n]); beta = -alpha; if (mdist > 0.) { bottom_diff[i] = beta; } else { bottom_diff[i] = 0; } } } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(conv_layer_spatial_phony,Dtype)(Dtype arg) { Dtype out = arg; } #define ACTIVATION_FUNCTION(_dst_, _offset_, _data_) do { (_dst_)[(_offset_)] = (_data_);} while(0) #define __CAT(x, y) x##y #define CAT(x, y) __CAT(x, y) #define LOOP0(VAR, STMT) #define LOOP1(VAR, STMT) (STMT); (VAR)++; #define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++; #define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++; #define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++; #define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++; #define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++; #define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++; #define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++; #define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++; #define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++; #define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++; #define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++; #define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++; #define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++; #define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++; #define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++; #define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT)) #ifdef MULTI __kernel void CFMultiNoPadding( __global Dtype* image_data, int_tp image_offset, __global Dtype* kernel_data, int_tp kernel_offset, __global Dtype* bias,const int_tp bias_offset, __global Dtype* convolved_image,const int_tp convolved_image_offset, const ushort input_width, const ushort input_height, const ushort output_width, const ushort output_height, const ushort pad_w, const ushort pad_h) { const int_tp outputX = get_global_id(0); const int_tp outputY = get_global_id(1); const int_tp kernelNum = get_global_id(2)*ZPAR; if(outputX < output_width && outputY < output_height) { Dtype sum[ZPAR]; for(int_tp kern =0; kern < ZPAR; kern++) { sum[kern] = 0.0f; } const int_tp org_y = outputY * STRIDE_H - pad_h; const int_tp org_x = outputX * STRIDE_W - pad_w; const int_tp currentKernelOffset = kernel_offset + kernelNum*KERNEL_H*KERNEL_W*CHANNELS; const int_tp biasIndex=bias_offset + kernelNum; const int_tp local_image_offset = org_y*input_width + org_x; const int_tp imageSize = input_width*input_height; __global Dtype* image_dataPtrFloat = (image_data + (image_offset + local_image_offset)); __global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset)); for(int_tp c = 0; c < CHANNELS; c++) { for(int_tp y = 0; y < KERNEL_H; y++) { for(int_tp x = 0; x < KERNEL_W; x++) { if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width)) { continue; } for(int_tp kern =0; kern < ZPAR; kern++) { sum[kern] += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS + x]; } } image_dataPtrFloat += input_width * DILATION_Y; kernel_dataPtrFloat += KERNEL_W; } image_dataPtrFloat += imageSize - input_width*KERNEL_H*DILATION_Y; } if(APPLY_BIAS == 1) { for(int_tp kern = 0; kern < ZPAR; kern++) { if(kernelNum+kern < OUTPUT_Z) { int_tp offset = convolved_image_offset + (kernelNum+kern)*output_height*output_width + outputY*output_width + outputX; ACTIVATION_FUNCTION(convolved_image, offset, sum[kern] + bias[biasIndex +kern]); } } } else { for(int_tp kern = 0; kern < ZPAR; kern++) { if(kernelNum+kern < OUTPUT_Z) { int_tp offset = convolved_image_offset + (kernelNum+kern)*output_height*output_width + outputY*output_width + outputX; ACTIVATION_FUNCTION(convolved_image, offset, sum[kern]); } } } } } #endif //Begin IDLF kernels below here #ifdef IDLF #define activation_function(x) (x) #define OUT_BLOCK_SIZE (OUT_BLOCK_WIDTH*OUT_BLOCK_HEIGHT) // Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map. // Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the imput image. // NDRange: (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH // NOTE: for beignet this reqd_work_group_size does not guarantee that SIMD16/8 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break. __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE))) kernel void convolve_simd( // __global float *inputs, __global float* weights, __global float* outputs __global float* inputs_base, filter_qualifier float* weights_base, __global float* biases_base, __global float* outputs_base, const ushort input_width, const ushort input_height, const ushort output_width, const ushort output_height) { __global float* outputs = outputs_base; __global float* inputs = inputs_base; filter_qualifier float* weights = weights_base; __global float* biases = biases_base; uint_tp oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column uint_tp or = get_global_id(1) * OUT_BLOCK_HEIGHT;// or = Output Row uint_tp fm = get_global_id(2);// fm = Feature Map = od = Output Depth uint_tp fmg = get_group_id(2); uint_tp lid = get_local_id(2); float out[OUT_BLOCK_SIZE]; int_tp in_addr; // find weights adress of given neuron (lid is index) uint_tp weight_addr = (fmg % (ALIGNED_NUM_FILTERS/SIMD_SIZE)) * INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid; for(int_tp i=0;i= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + 3 >= INPUT_PAD_W && curr_x < input_width + INPUT_PAD_W) { if (curr_x < INPUT_PAD_W) { in_buf.in_vec[reg].s0 = 0; if (curr_x + 1 >= INPUT_PAD_W) in_buf.in_vec[reg].s1 = *(inputs + in_offset + 1); else in_buf.in_vec[reg].s1 = 0; if (curr_x + 2 >= INPUT_PAD_W) in_buf.in_vec[reg].s2 = *(inputs + in_offset + 2); else in_buf.in_vec[reg].s2 = 0; in_buf.in_vec[reg].s3 = *(inputs + in_offset + 3); } else { in_buf.in_vec[reg] = *(global float4*)(inputs + in_offset); // read SIMD_SIZE elements if (curr_x + 1 >= input_width + INPUT_PAD_W) in_buf.in_vec[reg].s1 = 0; if (curr_x + 2 >= input_width + INPUT_PAD_W) in_buf.in_vec[reg].s2 = 0; if (curr_x + 3 >= input_width + INPUT_PAD_W) in_buf.in_vec[reg].s3 = 0; } } else { in_buf.in_vec[reg] = 0; } curr_y += TILE_Y_STRIDE; #else in_buf.in_vec[reg] = *(global float4*)(inputs + in_offset); // read SIMD_SIZE elements #endif in_offset += input_width * TILE_Y_STRIDE; }); in_addr += input_height * input_width; #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 curr_y = saved_y; #endif #if KERNEL_WIDTH * KERNEL_HEIGHT != 1 #define WEIGHT_PREF 8 #else #define WEIGHT_PREF 1 #endif union { float w[WEIGHT_PREF]; #if KERNEL_WIDTH * KERNEL_HEIGHT != 1 uint8 ui8; #endif } weight_buf; int_tp w_idx=0; uint_tp orig_weight_addr = weight_addr; #if KERNEL_WIDTH * KERNEL_HEIGHT != 1 weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]); weight_addr += SIMD_SIZE * WEIGHT_PREF; #else weight_buf.w[0] = as_float(intel_sub_group_block_read((__global uint *)&weights[weight_addr])); weight_addr += SIMD_SIZE * 1; #endif #define BLOCK_IN(n) sub_group_broadcast( in_buf.in_array[((n)%4) + ((n) / (TILE_Y_STRIDE * TILE_X)) * 4], (((n) % (TILE_Y_STRIDE * TILE_X))/4)) int_tp kr = 0; // kr = Kernel Row LOOP(KERNEL_HEIGHT, kr,// LOOP is a macro that unrolls the loop. { int_tp kc = 0; // kc = Kernel Column LOOP(KERNEL_WIDTH, kc, { for(int_tp br=0; br < OUT_BLOCK_HEIGHT; br++) { for(int_tp bc=0; bc < OUT_BLOCK_WIDTH; bc++) { float input = BLOCK_IN((br * STRIDEY + kr * DILATION_Y) * TILE_X + bc * STRIDEX + kc * DILATION_X); out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]); } } #if KERNEL_WIDTH * KERNEL_HEIGHT > WEIGHT_PREF // We assume KERNEL_W is equal to KERNEL_H here. if ((w_idx + 1) % WEIGHT_PREF == 0 #if KERNEL_WIDTH * KERNEL_HEIGHT % 8 != 0 && ((w_idx + 1) <= (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF)) #endif ) { weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]); weight_addr += SIMD_SIZE * WEIGHT_PREF; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details. } #if KERNEL_WIDTH*KERNEL_HEIGHT % 8 == 0 // need to do nothing #else else if ((w_idx + 1) % WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF))) #if KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 1 weight_buf.w[0] = weights[weight_addr]; #elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 2 weight_buf.ui8.s01 = intel_sub_group_block_read2((__global uint *)&weights[weight_addr]); #elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 <= 4 weight_buf.ui8.s0123 = intel_sub_group_block_read4((__global uint *)&weights[weight_addr]); #else weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]); #endif #endif #endif ++w_idx; }); }); weight_addr = orig_weight_addr + KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE; } // dead code to work around possible compiler bug. if (ALIGNED_NUM_FILTERS != NUM_FILTERS && fm > 0xfffffffeul) { outputs[0] = BLOCK_IN(fm % SIMD_SIZE); } fm = fm % ALIGNED_NUM_FILTERS; if ((ALIGNED_NUM_FILTERS == NUM_FILTERS || fm < NUM_FILTERS)) { uint_tp out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + fm ) * output_width * output_height; out_addr += or * output_width + oc; float bias = biases[fm]; for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) { if (r + or >= output_height) break; for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) { if (c + oc >= output_width) break; // this does a scattered write to SIMD_SIZE different feature maps, so that data within one map is contiguous, thus ready for input to next layer. outputs[out_addr + r * output_width + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]); } } } } #endif /******************************************************************************* Copyright © 2016, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ******************************************************************************/ #ifdef Conv_Interleaved typedef struct float1 { float s0; } float1; typedef struct float5 { float s0; float s1; float s2; float s3; float s4; } float5; typedef struct float6 { float s0; float s1; float s2; float s3; float s4; float s5; } float6; typedef struct float7 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; } float7; typedef struct float9 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; } float9; typedef struct float10 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; float s9;} float10; typedef struct float11 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; float s9; float sa;} float11; typedef struct float12 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; float s9; float sa; float sb; } float12; typedef struct float13 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; float s9; float sa; float sb; float sc;} float13; typedef struct float14 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; } float14; typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15; typedef struct float0 { float s0; } float0; //never used but makes compiler happy. #define OUT_PITCH_X output_width #define ROW_PITCH input_width #ifdef FUSED_CONV_ELTWISE #define GEMM_LIKE_KERNEL_ARGS __global Dtype* eltwise_data, const __global Dtype *src0, const __global Dtype *src1, const __global Dtype *biases, __global Dtype *dst, const ushort input_width, const ushort input_height, const ushort output_width, const ushort output_height, const int_tp out_pitch_y, const int_tp out_pitch_z, const int_tp aligned_input_size, const int_tp slice_pitch #else #define GEMM_LIKE_KERNEL_ARGS const __global Dtype *src0, const __global Dtype *src1, const __global Dtype *biases, __global Dtype *dst, const ushort input_width, const ushort input_height, const ushort output_width, const ushort output_height, const int_tp out_pitch_y, const int_tp out_pitch_z, const int_tp aligned_input_size, const int_tp slice_pitch #endif #endif #ifdef GEMM_LIKE_CONV_32_1 ////////////////////////////////////////////////////////////////////////////// // Conv_Interleaved_32_1_flex // // Convolution: each workitem computes 1 patch x 32 filters worth of output // data. Kernel's inner loop works on a single tile consisting of one // row from each patch and the filter data corresponding to that row. Filter // matrix is interleaved to reduce GRF bank conflicts. Patches are walked // by rows and then by slices. Relies on sub_group extension for block // reads and SIMD broadcast. Allows flexible sizing of TILE width (TILE_N) // by dynamically selecting one of two code paths: one uses TILE_N = 32 and // the other uses TILE_N = 8, 16, or 24. #define TILE_M 1 #define TILE_K KERNEL_WIDTH #define TILE_N 32 #ifdef __BEIGNET__ __attribute__((intel_reqd_sub_group_size(8))) #endif __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { const int group_x = get_group_id(0); const int group_y = get_group_id(1); const int global_x = get_global_id(0); const int global_y = get_global_id(1); const int global_z = get_global_id(2); int interleaved_y; int kernel_y; int kernel_idx; #define DOT_PRODUCT_8( _result, _rowA, colB ) { _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); } typedef CAT( float, KERNEL_WIDTH ) float_t; // True for all threads if filter_width is multiple of TILE_N // else, true for all but right-most column of threads. if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N ) { // Result ctile (*dst) is M rows x N columns // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. float8 blockC00 = 0.f; float8 blockC10 = 0.f; float8 blockC20 = 0.f; float8 blockC30 = 0.f; // Src0 (patch input) is directly used as atile. // Each work item points to the start of a different patch. // atile is M rows x K columns. int curr_x = ( global_y % output_width ) * STRIDE_X; int curr_y = ( global_y / output_width ) * STRIDE_Y; #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 int saved_y = curr_y; #endif const __global float *src0_read = src0 + aligned_input_size * global_z // batch offset + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset + (curr_x - INPUT_PAD_W); // x offset // Src1 (filter) is directly used as btile. // It starts at the top of src1 and walks down. // btile is K rows x N columns. const __global float *src1_read = src1 + ( global_x * TILE_N * 2); // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch // and KERNEL_WIDTH/2 rows of interleaved filter. int patch_depth = 0; do { int patch_row = 0; #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 curr_y = saved_y; #endif do { // Load atile and btile. // Kernel data is partially interleaved. Every 2 rows are interleaved at float8 granularity. // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non // interleaved row is padded with zero to ensure same size as interleaved rows. This // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3. // (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) .. // (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ... // (0, 2) (8, 2) (16, 2) (24, 2) ... ... // ... const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 float_t blockA00 = ( (const __global float_t*)src0_read )[ 0 ]; float* pblockA00 = (float*)(&blockA00); #else float_t blockA00; float* pblockA00 = (float*)(&blockA00); int pos = 0; LOOP(KERNEL_WIDTH, pos, { if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read[pos * DILATION_X]; else pblockA00[pos] = 0; }) curr_y += DILATION_Y; #endif src0_read += (ROW_PITCH * DILATION_Y); float blockB00[KERNEL_WIDTH*4]; float8* p8BlockB00 = (float8*)blockB00; float4* p4BlockB00 = (float4*)blockB00; float* pBlockB00 = (float* )blockB00; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { p8BlockB00[interleaved_y] = as_float8( intel_sub_group_block_read8( (const __global uint*)src1_read ) ); src1_read += WIDTH1 * 2; } ) if ( kernel_width_is_odd ) { p4BlockB00[KERNEL_WIDTH - 1] = as_float4( intel_sub_group_block_read4( (const __global uint*)src1_read ) ); src1_read += WIDTH1 * 2; } // Perform MADs kernel_idx = 0; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { kernel_y = interleaved_y * 2; DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; } ) kernel_y = interleaved_y * 2; if ( kernel_width_is_odd ) { DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; } } //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch } //while ( ++patch_depth < 1 ); //debug while ( ++patch_depth < INPUT_DEPTH ); // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. __global float *out = dst + global_z * out_pitch_z // batch offset + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset float bias[4]; float4 *bias_vec; bias_vec = (float4*)bias; *bias_vec = as_float4(intel_sub_group_block_read4((__global uint *)biases + group_x * TILE_N)); if (global_y * TILE_M < output_width * output_height ) { for (int i = 0; i < 8; i++) { out[( 0+i) * out_pitch_y] = blockC00[i] + intel_sub_group_shuffle(bias[0], i); out[( 8+i) * out_pitch_y] = blockC10[i] + intel_sub_group_shuffle(bias[1], i); out[(16+i) * out_pitch_y] = blockC20[i] + intel_sub_group_shuffle(bias[2], i); out[(24+i) * out_pitch_y] = blockC30[i] + intel_sub_group_shuffle(bias[3], i); } } } #if TILE_N_LAST > 0 else { // Result ctile (*dst) is M rows x N columns // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. int i = 0; float8 blockC[TILE_N_LAST_DIV8]; LOOP(TILE_N_LAST_DIV8, i, { blockC[i] = 0.f; } ) // Src0 (patch input) is directly used as atile. // Each work item points to the start of a different patch. // atile is M rows x K columns. int curr_x = ( global_y % output_width ) * STRIDE_X; int curr_y = ( global_y / output_width ) * STRIDE_Y; #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 int saved_y = curr_y; #endif const __global float *src0_read = src0 + aligned_input_size * global_z // batch offset + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset + (curr_x - INPUT_PAD_W); // x offset // Src1 (filter) is directly used as btile. // It starts at the top of src1 and walks down. // btile is K rows x N columns. const __global float *src1_read = src1 + ( global_x * TILE_N * 2); // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch // and KERNEL_WIDTH/2 rows of interleaved filter. int patch_depth = 0; do { int patch_row = 0; #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 curr_y = saved_y; #endif do { // Load atile and interleaved btile. const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 float_t blockA00 = ( (const __global float_t*)src0_read )[ 0 ]; float* pblockA00 = (float*)(&blockA00); #else float_t blockA00; float* pblockA00 = (float*)(&blockA00); int pos = 0; LOOP(KERNEL_WIDTH, pos, { if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read[pos * DILATION_X]; else pblockA00[pos] = 0; }) curr_y += DILATION_Y; #endif src0_read += (ROW_PITCH * DILATION_Y); float blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8]; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { #if TILE_N_LAST_DIV8 == 1 float2* p2BlockB = (float2* )blockB; p2BlockB[interleaved_y] = as_float2( intel_sub_group_block_read2( (const __global uint*)src1_read ) ); #elif TILE_N_LAST_DIV8 == 2 float4* p4BlockB = (float4* )blockB; p4BlockB[interleaved_y] = as_float4( intel_sub_group_block_read4( (const __global uint*)src1_read ) ); #elif TILE_N_LAST_DIV8 == 3 //TODO: broken. No block_read6 float6* p6BlockB = (float6* )blockB; (*((float8*)(&p6BlockB[interleaved_y]))).s0123 = as_float4( intel_sub_group_block_read4( (const __global uint*)src1_read ) ); (*((float8*)(&p6BlockB[interleaved_y]))).s45 = as_float2( intel_sub_group_block_read2( (const __global uint*)(src1_read + 4 * 8) ) ); #endif src1_read += WIDTH1 * 2; } ) if ( kernel_width_is_odd ) { #if TILE_N_LAST_DIV8 == 1 float* pBlockB = (float* )blockB; pBlockB[KERNEL_WIDTH - 1] = as_float( intel_sub_group_block_read( (const __global uint*)src1_read ) ); #elif TILE_N_LAST_DIV8 == 2 float2* p2BlockB = (float2* )blockB; p2BlockB[KERNEL_WIDTH - 1] = as_float2( intel_sub_group_block_read2( (const __global uint*)src1_read ) ); #elif TILE_N_LAST_DIV8 == 3 float3* p3BlockB = (float3* )blockB; p3BlockB[KERNEL_WIDTH - 1].s01 = as_float2( intel_sub_group_block_read2( (const __global uint*)src1_read ) ); p3BlockB[KERNEL_WIDTH - 1].s2 = as_float( intel_sub_group_block_read( (const __global uint*) (src1_read + 2 * 8) ) ); #endif src1_read += WIDTH1 * 2; } // Perform MADs float* pBlockB = (float*)blockB; kernel_idx = 0; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { kernel_y = interleaved_y * 2; DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; #if TILE_N_LAST_DIV8 >= 2 DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; #if TILE_N_LAST_DIV8 >= 3 DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; #endif #endif } ) kernel_y = interleaved_y * 2; if ( kernel_width_is_odd ) { DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; #if TILE_N_LAST_DIV8 >= 2 DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; #if TILE_N_LAST_DIV8 >= 3 DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; #endif #endif } } //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch } //while ( ++patch_depth < 1 ); //debug while ( ++patch_depth < INPUT_DEPTH ); // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. __global float *out = dst + global_z * out_pitch_z // batch offset + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset float bias[4]; float4 *bias_vec; bias_vec = (float4*)bias; *bias_vec = as_float4(intel_sub_group_block_read4((__global uint *)biases + group_x * TILE_N)); if (global_y * TILE_M < output_width * output_height ) { for (int i = 0; i < 8; i++) { if ( TILE_N_LAST_DIV8 > 0 ) out[( 0+i) * out_pitch_y] = blockC[0][i] + intel_sub_group_shuffle(bias[0], i); if ( TILE_N_LAST_DIV8 > 1 ) out[( 8+i) * out_pitch_y] = blockC[1][i] + intel_sub_group_shuffle(bias[1], i); if ( TILE_N_LAST_DIV8 > 2 ) out[(16+i) * out_pitch_y] = blockC[2][i] + intel_sub_group_shuffle(bias[2], i); if ( TILE_N_LAST_DIV8 > 3 ) out[(24+i) * out_pitch_y] = blockC[3][i] + intel_sub_group_shuffle(bias[3], i); } } } #endif } #endif #ifdef GEMM_LIKE_CONV_32_1_SIMD16 #define TILE_M 1 #define TILE_K KERNEL_WIDTH #define TILE_N 32 #ifndef __BEIGNET__ __attribute__((intel_reqd_sub_group_size(16))) #endif __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { const int group_x = get_group_id(0); const int group_y = get_group_id(1); const int global_x = get_global_id(0); const int global_y = get_global_id(1); const int global_z = get_global_id(2); int interleaved_y; int kernel_y; int kernel_idx; // Result ctile (*dst) is M rows x N columns // LWG size is 1x16. Thus each thread calculates 16*M rows x N cols of ctile. Dtype16 blockC00 = 0.f; Dtype16 blockC10 = 0.f; // Src0 (patch input) is directly used as atile. // Each work item points to the start of a different patch. // atile is M rows x K columns. int curr_x = ( global_y % output_width ) * STRIDE_X; int curr_y = ( global_y / output_width ) * STRIDE_Y; #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 int saved_y = curr_y; #endif const __global Dtype *src0_read = src0 + aligned_input_size * global_z // batch offset + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x - INPUT_PAD_W; // x offset const __global Dtype *src0_read_orig = src0_read; // Src1 (filter) is directly used as btile. // It starts at the top of src1 and walks down. // btile is K rows x N columns. const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2 ); #define DOT_PRODUCT_16( _result, _rowA, colB ) { _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); _result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 ); _result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 ); _result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa ); _result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb ); _result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc ); _result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd ); _result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se ); _result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf ); } typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t; // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch // and KERNEL_WIDTH/2 rows of interleaved filter. int patch_depth = 0; #ifndef __BEIGNET__ __attribute__((opencl_unroll_hint(1))) #endif do { int patch_row = 0; #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 curr_y = saved_y; #endif #ifndef __BEIGNET__ __attribute__((opencl_unroll_hint(1))) #endif do { // Load atile and btile. // Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype16 granularity. // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non // interleaved row is padded with zero to ensure same size as interleaved rows. This // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3. // (0, 0) (16, 0) (32, 0) (48, 0) ... (0, 0) ( 0, 1) (16, 0) ( 0, 1) (32, 0) (0, 1) (48, 0) ... // (0, 1) (16, 1) (32, 1) (48, 1) ... => (0, 2) (16, 2) (32, 2) (48, 2) ... // (0, 2) (16, 2) (32, 2) (48, 2) ... ... // ... const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; Dtype* pblockA00 = (Dtype*)(&blockA00); #else Dtype_t blockA00; Dtype* pblockA00 = (Dtype*)(&blockA00); int pos = 0; LOOP(KERNEL_WIDTH, pos, { if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read[pos * DILATION_X]; else pblockA00[pos] = 0; }) curr_y += DILATION_Y; #endif src0_read += ROW_PITCH * DILATION_X; uint blockB00[KERNEL_WIDTH * 2]; uint4* p4BlockB00 = (uint4*)blockB00; uint2* p2BlockB00 = (uint2*)blockB00; Dtype* pBlockB00 = (Dtype*)blockB00; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { p4BlockB00[interleaved_y] = intel_sub_group_block_read4( (const __global uint*)src1_read ); src1_read += WIDTH1 * 2; } ) if ( kernel_width_is_odd ) { p2BlockB00[KERNEL_WIDTH - 1] = intel_sub_group_block_read2( (const __global uint*)src1_read ); src1_read += WIDTH1 * 2; } // Perform MADs kernel_idx = 0; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { kernel_y = interleaved_y * 2; DOT_PRODUCT_16( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_16( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; } ) if ( kernel_width_is_odd ) { kernel_y = interleaved_y * 2; DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; } } //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch } //while ( ++patch_depth < 1 ); //debug while ( ++patch_depth < INPUT_DEPTH ); // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. __global Dtype *out = dst + global_z * out_pitch_z // batch offset + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset Dtype bias[2]; Dtype2 *bias_vec; bias_vec = (Dtype2*)bias; *bias_vec = as_float2(intel_sub_group_block_read2((__global uint *)biases + group_x * TILE_N)); // Work around a potential compiler bug. if (group_x > 0xFFFFFFFEul) out[0] = bias[0] + bias[1]; if (global_y * TILE_M < output_width * output_height ) { #if ( ( OUT_DEPTH % TILE_N ) == 0 ) for (int i = 0; i < 16; i++) { out[( 0+i) * out_pitch_y] = blockC00[i] + intel_sub_group_shuffle(bias[0], i); out[(16+i) * out_pitch_y] = blockC10[i] + intel_sub_group_shuffle(bias[1], i);; } #elif ( ( OUT_DEPTH % 16 ) == 0 ) if ( ( global_x + 1 ) < get_global_size(0) ) { for ( int i = 0; i < 16; i++ ) { out[( 0+i) * out_pitch_y] = blockC00[i] + intel_sub_group_shuffle(bias[0], i);; out[(16+i) * out_pitch_y] = blockC10[i] + intel_sub_group_shuffle(bias[1], i);; } } else { for (int i = 0; i < 16; i++) { out[( 0+i) * out_pitch_y] = blockC00[i] + intel_sub_group_shuffle(bias[0], i);; } } #else if ( ( global_x + 1 ) < get_global_size(0) ) { for ( int i = 0; i < 16; i++ ) { out[( 0+i) * out_pitch_y] = blockC00[i] + intel_sub_group_shuffle(bias[0], i);; out[(16+i) * out_pitch_y] = blockC10[i] + intel_sub_group_shuffle(bias[1], i);; } } else { #if ( (OUT_DEPTH % TILE_N) > 16 ) { for (int i = 0; i < 16 ; i++) { out[( 0+i) * out_pitch_y] = blockC00[i] + intel_sub_group_shuffle(bias[0], i);; } for (int i = 0; i < OUT_DEPTH % 16 ; i++) { out[(16+i) * out_pitch_y] = blockC10[i] + intel_sub_group_shuffle(bias[1], i);; } } #else { for (int i = 0; i < OUT_DEPTH % 16 ; i++) { out[( 0+i) * out_pitch_y] = blockC00[i] + intel_sub_group_shuffle(bias[0], i);; } } #endif } #endif } } #endif #ifdef GEMM_LIKE_CONV_32_2 ////////////////////////////////////////////////////////////////////////////// // Conv_Interleaved_32_2_flex // // Convolution: each workitem computes 1 patch x 32 filters worth of output // data. Kernel's inner loop works on a single tile consisting of one // row from each patch and the filter data corresponding to that row. Filter // matrix is interleaved to reduce GRF bank conflicts. Patches are walked // by rows and then by slices. Relies on sub_group extension for block // reads and SIMD broadcast. Allows flexible sizing of TILE width (TILE_N) // by dynamically selecting one of two code paths: one uses TILE_N = 32 and // the other uses TILE_N = 8, 16, or 24. #define TILE_M 2 #define TILE_K KERNEL_WIDTH #define TILE_N 32 #ifdef __BEIGNET__ __attribute__((intel_reqd_sub_group_size(8))) #endif __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { const int group_x = get_group_id(0); const int group_y = get_group_id(1); const int global_x = get_global_id(0); const int global_y = get_global_id(1); const int global_z = get_global_id(2); int interleaved_y; int kernel_y; int kernel_idx; #define DOT_PRODUCT_8( _result, _rowA, colB ) { _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); } typedef CAT( float, KERNEL_WIDTH ) float_t; // True for all threads if filter_width is multiple of TILE_N // else, true for all but right-most column of threads. if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N ) { // Result ctile (*dst) is M rows x N columns // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. float8 blockC00 = 0.f; float8 blockC10 = 0.f; float8 blockC20 = 0.f; float8 blockC30 = 0.f; float8 blockC01 = 0.f; float8 blockC11 = 0.f; float8 blockC21 = 0.f; float8 blockC31 = 0.f; // Src0 (patch input) is directly used as atile. // Each work item points to the start of a different patch. // atile is M rows x K columns. int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X; int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 int saved_y0 = curr_y0; int saved_y1 = curr_y1; #endif const __global float *src0_read0 = src0 + aligned_input_size * global_z // batch offset + (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x0 - INPUT_PAD_W; // x offset const __global float *src0_read1 = src0 + aligned_input_size * global_z // batch offset + (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x1 - INPUT_PAD_W; // x offset // Src1 (filter) is directly used as btile. // It starts at the top of src1 and walks down. // btile is K rows x N columns. const __global float *src1_read = src1 + ( global_x * TILE_N * 2); // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch // and KERNEL_WIDTH/2 rows of interleaved filter. int patch_depth = 0; do { int patch_row = 0; do { // Load atile and btile. // Kernel data is partially interleaved. Every 2 rows are interleaved at float8 granularity. // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non // interleaved row is padded with zero to ensure same size as interleaved rows. This // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3. // (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) .. // (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ... // (0, 2) (8, 2) (16, 2) (24, 2) ... ... // ... const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; #if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 float_t blockA00 = ( (const __global float_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; float_t blockA01 = ( (const __global float_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; float* pblockA00 = (float*)(&blockA00); float* pblockA01 = (float*)(&blockA01); #else float_t blockA00; float* pblockA00 = (float*)(&blockA00); int pos = 0; LOOP(KERNEL_WIDTH, pos, { if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read0[pos * DILATION_X]; else pblockA00[pos] = 0; }) curr_y0 += DILATION_Y; float_t blockA01; float* pblockA01 = (float*)(&blockA01); pos = 0; LOOP(KERNEL_WIDTH, pos, { if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA01[pos] = src0_read1[pos * DILATION_X]; else pblockA01[pos] = 0; }) curr_y1 += DILATION_Y; src0_read0 += ROW_PITCH * DILATION_Y; src0_read1 += ROW_PITCH * DILATION_Y; #endif float blockB00[KERNEL_WIDTH*4]; float8* p8BlockB00 = (float8*)blockB00; float4* p4BlockB00 = (float4*)blockB00; float* pBlockB00 = (float* )blockB00; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { p8BlockB00[interleaved_y] = as_float8( intel_sub_group_block_read8( (const __global uint*)src1_read ) ); src1_read += WIDTH1 * 2; } ) if ( kernel_width_is_odd ) { p4BlockB00[KERNEL_WIDTH - 1] = as_float4( intel_sub_group_block_read4( (const __global uint*)src1_read ) ); src1_read += WIDTH1 * 2; } // Perform MADs kernel_idx = 0; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { kernel_y = interleaved_y * 2; DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC01, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC11, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC21, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC21, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC31, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC31, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; } ) if ( kernel_width_is_odd ) { kernel_y = interleaved_y * 2; DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC21, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC31, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; } } //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 curr_y0 = saved_y0; curr_y1 = saved_y1; #endif src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); } //while ( ++patch_depth < 1 ); //debug while ( ++patch_depth < INPUT_DEPTH ); // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. __global float *out0 = dst + global_z * out_pitch_z // batch offset + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset __global float *out1 = dst + global_z * out_pitch_z // batch offset + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset float bias[4]; float4 *bias_vec; bias_vec = (float4*)bias; *bias_vec = as_float4(intel_sub_group_block_read4((__global uint *)biases + group_x * TILE_N)); if( global_y * TILE_M < output_width * output_height ) { for( int i = 0; i < 8; i++ ) { out0[( 0+i) * out_pitch_y] = blockC00[i] + intel_sub_group_shuffle(bias[0], i); out0[( 8+i) * out_pitch_y] = blockC10[i] + intel_sub_group_shuffle(bias[1], i); out0[(16+i) * out_pitch_y] = blockC20[i] + intel_sub_group_shuffle(bias[2], i); out0[(24+i) * out_pitch_y] = blockC30[i] + intel_sub_group_shuffle(bias[3], i); } } if( global_y * TILE_M + 1 < output_width * output_height ) { for( int i = 0; i < 8; i++ ) { out1[( 0+i) * out_pitch_y] = blockC01[i] + intel_sub_group_shuffle(bias[0], i); out1[( 8+i) * out_pitch_y] = blockC11[i] + intel_sub_group_shuffle(bias[1], i); out1[(16+i) * out_pitch_y] = blockC21[i] + intel_sub_group_shuffle(bias[2], i); out1[(24+i) * out_pitch_y] = blockC31[i] + intel_sub_group_shuffle(bias[3], i); } } } #if TILE_N_LAST > 0 else { // Result ctile (*dst) is M rows x N columns // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. int i = 0; float8 blockC0[TILE_N_LAST_DIV8]; float8 blockC1[TILE_N_LAST_DIV8]; LOOP(TILE_N_LAST_DIV8, i, { blockC0[i] = 0.f; blockC1[i] = 0.f; } ) // Src0 (patch input) is directly used as atile. // Each work item points to the start of a different patch. // atile is M rows x K columns. int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X; int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 int saved_y0 = curr_y0; int saved_y1 = curr_y1; #endif const __global float *src0_read0 = src0 + aligned_input_size * global_z // batch offset + (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x0 - INPUT_PAD_W; // x offset const __global float *src0_read1 = src0 + aligned_input_size * global_z // batch offset + (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x1 - INPUT_PAD_W; // x offset // Src1 (filter) is directly used as btile. // It starts at the top of src1 and walks down. // btile is K rows x N columns. const __global float *src1_read = src1 + ( global_x * TILE_N * 2); // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch // and KERNEL_WIDTH/2 rows of interleaved filter. int patch_depth = 0; do { int patch_row = 0; do { // Load atile and interleaved btile. const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; #if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 float_t blockA00 = ( (const __global float_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; float_t blockA01 = ( (const __global float_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; float* pblockA00 = (float*)(&blockA00); float* pblockA01 = (float*)(&blockA01); #else float_t blockA00; float* pblockA00 = (float*)(&blockA00); int pos = 0; LOOP(KERNEL_WIDTH, pos, { if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read0[pos * DILATION_X]; else pblockA00[pos] = 0; }) curr_y0 += DILATION_Y; float_t blockA01; float* pblockA01 = (float*)(&blockA01); pos = 0; LOOP(KERNEL_WIDTH, pos, { if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA01[pos] = src0_read1[pos * DILATION_X]; else pblockA01[pos] = 0; }) curr_y1 += DILATION_Y; src0_read0 += (ROW_PITCH * DILATION_Y); src0_read1 += (ROW_PITCH * DILATION_Y); #endif float blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8]; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { #if TILE_N_LAST_DIV8 == 1 float2* p2BlockB = (float2* )blockB; p2BlockB[interleaved_y] = as_float2( intel_sub_group_block_read2( (const __global uint*)src1_read ) ); #elif TILE_N_LAST_DIV8 == 2 float4* p4BlockB = (float4* )blockB; p4BlockB[interleaved_y] = as_float4( intel_sub_group_block_read4( (const __global uint*)src1_read ) ); #elif TILE_N_LAST_DIV8 == 3 //TODO: broken. No block_read6 float6* p6BlockB = (float6* )blockB; (*((float8*)(&p6BlockB[interleaved_y]))).s0123 = as_float4( intel_sub_group_block_read4( (const __global uint*)src1_read ) ); (*((float8*)(&p6BlockB[interleaved_y]))).s45 = as_float2( intel_sub_group_block_read2( (const __global uint*)(src1_read + 4 * 8) ) ); #endif src1_read += WIDTH1 * 2; } ) if ( kernel_width_is_odd ) { #if TILE_N_LAST_DIV8 == 1 float* pBlockB = (float* )blockB; pBlockB[KERNEL_WIDTH - 1] = as_float( intel_sub_group_block_read( (const __global uint*)src1_read ) ); #elif TILE_N_LAST_DIV8 == 2 float2* p2BlockB = (float2* )blockB; p2BlockB[KERNEL_WIDTH - 1] = as_float2( intel_sub_group_block_read2( (const __global uint*)src1_read ) ); #elif TILE_N_LAST_DIV8 == 3 float3* p3BlockB = (float3* )blockB; p3BlockB[KERNEL_WIDTH - 1].s01 = as_float2( intel_sub_group_block_read2( (const __global uint*)src1_read ) ); p3BlockB[KERNEL_WIDTH - 1].s2 = as_float( intel_sub_group_block_read( (const __global uint*) (src1_read + 8) ) ); #endif src1_read += WIDTH1 * 2; } // Perform MADs float* pBlockB = (float*)blockB; kernel_idx = 0; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { kernel_y = interleaved_y * 2; DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y ], pBlockB[kernel_idx] ); DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; #if TILE_N_LAST_DIV8 >= 2 DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y ], pBlockB[kernel_idx] ); DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; #if TILE_N_LAST_DIV8 >= 3 DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y ], pBlockB[kernel_idx] ); DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; #endif #endif } ) kernel_y = interleaved_y * 2; if ( kernel_width_is_odd ) { DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y], pBlockB[kernel_idx] ); DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; #if TILE_N_LAST_DIV8 >= 2 DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y], pBlockB[kernel_idx] ); DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; #if TILE_N_LAST_DIV8 >= 3 DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y], pBlockB[kernel_idx] ); DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; #endif #endif } } //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 curr_y0 = saved_y0; curr_y1 = saved_y1; #endif src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); } //while ( ++patch_depth < 1 ); //debug while ( ++patch_depth < INPUT_DEPTH ); // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. __global float *out0 = dst + global_z * out_pitch_z // batch offset + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset __global float *out1 = dst + global_z * out_pitch_z // batch offset + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset float bias[4]; float4 *bias_vec; bias_vec = (float4*)bias; *bias_vec = as_float4(intel_sub_group_block_read4((__global uint *)biases + group_x * TILE_N)); if( global_y * TILE_M < output_width * output_height ) { for( int i = 0; i < 8; i++ ) { if ( TILE_N_LAST_DIV8 > 0 ) out0[( 0+i) * out_pitch_y] = blockC0[0][i] + intel_sub_group_shuffle(bias[0], i); if ( TILE_N_LAST_DIV8 > 1 ) out0[( 8+i) * out_pitch_y] = blockC0[1][i] + intel_sub_group_shuffle(bias[1], i); if ( TILE_N_LAST_DIV8 > 2 ) out0[(16+i) * out_pitch_y] = blockC0[2][i] + intel_sub_group_shuffle(bias[2], i); if ( TILE_N_LAST_DIV8 > 3 ) out0[(24+i) * out_pitch_y] = blockC0[3][i] + intel_sub_group_shuffle(bias[3], i); } } if( global_y * TILE_M + 1 < output_width * output_height ) { for( int i = 0; i < 8; i++ ) { if ( TILE_N_LAST_DIV8 > 0 ) out1[( 0+i) * out_pitch_y] = blockC1[0][i] + intel_sub_group_shuffle(bias[0], i); if ( TILE_N_LAST_DIV8 > 1 ) out1[( 8+i) * out_pitch_y] = blockC1[1][i] + intel_sub_group_shuffle(bias[1], i); if ( TILE_N_LAST_DIV8 > 2 ) out1[(16+i) * out_pitch_y] = blockC1[2][i] + intel_sub_group_shuffle(bias[2], i); if ( TILE_N_LAST_DIV8 > 3 ) out1[(24+i) * out_pitch_y] = blockC1[3][i] + intel_sub_group_shuffle(bias[3], i); } } } #endif } #endif #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(copyImage, Dtype) (__global Dtype* image_data, int_tp image_offset, const int_tp channels, const int_tp height, const int_tp width, const int_tp adjustedHeight, const int_tp adjustedWidth, const int_tp pad_h, const int_tp pad_w, __global Dtype* output_image, const int_tp output_offset, const int_tp batch_size) { uint_tp sX = get_global_id(0); uint_tp sY = get_global_id(1); uint_tp sZ = get_global_id(2); int_tp in_y = sY - pad_h; int_tp in_x = sX - pad_w; int_tp batch_offset = 0; int_tp adjusted_batch_offset = 0; for(uint_tp batch_idx = 0; batch_idx < batch_size; batch_idx++) { int_tp dst_offset = adjusted_batch_offset + output_offset + sZ*adjustedHeight*adjustedWidth + sY*adjustedWidth +sX; int_tp src_offset = batch_offset + image_offset + sZ*height*width + in_y*width + in_x; if((in_y >= 0 && in_y < height && in_x >= 0 && in_x < width)) output_image[dst_offset] = image_data[src_offset]; else output_image[dst_offset] = 0; batch_offset += height * width * channels; adjusted_batch_offset += adjustedHeight * adjustedWidth * channels; } } __kernel void TEMPLATE(copyWeightsSwizzled, Dtype) (__global Dtype* weightIn, __global Dtype* weightOut, const int_tp kernel_w, const int_tp kernel_h, const int_tp channels, const int_tp outputs, const int_tp swizzleFactor) { uint_tp sX = get_global_id(0); //Original location //Output location int_tp outputSublayer = channels / swizzleFactor; int_tp outputSublayerIndex = channels % swizzleFactor; int_tp filter = sX / (kernel_w*kernel_h*channels); int_tp kernel_X = sX % kernel_w; int_tp kernel_Y = (sX / kernel_w) % kernel_h; int_tp kernel_C = (sX / (kernel_w * kernel_h)) % channels; int_tp FP = filter / swizzleFactor; int_tp F1 = filter % swizzleFactor; weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1] = weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X]; } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif inline int_tp TEMPLATE(compute_uncropped_index,Dtype)( int_tp index, const int_tp ndims, __global const int_tp* src_strides, __global const int_tp* dst_strides, __global const int_tp* offsets) { int_tp dest_index = index; int_tp src_index = 0; for (int_tp i = 0; i < ndims; ++i) { int_tp coord = dest_index / dst_strides[i]; dest_index -= coord * dst_strides[i]; src_index += src_strides[i] * (coord + offsets[i]); } return src_index; } __kernel void TEMPLATE(crop_forward,Dtype)(const int_tp nthreads, const int_tp ndims, __global const int_tp* src_strides, __global const int_tp* dst_strides, __global const int_tp* offsets, __global const Dtype* src, const int_tp src_off, __global Dtype* dst, const int_tp dst_off) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp src_index = TEMPLATE(compute_uncropped_index,Dtype)( index, ndims, src_strides, dst_strides, offsets); dst[dst_off + index] = src[src_off + src_index]; } } __kernel void TEMPLATE(crop_backward,Dtype)(const int_tp nthreads, const int_tp ndims, __global const int_tp* src_strides, __global const int_tp* dst_strides, __global const int_tp* offsets, __global Dtype* src, const int_tp src_off, __global const Dtype* dst, const int_tp dst_off) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp src_index = TEMPLATE(compute_uncropped_index,Dtype)( index, ndims, src_strides, dst_strides, offsets); src[src_off + src_index] = dst[dst_off + index]; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(dropout_forward,Dtype)(const int_tp n, __global const Dtype* in, __global const uint_tp* mask, const uint_tp threshold, const Dtype scale, __global Dtype* out) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out[index] = in[index] * ((mask[index] > threshold)?1.0:0.0) * scale; } } __kernel void TEMPLATE(dropout_backward,Dtype)( const int_tp n, __global const Dtype* in_diff, __global const uint_tp* mask, const uint_tp threshold, const Dtype scale, __global Dtype* out_diff) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out_diff[index] = in_diff[index] * ((mask[index] > threshold)?1.0:0.0) * scale; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(eltwise_max_forward,Dtype)( const int_tp nthreads, __global const Dtype* bottom_data_a, __global const Dtype* bottom_data_b, const int_tp blob_idx, __global Dtype* top_data, __global int_tp* mask) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { Dtype maxval = -FLT_MAX; int_tp maxidx = -1; if (bottom_data_a[index] > bottom_data_b[index]) { // only update for very first bottom_data blob (blob_idx == 0) if (blob_idx == 0) { maxval = bottom_data_a[index]; top_data[index] = maxval; maxidx = blob_idx; mask[index] = maxidx; } } else { maxval = bottom_data_b[index]; top_data[index] = maxval; maxidx = blob_idx + 1; mask[index] = maxidx; } } } __kernel void TEMPLATE(eltwise_max_backward,Dtype)(const int_tp nthreads, __global const Dtype* top_diff, const int_tp blob_idx, __global const int_tp* mask, __global Dtype* bottom_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { Dtype gradient = 0; if (mask[index] == blob_idx) { gradient += top_diff[index]; } bottom_diff[index] = gradient; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(elu_forward,Dtype)(const int n, __global const Dtype* in, __global Dtype* out, Dtype alpha) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out[index] = in[index] > 0 ? in[index] : alpha * (exp(in[index]) - 1.0); } } __kernel void TEMPLATE(elu_backward,Dtype)(const int n, __global const Dtype* in_diff, __global const Dtype* out_data, __global const Dtype* in_data, __global Dtype* out_diff, Dtype alpha) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out_diff[index] = in_data[index] > 0 ? in_diff[index] : in_diff[index] * (out_data[index] + alpha); } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(embed_forward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data, __global const Dtype* weight, const int_tp M, const int_tp N, const int_tp K, __global Dtype* top_data) { for (int_tp top_index = get_global_id(0); top_index < nthreads; top_index += get_global_size(0)) { const int_tp n = top_index / N; const int_tp d = top_index % N; const int_tp index = (int_tp)(bottom_data[n]); const int_tp weight_index = index * N + d; top_data[top_index] = weight[weight_index]; } } // atomic_add from: http://suhorukov.blogspot.com/2011/12/opencl-11-atomic-operations-on-floating.html #if (TYPE == TYPE_FLOAT) #ifdef ATOMICS_32_AVAILABLE inline void TEMPLATE(atomic_add,Dtype)(volatile __global Dtype *source, const Dtype operand) { union { uint_tp intVal; Dtype floatVal; } newVal; union { uint_tp intVal; Dtype floatVal; } prevVal; do { prevVal.floatVal = *source; newVal.floatVal = prevVal.floatVal + operand; } while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal); } __kernel void TEMPLATE(embed_backward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data, __global const Dtype* top_diff, const int_tp M, const int_tp N, const int_tp K, __global Dtype* weight_diff) { for (int_tp top_index = get_global_id(0); top_index < nthreads; top_index += get_global_size(0)) { const int_tp n = top_index / N; const int_tp d = top_index % N; const int_tp index = (int_tp)(bottom_data[n]); const int_tp weight_index = index * N + d; TEMPLATE(atomic_add,Dtype)((weight_diff + weight_index), *(top_diff + top_index)); } } #endif #endif #if (TYPE == TYPE_DOUBLE) #ifdef ATOMICS_64_AVAILABLE inline void TEMPLATE(atomic_add,Dtype)(volatile __global Dtype *source, const Dtype operand) { union { unsigned long intVal; Dtype floatVal; } newVal; union { unsigned long intVal; Dtype floatVal; } prevVal; do { prevVal.floatVal = *source; newVal.floatVal = prevVal.floatVal + operand; } while (atom_cmpxchg((volatile __global unsigned long *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal); } __kernel void TEMPLATE(embed_backward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data, __global const Dtype* top_diff, const int_tp M, const int_tp N, const int_tp K, __global Dtype* weight_diff) { for (int_tp top_index = get_global_id(0); top_index < nthreads; top_index += get_global_size(0)) { const int_tp n = top_index / N; const int_tp d = top_index % N; const int_tp index = (int_tp)(bottom_data[n]); const int_tp weight_index = index * N + d; TEMPLATE(atomic_add,Dtype)((weight_diff + weight_index), *(top_diff + top_index)); } } #endif #endif #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(fft_phony,Dtype)(Dtype arg) { Dtype out = arg; } #ifdef FFT #ifndef __OPENCL_VERSION__ #include "header.cl" #endif #define DtypeComplex Dtype2 __kernel void TEMPLATE(copy2buffer_cyclic_shift_in,Dtype)( __global Dtype* fft_gpu_weights_real, const int_tp offset_fft_gpu_weights_real, __global Dtype* weight, const int_tp offset_weight, const int_tp ker_size, const int_tp ch_gr, const int_tp ker_size_ch_gr, const int_tp ker_w, const int_tp ker_c_h, const int_tp ker_c_w, const int_tp fft_height, const int_tp fft_width, const int_tp complex_w_len) { fft_gpu_weights_real += offset_fft_gpu_weights_real; weight += offset_weight; int_tp gId = get_global_id(0); int_tp out = gId / ker_size_ch_gr; int_tp c = (gId - out * ker_size_ch_gr) / ker_size; int_tp map_offset = out * ch_gr + c; int_tp map_offset_ker_size = map_offset * ker_size; int_tp pos_in_map = gId - map_offset_ker_size; int_tp h = pos_in_map / ker_w; int_tp h_ker_w = h * ker_w; int_tp w = pos_in_map - h_ker_w; int_tp src_idx = map_offset_ker_size + h_ker_w + w; int_tp ky = h - ker_c_h; if (ky < 0) ky += fft_height; int_tp kx = w - ker_c_w; if (kx < 0) kx += fft_width; int_tp dst_idx = (map_offset * fft_height + ky) * complex_w_len + kx; fft_gpu_weights_real[dst_idx] = weight[src_idx]; } /* Use when width < 4 */ __kernel void TEMPLATE(copy2buffer_left_top_in_naive,Dtype)(__global Dtype* map_out, const int_tp offset_map_out, const __global Dtype* map_in, const int_tp offset_map_in, const int_tp size, const int_tp height_out, const int_tp width_out, const int_tp height, const int_tp width, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w) { map_out += offset_map_out; map_in += offset_map_in; int_tp gId = get_global_id(0); int_tp h = gId / width; int_tp w = gId - (h * width); int_tp dst_idx = (h*stride_h + pad_h)*width_out + (w*stride_w + pad_w); map_out[dst_idx] = map_in[gId]; } /* Use when width < 4 */ __kernel void TEMPLATE(copy2buffer_left_top_in_naive_2d,Dtype)(__global Dtype* map_out, const int_tp offset_map_out, const __global Dtype* map_in, const int_tp offset_map_in, const int_tp map_out_size, const int_tp size, const int_tp count, const int_tp height_out, const int_tp width_out, const int_tp height, const int_tp width, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w) { map_out += offset_map_out; map_in += offset_map_in; int_tp gId_x = get_global_id(0); int_tp gId_y = get_global_id(1); int_tp h = gId_x / width; int_tp w = gId_x - (h * width); int_tp src_idx = gId_y * size + gId_x; int_tp dst_idx = gId_y * map_out_size + (h * stride_h + pad_h) * width_out + (w * stride_w + pad_w); map_out[dst_idx] = map_in[src_idx]; } /* Use when width >= 4 */ __kernel void TEMPLATE(copy2buffer_left_top_in,Dtype)(__global Dtype* map_out, const int_tp offset_map_out, const __global Dtype* map_in, const int_tp offset_map_in, const int_tp size, const int_tp height_out, const int_tp width_out, const int_tp height, const int_tp width, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w) { map_out += offset_map_out; map_in += offset_map_in; int_tp gId = get_global_id(0); int_tp count = size >> 2; int_tp gId4 = gId << 2; int_tp h = gId4 / width; int_tp w = gId4 - (h * width); int_tp dst_h = h*stride_h + pad_h; int_tp dst_w = w*stride_w + pad_w; int_tp dst_idx = dst_h*width_out + dst_w; if (gId < count) { Dtype4 map_in_cache4 = vload4(gId, map_in); int_tp has_pad = width - dst_w; if (has_pad >= 4) { vstore4(map_in_cache4, dst_idx >> 2, map_out); } else { if (0 == has_pad) { dst_idx += width_out + pad_w - dst_w; } map_out[dst_idx] = map_in_cache4.x; if (1 == has_pad) { dst_idx += width_out + pad_w - dst_w - 1; } map_out[dst_idx+1] = map_in_cache4.y; if (2 == has_pad) { dst_idx += width_out + pad_w - dst_w - 2; } map_out[dst_idx+2] = map_in_cache4.z; if (3 == has_pad) { dst_idx += width_out + pad_w - dst_w - 3; } map_out[dst_idx+3] = map_in_cache4.w; dst_h += 1; dst_w = pad_w; } } else if (gId == count) { int_tp res = size - (count << 2); /* size % 4 */ if (res > 0) { Dtype4 map_in_cache4 = 0.f; if (res >= 1) map_in_cache4.x = map_in[gId4]; if (res >= 2) map_in_cache4.y = map_in[gId4+1]; if (res == 3) map_in_cache4.z = map_in[gId4+2]; int_tp has_pad = width - dst_w; if (has_pad >= 4) { vstore4(map_in_cache4, dst_idx >> 2, map_out); } else { if (0 == has_pad) { dst_idx += width_out + pad_w - dst_w; } map_out[dst_idx] = map_in_cache4.x; if (1 == has_pad) { dst_idx += width_out + pad_w - dst_w - 1; } map_out[dst_idx+1] = map_in_cache4.y; if (2 == has_pad) { dst_idx += width_out + pad_w - dst_w - 2; } map_out[dst_idx+2] = map_in_cache4.z; if (3 == has_pad) { dst_idx += width_out + pad_w - dst_w - 3; } map_out[dst_idx+3] = map_in_cache4.w; dst_h += 1; dst_w = pad_w; } } } } /* Use when width >= 4 */ __kernel void TEMPLATE(copy2buffer_left_top_in_2d,Dtype)(__global Dtype* map_out, const int_tp offset_map_out, const __global Dtype* map_in, const int_tp offset_map_in, const int_tp map_out_size, const int_tp size, const int_tp count, const int_tp height_out, const int_tp width_out, const int_tp height, const int_tp width, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w) { map_out += offset_map_out; map_in += offset_map_in; int_tp gId = get_global_id(0); int_tp gId_y = get_global_id(1); int_tp gId4 = gId << 2; int_tp h = gId4 / width; int_tp w = gId4 - (h * width); int_tp dst_h = h*stride_h + pad_h; int_tp dst_w = w*stride_w + pad_w; int_tp dst_idx = dst_h*width_out + dst_w; const __global Dtype* map_in_2d = map_in + gId_y * size; __global Dtype* map_out_2d = map_out + gId_y * map_out_size; if (gId < count) { Dtype4 map_in_cache4 = vload4(gId, map_in_2d); int_tp has_pad = width - dst_w; if (has_pad >= 4) { vstore4(map_in_cache4, dst_idx >> 2, map_out_2d); } else { if (0 == has_pad) { dst_idx += width_out + pad_w - dst_w; } map_out_2d[dst_idx] = map_in_cache4.x; if (1 == has_pad) { dst_idx += width_out + pad_w - dst_w - 1; } map_out_2d[dst_idx+1] = map_in_cache4.y; if (2 == has_pad) { dst_idx += width_out + pad_w - dst_w - 2; } map_out_2d[dst_idx+2] = map_in_cache4.z; if (3 == has_pad) { dst_idx += width_out + pad_w - dst_w - 3; } map_out_2d[dst_idx+3] = map_in_cache4.w; dst_h += 1; dst_w = pad_w; } } else if (gId == count) { int_tp res = size - (count << 2); /* size % 4 */ if (res > 0) { Dtype4 map_in_cache4 = 0.f; if (res >= 1) map_in_cache4.x = map_in_2d[gId4]; if (res >= 2) map_in_cache4.y = map_in_2d[gId4+1]; if (res == 3) map_in_cache4.z = map_in_2d[gId4+2]; int_tp has_pad = width - dst_w; if (has_pad >= 4) { vstore4(map_in_cache4, dst_idx >> 2, map_out_2d); } else { if (0 == has_pad) { dst_idx += width_out + pad_w - dst_w; } map_out_2d[dst_idx] = map_in_cache4.x; if (1 == has_pad) { dst_idx += width_out + pad_w - dst_w - 1; } map_out_2d[dst_idx+1] = map_in_cache4.y; if (2 == has_pad) { dst_idx += width_out + pad_w - dst_w - 2; } map_out_2d[dst_idx+2] = map_in_cache4.z; if (3 == has_pad) { dst_idx += width_out + pad_w - dst_w - 3; } map_out_2d[dst_idx+3] = map_in_cache4.w; dst_h += 1; dst_w = pad_w; } } } } /* Use when width_out < 4 */ __kernel void TEMPLATE(copy2buffer_left_top_out_naive,Dtype)(__global Dtype* map_out, const int_tp offset_map_out, const __global Dtype* map_in, const int_tp offset_map_in, const int_tp size, const int_tp height_out, const int_tp width_out, const int_tp fft_height, const int_tp fft_width, const int_tp ker_center_h, const int_tp ker_center_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w) { map_out += offset_map_out; map_in += offset_map_in; int_tp gId = get_global_id(0); int_tp h_out = gId / width_out; int_tp w_out = gId - (h_out * width_out); int_tp h = h_out * stride_h + ker_center_h; int_tp w = w_out * stride_w + ker_center_w; int_tp src_idx = h*fft_width + w; map_out[gId] = map_in[src_idx]; } /* Use when width_out < 4 */ __kernel void TEMPLATE(copy2buffer_left_top_out_naive_2d,Dtype)(__global Dtype* map_out, const int_tp offset_map_out, const __global Dtype* map_in, const int_tp offset_map_in, const int_tp size, const int_tp count, const int_tp map_in_size, const int_tp height_out, const int_tp width_out, const int_tp fft_height, const int_tp fft_width, const int_tp ker_center_h, const int_tp ker_center_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w) { map_out += offset_map_out; map_in += offset_map_in; int_tp gId = get_global_id(0); int_tp out = get_global_id(1); int_tp h_out = gId / width_out; int_tp w_out = gId - (h_out * width_out); int_tp h = h_out * stride_h + ker_center_h; int_tp w = w_out * stride_w + ker_center_w; int_tp src_idx = out * map_in_size + h*fft_width + w; int_tp dst_idx = out * size + gId; map_out[dst_idx] = map_in[src_idx]; } /* Use when width_out >= 4 */ __kernel void TEMPLATE(copy2buffer_left_top_out,Dtype)(__global Dtype* map_out, const int_tp offset_map_out, const __global Dtype* map_in, const int_tp offset_map_in, const int_tp size, const int_tp height_out, const int_tp width_out, const int_tp fft_height, const int_tp fft_width, const int_tp ker_c_h, const int_tp ker_c_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w) { map_out += offset_map_out; map_in += offset_map_in; int_tp gId = get_global_id(0); int_tp count = size >> 2; int_tp gId4 = gId << 2; int_tp h_out = gId4 / width_out; int_tp w_out = gId4 - (h_out * width_out); int_tp h = h_out * stride_h + ker_c_h; int_tp w = w_out * stride_w + ker_c_w; int_tp src_idx = h*fft_width + w; if (gId < count) { Dtype4 map_in_cache4; int_tp has_pad = width_out - (w - pad_w); if (has_pad >= 4) { map_in_cache4 = vload4(src_idx >> 2, map_in); } else { int_tp right_elements = fft_width - width_out; if (0 == has_pad) { src_idx += right_elements; } map_in_cache4.x = map_in[src_idx]; if (1 == has_pad) { src_idx += right_elements; } map_in_cache4.y = map_in[src_idx+1]; if (2 == has_pad) { src_idx += right_elements; } map_in_cache4.z = map_in[src_idx+2]; if (3 == has_pad) { src_idx += right_elements; } map_in_cache4.w = map_in[src_idx+3]; } vstore4(map_in_cache4, gId, map_out); } else if (gId == count) { int_tp res = size - (count << 2); /* size % 4 */ if (res > 0) { for (int_tp i = gId4; i < size; ++i) { map_out[i] = map_in[src_idx]; src_idx++; } } } } /* Use when width_out >= 4 */ __kernel void TEMPLATE(copy2buffer_left_top_out_2d,Dtype)(__global Dtype* map_out, const int_tp offset_map_out, const __global Dtype* map_in, const int_tp offset_map_in, const int_tp size, const int_tp count, const int_tp map_in_size, const int_tp height_out, const int_tp width_out, const int_tp fft_height, const int_tp fft_width, const int_tp ker_c_h, const int_tp ker_c_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w) { map_out += offset_map_out; map_in += offset_map_in; int_tp gId = get_global_id(0); int_tp out = get_global_id(1); int_tp gId4 = gId << 2; int_tp h_out = gId4 / width_out; int_tp w_out = gId4 - (h_out * width_out); int_tp h = h_out * stride_h + ker_c_h; int_tp w = w_out * stride_w + ker_c_w; int_tp src_idx = h*fft_width + w; const __global Dtype* map_in_2d = map_in + out * map_in_size; __global Dtype* map_out_2d = map_out + out * size; if (gId < count) { Dtype4 map_in_cache4; int_tp has_pad = width_out - (w - pad_w); if (has_pad >= 4) { map_in_cache4 = vload4(src_idx >> 2, map_in_2d); } else { int_tp right_elements = fft_width - width_out; if (0 == has_pad) { src_idx += right_elements; } map_in_cache4.x = map_in_2d[src_idx]; if (1 == has_pad) { src_idx += right_elements; } map_in_cache4.y = map_in_2d[src_idx+1]; if (2 == has_pad) { src_idx += right_elements; } map_in_cache4.z = map_in_2d[src_idx+2]; if (3 == has_pad) { src_idx += right_elements; } map_in_cache4.w = map_in_2d[src_idx+3]; } vstore4(map_in_cache4, gId, map_out_2d); } else if (gId == count) { int_tp res = size - (count << 2); /* size % 4 */ if (res > 0) { const __global Dtype4* map_in_2d_4 = (const __global Dtype4*)(map_in_2d + src_idx); __global Dtype4* map_out_2d_4 = (__global Dtype4*)(map_out_2d + gId4); if (res == 3) { map_out_2d_4[0].xyz = map_in_2d_4[0].xyz; } else if (res == 2) { map_out_2d_4[0].xy = map_in_2d_4[0].xy; } else if (res == 1) { map_out_2d_4[0].x = map_in_2d_4[0].x; } } } } __kernel void TEMPLATE(copy2buffer_cyclic_shift_out,Dtype)(__global Dtype* map_out, const int_tp offset_map_out, const __global Dtype* map_in, const int_tp offset_map_in, const int_tp width_out, const int_tp fft_height, const int_tp fft_width, const int_tp ker_center_h, const int_tp ker_center_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w) { map_out += offset_map_out; map_in += offset_map_in; int_tp gId = get_global_id(0); int_tp h_out = gId / width_out; int_tp w_out = gId - (h_out * width_out); int_tp h = h_out * stride_h + pad_h; int_tp w = w_out * stride_w + pad_w; int_tp ky = h - ker_center_h; if (ky < 0) ky += fft_height; int_tp kx = w - ker_center_w; if (kx < 0) kx += fft_width; int_tp src_idx = ky*fft_width + kx; map_out[gId] = map_in[src_idx]; } __kernel void TEMPLATE(copy2buffer_cyclic_shift_out_2d,Dtype)(__global Dtype* map_out, const int_tp offset_map_out, const __global Dtype* map_in, const int_tp offset_map_in, const int_tp map_out_size, const int_tp map_in_size, const int_tp width_out, const int_tp fft_height, const int_tp fft_width, const int_tp ker_center_h, const int_tp ker_center_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w) { map_out += offset_map_out; map_in += offset_map_in; int_tp gId = get_global_id(0); int_tp gId_y = get_global_id(1); int_tp h_out = gId / width_out; int_tp w_out = gId - (h_out * width_out); int_tp h = h_out * stride_h + pad_h; int_tp w = w_out * stride_w + pad_w; int_tp ky = h - ker_center_h; if (ky < 0) ky += fft_height; int_tp kx = w - ker_center_w; if (kx < 0) kx += fft_width; int_tp src_idx = gId_y * map_in_size + ky*fft_width + kx; int_tp dst_idx = gId_y * map_out_size + gId; map_out[dst_idx] = map_in[src_idx]; } __kernel void TEMPLATE(complex_conjugate_multiplication_1d,Dtype)(__global Dtype* dst, const int_tp offset_dst, const __global Dtype* src1, const int_tp offset_src1, const __global Dtype* src2, const int_tp offset_src2, const int_tp ch_gr) { dst += offset_dst; src1 += offset_src1; src2 += offset_src2; int_tp gId = get_global_id(0); int_tp size = get_global_size(0); Dtype4 dst_cache = 0.f; int_tp src_idx; Dtype4 s1_cache; Dtype4 s2_cache; for (int_tp c = 0; c < ch_gr; ++c) { src_idx = size * c + gId; s1_cache = vload4(src_idx, src1); s2_cache = vload4(src_idx, src2); dst_cache.x += s1_cache.x * s2_cache.x + s1_cache.y * s2_cache.y; dst_cache.y += -s1_cache.x * s2_cache.y + s1_cache.y * s2_cache.x; dst_cache.z += s1_cache.z * s2_cache.z + s1_cache.w * s2_cache.w; dst_cache.w += -s1_cache.z * s2_cache.w + s1_cache.w * s2_cache.z; } ((__global Dtype4*)(&dst[gId<<2]))[0] += dst_cache; } __kernel void TEMPLATE(complex_conjugate_multiplication_2d,Dtype)(__global Dtype* dst, const int_tp offset_dst, const __global Dtype* src1, const int_tp offset_src1, const __global Dtype* src2, const int_tp offset_src2, const int_tp out_gr, const int_tp map_size, const int_tp ch_gr) { dst += offset_dst; src1 += offset_src1; src2 += offset_src2; int_tp gId = get_global_id(0); int_tp out = get_global_id(1); int_tp src1_idx, src2_idx; int_tp dst_map_offset = map_size * out; int_tp dst_idx = dst_map_offset + gId; Dtype4 s1_cache, s2_cache; Dtype4 dst_cache = 0.f; int_tp map_offset = dst_map_offset * ch_gr; for (int_tp i = 0; i < ch_gr; ++i) { src1_idx = map_size * i + gId; src2_idx = map_offset + src1_idx; s1_cache = vload4(src1_idx, src1); s2_cache = vload4(src2_idx, src2); dst_cache.xz += mad( s1_cache.xz, s2_cache.xz, s1_cache.yw * s2_cache.yw); dst_cache.yw += mad(-s1_cache.xz, s2_cache.yw, s1_cache.yw * s2_cache.xz); } vstore4(dst_cache, dst_idx, dst); } __kernel void TEMPLATE(complex_conjugate_multiplication_2d_SLM,Dtype)( __global Dtype* restrict dst, const int_tp offset_dst, const __global Dtype* restrict src1, const int_tp offset_src1, __local Dtype* local_src1, const __global Dtype* restrict src2, const int_tp offset_src2, const int_tp out_gr, const int_tp map_size, const int_tp ch_gr) { int_tp gId = get_global_id(0); if (gId >= map_size) return; /* Do not remove this */ int_tp out = get_global_id(1); if (out >= out_gr) return; /* Do not remove this */ dst += offset_dst; src1 += offset_src1; src2 += offset_src2; int_tp tId = get_local_id(0); int_tp local_out = get_local_id(1); int_tp tile_size = get_local_size(0); Dtype4 s1_cache; if (local_out == 0) { for (int_tp c = 0; c < ch_gr; ++c) { s1_cache = vload4(map_size * c + gId, src1); vstore4(s1_cache, tile_size * c + tId, local_src1); } } barrier(CLK_LOCAL_MEM_FENCE); int_tp dst_map_offset = map_size * out; int_tp dst_idx = (dst_map_offset + gId) << 2; Dtype4 dst_cache = 0.f; Dtype4 s2_cache; int_tp ch_offset = 0; int_tp map_offset = dst_map_offset * ch_gr; for (int_tp c = 0; c < ch_gr; ++c) { ch_offset = map_size * c; s1_cache = vload4(tile_size * c + tId, local_src1); s2_cache = vload4(map_offset + ch_offset + gId, src2); dst_cache.xz += mad( s1_cache.xz, s2_cache.xz, s1_cache.yw * s2_cache.yw); dst_cache.yw += mad(-s1_cache.xz, s2_cache.yw, s1_cache.yw * s2_cache.xz); } ((__global Dtype4*)(&dst[dst_idx]))[0] += dst_cache; } __kernel void TEMPLATE(complex_conjugate_multiplication_3d,Dtype)(__global Dtype* dst, const int_tp offset_dst, const __global Dtype* src1, const int_tp offset_src1, const __global Dtype* src2, const int_tp offset_src2, const int_tp out_gr, const int_tp size, const int_tp ch_gr) { dst += offset_dst; src1 += offset_src1; src2 += offset_src2; int_tp gId = get_global_id(0); int_tp out = get_global_id(1); int_tp ch = get_global_id(2); Dtype4 dst_cache = 0.f; Dtype4 s1_cache = ((__global Dtype4*)(&(src1[(size*ch+gId)<<2])))[0]; Dtype4 s2_cache = ((__global Dtype4*)(&(src2[(size*(out*ch_gr+ch)+gId)<<2])))[0]; dst_cache.x = s1_cache.x * s2_cache.x + s1_cache.y * s2_cache.y; dst_cache.y = -s1_cache.x * s2_cache.y + s1_cache.y * s2_cache.x; dst_cache.z = s1_cache.z * s2_cache.z + s1_cache.w * s2_cache.w; dst_cache.w = -s1_cache.z * s2_cache.w + s1_cache.w * s2_cache.z; ((__global Dtype4*)(&dst[(size*out+gId)<<2]))[0] += dst_cache; } __kernel void TEMPLATE(complex_conjugate_multiplication_3d_SLM,Dtype)(__global Dtype* dst, const int_tp offset_dst, __local Dtype* local_dst, const __global Dtype* src1, const int_tp offset_src1, __local Dtype* local_src1, const __global Dtype* src2, const int_tp offset_src2, const int_tp out_gr, const int_tp map_size, const int_tp ch_gr) { int_tp gId = get_global_id(0); if (gId >= map_size) return; /* Do not remove this */ int_tp out = get_global_id(1); if (out >= out_gr) return; /* Do not remove this */ int_tp ch = get_global_id(2); if (ch >= ch_gr) return; /* Do not remove this */ dst += offset_dst; src1 += offset_src1; src2 += offset_src2; int_tp tId = get_local_id(0); int_tp local_out = get_local_id(1); int_tp tile_size = get_local_size(0); Dtype4 s1_cache; if (local_out == 0) { s1_cache = vload4(map_size * ch + gId, src1); vstore4(s1_cache, tile_size * ch + tId, local_src1); } barrier(CLK_LOCAL_MEM_FENCE); int_tp dst_map_offset = map_size * out; int_tp dst_idx = (dst_map_offset + gId) << 2; Dtype4 dst_cache = 0.f; Dtype4 s2_cache; s1_cache = vload4(tile_size * ch + tId, local_src1); s2_cache = vload4((dst_map_offset * ch_gr) + (map_size * ch) + gId, src2); dst_cache.x += s1_cache.x * s2_cache.x + s1_cache.y * s2_cache.y; dst_cache.y += -s1_cache.x * s2_cache.y + s1_cache.y * s2_cache.x; dst_cache.z += s1_cache.z * s2_cache.z + s1_cache.w * s2_cache.w; dst_cache.w += -s1_cache.z * s2_cache.w + s1_cache.w * s2_cache.z; ((__global Dtype4*)(&dst[dst_idx]))[0] += dst_cache; } __kernel void TEMPLATE(complex_multiplication_1d,Dtype)(__global Dtype* dst, const int_tp offset_dst, const __global Dtype* src1, const int_tp offset_src1, const __global Dtype* src2, const int_tp offset_src2, const int_tp size, const int_tp ch_gr) { dst += offset_dst; src1 += offset_src1; src2 += offset_src2; int_tp gId = get_global_id(0); Dtype4 s2_cache; Dtype4 dst_cache = 0.f; int_tp idx_with_ch; Dtype4 s1_cache = vload4(gId, src1); for (int_tp ch = 0; ch < ch_gr; ++ch) { idx_with_ch = size * ch + gId; s2_cache = vload4(idx_with_ch, src2); dst_cache.xz = s1_cache.xz * s2_cache.xz - s1_cache.yw * s2_cache.yw; dst_cache.yw = s1_cache.xz * s2_cache.yw + s1_cache.yw * s2_cache.xz; ((__global Dtype4*)(&dst[idx_with_ch<<2]))[0] += dst_cache; } } __kernel void TEMPLATE(complex_multiplication_2d_SLM,Dtype)(__global Dtype* restrict dst, const int_tp offset_dst, __local Dtype* local_dst, const __global Dtype* restrict src1, const int_tp offset_src1, const __global Dtype* restrict src2, const int_tp offset_src2, const int_tp num_output, const int_tp size, const int_tp ch_gr) { int_tp gId = get_global_id(0); if (gId >= size) return; int_tp out = get_global_id(1); if (out >= num_output) return; dst += offset_dst; src1 += offset_src1; src2 += offset_src2; int_tp tId = get_local_id(0); int_tp tOut = get_local_id(1); int_tp tile_size = get_local_size(0); int_tp local_out_size = get_local_size(1); int_tp out_offset = out * size; int_tp out_ch_offset = out_offset * ch_gr; int_tp tile_size_in_all_ch = tile_size * ch_gr; int_tp local_out_ch_offset = tOut * tile_size_in_all_ch; int_tp src2_idx, local_dst_idx; Dtype4 s2_cache, dst_cache; int_tp src1_idx = out_offset + gId; Dtype4 s1_cache = vload4(src1_idx, src1); for (int_tp ch = 0; ch < ch_gr; ++ch) { src2_idx = out_ch_offset + ch * size + gId; s2_cache = vload4(src2_idx, src2); dst_cache.xz = s1_cache.xz * s2_cache.xz - s1_cache.yw * s2_cache.yw; dst_cache.yw = s1_cache.xz * s2_cache.yw + s1_cache.yw * s2_cache.xz; local_dst_idx = local_out_ch_offset + ch * tile_size + tId; vstore4(dst_cache, local_dst_idx, local_dst); } barrier(CLK_LOCAL_MEM_FENCE); int_tp start_idx, half_start_idx; int_tp ch_offset; int_tp this_idx, that_idx; for (int_tp offset = local_out_size >>= 1; offset > 0; offset >>=1) { if (tOut < offset) { start_idx = tOut * tile_size_in_all_ch + tId; half_start_idx = (tOut + offset) * tile_size_in_all_ch + tId; for (int_tp ch = 0; ch < ch_gr; ++ch) { ch_offset = ch * tile_size; this_idx = (start_idx + ch_offset) << 2; that_idx = (half_start_idx + ch_offset) << 2; ((__local Dtype4*)(&local_dst[this_idx]))[0] += ((__local Dtype4*)(&local_dst[that_idx]))[0]; } } barrier(CLK_LOCAL_MEM_FENCE); } if (tOut == 0) { for (int_tp ch = 0; ch < ch_gr; ++ch) { dst_cache = vload4(tile_size * ch + tId, local_dst); ((__global Dtype4*)(&dst[(size * ch + gId)<<2]))[0] += dst_cache; } } } __kernel void TEMPLATE(complex_multiplication_3d,Dtype)(__global Dtype* dst, const int_tp offset_dst, const __global Dtype* src1, const int_tp offset_src1, const __global Dtype* src2, const int_tp offset_src2, const int_tp size, const int_tp ch_gr, const int_tp out_gr, const int_tp num_output) { dst += offset_dst; src1 += offset_src1; src2 += offset_src2; int_tp gId = get_global_id(0); int_tp ch = get_global_id(1); int_tp out = get_global_id(2); int_tp g = out / out_gr; ch += (g * ch_gr); int_tp c_offset = ch - ((ch / ch_gr) * ch_gr); __global Dtype2* dst_ch = ((__global Dtype2*)(dst)) + (size * ch); __global Dtype2* src1_out = ((__global Dtype2*)(src1)) + (size * out); __global Dtype2* src2_out_ch = ((__global Dtype2*)(src2)) + (size * (out * ch_gr + c_offset)); Dtype2 s1_cache = src1_out[gId]; Dtype2 s2_cache = src2_out_ch[gId]; Dtype2 dst_cache = 0.f; dst_cache.x = s1_cache.x * s2_cache.x - s1_cache.y * s2_cache.y; dst_cache.y = s1_cache.x * s2_cache.y + s1_cache.y * s2_cache.x; dst_ch[gId] += dst_cache; } /* Convert [RRRR...GGGG...BBBB...] to [RGBRGBRGBRGB...] */ /* Reshape 2 */ __kernel void TEMPLATE(convert_data_to_channel_major,Dtype)(__global Dtype2* dst, const __global Dtype2* src, const int_tp size, const int_tp ch_gr) { int_tp gId = get_global_id(0); __global Dtype* dst_ptr = (__global Dtype*)(dst + (gId * ch_gr)); const __global Dtype* src_ptr = (const __global Dtype*)(src + gId); Dtype2 s; int_tp src_idx = 0; for (int_tp i = 0; i < ch_gr; ++i) { s = vload2(src_idx, src_ptr); vstore2(s, i, dst_ptr); src_idx += size; } } /* Reshape 1 */ /*__kernel void TEMPLATE(convert_data_to_channel_major(__global Dtype4* dst, const __global Dtype4* src, const int_tp size, const int_tp ch_gr) { int_tp gId = get_global_id(0); const __global Dtype4* src_ptr4 = src + gId; __global Dtype4* dst_ptr4 = dst + (gId * ch_gr); for (int_tp i = 0; i < ch_gr; ++i) { dst_ptr4[i] = src_ptr4[i*size]; } } */ /* Convert multiple [RRRR...GGGG...BBBB...] to multiple [RGBRGBRGBRGB...] */ /* Reshape 2 */ __kernel void TEMPLATE(convert_weight_to_channel_major,Dtype)(__global Dtype2* dst, const __global Dtype2* src, const int_tp size, const int_tp ch_gr, const int_tp num_output) { int_tp gId = get_global_id(0); int_tp out = get_global_id(1); int_tp out_offset = out * (size * ch_gr); __global Dtype* dst_ptr = (__global Dtype*)(dst + out_offset + (gId * ch_gr)); const __global Dtype* src_ptr = (const __global Dtype*)(src + out_offset + gId); Dtype2 s; int_tp src_idx = 0; for (int_tp i = 0; i < ch_gr; ++i) { s = vload2(src_idx, src_ptr); vstore2(s, i, dst_ptr); src_idx += size; } } /* Reshape 1 */ /* __kernel void TEMPLATE(convert_weight_to_channel_major(__global Dtype4* dst, const __global Dtype4* src, const int_tp size, const int_tp ch_gr, const int_tp out_gr) { int_tp gId = get_global_id(0); int_tp out = get_global_id(1); int_tp out_offset = out * (size * ch_gr); __global Dtype4* dst_ptr4 = dst + out_offset + (gId * ch_gr); const __global Dtype4* src_ptr4 = src + out_offset + gId; for (int_tp i = 0; i < ch_gr; ++i) { dst_ptr4[i] = src_ptr4[size * i]; } } */ /* Cdotc per element */ /* Reshape 1 */ /* __kernel void TEMPLATE(batchedCdotc(__global Dtype4* dst, const __global Dtype4* src1, const __global Dtype4* src2, const int_tp size, const int_tp ch_gr, const int_tp out_gr) { int_tp gId = get_global_id(0); int_tp out = get_global_id(1); int_tp ch_offset = gId * ch_gr; int_tp out_offset = out * size; const __global Dtype* src1_ptr = (const __global Dtype*)(src1 + ch_offset); const __global Dtype* src2_ptr = (const __global Dtype*)(src2 + (out_offset * ch_gr) + ch_offset); Dtype4 cdotc = 0.f; Dtype4 s1, s2; for (int_tp c = 0; c < ch_gr; ++c) { s1 = vload4(c, src1_ptr); s2 = vload4(c, src2_ptr); cdotc.xz += mad( s1.xz, s2.xz, s1.yw * s2.yw); cdotc.yw += mad(-s1.xz, s2.yw, s1.yw * s2.xz); } __global Dtype4* dst_ptr4 = dst + out_offset + gId; dst_ptr4[0] += cdotc; } */ /* Cdotc per two elements */ /* Reshape 2 */ __kernel void TEMPLATE(batchedCdotc,Dtype)(__global Dtype2* dst, const __global Dtype2* src1, const __global Dtype2* src2, const int_tp size, const int_tp ch_gr, const int_tp out_gr) { int_tp gId = get_global_id(0); int_tp out = get_global_id(1); int_tp ch_offset = gId * ch_gr; const __global Dtype* src1_ptr = (const __global Dtype*)(src1 + ch_offset); const __global Dtype* src2_ptr = (const __global Dtype*)(src2 + (out * size * ch_gr) + ch_offset); Dtype4 cdotc4 = 0.f; Dtype2 cdotc = 0.f; Dtype4 s1, s2; int_tp n = ch_gr >> 1; int_tp r = ch_gr - (n << 1); for (int_tp i = 0; i < n; ++i) { s1 = vload4(i, src1_ptr); s2 = vload4(i, src2_ptr); cdotc4.xz += mad( s1.xz, s2.xz, s1.yw * s2.yw); cdotc4.yw += mad(-s1.xz, s2.yw, s1.yw * s2.xz); } cdotc.x += dot(cdotc4.xz, (float2)(1)); cdotc.y += dot(cdotc4.yw, (float2)(1)); if (r == 1) { const __global Dtype* src1_ptr2 = (const __global Dtype*)(((const __global Dtype4*)(src1_ptr)) + n); const __global Dtype* src2_ptr2 = (const __global Dtype*)(((const __global Dtype4*)(src2_ptr)) + n); Dtype2 t1 = vload2(0, src1_ptr2); Dtype2 t2 = vload2(0, src2_ptr2); cdotc.x += mad( t1.x, t2.x, t1.y * t2.y); cdotc.y += mad(-t1.x, t2.y, t1.y * t2.x); } __global Dtype* dst_ptr = (__global Dtype*)(dst + (out * size) + gId); vstore2(cdotc, 0, dst_ptr); } #endif #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(fillbuffer,Dtype)(const int_tp n, const char alpha, __global char* x, const int_tp offx) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { x[index + offx] = alpha; } } __kernel void TEMPLATE(fill,Dtype)(const int_tp n, const Dtype alpha, __global Dtype* x, const int_tp offx) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { x[index + offx] = alpha; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(im2col,Dtype)(const int_tp n, __global const Dtype* data_im, const int_tp data_im_off, const int_tp height, const int_tp width, const int_tp kernel_h, const int_tp kernel_w, const int_tp pad_h, const int_tp pad_w, const int_tp stride_h, const int_tp stride_w, const int_tp dilation_h, const int_tp dilation_w, const int_tp height_col, const int_tp width_col, __global Dtype* data_col, const int_tp data_col_off) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { const int_tp h_index = index / width_col; const int_tp h_col = h_index % height_col; const int_tp w_col = index % width_col; const int_tp c_im = h_index / height_col; const int_tp c_col = c_im * kernel_h * kernel_w; const int_tp h_offset = h_col * stride_h - pad_h; const int_tp w_offset = w_col * stride_w - pad_w; __global Dtype* data_col_ptr = data_col + data_col_off; data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; __global const Dtype* data_im_ptr = data_im + data_im_off; data_im_ptr += (c_im * height + h_offset) * width + w_offset; for (int_tp i = 0; i < kernel_h; ++i) { for (int_tp j = 0; j < kernel_w; ++j) { int_tp h_im = h_offset + i * dilation_h; int_tp w_im = w_offset + j * dilation_w; *data_col_ptr = (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ? data_im_ptr[i * dilation_h * width + j * dilation_w] : 0; data_col_ptr += height_col * width_col; } } } } __kernel void TEMPLATE(col2im,Dtype)(const int_tp n, __global const Dtype* data_col, const int_tp data_col_off, const int_tp height, const int_tp width, const int_tp channels, const int_tp kernel_h, const int_tp kernel_w, const int_tp pad_h, const int_tp pad_w, const int_tp stride_h, const int_tp stride_w, const int_tp dilation_h, const int_tp dilation_w, const int_tp height_col, const int_tp width_col, __global Dtype* data_im, const int_tp data_im_off) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { Dtype val = 0; const int_tp w_im = index % width + pad_w; const int_tp h_im = (index / width) % height + pad_h; const int_tp c_im = index / (width * height); int_tp kernel_extent_w = (kernel_w - 1) * dilation_w + 1; int_tp kernel_extent_h = (kernel_h - 1) * dilation_h + 1; // compute the start and end of the output const int_tp w_col_start = (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1; const int_tp w_col_end = min(w_im / stride_w + 1, width_col); const int_tp h_col_start = (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1; const int_tp h_col_end = min(h_im / stride_h + 1, height_col); // TODO: use LCM of stride and dilation to avoid unnecessary loops for (int_tp h_col = h_col_start; h_col < h_col_end; h_col += 1) { for (int_tp w_col = w_col_start; w_col < w_col_end; w_col += 1) { int_tp h_k = (h_im - h_col * stride_h); int_tp w_k = (w_im - w_col * stride_w); if (h_k % dilation_h == 0 && w_k % dilation_w == 0) { h_k /= dilation_h; w_k /= dilation_w; int_tp data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) * height_col + h_col) * width_col + w_col; val += data_col[data_col_off + data_col_index]; } } } data_im[data_im_off + index] = val; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(im2col_nd, Dtype)(const int_tp n, const int_tp num_axes, const int_tp channel_axis, __global const Dtype* data_im, const int_tp data_im_off, __global const int_tp* im_shape, __global const int_tp* col_shape, __global const int_tp* kernel_shape, __global const int_tp* pad, __global const int_tp* stride, __global const int_tp* dilation, __global Dtype* data_col, const int_tp data_col_off) { int_tp d_temp[6]; int_tp d_iter[6]; int_tp i; __global const int_tp* im_shape_ptr = im_shape + channel_axis; __global const int_tp* col_shape_ptr = col_shape + channel_axis; __local int_tp shared_dilation[6]; __local int_tp shared_kernel_shape[6]; __local int_tp shared_pad[6]; __local int_tp shared_stride[6]; __local int_tp shared_col_shape[6 + 1]; __local int_tp shared_im_shape[6 + 1]; for (int li = get_local_id(0); li < num_axes; li += get_local_size(0)) { shared_dilation[li] = dilation[li]; shared_kernel_shape[li] = kernel_shape[li]; shared_pad[li] = pad[li]; shared_stride[li] = stride[li]; } for (int li = get_local_id(0); li < num_axes + 1; li += get_local_size(0)) { shared_col_shape[li] = col_shape_ptr[li]; shared_im_shape[li] = im_shape_ptr[li]; } barrier(CLK_LOCAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { // Initialize channel_in, computed in the loop below, with intermediate // computations used to compute the spatial indices. int_tp channel_in = index; int_tp channel_out = 1; for (i = num_axes - 1; i >= 0; --i) { d_temp[i] = channel_in % shared_col_shape[i + 1]; channel_in /= shared_col_shape[i + 1]; channel_out *= shared_kernel_shape[i]; } channel_out *= channel_in; int_tp data_col_inc = 1; for (i = 0; i < num_axes; ++i) { channel_out *= shared_col_shape[i + 1]; channel_out += d_temp[i]; d_temp[i] = d_temp[i] * shared_stride[i] - shared_pad[i]; channel_in *= shared_im_shape[i + 1]; channel_in += d_temp[i]; data_col_inc *= shared_col_shape[i + 1]; d_iter[i] = 0; } __global Dtype* data_col_ptr = data_col + data_col_off + channel_out; __global const Dtype* data_im_ptr = data_im + data_im_off + channel_in; bool incremented; do { bool in_range = true; for (i = 0; i < num_axes; ++i) { const int_tp d_iter_im = d_iter[i] * shared_dilation[i] + d_temp[i]; in_range &= d_iter_im >= 0 && d_iter_im < shared_im_shape[i + 1]; if (!in_range) { break; } } if (in_range) { int_tp data_im_offset = d_iter[0] * shared_dilation[0]; for (i = 1; i < num_axes; ++i) { data_im_offset *= shared_im_shape[i + 1]; data_im_offset += d_iter[i] * shared_dilation[i]; } *data_col_ptr = data_im_ptr[data_im_offset]; } else { *data_col_ptr = 0; } data_col_ptr += data_col_inc; incremented = false; for (i = num_axes - 1; i >= 0; --i) { const int_tp d_max = shared_kernel_shape[i]; if (d_iter[i] == d_max - 1) { d_iter[i] = 0; } else { // d_iter[i] < d_max - 1 ++d_iter[i]; incremented = true; break; } } // for (int_tp i = num_axes - 1; i >= 0; --i) } while (incremented); // do } } __kernel void TEMPLATE(col2im_nd, Dtype)(const int_tp n, const int_tp num_axes, const int_tp channel_axis, __global const Dtype* data_col, const int_tp data_col_off, __global const int_tp* im_shape, __global const int_tp* col_shape, __global const int_tp* kernel_shape, __global const int_tp* pad, __global const int_tp* stride, __global const int_tp* dilation, __global Dtype* data_im, const int_tp data_im_off) { int_tp d_im[6]; int_tp d_col_iter[6]; int_tp d_col_start[6]; int_tp d_col_end[6]; __global const int_tp* im_shape_ptr = im_shape + channel_axis; __global const int_tp* col_shape_ptr = col_shape + channel_axis; __local int_tp shared_dilation[6]; __local int_tp shared_kernel_shape[6]; __local int_tp shared_pad[6]; __local int_tp shared_stride[6]; __local int_tp shared_col_shape[6 + 1]; __local int_tp shared_im_shape[6 + 1]; for (int li = get_local_id(0); li < num_axes; li += get_local_size(0)) { shared_dilation[li] = dilation[li]; shared_kernel_shape[li] = kernel_shape[li]; shared_pad[li] = pad[li]; shared_stride[li] = stride[li]; } for (int li = get_local_id(0); li < num_axes + 1; li += get_local_size(0)) { shared_col_shape[li] = col_shape_ptr[li]; shared_im_shape[li] = im_shape_ptr[li]; } barrier(CLK_LOCAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { // Initialize channel_in, computed in the loop below, with intermediate // computations used to compute the spatial indices. int_tp c_im = index; // Calculate d_im (image dimensions). for (int_tp i = num_axes - 1; i >= 0; --i) { d_im[i] = c_im % shared_im_shape[i + 1] + shared_pad[i]; c_im /= shared_im_shape[i + 1]; } // Calculate col start/end indices. bool done = false; for (int_tp i = 0; i < num_axes; ++i) { const int_tp kernel_extent = shared_dilation[i] * (shared_kernel_shape[i] - 1) + 1; d_col_start[i] = d_col_iter[i] = (d_im[i] < kernel_extent) ? 0 : (d_im[i] - kernel_extent) / shared_stride[i] + 1; d_col_end[i] = min(d_im[i] / shared_stride[i] + 1, shared_col_shape[i + 1]); if (d_col_start[i] >= d_col_end[i]) { // Skip computation if the dimension is 0 at any spatial axis -- // final val will be 0. data_im[index] = (Dtype)0.0; done = true; break; // for (int_tp i = 0; i < num_axes; ++i) } } if (!done) { // Loop over the col to compute the output val. Dtype val = (Dtype)0.0; bool incremented = true; bool skip = false; do { // Compute the final offset. int_tp final_offset = 0; int_tp kernel_shape_prod = 1; int_tp kernel_index; for (int_tp i = num_axes - 1; i >= 0; --i) { kernel_index = d_im[i] - d_col_iter[i] * shared_stride[i]; if (kernel_index % shared_dilation[i]) { skip = true; break; } else { kernel_index /= shared_dilation[i]; final_offset += kernel_index * kernel_shape_prod; kernel_shape_prod *= shared_kernel_shape[i]; } } if (!skip) { final_offset += kernel_shape_prod * c_im; for (int_tp i = 0; i < num_axes; ++i) { final_offset *= shared_col_shape[i + 1]; final_offset += d_col_iter[i]; } val += data_col[data_col_off + final_offset]; } skip = false; incremented = false; for (int_tp i = num_axes - 1; i >= 0; --i) { const int_tp d_max = d_col_end[i]; if (d_col_iter[i] == d_max - 1) { d_col_iter[i] = d_col_start[i]; } else { // d_col_iter[i] < d_max - 1 ++d_col_iter[i]; incremented = true; break; // for (int_tp i = num_axes - 1; i >= 0; --i) } } // for (int_tp i = num_axes - 1; i >= 0; --i) } while (incremented); data_im[data_im_off + index] = val; } } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(lrn_compute_output,Dtype)(const int_tp nthreads, __global const Dtype* in, __global const Dtype* scale, const Dtype negative_beta, __global Dtype* out) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { out[index] = in[index] * pow(scale[index], negative_beta); } } __kernel void TEMPLATE(lrn_fill_scale,Dtype)(const int_tp nthreads, __global const Dtype* in, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp size, const Dtype alpha_over_size, const Dtype k, __global Dtype* const scale) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { // find out the local offset const int_tp w = index % width; const int_tp h = (index / width) % height; const int_tp n = index / width / height; const int_tp offset = (n * channels * height + h) * width + w; const int_tp step = height * width; __global const Dtype* in_off = in + offset; __global Dtype* scale_off = scale + offset; int_tp head = 0; const int_tp pre_pad = (size - 1) / 2; const int_tp post_pad = size - pre_pad - 1; Dtype accum_scale = 0; // fill the scale at [n, :, h, w] // accumulate values while (head < post_pad && head < channels) { accum_scale += in_off[head * step] * in_off[head * step]; ++head; } // both add and subtract while (head < channels) { accum_scale += in_off[head * step] * in_off[head * step]; if (head - size >= 0) { accum_scale -= in_off[(head - size) * step] * in_off[(head - size) * step]; } scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size; ++head; } // subtract only while (head < channels + post_pad) { if (head - size >= 0) { accum_scale -= in_off[(head - size) * step] * in_off[(head - size) * step]; } scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size; ++head; } } } __kernel void TEMPLATE(lrn_compute_diff,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data, __global const Dtype* top_data, __global const Dtype* scale, __global const Dtype* top_diff, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp size, const Dtype negative_beta, const Dtype cache_ratio, __global Dtype* bottom_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { // find out the local offset const int_tp w = index % width; const int_tp h = (index / width) % height; const int_tp n = index / width / height; const int_tp offset = (n * channels * height + h) * width + w; const int_tp step = height * width; __global const Dtype* bottom_off = bottom_data + offset; __global const Dtype* top_off = top_data + offset; __global const Dtype* scale_off = scale + offset; __global const Dtype* top_diff_off = top_diff + offset; __global Dtype* bottom_diff_off = bottom_diff + offset; int_tp head = 0; const int_tp pre_pad = size - (size + 1) / 2; const int_tp post_pad = size - pre_pad - 1; Dtype accum_ratio = 0; // accumulate values while (head < post_pad && head < channels) { accum_ratio += top_diff_off[head * step] * top_off[head * step] / scale_off[head * step]; ++head; } // both add and subtract while (head < channels) { accum_ratio += top_diff_off[head * step] * top_off[head * step] / scale_off[head * step]; if (head - size >= 0) { accum_ratio -= top_diff_off[(head - size) * step] * top_off[(head - size) * step] / scale_off[(head - size) * step]; } bottom_diff_off[(head - post_pad) * step] = top_diff_off[(head - post_pad) * step] * pow(scale_off[(head - post_pad) * step], negative_beta) - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio; ++head; } // subtract only while (head < channels + post_pad) { if (head - size >= 0) { accum_ratio -= top_diff_off[(head - size) * step] * top_off[(head - size) * step] / scale_off[(head - size) * step]; } bottom_diff_off[(head - post_pad) * step] = top_diff_off[(head - post_pad) * step] * pow(scale_off[(head - post_pad) * step], negative_beta) - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio; ++head; } } } __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int_tp nthreads, __global const Dtype* in, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp size, const Dtype alpha_over_size, const Dtype k, __global Dtype* const out, const Dtype negative_beta) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { // find out the local offset const int_tp w = index % width; const int_tp h = (index / width) % height; const int_tp n = index / width / height; const int_tp offset = (n * channels * height + h) * width + w; const int_tp step = height * width; __global const Dtype* in_off = in + offset; __global Dtype* out_off = out + offset; Dtype scale_val; int_tp head = 0; const int_tp pre_pad = (size - 1) / 2; const int_tp post_pad = size - pre_pad - 1; Dtype accum_scale = 0; // fill the scale at [n, :, h, w] // accumulate values while (head < post_pad && head < channels) { accum_scale += in_off[head * step] * in_off[head * step]; ++head; } // both add and subtract while (head < channels) { accum_scale += in_off[head * step] * in_off[head * step]; if (head - size >= 0) { accum_scale -= in_off[(head - size) * step] * in_off[(head - size) * step]; } scale_val = k + accum_scale * alpha_over_size; out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta); ++head; } // subtract only while (head < channels + post_pad) { if (head - size >= 0) { accum_scale -= in_off[(head - size) * step] * in_off[(head - size) * step]; } scale_val = k + accum_scale * alpha_over_size; out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta); ++head; } } } __kernel void TEMPLATE(lrn_full,Dtype)(const int_tp nthreads, __global const Dtype* in, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp size, const Dtype alpha_over_size, const Dtype k, __global Dtype* const scale, __global Dtype* const out, const Dtype negative_beta) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { // find out the local offset const int_tp w = index % width; const int_tp h = (index / width) % height; const int_tp n = index / width / height; const int_tp offset = (n * channels * height + h) * width + w; const int_tp step = height * width; __global const Dtype* in_off = in + offset; __global Dtype* out_off = out + offset; __global Dtype* scale_off = scale + offset; Dtype scale_val; int_tp head = 0; const int_tp pre_pad = (size - 1) / 2; const int_tp post_pad = size - pre_pad - 1; Dtype accum_scale = 0; // fill the scale at [n, :, h, w] // accumulate values while (head < post_pad && head < channels) { accum_scale += in_off[head * step] * in_off[head * step]; ++head; } // both add and subtract while (head < channels) { accum_scale += in_off[head * step] * in_off[head * step]; if (head - size >= 0) { accum_scale -= in_off[(head - size) * step] * in_off[(head - size) * step]; } scale_val = k + accum_scale * alpha_over_size; scale_off[(head - post_pad) * step] = scale_val; out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta); ++head; } // subtract only while (head < channels + post_pad) { if (head - size >= 0) { accum_scale -= in_off[(head - size) * step] * in_off[(head - size) * step]; } scale_val = k + accum_scale * alpha_over_size; scale_off[(head - post_pad) * step] = scale_val; out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta); ++head; } } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif inline Dtype TEMPLATE(lstm_sigmoid,Dtype)(const Dtype x) { return (Dtype)1 / ((Dtype)1 + exp(-x)); } inline Dtype TEMPLATE(lstm_tanh,Dtype)(const Dtype x) { return (Dtype)2 * TEMPLATE(lstm_sigmoid,Dtype)((Dtype)2 * x) - (Dtype)1; } __kernel void TEMPLATE(lstm_acts_forward,Dtype)(const int_tp nthreads, const int_tp dim, __global const Dtype* X, __global Dtype* X_acts) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp x_dim = 4 * dim; const int_tp d = index % x_dim; if (d < 3 * dim) { X_acts[index] = TEMPLATE(lstm_sigmoid,Dtype)(X[index]); } else { X_acts[index] = TEMPLATE(lstm_tanh,Dtype)(X[index]); } } } __kernel void TEMPLATE(lstm_unit_forward,Dtype)(const int_tp nthreads, const int_tp dim, __global const Dtype* C_prev, __global const Dtype* X, __global const Dtype* cont, __global Dtype* C, __global Dtype* H) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp n = index / dim; const int_tp d = index % dim; __global const Dtype* X_offset = X + 4 * dim * n; const Dtype i = X_offset[d]; const Dtype f = X_offset[1 * dim + d]; const Dtype o = X_offset[2 * dim + d]; const Dtype g = X_offset[3 * dim + d]; const Dtype c_prev = C_prev[index]; const Dtype c = cont[n] * f * c_prev + i * g; C[index] = c; const Dtype tanh_c = TEMPLATE(lstm_tanh,Dtype)(c); H[index] = o * tanh_c; } } __kernel void TEMPLATE(lstm_unit_backward,Dtype)(const int_tp nthreads, const int_tp dim, __global const Dtype* C_prev, __global const Dtype* X, __global const Dtype* C, __global const Dtype* H, __global const Dtype* cont, __global const Dtype* C_diff, __global const Dtype* H_diff, __global Dtype* C_prev_diff, __global Dtype* X_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp n = index / dim; const int_tp d = index % dim; __global const Dtype* X_offset = X + 4 * dim * n; const Dtype i = X_offset[d]; const Dtype f = X_offset[1 * dim + d]; const Dtype o = X_offset[2 * dim + d]; const Dtype g = X_offset[3 * dim + d]; const Dtype c_prev = C_prev[index]; const Dtype c = C[index]; const Dtype tanh_c = TEMPLATE(lstm_tanh,Dtype)(c); __global Dtype* c_prev_diff = C_prev_diff + index; __global Dtype* X_diff_offset = X_diff + 4 * dim * n; __global Dtype* i_diff = X_diff_offset + d; __global Dtype* f_diff = X_diff_offset + 1 * dim + d; __global Dtype* o_diff = X_diff_offset + 2 * dim + d; __global Dtype* g_diff = X_diff_offset + 3 * dim + d; const Dtype c_term_diff = C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c); const Dtype cont_n = cont[n]; *c_prev_diff = cont_n * c_term_diff * f; *i_diff = c_term_diff * g; *f_diff = cont_n * c_term_diff * c_prev; *o_diff = H_diff[index] * tanh_c; *g_diff = c_term_diff * i; } } __kernel void TEMPLATE(lstm_acts_backward,Dtype)(const int_tp nthreads, const int_tp dim, __global const Dtype* X_acts, __global const Dtype* X_acts_diff, __global Dtype* X_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp x_dim = 4 * dim; const int_tp d = index % x_dim; const Dtype X_act = X_acts[index]; if (d < 3 * dim) { X_diff[index] = X_acts_diff[index] * X_act * ((Dtype)1 - X_act); } else { X_diff[index] = X_acts_diff[index] * ((Dtype)1 - X_act * X_act); } } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(mul,Dtype)(const int_tp n, __global const Dtype* a, const int_tp offa, __global Dtype* b, const int_tp offb, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[index + offy] = a[index + offa] * b[index + offb]; } } __kernel void TEMPLATE(div,Dtype)(const int_tp n, __global const Dtype* a, const int_tp offa, __global Dtype* b, const int_tp offb, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[index + offy] = a[index + offa] / b[index + offb]; } } __kernel void TEMPLATE(add_scalar,Dtype)(const int_tp N, const Dtype alpha, __global Dtype* Y, const int_tp offY) { for (int_tp index = get_global_id(0); index < N; index += get_global_size(0)) { Y[offY + index] += alpha; } } __kernel void TEMPLATE(add,Dtype)(const int_tp n, __global const Dtype* a, const int_tp offa, __global const Dtype* b, const int_tp offb, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[offy + index] = a[offa + index] + b[offb + index]; } } __kernel void TEMPLATE(sub,Dtype)(const int_tp n, __global const Dtype* a, const int_tp offa, __global const Dtype* b, const int_tp offb, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[offy + index] = a[offa + index] - b[offb + index]; } } __kernel void TEMPLATE(abs,Dtype)(const int_tp n, __global const Dtype* a, const int_tp offa, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[offy + index] = fabs((Dtype)(a[offa + index])); } } __kernel void TEMPLATE(exp,Dtype)(const int_tp n, __global const Dtype* a, const int_tp offa, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[offy + index] = exp(a[offa + index]); } } __kernel void TEMPLATE(log,Dtype)(const int_tp n, __global const Dtype* a, const int_tp offa, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[offy + index] = log((Dtype)(a[offa + index])); } } __kernel void TEMPLATE(sqrt,Dtype)(const int_tp n, __global const Dtype* a, const int_tp offa, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[offy + index] = sqrt((Dtype)a[offa + index]); } } __kernel void TEMPLATE(powx,Dtype)(const int_tp n, __global const Dtype* a, const int_tp offa, Dtype alpha, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { if(alpha == 2.0) { y[offy + index] = pow((Dtype)fabs(a[offa + index]), (Dtype)alpha); } else { y[offy + index] = pow((Dtype)a[offa + index], (Dtype)alpha); } } } __kernel void TEMPLATE(sign,Dtype)(const int_tp n, __global const Dtype* x, const int_tp offx, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[index + offy] = (0.0 < x[index + offx]) - (x[index + offx] < 0.0); } } __kernel void TEMPLATE(sgnbit,Dtype)(const int_tp n, __global const Dtype* x, const int_tp offx, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[index + offy] = signbit(x[index + offx]); } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(merge_copy_forward_stack, Dtype)(const int_tp nthreads, const int_tp dims, __global const Dtype* bottom_a, const int_tp forward_a, __global const Dtype* bottom_b, const int_tp forward_b, __global Dtype* top, const int_tp num, const int_tp channels_a, const int_tp channels_b, __global const int_tp* shape_a, __global const int_tp* shape_b) { int_tp pad[6]; int_tp tmp_idx[6]; int_tp size_a = 1; int_tp size_b = 1; for (int_tp i = 0; i < dims; ++i) { pad[i] = (shape_b[i] - shape_a[i]) / 2; size_a *= shape_a[i]; size_b *= shape_b[i]; } for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp batch_id = index / ((channels_a + channels_b) * size_a); int_tp bottom_id = ((index - batch_id * (channels_a + channels_b) * size_a) / (channels_a * size_a)) % 2; int_tp counter = index; for (int_tp i = dims - 1; i >= 0; --i) { tmp_idx[i] = counter % shape_a[i]; counter /= shape_a[i]; } if (bottom_id == 0) { int_tp channel_id = (index / size_a) % channels_a; int_tp aidx = batch_id * channels_a + channel_id; for (int_tp i = 0; i < dims; ++i) { aidx *= shape_a[i]; aidx += tmp_idx[i]; } top[index] = (forward_a == 1) ? bottom_a[aidx] : 0; } else { int_tp channel_id = (index / size_a) % channels_b; int_tp bidx = (batch_id * channels_b + channel_id) * size_b; int_tp btemp = 1; for (int_tp i = dims - 1; i >= 0; --i) { bidx += btemp * (tmp_idx[i] + pad[i]); btemp *= shape_b[i]; } top[index] = (forward_b == 1) ? bottom_b[bidx] : 0; } } } __kernel void TEMPLATE(merge_copy_backward_stack,Dtype)(const int_tp nthreads, const int_tp dims, __global Dtype* bottom_a, const int_tp backward_a, __global Dtype* bottom_b, const int_tp backward_b, __global const Dtype* top, const int_tp num, const int_tp channels_a, const int_tp channels_b, __global const int_tp* shape_a, __global const int_tp* shape_b) { int_tp pad[6]; int_tp tmp_idx[6]; int_tp size_a = 1; int_tp size_b = 1; for (int_tp i = 0; i < dims; ++i) { pad[i] = (shape_b[i] - shape_a[i]) / 2; size_a *= shape_a[i]; size_b *= shape_b[i]; } for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp batch_id = index / ((channels_a + channels_b) * size_a); int_tp bottom_id = ((index - batch_id * (channels_a + channels_b) * size_a) / (channels_a * size_a)) % 2; int_tp counter = index; for (int_tp i = dims - 1; i >= 0; --i) { tmp_idx[i] = counter % shape_a[i]; counter /= shape_a[i]; } if (bottom_id == 0) { int_tp channel_id = (index / size_a) % channels_a; int_tp aidx = batch_id * channels_a + channel_id; for (int_tp i = 0; i < dims; ++i) { aidx *= shape_a[i]; aidx += tmp_idx[i]; } bottom_a[aidx] = (backward_a == 1) ? top[index] : 0; } else { int_tp channel_id = (index / size_a) % channels_b; int_tp bidx = (batch_id * channels_b + channel_id) * size_b; int_tp btemp = 1; for (int_tp i = dims - 1; i >= 0; --i) { bidx += btemp * (tmp_idx[i] + pad[i]); btemp *= shape_b[i]; } bottom_b[bidx] = (backward_b == 1) ? top[index] : 0; } } } __kernel void TEMPLATE(merge_copy_forward_add, Dtype)(const int_tp nthreads, const int_tp dims, __global const Dtype* bottom_a, const int_tp forward_a, __global const Dtype* bottom_b, const int_tp forward_b, __global Dtype* top, const int_tp num, const int_tp channels, __global const int_tp* shape_a, __global const int_tp* shape_b) { int_tp pad[6]; int_tp tmp_idx[6]; int_tp size_a = 1; int_tp size_b = 1; for (int_tp i = 0; i < dims; ++i) { pad[i] = (shape_b[i] - shape_a[i]) / 2; size_a *= shape_a[i]; size_b *= shape_b[i]; } for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp batch_id = index / (channels * size_a); int_tp counter = index; for (int_tp i = dims - 1; i >= 0; --i) { tmp_idx[i] = counter % shape_a[i]; counter /= shape_a[i]; } top[index] = 0; int_tp channel_id = (index / size_a) % channels; int_tp aidx = batch_id * channels + channel_id; for (int_tp i = 0; i < dims; ++i) { aidx *= shape_a[i]; aidx += tmp_idx[i]; } top[index] = forward_a ? top[index] + bottom_a[aidx] : top[index]; int_tp bidx = (batch_id * channels + channel_id) * size_b; int_tp btemp = 1; for (int_tp i = dims - 1; i >= 0; --i) { bidx += btemp * (tmp_idx[i] + pad[i]); btemp *= shape_b[i]; } top[index] = forward_b ? top[index] + bottom_b[bidx] : top[index]; } } __kernel void TEMPLATE(merge_copy_backward_add,Dtype)(const int_tp nthreads, const int_tp dims, __global Dtype* bottom_a, const int_tp backward_a, __global Dtype* bottom_b, const int_tp backward_b, __global const Dtype* top, const int_tp num, const int_tp channels, __global const int_tp* shape_a, __global const int_tp* shape_b) { int_tp pad[6]; int_tp tmp_idx[6]; int_tp size_a = 1; int_tp size_b = 1; for (int_tp i = 0; i < dims; ++i) { pad[i] = (shape_b[i] - shape_a[i]) / 2; size_a *= shape_a[i]; size_b *= shape_b[i]; } for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp batch_id = index / (channels * size_a); int_tp counter = index; for (int_tp i = dims - 1; i >= 0; --i) { tmp_idx[i] = counter % shape_a[i]; counter /= shape_a[i]; } int_tp channel_id = (index / size_a) % channels; int_tp aidx = batch_id * channels + channel_id; for (int_tp i = 0; i < dims; ++i) { aidx *= shape_a[i]; aidx += tmp_idx[i]; } bottom_a[aidx] = backward_a ? top[index] : 0; int_tp bidx = (batch_id * channels + channel_id) * size_b; int_tp btemp = 1; for (int_tp i = dims - 1; i >= 0; --i) { bidx += btemp * (tmp_idx[i] + pad[i]); btemp *= shape_b[i]; } bottom_b[bidx] = backward_b ? top[index] : 0; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(max_pool_forward,Dtype)( const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w, __global Dtype* top_data, const int use_mask, __global int_tp* mask, __global Dtype* top_mask) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp pw = index % pooled_width; const int_tp ph = (index / pooled_width) % pooled_height; const int_tp c = (index / pooled_width / pooled_height) % channels; const int_tp n = index / pooled_width / pooled_height / channels; int_tp hstart = ph * stride_h - pad_h; int_tp wstart = pw * stride_w - pad_w; const int_tp hend = min(hstart + kernel_h, height); const int_tp wend = min(wstart + kernel_w, width); hstart = max(hstart, (int_tp)0); wstart = max(wstart, (int_tp)0); Dtype maxval = -FLT_MAX; int_tp maxidx = -1; __global const Dtype* bottom_slice = bottom_data + (n * channels + c) * height * width; for (int_tp h = hstart; h < hend; ++h) { for (int_tp w = wstart; w < wend; ++w) { if (bottom_slice[h * width + w] > maxval) { maxidx = h * width + w; maxval = bottom_slice[maxidx]; } } } top_data[index] = maxval; if (use_mask == 1) { mask[index] = maxidx; } else { top_mask[index] = maxidx; } } } __kernel void TEMPLATE(ave_pool_forward,Dtype)( const int_tp nthreads, __global const Dtype* const bottom_data, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w, __global Dtype* top_data) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { { const int_tp pw = index % pooled_width; const int_tp ph = (index / pooled_width) % pooled_height; const int_tp c = (index / pooled_width / pooled_height) % channels; const int_tp n = index / pooled_width / pooled_height / channels; int_tp hstart = ph * stride_h - pad_h; int_tp wstart = pw * stride_w - pad_w; int_tp hend = min(hstart + kernel_h, height + pad_h); int_tp wend = min(wstart + kernel_w, width + pad_w); const int_tp pool_size = (hend - hstart) * (wend - wstart); hstart = max(hstart, (int_tp)0); wstart = max(wstart, (int_tp)0); hend = min(hend, height); wend = min(wend, width); Dtype aveval = 0; __global const Dtype* bottom_slice = bottom_data + (n * channels + c) * height * width; for (int_tp h = hstart; h < hend; ++h) { for (int_tp w = wstart; w < wend; ++w) { aveval += bottom_slice[h * width + w]; } } top_data[index] = aveval / pool_size; } } } __kernel void TEMPLATE(sto_pool_forward_train,Dtype)( const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, __global Dtype* rand_idx, __global Dtype* top_data) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp pw = index % pooled_width; const int_tp ph = (index / pooled_width) % pooled_height; const int_tp c = (index / pooled_width / pooled_height) % channels; const int_tp n = index / pooled_width / pooled_height / channels; const int_tp hstart = ph * stride_h; const int_tp hend = min(hstart + kernel_h, height); const int_tp wstart = pw * stride_w; const int_tp wend = min(wstart + kernel_w, width); Dtype cumsum = 0.; __global const Dtype* bottom_slice = bottom_data + (n * channels + c) * height * width; // First pass: get sum for (int_tp h = hstart; h < hend; ++h) { for (int_tp w = wstart; w < wend; ++w) { cumsum += bottom_slice[h * width + w]; } } const float thres = rand_idx[index] * cumsum; // Second pass: get value, and set index. cumsum = 0; for (int_tp h = hstart; h < hend; ++h) { for (int_tp w = wstart; w < wend; ++w) { cumsum += bottom_slice[h * width + w]; if (cumsum >= thres) { rand_idx[index] = ((n * channels + c) * height + h) * width + w; top_data[index] = bottom_slice[h * width + w]; h = hend; w = wend; } } } } } __kernel void TEMPLATE(sto_pool_forward_test,Dtype)( const int_tp nthreads, __global const Dtype* const bottom_data, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, __global Dtype* top_data) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp pw = index % pooled_width; const int_tp ph = (index / pooled_width) % pooled_height; const int_tp c = (index / pooled_width / pooled_height) % channels; const int_tp n = index / pooled_width / pooled_height / channels; const int_tp hstart = ph * stride_h; const int_tp hend = min(hstart + kernel_h, height); const int_tp wstart = pw * stride_w; const int_tp wend = min(wstart + kernel_w, width); // We set cumsum to be 0 to avoid divide-by-zero problems Dtype cumsum = FLT_MIN; Dtype cumvalues = 0.; __global const Dtype* bottom_slice = bottom_data + (n * channels + c) * height * width; // First pass: get sum for (int_tp h = hstart; h < hend; ++h) { for (int_tp w = wstart; w < wend; ++w) { cumsum += bottom_slice[h * width + w]; cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w]; } } top_data[index] = cumvalues / cumsum; } } __kernel void TEMPLATE(max_pool_backward,Dtype)(const int_tp nthreads, __global const Dtype* top_diff, const int use_mask, __global const int_tp* mask, __global const Dtype* top_mask, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w, __global Dtype* bottom_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { // find out the local index // find out the local offset const int_tp w = index % width; const int_tp h = (index / width) % height; const int_tp c = (index / width / height) % channels; const int_tp n = index / width / height / channels; const int_tp phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; const int_tp phend = min((h + pad_h) / stride_h + 1, pooled_height); const int_tp pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; const int_tp pwend = min((w + pad_w) / stride_w + 1, pooled_width); Dtype gradient = 0; const int_tp offset = (n * channels + c) * pooled_height * pooled_width; __global const Dtype* top_diff_slice = top_diff + offset; if (use_mask == 1) { __global const int_tp* mask_slice = mask + offset; for (int_tp ph = phstart; ph < phend; ++ph) { for (int_tp pw = pwstart; pw < pwend; ++pw) { if (mask_slice[ph * pooled_width + pw] == h * width + w) { gradient += top_diff_slice[ph * pooled_width + pw]; } } } } else { __global const Dtype* top_mask_slice = top_mask + offset; for (int_tp ph = phstart; ph < phend; ++ph) { for (int_tp pw = pwstart; pw < pwend; ++pw) { if (top_mask_slice[ph * pooled_width + pw] == h * width + w) { gradient += top_diff_slice[ph * pooled_width + pw]; } } } } bottom_diff[index] = gradient; } } __kernel void TEMPLATE(ave_pool_backward,Dtype)(const int_tp nthreads, __global const Dtype* top_diff, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w, __global Dtype* bottom_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { // find out the local index // find out the local offset const int_tp w = index % width + pad_w; const int_tp h = (index / width) % height + pad_h; const int_tp c = (index / width / height) % channels; const int_tp n = index / width / height / channels; const int_tp phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; const int_tp phend = min(h / stride_h + 1, pooled_height); const int_tp pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; const int_tp pwend = min(w / stride_w + 1, pooled_width); Dtype gradient = 0.0; __global const Dtype* const top_diff_slice = top_diff + (n * channels + c) * pooled_height * pooled_width; for (int_tp ph = phstart; ph < phend; ++ph) { for (int_tp pw = pwstart; pw < pwend; ++pw) { // figure out the pooling size int_tp hstart = ph * stride_h - pad_h; int_tp wstart = pw * stride_w - pad_w; int_tp hend = min(hstart + kernel_h, height + pad_h); int_tp wend = min(wstart + kernel_w, width + pad_w); int_tp pool_size = (hend - hstart) * (wend - wstart); gradient += top_diff_slice[ph * pooled_width + pw] / pool_size; } } bottom_diff[index] = gradient; } } __kernel void TEMPLATE(sto_pool_backward,Dtype)( const int_tp nthreads, __global const Dtype* rand_idx, __global const Dtype* const top_diff, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, __global Dtype* bottom_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { // find out the local index // find out the local offset const int_tp w = index % width; const int_tp h = (index / width) % height; const int_tp c = (index / width / height) % channels; const int_tp n = index / width / height / channels; const int_tp phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; const int_tp phend = min(h / stride_h + 1, pooled_height); const int_tp pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; const int_tp pwend = min(w / stride_w + 1, pooled_width); Dtype gradient = 0.0; __global const Dtype* rand_idx_slice = rand_idx + (n * channels + c) * pooled_height * pooled_width; __global const Dtype* top_diff_slice = top_diff + (n * channels + c) * pooled_height * pooled_width; for (int_tp ph = phstart; ph < phend; ++ph) { for (int_tp pw = pwstart; pw < pwend; ++pw) { gradient += top_diff_slice[ph * pooled_width + pw] * (index == (int_tp) (rand_idx_slice[ph * pooled_width + pw])?1.0:0.0); } } bottom_diff[index] = gradient; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(max_pool_forward_nd, Dtype)(const int_tp n, const int_tp num_axes, __global const Dtype* bottom_data, const int_tp channels, __global const int_tp* size, __global const int_tp* pooled_size, __global const int_tp* kernel_size, __global const int_tp* ext_kernel_size, __global const int_tp* stride, __global const int_tp* dilation, __global const int_tp* pad, __global Dtype* top_data, const int use_mask, __global int_tp* mask, __global Dtype* top_mask) { int_tp d_idx[6]; int_tp d_start[6]; int_tp d_end[6]; int_tp d_iter[6]; int_tp i; for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { int_tp offset = 1; int_tp num = index; bool do_continue = false; for (i = num_axes - 1; i >= 0; --i) { d_idx[i] = num % pooled_size[i]; d_start[i] = d_idx[i] * stride[i] - pad[i]; d_end[i] = min(d_start[i] + ext_kernel_size[i], size[i]); while (d_start[i] < 0) { d_start[i] += dilation[i]; } num /= pooled_size[i]; offset *= size[i]; d_iter[i] = d_start[i]; if (d_start[i] >= d_end[i]) { top_data[index] = -FLT_MAX; if (use_mask) { mask[index] = -1; } else { top_mask[index] = -1; } do_continue = true; } } if(do_continue) { continue; } int_tp chan = num % channels; num /= channels; offset *= (num * channels + chan); Dtype maxval = -FLT_MAX; int_tp maxidx = -1; int_tp final_offset = 0; bool incremented; do { final_offset = 0; int_tp size_prod = 1; for (i = num_axes - 1; i >= 0; --i) { final_offset += d_iter[i] * size_prod; size_prod *= size[i]; } if (bottom_data[final_offset + offset] > maxval) { maxidx = final_offset; maxval = bottom_data[offset + final_offset]; } incremented = false; for (i = num_axes - 1; i >= 0; --i) { if (d_iter[i] >= d_end[i] - dilation[i]) { d_iter[i] = d_start[i]; } else { d_iter[i] += dilation[i]; incremented = true; break; } } } while (incremented); top_data[index] = maxval; if (use_mask == 1) { mask[index] = maxidx; } else { top_mask[index] = maxidx; } } } __kernel void TEMPLATE(max_pool_backward_nd, Dtype)(const int_tp n, const int_tp num_axes, __global const Dtype* top_diff, const int use_mask, __global const int_tp* mask, __global const Dtype* top_mask, const int_tp channels, __global const int_tp* size, __global const int_tp* pooled_size, __global const int_tp* kernel_size, __global const int_tp* ext_kernel_size, __global const int_tp* stride, __global const int_tp* dilation, __global const int_tp* pad, __global Dtype* bottom_diff) { int_tp d_idx[6]; int_tp d_start[6]; int_tp d_end[6]; int_tp d_iter[6]; int_tp i; for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { // find out the local index // find out the local offset int_tp offset = 1; int_tp num = index; bool do_continue = false; for (i = num_axes - 1; i >= 0; --i) { d_idx[i] = num % size[i]; d_start[i] = (d_idx[i] + pad[i] < ext_kernel_size[i]) ? 0 : (d_idx[i] + pad[i] - ext_kernel_size[i]) / stride[i] + 1; d_end[i] = min((int_tp) ((d_idx[i] + pad[i]) / stride[i]), (int_tp) (pooled_size[i] - 1)); num /= size[i]; offset *= pooled_size[i]; d_iter[i] = d_start[i]; if (d_start[i] > d_end[i]) { bottom_diff[index] = 0; do_continue = true; } } if (do_continue) { continue; } int_tp chan = num % channels; num /= channels; offset *= (num * channels + chan); Dtype gradient = 0.0; int_tp final_offset = 0; int_tp im_offset = 0; bool incremented; do { final_offset = offset; im_offset = 0; int_tp size_prod = 1; int_tp pooled_size_prod = 1; for (i = num_axes - 1; i >= 0; --i) { final_offset += d_iter[i] * pooled_size_prod; im_offset += d_idx[i] * size_prod; size_prod *= size[i]; pooled_size_prod *= pooled_size[i]; } if (use_mask) { if (mask[final_offset] == im_offset) { gradient += top_diff[final_offset]; } } else { if (top_mask[final_offset] == im_offset) { gradient += top_diff[final_offset]; } } incremented = false; for (i = num_axes - 1; i >= 0; --i) { if (d_iter[i] >= d_end[i]) { d_iter[i] = d_start[i]; } else { ++d_iter[i]; incremented = true; break; } } } while (incremented); bottom_diff[index] = gradient; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(max_pool_forward_sk,Dtype)(const int_tp nthreads, __global Dtype* bottom_data, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h, const int_tp pad_w, __global Dtype* top_data, const int use_mask, __global int_tp* mask, __global Dtype* top_mask) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp pw = index % pooled_width; int_tp ph = (index / pooled_width) % pooled_height; int_tp c = (index / pooled_width / pooled_height) % channels; int_tp n = index / pooled_width / pooled_height / channels; int_tp hstart = ph * stride_h - pad_h; int_tp wstart = pw * stride_w - pad_w; int_tp hend = min(hstart + ext_kernel_h, height); int_tp wend = min(wstart + ext_kernel_w, width); while (hstart < 0) { hstart += dilation_h; } while (wstart < 0) { wstart += dilation_w; } Dtype maxval = -FLT_MAX; int_tp maxidx = -1; __global Dtype* bottom_data_ptr = bottom_data + (n * channels + c) * height * width; for (int_tp h = hstart; h < hend; h += dilation_h) { for (int_tp w = wstart; w < wend; w += dilation_w) { if (bottom_data_ptr[h * width + w] > maxval) { maxidx = h * width + w; maxval = bottom_data_ptr[maxidx]; } } } top_data[index] = maxval; if (use_mask == 1) { mask[index] = maxidx; } else { top_mask[index] = maxidx; } } } __kernel void TEMPLATE(max_pool_backward_sk,Dtype)( const int_tp nthreads, __global const Dtype* top_diff, const int use_mask, __global const int_tp* mask, __global const Dtype* top_mask, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h, const int_tp pad_w, __global Dtype* bottom_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { __global const int_tp* mask_ptr = mask; __global const Dtype* top_diff_ptr = top_diff; // find out the local index // find out the local offset int_tp w = index % width; int_tp h = (index / width) % height; int_tp c = (index / width / height) % channels; int_tp n = index / width / height / channels; int_tp phstart = (h + pad_h < ext_kernel_h) ? 0 : (h + pad_h - ext_kernel_h) / stride_h + 1; int_tp phend = min(((h + pad_h) / stride_h + 1), pooled_height); int_tp pwstart = (w + pad_w < ext_kernel_w) ? 0 : (w + pad_w - ext_kernel_w) / stride_w + 1; int_tp pwend = min(((w + pad_w) / stride_w + 1), pooled_width); Dtype gradient = 0.0; int_tp offset = (n * channels + c) * pooled_height * pooled_width; top_diff_ptr += offset; if (use_mask == 1) { mask_ptr += offset; for (int_tp ph = phstart; ph < phend; ++ph) { for (int_tp pw = pwstart; pw < pwend; ++pw) { if (mask_ptr[ph * pooled_width + pw] == h * width + w) { gradient += top_diff_ptr[ph * pooled_width + pw]; } } } } else { for (int_tp ph = phstart; ph < phend; ++ph) { for (int_tp pw = pwstart; pw < pwend; ++pw) { if (top_mask[ph * pooled_width + pw] == h * width + w) { gradient += top_diff_ptr[ph * pooled_width + pw]; } } } } bottom_diff[index] = gradient; } } __kernel void TEMPLATE(ave_pool_forward_sk,Dtype)( const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h, const int_tp pad_w, __global Dtype* top_data) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp pool_size = 0; int_tp pw = index % pooled_width; int_tp ph = (index / pooled_width) % pooled_height; int_tp c = (index / pooled_width / pooled_height) % channels; int_tp n = index / pooled_width / pooled_height / channels; int_tp hstart = ph * stride_h - pad_h; int_tp wstart = pw * stride_w - pad_w; int_tp hend = hstart + ext_kernel_h; int_tp wend = wstart + ext_kernel_w; // Overspill over the image + pad does // not contribute to pool size while (hend > height + pad_h) { hend -= dilation_h; } while (wend > width + pad_w) { wend -= dilation_w; } Dtype aveval = 0; __global const Dtype* bottom_data_ptr = bottom_data; bottom_data_ptr += (n * channels + c) * height * width; for (int_tp h = hstart; h < hend; h += dilation_h) { for (int_tp w = wstart; w < wend; w += dilation_w) { if (h >= 0 && h < height && w >= 0 && w < width) { aveval += bottom_data_ptr[h * width + w]; } ++pool_size; } } top_data[index] = aveval / pool_size; } } __kernel void TEMPLATE(ave_pool_backward_sk,Dtype)(const int_tp nthreads, __global const Dtype* top_diff, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h, const int_tp pad_w, __global Dtype* bottom_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { // find out the local index // find out the local offset const int_tp w = index % width; const int_tp h = (index / width) % height; const int_tp c = (index / width / height) % channels; const int_tp n = index / width / height / channels; int_tp phstart = (h + pad_h < ext_kernel_h) ? 0 : (h + pad_h - ext_kernel_h) / stride_h + 1; int_tp phend = min(((h + pad_h) / stride_h + 1), pooled_height); int_tp pwstart = (w + pad_w < ext_kernel_w) ? 0 : (w + pad_w - ext_kernel_w) / stride_w + 1; int_tp pwend = min(((w + pad_w) / stride_w + 1), pooled_width); Dtype gradient = 0.0; __global const Dtype* const top_diff_slice = top_diff + (n * channels + c) * pooled_height * pooled_width; for (int_tp ph = phstart; ph < phend; ++ph) { for (int_tp pw = pwstart; pw < pwend; ++pw) { // figure out the pooling size int_tp hstart = ph * stride_h - pad_h; int_tp wstart = pw * stride_w - pad_w; int_tp hend = min(hstart + ext_kernel_h, height + pad_h); int_tp wend = min(wstart + ext_kernel_w, width + pad_w); int_tp pool_size = ((hend - hstart - 1) / dilation_h + 1) * ((wend - wstart - 1) / dilation_w + 1); if (h >= hstart && h < hend && (h - hstart) % dilation_h == 0 && w >= wstart && w < wend && (w - wstart) % dilation_w == 0) { gradient += top_diff_slice[ph * pooled_width + pw] / pool_size; } } } bottom_diff[index] = gradient; } } __kernel void TEMPLATE(sto_pool_forward_train_sk,Dtype)( const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp dilation_h, const int_tp dilation_w, __global Dtype* rand_idx, __global Dtype* top_data) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp pw = index % pooled_width; int_tp ph = (index / pooled_width) % pooled_height; int_tp c = (index / pooled_width / pooled_height) % channels; int_tp n = index / pooled_width / pooled_height / channels; int_tp hstart = ph * stride_h; int_tp hend = min(hstart + ext_kernel_h, height); int_tp wstart = pw * stride_w; int_tp wend = min(wstart + ext_kernel_w, width); Dtype cumsum = 0.; __global const Dtype* bottom_data_ptr = bottom_data; bottom_data_ptr += (n * channels + c) * height * width; // First pass: get sum for (int_tp h = hstart; h < hend; h += dilation_h) { for (int_tp w = wstart; w < wend; w += dilation_w) { cumsum += bottom_data_ptr[h * width + w]; } } float thres = rand_idx[index] * cumsum; // Second pass: get value, and set index. cumsum = 0; for (int_tp h = hstart; h < hend; h += dilation_h) { for (int_tp w = wstart; w < wend; w += dilation_w) { cumsum += bottom_data_ptr[h * width + w]; if (cumsum >= thres) { rand_idx[index] = ((n * channels + c) * height + h) * width + w; top_data[index] = bottom_data_ptr[h * width + w]; h = hend; w = wend; } } } } } __kernel void TEMPLATE(sto_pool_forward_test_sk,Dtype)( const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp dilation_h, const int_tp dilation_w, __global Dtype* top_data) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp pw = index % pooled_width; int_tp ph = (index / pooled_width) % pooled_height; int_tp c = (index / pooled_width / pooled_height) % channels; int_tp n = index / pooled_width / pooled_height / channels; int_tp hstart = ph * stride_h; int_tp hend = min(hstart + ext_kernel_h, height); int_tp wstart = pw * stride_w; int_tp wend = min(wstart + ext_kernel_w, width); // We set cumsum to be 0 to avoid divide-by-zero problems Dtype cumsum = FLT_MIN; Dtype cumvalues = 0.; __global const Dtype* bottom_data_ptr = bottom_data; bottom_data_ptr += (n * channels + c) * height * width; // First pass: get sum for (int_tp h = hstart; h < hend; h += dilation_h) { for (int_tp w = wstart; w < wend; w += dilation_w) { cumsum += bottom_data_ptr[h * width + w]; cumvalues += bottom_data_ptr[h * width + w] * bottom_data_ptr[h * width + w]; } } top_data[index] = cumvalues / cumsum; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(slice,Dtype)(const int_tp nthreads, __global const Dtype* in_data, const int forward, const int_tp num_slices, const int_tp slice_size, const int_tp bottom_slice_axis, const int_tp top_slice_axis, const int_tp offset_slice_axis, __global Dtype* out_data) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp total_slice_size = slice_size * top_slice_axis; const int_tp slice_num = index / total_slice_size; const int_tp slice_index = index % total_slice_size; const int_tp bottom_index = slice_index + (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size; if (forward == 1) { out_data[index] = in_data[bottom_index]; } else { out_data[bottom_index] = in_data[index]; } } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif #ifndef __OPENCL_VERSION__ #include "header.cl" #endif #if defined(cl_intel_subgroups) #pragma OPENCL EXTENSION cl_intel_subgroups : enable __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int_tp num, const int_tp channels, const int_tp spatial_dim, __global Dtype* scale, __global const Dtype* data, __global Dtype* out, __local Dtype *out_tmp, __local Dtype *scale_tmp, __local Dtype *group_tmp) { int_tp n = get_global_id(1); for (int_tp index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index += get_global_size(0), ++s) { float maxval = -FLT_MAX; for (int_tp c = get_global_id(0); c < channels; c += get_global_size(0)) { Dtype tmp = data[(n * channels + c) * spatial_dim + s]; maxval = max((Dtype)tmp, (Dtype)maxval); } maxval = sub_group_reduce_max(maxval); //if (get_sub_group_local_id() == 0) group_tmp[get_sub_group_id() * spatial_dim + s] = maxval; } barrier(CLK_LOCAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index += get_global_size(0)) { int_tp s = index / get_max_sub_group_size(); Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]); //if (get_sub_group_local_id() == 0) scale_tmp[s] = maxval; } barrier(CLK_LOCAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < channels * spatial_dim; index += get_global_size(0)) { int_tp s = index % spatial_dim; out_tmp[index] = exp(data[n * channels * spatial_dim + index] - scale_tmp[s]); } barrier(CLK_LOCAL_MEM_FENCE); for (int_tp index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index += get_global_size(0), ++s) { Dtype sum = 0; for (int_tp c = get_global_id(0); c < channels; c += get_global_size(0)) { sum += out_tmp[c * spatial_dim + s]; } sum = sub_group_reduce_add(sum); group_tmp[get_sub_group_id() * spatial_dim + s] = sum; } barrier(CLK_LOCAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index += get_global_size(0)) { int_tp s = index / get_max_sub_group_size(); Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]); //if (get_sub_group_local_id() == 0) scale_tmp[s] = sum; } barrier(CLK_LOCAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < channels * spatial_dim; index += get_global_size(0)) { int_tp s = index % spatial_dim; out[n * channels * spatial_dim + index] = out_tmp[index] / scale_tmp[s]; } } __kernel void TEMPLATE(softmax_forward,Dtype)(const int_tp num, const int_tp channels, const int_tp spatial_dim, __global Dtype* scale, __global const Dtype* data, __global Dtype* out) { int_tp n = get_global_id(1); __global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim; for (int_tp index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index += get_global_size(0), ++s) { float maxval = -FLT_MAX; for (int_tp c = get_global_id(0); c < channels; c += get_global_size(0)) { Dtype tmp = data[(n * channels + c) * spatial_dim + s]; maxval = max((Dtype)tmp, (Dtype)maxval); } maxval = sub_group_reduce_max(maxval); //if (get_sub_group_local_id() == 0) group_tmp[get_sub_group_id() * spatial_dim + s] = maxval; } barrier(CLK_GLOBAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index += get_global_size(0)) { int_tp s = index / get_max_sub_group_size(); Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]); //if (get_sub_group_local_id() == 0) scale[n * spatial_dim + s] = maxval; } barrier(CLK_GLOBAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < channels * spatial_dim; index += get_global_size(0)) { int_tp s = index % spatial_dim; out[n * channels * spatial_dim + index] = exp(data[n * channels * spatial_dim + index] - scale[n * spatial_dim + s]); } barrier(CLK_GLOBAL_MEM_FENCE); for (int_tp index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index += get_global_size(0), ++s) { Dtype sum = 0; for (int_tp c = get_global_id(0); c < channels; c += get_global_size(0)) { sum += out[n * channels * spatial_dim + c * spatial_dim + s]; } sum = sub_group_reduce_add(sum); group_tmp[get_sub_group_id() * spatial_dim + s] = sum; } barrier(CLK_GLOBAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index += get_global_size(0)) { int_tp s = index / get_max_sub_group_size(); Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]); //if (get_sub_group_local_id() == 0) scale[n * spatial_dim + s] = sum; } barrier(CLK_GLOBAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < channels * spatial_dim; index += get_global_size(0)) { int_tp s = index % spatial_dim; out[n * channels * spatial_dim + index] /= scale[n * spatial_dim + s]; } } // Copied from caffe.pb.h, must keep consistent with the original definition #ifndef __SOFTMAX_LOSS_CL__ #define __SOFTMAX_LOSS_CL__ enum LossParameter_NormalizationMode { LossParameter_NormalizationMode_FULL = 0, LossParameter_NormalizationMode_VALID = 1, LossParameter_NormalizationMode_BATCH_SIZE = 2, LossParameter_NormalizationMode_NONE = 3 }; #endif // Copied from softmax_loss_layer.cpp, must keep consistent with the original implementation Dtype TEMPLATE(get_normalizer, Dtype)( enum LossParameter_NormalizationMode normalization_mode, int_tp valid_count, int_tp outer_num_, int_tp inner_num_) { Dtype normalizer; switch (normalization_mode) { case LossParameter_NormalizationMode_FULL: normalizer = (Dtype)(outer_num_ * inner_num_); break; case LossParameter_NormalizationMode_VALID: if (valid_count == -1) { normalizer = (Dtype)(outer_num_ * inner_num_); } else { normalizer = (Dtype)(valid_count); } break; case LossParameter_NormalizationMode_BATCH_SIZE: normalizer = (Dtype)(outer_num_); break; case LossParameter_NormalizationMode_NONE: normalizer = (Dtype)(1); break; default: normalizer = (Dtype)(0); } // Some users will have no labels for some examples in order to 'turn off' a // particular loss in a multi-task setup. The max prevents NaNs in that case. return fmax((Dtype)(1.0), normalizer); } Dtype TEMPLATE(asum, Dtype)(int_tp n, __global const Dtype *data, __local Dtype *sum_tmp) { Dtype sum = 0; for(int_tp i = get_global_id(0); i < n; i += get_global_size(0)) { sum += data[i]; } sum = sub_group_reduce_add(sum); sum_tmp[get_sub_group_id()] = sum; barrier(CLK_LOCAL_MEM_FENCE); if (get_sub_group_id() == 0) sum = sub_group_reduce_add(sum_tmp[get_sub_group_local_id()]); return sum; } __kernel void TEMPLATE(softmax_loss_forward_asum, Dtype)( int_tp n, int_tp outer_num_, int_tp inner_num_, int_tp compute_count_sum, int_tp normalization_type, __global const Dtype *loss, __global const Dtype *counts, __global Dtype *out) { __local Dtype sum_tmp[16]; Dtype loss_sum = TEMPLATE(asum, Dtype)(n, loss, sum_tmp); Dtype counts_sum = -1; if (compute_count_sum) counts_sum = TEMPLATE(asum, Dtype)(n, counts, sum_tmp); if (get_global_id(0) == 0) out[0] = loss_sum / TEMPLATE(get_normalizer, Dtype)(normalization_type, counts_sum, outer_num_, inner_num_); } #endif __kernel void TEMPLATE(softmax_loss_forward,Dtype)( int_tp n, __global const Dtype* prob_data, __global const Dtype* label, __global Dtype* loss, const int_tp num, const int_tp dim, const int_tp spatial_dim, const int has_ignore_label_, const int_tp ignore_label_, __global Dtype* counts) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { const int_tp n = index / spatial_dim; const int_tp s = index % spatial_dim; const int_tp label_value = (int_tp) (label[n * spatial_dim + s]); if (has_ignore_label_ == 1 && label_value == ignore_label_) { loss[index] = 0; counts[index] = 0; } else { loss[index] = -log((Dtype)( max((Dtype) (prob_data[n * dim + label_value * spatial_dim + s]), (Dtype) FLT_MIN))); counts[index] = 1; } } } __kernel void TEMPLATE(softmax_loss_backward,Dtype)(const int_tp nthreads, __global const Dtype* top, __global const Dtype* label, __global Dtype* bottom_diff, const int_tp num, const int_tp dim, const int_tp spatial_dim, const int has_ignore_label_, const int_tp ignore_label_, __global Dtype* counts) { const int_tp channels = dim / spatial_dim; for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp n = index / spatial_dim; const int_tp s = index % spatial_dim; const int_tp label_value = (int_tp) (label[n * spatial_dim + s]); if (has_ignore_label_ == 1 && label_value == ignore_label_) { for (int_tp c = 0; c < channels; ++c) { bottom_diff[n * dim + c * spatial_dim + s] = 0; } counts[index] = 0; } else { bottom_diff[n * dim + label_value * spatial_dim + s] -= 1; counts[index] = 1; } } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(ada_delta_update,Dtype)(int_tp N, __global Dtype* g, __global Dtype* h, __global Dtype* h2, Dtype momentum, Dtype delta, Dtype local_rate) { for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { Dtype gi = g[i]; Dtype hi = h[i] = momentum * h[i] + (1.0 - momentum) * gi * gi; gi = gi * sqrt((h2[i] + delta) / (hi + delta)); h2[i] = momentum * h2[i] + (1.0 - momentum) * gi * gi; g[i] = local_rate * gi; } } __kernel void TEMPLATE(ada_grad_update,Dtype)(int_tp N, __global Dtype* g, __global Dtype* h, Dtype delta, Dtype local_rate) { for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { Dtype gi = g[i]; Dtype hi = h[i] = h[i] + gi * gi; g[i] = local_rate * gi / (sqrt(hi) + delta); } } __kernel void TEMPLATE(adam_update,Dtype)(int_tp N, __global Dtype* g, __global Dtype* m, __global Dtype* v, Dtype beta1, Dtype beta2, Dtype eps_hat, Dtype corrected_local_rate) { for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { Dtype gi = g[i]; Dtype mi = m[i] = m[i] * beta1 + gi * (1 - beta1); Dtype vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2); g[i] = corrected_local_rate * mi / (sqrt(vi) + eps_hat); } } __kernel void TEMPLATE(nesterov_update,Dtype)(int_tp N, __global Dtype* g, __global Dtype* h, Dtype momentum, Dtype local_rate) { for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { Dtype hi = h[i]; Dtype hi_new = h[i] = momentum * hi + local_rate * g[i]; g[i] = (1 + momentum) * hi_new - momentum * hi; } } __kernel void TEMPLATE(rms_prop_update,Dtype)(int_tp N, __global Dtype* g, __global Dtype* h, Dtype rms_decay, Dtype delta, Dtype local_rate) { for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { Dtype gi = g[i]; Dtype hi = h[i] = rms_decay * h[i] + (1 - rms_decay) * gi * gi; g[i] = local_rate * g[i] / (sqrt(hi) + delta); } } __kernel void TEMPLATE(sgd_update,Dtype)(int_tp N, __global Dtype* g, __global Dtype* h, Dtype momentum, Dtype local_rate) { for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { g[i] = h[i] = momentum * h[i] + local_rate * g[i]; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(tile,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data, const int_tp tile_size, const int_tp num_tiles, const int_tp bottom_tile_axis, __global Dtype* top_data) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp d = index % tile_size; const int_tp b = (index / tile_size / num_tiles) % bottom_tile_axis; const int_tp n = index / tile_size / num_tiles / bottom_tile_axis; const int_tp bottom_index = (n * bottom_tile_axis + b) * tile_size + d; top_data[index] = bottom_data[bottom_index]; } } __kernel void TEMPLATE(tile_backward,Dtype)(const int_tp nthreads, __global const Dtype* top_diff, const int_tp tile_size, const int_tp num_tiles, const int_tp bottom_tile_axis, __global Dtype* bottom_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp d = index % tile_size; const int_tp b = (index / tile_size) % bottom_tile_axis; const int_tp n = index / tile_size / bottom_tile_axis; bottom_diff[index] = 0; int_tp top_index = (n * num_tiles * bottom_tile_axis + b) * tile_size + d; for (int_tp t = 0; t < num_tiles; ++t) { bottom_diff[index] += top_diff[top_index]; top_index += bottom_tile_axis * tile_size; } } } #ifdef DOUBLE_SUPPORT_AVAILABLE #undef Dtype #undef Dtype2 #undef Dtype4 #undef Dtype8 #undef Dtype16 #define Dtype double #define Dtype2 double2 #define Dtype4 double4 #define Dtype8 double8 #define Dtype16 double16 #undef TYPE #define TYPE TYPE_DOUBLE #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(relu_forward,Dtype)(const int_tp n, __global const Dtype* in, __global Dtype* out, Dtype negative_slope) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope; } } __kernel void TEMPLATE(relu_backward,Dtype)(const int_tp n, __global const Dtype* in_diff, __global const Dtype* in_data, __global Dtype* out_diff, Dtype negative_slope) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out_diff[index] = in_diff[index] * ((in_data[index] > 0?1.0:0.0) + (in_data[index] <= 0?1.0:0.0) * negative_slope); } } __kernel void TEMPLATE(tanh_forward,Dtype)(const int_tp n, __global const Dtype* in, __global Dtype* out) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out[index] = tanh(in[index]); } } __kernel void TEMPLATE(tanh_backward,Dtype)(const int_tp n, __global const Dtype* in_diff, __global const Dtype* out_data, __global Dtype* out_diff) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { Dtype tanhx = out_data[index]; out_diff[index] = in_diff[index] * (1 - tanhx * tanhx); } } __kernel void TEMPLATE(sigmoid_forward,Dtype)(const int_tp n, __global const Dtype* in, __global Dtype* out) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out[index] = 1.0 / (1.0 + exp(-in[index])); } } __kernel void TEMPLATE(sigmoid_backward,Dtype)(const int_tp n, __global const Dtype* in_diff, __global const Dtype* out_data, __global Dtype* out_diff) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { const Dtype sigmoid_x = out_data[index]; out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x); } } __kernel void TEMPLATE(threshold,Dtype)(const int_tp n, const Dtype threshold, __global const Dtype* in, __global Dtype* out) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out[index] = in[index] > threshold ? 1.0 : 0.0; } } __kernel void TEMPLATE(prelu_forward,Dtype)(const int_tp n, const int_tp channels, const int_tp dim, __global const Dtype* in, __global Dtype* out, __global const Dtype* slope_data, const int_tp div_factor) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { int_tp c = (index / dim) % channels / div_factor; out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c]; } } __kernel void TEMPLATE(prelu_backward,Dtype)(const int_tp n, const int_tp channels, const int_tp dim, __global const Dtype* in_diff, __global const Dtype* in_data, __global Dtype* out_diff, __global const Dtype* slope_data, const int_tp div_factor) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { int_tp c = (index / dim) % channels / div_factor; out_diff[index] = in_diff[index] * ((Dtype)(in_data[index] > 0?1.0:0.0) + (Dtype)(in_data[index] <= 0?1.0:0.0) * slope_data[c]); } } __kernel void TEMPLATE(prelu_param_backward,Dtype)(const int_tp n, const int_tp rows, const int_tp rowPitch, __global const Dtype* in_diff, __global const Dtype* in_data, __global Dtype* out_diff) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0?1.0:0.0); for (int k = 1; k < rows; k++) { out_diff[index] += in_diff[index + k * rowPitch] * in_data[index + k * rowPitch] * (in_data[index + k * rowPitch] <= 0?1.0:0.0); } } } __kernel void TEMPLATE(sce_loss_forward,Dtype)(const int_tp nthreads, __global const Dtype* input_data, __global const Dtype* target, __global Dtype* loss, const int_tp has_ignore_label_, const int_tp ignore_label_, __global Dtype* counts) { for (int_tp i = get_global_id(0); i < nthreads; i += get_global_size(0)) { const int_tp target_value = (int_tp)(target[i]); if (has_ignore_label_ == 1 && target_value == ignore_label_) { loss[i] = 0.0; counts[i] = 0.0; } else { loss[i] = input_data[i] * (target[i] - (input_data[i] >= 0.0)) - log((Dtype)1.0 + exp(input_data[i] - (Dtype)2.0 * input_data[i] * (input_data[i] >= 0.0))); counts[i] = 1.0; } } } __kernel void TEMPLATE(sce_loss_ignore_diff,Dtype)(const int_tp count, const int_tp ignore_label, __global const Dtype* target, __global Dtype* diff) { for (int_tp i = get_global_id(0); i < count; i += get_global_size(0)) { const int_tp target_value = (int_tp)(target[i]); if (target_value == ignore_label) { diff[i] = 0.0; } } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(gpu_set,Dtype)(const int_tp n, const Dtype alpha, __global Dtype* y) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[index] = alpha; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(batch_norm_use_global_stats_in_place,Dtype)(const int_tp num, const int_tp channels, const int_tp spatial_dim, const Dtype scale, const Dtype eps, __global const Dtype* mean, __global const Dtype* variance, __global Dtype* top) { const int_tp idx_num = get_global_id(0); const int_tp idx_chans = get_global_id(1); const int_tp idx_spatial_dim = get_global_id(2); Dtype m = mean[idx_chans]; Dtype v = variance[idx_chans]; m = -scale * m; v = (Dtype)native_powr((float)mad(scale, v, eps), (float)-0.5); const int_tp out_off = (idx_num * channels + idx_chans) * spatial_dim + idx_spatial_dim; top[out_off] = v * (top[out_off] + m); } __kernel void TEMPLATE(batch_norm_use_global_stats,Dtype)(const int_tp num, const int_tp channels, const int_tp spatial_dim, const Dtype scale, const Dtype eps, __global const Dtype* mean, __global const Dtype* variance, __global const Dtype* bottom, __global Dtype* top) { const int_tp idx_num = get_global_id(0); const int_tp idx_chans = get_global_id(1); const int_tp idx_spatial_dim = get_global_id(2); Dtype m = mean[idx_chans]; Dtype v = variance[idx_chans]; m = -scale * m; v = (Dtype)native_powr((float)mad(scale, v, eps), (float)-0.5); const int_tp out_off = (idx_num * channels + idx_chans) * spatial_dim + idx_spatial_dim; top[out_off] = v * (bottom[out_off] + m); } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(br_forward,Dtype)(const int_tp count, const int_tp inner_dim, __global const Dtype* in, __global const Dtype* permut, __global Dtype* out) { for (int_tp index = get_global_id(0); index < count; index += get_global_size(0)) { int_tp n = index / (inner_dim); int_tp in_n = (int_tp) (permut[n]); out[index] = in[in_n * (inner_dim) + index % (inner_dim)]; } } __kernel void TEMPLATE(br_backward,Dtype)(const int_tp count, const int_tp inner_dim, __global const Dtype* in, __global const Dtype* top_indexes, __global const Dtype* begins, __global const Dtype* counts, __global Dtype* out) { for (int_tp index = get_global_id(0); index < count; index += get_global_size(0)) { int_tp n = index / (inner_dim); out[index] = 0; int_tp lower = (int_tp) (begins[n]); int_tp upper = lower + (int_tp) (counts[n]); for (int_tp i = lower; i < upper; ++i) { int_tp in_n = (int_tp) (top_indexes[i]); out[index] += in[in_n * (inner_dim) + index % (inner_dim)]; } } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(null_kernel,Dtype)(Dtype arg) { Dtype out = arg; } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(bias_forward,Dtype)(const int_tp n, __global const Dtype* in, __global const Dtype* bias, const int_tp bias_dim, const int_tp inner_dim, __global Dtype* out) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { const int_tp bias_index = (index / inner_dim) % bias_dim; out[index] = in[index] + bias[bias_index]; } } __kernel void TEMPLATE(scale_forward,Dtype)(const int_tp n, __global const Dtype* in, __global const Dtype* scale, const int_tp scale_dim, const int_tp inner_dim, __global Dtype* out) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { const int_tp scale_index = (index / inner_dim) % scale_dim; out[index] = in[index] * scale[scale_index]; } } __kernel void TEMPLATE(scale_bias_forward,Dtype)(const int_tp n, __global const Dtype* in, __global const Dtype* scale, __global const Dtype* bias, const int_tp scale_dim, const int_tp inner_dim, __global Dtype* out) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { const int_tp scale_index = (index / inner_dim) % scale_dim; out[index] = in[index] * scale[scale_index] + bias[scale_index]; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(bnll_forward,Dtype)(const int_tp n, __global const Dtype* in, __global Dtype* out) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { if (in[index] > 0.0f) { out[index] = in[index] + log((Dtype) (1.0 + exp(-in[index]))); } else { out[index] = log((Dtype) (1.0 + exp(in[index]))); } } } __kernel void TEMPLATE(bnll_backward,Dtype)(const int_tp n, __global const Dtype* in_diff, __global const Dtype* in_data, __global Dtype* out_diff) { Dtype kBNLL_THRESHOLD = 50.; for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { Dtype expval = exp(min(in_data[index], kBNLL_THRESHOLD)); out_diff[index] = in_diff[index] * expval / (expval + 1.); } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(kernel_channel_max,Dtype)(const int_tp num, const int_tp channels, const int_tp spatial_dim, __global const Dtype* data, __global Dtype* out) { for (int_tp index = get_global_id(0); index < num * spatial_dim; index += get_global_size(0)) { int_tp n = index / spatial_dim; int_tp s = index % spatial_dim; float maxval = -FLT_MAX; for (int_tp c = 0; c < channels; ++c) { maxval = max((Dtype)(data[(n * channels + c) * spatial_dim + s]), (Dtype)maxval); } out[index] = maxval; } } __kernel void TEMPLATE(kernel_channel_subtract,Dtype)(const int_tp count, const int_tp num, const int_tp channels, const int_tp spatial_dim, __global const Dtype* channel_max, __global Dtype* data) { for (int_tp index = get_global_id(0); index < count; index += get_global_size(0)) { int_tp n = index / channels / spatial_dim; int_tp s = index % spatial_dim; data[index] -= channel_max[n * spatial_dim + s]; } } __kernel void TEMPLATE(kernel_exp,Dtype)(const int_tp count, __global const Dtype* data, __global Dtype* out) { for (int_tp index = get_global_id(0); index < count; index += get_global_size(0)) { out[index] = exp(data[index]); } } __kernel void TEMPLATE(kernel_channel_sum,Dtype)(const int_tp num, const int_tp channels, const int_tp spatial_dim, __global const Dtype* data, __global Dtype* channel_sum) { for (int_tp index = get_global_id(0); index < num * spatial_dim; index += get_global_size(0)) { int_tp n = index / spatial_dim; int_tp s = index % spatial_dim; Dtype sum = 0; for (int_tp c = 0; c < channels; ++c) { sum += data[(n * channels + c) * spatial_dim + s]; } channel_sum[index] = sum; } } __kernel void TEMPLATE(kernel_channel_div,Dtype)(const int_tp count, const int_tp num, const int_tp channels, const int_tp spatial_dim, __global const Dtype* channel_sum, __global Dtype* data) { for (int_tp index = get_global_id(0); index < count; index += get_global_size(0)) { int_tp n = index / channels / spatial_dim; int_tp s = index % spatial_dim; data[index] /= channel_sum[n * spatial_dim + s]; } } __kernel void TEMPLATE(kernel_channel_dot,Dtype)(const int_tp num, const int_tp channels, const int_tp spatial_dim, __global const Dtype* data_1, __global const Dtype* data_2, __global Dtype* channel_dot) { for (int_tp index = get_global_id(0); index < num * spatial_dim; index += get_global_size(0)) { int_tp n = index / spatial_dim; int_tp s = index % spatial_dim; Dtype dot = 0; for (int_tp c = 0; c < channels; ++c) { dot += (data_1[(n * channels + c) * spatial_dim + s] * data_2[(n * channels + c) * spatial_dim + s]); } channel_dot[index] = dot; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(concat,Dtype)(const int_tp nthreads, __global const Dtype* in_data, const int forward, const int_tp num_concats, const int_tp concat_size, const int_tp top_concat_axis, const int_tp bottom_concat_axis, const int_tp offset_concat_axis, __global Dtype* out_data) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp total_concat_size = concat_size * bottom_concat_axis; const int_tp concat_num = index / total_concat_size; const int_tp concat_index = index % total_concat_size; const int_tp top_index = concat_index + (concat_num * top_concat_axis + offset_concat_axis) * concat_size; if (forward == 1) { out_data[top_index] = in_data[index]; } else { out_data[index] = in_data[top_index]; } } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(cll_backward,Dtype)(const int_tp count, const int_tp channels, const Dtype margin, const Dtype alpha, __global const Dtype* y, __global const Dtype* diff, __global const Dtype* dist_sq, __global Dtype *bottom_diff) { for (int_tp i = get_global_id(0); i < count; i += get_global_size(0)) { int_tp n = i / channels; // the num index, to access y and dist_sq if (trunc(y[n]) != 0.) { // similar pairs bottom_diff[i] = alpha * diff[i]; } else { // dissimilar pairs Dtype mdist = 0.; Dtype beta = 0.; Dtype dist = sqrt(dist_sq[n]); mdist = (margin - dist); beta = -alpha * mdist / (dist + 1e-4) * diff[i]; if (mdist > 0.) { bottom_diff[i] = beta; } else { bottom_diff[i] = 0; } } } } __kernel void TEMPLATE(cll_backward_legacy,Dtype)(const int count, const int channels, const Dtype margin, const Dtype alpha, __global Dtype* y, __global Dtype* diff, __global Dtype* dist_sq, __global Dtype* bottom_diff) { for (int_tp i = get_global_id(0); i < count; i += get_global_size(0)) { int n = i / channels; // the num index, to access y and dist_sq if (trunc(y[n]) != 0.) { // similar pairs bottom_diff[i] = alpha * diff[i]; } else { // dissimilar pairs Dtype mdist = 0.; Dtype beta = 0.; mdist = (margin - dist_sq[n]); beta = -alpha; if (mdist > 0.) { bottom_diff[i] = beta; } else { bottom_diff[i] = 0; } } } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(conv_layer_spatial_phony,Dtype)(Dtype arg) { Dtype out = arg; } #define ACTIVATION_FUNCTION(_dst_, _offset_, _data_) do { (_dst_)[(_offset_)] = (_data_);} while(0) #define __CAT(x, y) x##y #define CAT(x, y) __CAT(x, y) #define LOOP0(VAR, STMT) #define LOOP1(VAR, STMT) (STMT); (VAR)++; #define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++; #define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++; #define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++; #define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++; #define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++; #define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++; #define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++; #define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++; #define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++; #define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++; #define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++; #define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++; #define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++; #define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++; #define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++; #define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT)) #ifdef MULTI __kernel void CFMultiNoPadding( __global Dtype* image_data, int_tp image_offset, __global Dtype* kernel_data, int_tp kernel_offset, __global Dtype* bias,const int_tp bias_offset, __global Dtype* convolved_image,const int_tp convolved_image_offset, const ushort input_width, const ushort input_height, const ushort output_width, const ushort output_height, const ushort pad_w, const ushort pad_h) { const int_tp outputX = get_global_id(0); const int_tp outputY = get_global_id(1); const int_tp kernelNum = get_global_id(2)*ZPAR; if(outputX < output_width && outputY < output_height) { Dtype sum[ZPAR]; for(int_tp kern =0; kern < ZPAR; kern++) { sum[kern] = 0.0f; } const int_tp org_y = outputY * STRIDE_H - pad_h; const int_tp org_x = outputX * STRIDE_W - pad_w; const int_tp currentKernelOffset = kernel_offset + kernelNum*KERNEL_H*KERNEL_W*CHANNELS; const int_tp biasIndex=bias_offset + kernelNum; const int_tp local_image_offset = org_y*input_width + org_x; const int_tp imageSize = input_width*input_height; __global Dtype* image_dataPtrFloat = (image_data + (image_offset + local_image_offset)); __global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset)); for(int_tp c = 0; c < CHANNELS; c++) { for(int_tp y = 0; y < KERNEL_H; y++) { for(int_tp x = 0; x < KERNEL_W; x++) { if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width)) { continue; } for(int_tp kern =0; kern < ZPAR; kern++) { sum[kern] += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS + x]; } } image_dataPtrFloat += input_width * DILATION_Y; kernel_dataPtrFloat += KERNEL_W; } image_dataPtrFloat += imageSize - input_width*KERNEL_H*DILATION_Y; } if(APPLY_BIAS == 1) { for(int_tp kern = 0; kern < ZPAR; kern++) { if(kernelNum+kern < OUTPUT_Z) { int_tp offset = convolved_image_offset + (kernelNum+kern)*output_height*output_width + outputY*output_width + outputX; ACTIVATION_FUNCTION(convolved_image, offset, sum[kern] + bias[biasIndex +kern]); } } } else { for(int_tp kern = 0; kern < ZPAR; kern++) { if(kernelNum+kern < OUTPUT_Z) { int_tp offset = convolved_image_offset + (kernelNum+kern)*output_height*output_width + outputY*output_width + outputX; ACTIVATION_FUNCTION(convolved_image, offset, sum[kern]); } } } } } #endif //Begin IDLF kernels below here #ifdef IDLF #define activation_function(x) (x) #define OUT_BLOCK_SIZE (OUT_BLOCK_WIDTH*OUT_BLOCK_HEIGHT) // Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map. // Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the imput image. // NDRange: (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH // NOTE: for beignet this reqd_work_group_size does not guarantee that SIMD16/8 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break. __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE))) kernel void convolve_simd( // __global float *inputs, __global float* weights, __global float* outputs __global float* inputs_base, filter_qualifier float* weights_base, __global float* biases_base, __global float* outputs_base, const ushort input_width, const ushort input_height, const ushort output_width, const ushort output_height) { __global float* outputs = outputs_base; __global float* inputs = inputs_base; filter_qualifier float* weights = weights_base; __global float* biases = biases_base; uint_tp oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column uint_tp or = get_global_id(1) * OUT_BLOCK_HEIGHT;// or = Output Row uint_tp fm = get_global_id(2);// fm = Feature Map = od = Output Depth uint_tp fmg = get_group_id(2); uint_tp lid = get_local_id(2); float out[OUT_BLOCK_SIZE]; int_tp in_addr; // find weights adress of given neuron (lid is index) uint_tp weight_addr = (fmg % (ALIGNED_NUM_FILTERS/SIMD_SIZE)) * INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid; for(int_tp i=0;i= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + 3 >= INPUT_PAD_W && curr_x < input_width + INPUT_PAD_W) { if (curr_x < INPUT_PAD_W) { in_buf.in_vec[reg].s0 = 0; if (curr_x + 1 >= INPUT_PAD_W) in_buf.in_vec[reg].s1 = *(inputs + in_offset + 1); else in_buf.in_vec[reg].s1 = 0; if (curr_x + 2 >= INPUT_PAD_W) in_buf.in_vec[reg].s2 = *(inputs + in_offset + 2); else in_buf.in_vec[reg].s2 = 0; in_buf.in_vec[reg].s3 = *(inputs + in_offset + 3); } else { in_buf.in_vec[reg] = *(global float4*)(inputs + in_offset); // read SIMD_SIZE elements if (curr_x + 1 >= input_width + INPUT_PAD_W) in_buf.in_vec[reg].s1 = 0; if (curr_x + 2 >= input_width + INPUT_PAD_W) in_buf.in_vec[reg].s2 = 0; if (curr_x + 3 >= input_width + INPUT_PAD_W) in_buf.in_vec[reg].s3 = 0; } } else { in_buf.in_vec[reg] = 0; } curr_y += TILE_Y_STRIDE; #else in_buf.in_vec[reg] = *(global float4*)(inputs + in_offset); // read SIMD_SIZE elements #endif in_offset += input_width * TILE_Y_STRIDE; }); in_addr += input_height * input_width; #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 curr_y = saved_y; #endif #if KERNEL_WIDTH * KERNEL_HEIGHT != 1 #define WEIGHT_PREF 8 #else #define WEIGHT_PREF 1 #endif union { float w[WEIGHT_PREF]; #if KERNEL_WIDTH * KERNEL_HEIGHT != 1 uint8 ui8; #endif } weight_buf; int_tp w_idx=0; uint_tp orig_weight_addr = weight_addr; #if KERNEL_WIDTH * KERNEL_HEIGHT != 1 weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]); weight_addr += SIMD_SIZE * WEIGHT_PREF; #else weight_buf.w[0] = as_float(intel_sub_group_block_read((__global uint *)&weights[weight_addr])); weight_addr += SIMD_SIZE * 1; #endif #define BLOCK_IN(n) sub_group_broadcast( in_buf.in_array[((n)%4) + ((n) / (TILE_Y_STRIDE * TILE_X)) * 4], (((n) % (TILE_Y_STRIDE * TILE_X))/4)) int_tp kr = 0; // kr = Kernel Row LOOP(KERNEL_HEIGHT, kr,// LOOP is a macro that unrolls the loop. { int_tp kc = 0; // kc = Kernel Column LOOP(KERNEL_WIDTH, kc, { for(int_tp br=0; br < OUT_BLOCK_HEIGHT; br++) { for(int_tp bc=0; bc < OUT_BLOCK_WIDTH; bc++) { float input = BLOCK_IN((br * STRIDEY + kr * DILATION_Y) * TILE_X + bc * STRIDEX + kc * DILATION_X); out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]); } } #if KERNEL_WIDTH * KERNEL_HEIGHT > WEIGHT_PREF // We assume KERNEL_W is equal to KERNEL_H here. if ((w_idx + 1) % WEIGHT_PREF == 0 #if KERNEL_WIDTH * KERNEL_HEIGHT % 8 != 0 && ((w_idx + 1) <= (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF)) #endif ) { weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]); weight_addr += SIMD_SIZE * WEIGHT_PREF; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details. } #if KERNEL_WIDTH*KERNEL_HEIGHT % 8 == 0 // need to do nothing #else else if ((w_idx + 1) % WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF))) #if KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 1 weight_buf.w[0] = weights[weight_addr]; #elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 2 weight_buf.ui8.s01 = intel_sub_group_block_read2((__global uint *)&weights[weight_addr]); #elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 <= 4 weight_buf.ui8.s0123 = intel_sub_group_block_read4((__global uint *)&weights[weight_addr]); #else weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]); #endif #endif #endif ++w_idx; }); }); weight_addr = orig_weight_addr + KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE; } // dead code to work around possible compiler bug. if (ALIGNED_NUM_FILTERS != NUM_FILTERS && fm > 0xfffffffeul) { outputs[0] = BLOCK_IN(fm % SIMD_SIZE); } fm = fm % ALIGNED_NUM_FILTERS; if ((ALIGNED_NUM_FILTERS == NUM_FILTERS || fm < NUM_FILTERS)) { uint_tp out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + fm ) * output_width * output_height; out_addr += or * output_width + oc; float bias = biases[fm]; for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) { if (r + or >= output_height) break; for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) { if (c + oc >= output_width) break; // this does a scattered write to SIMD_SIZE different feature maps, so that data within one map is contiguous, thus ready for input to next layer. outputs[out_addr + r * output_width + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]); } } } } #endif /******************************************************************************* Copyright © 2016, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ******************************************************************************/ #ifdef Conv_Interleaved typedef struct float1 { float s0; } float1; typedef struct float5 { float s0; float s1; float s2; float s3; float s4; } float5; typedef struct float6 { float s0; float s1; float s2; float s3; float s4; float s5; } float6; typedef struct float7 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; } float7; typedef struct float9 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; } float9; typedef struct float10 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; float s9;} float10; typedef struct float11 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; float s9; float sa;} float11; typedef struct float12 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; float s9; float sa; float sb; } float12; typedef struct float13 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; float s9; float sa; float sb; float sc;} float13; typedef struct float14 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; } float14; typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15; typedef struct float0 { float s0; } float0; //never used but makes compiler happy. #define OUT_PITCH_X output_width #define ROW_PITCH input_width #ifdef FUSED_CONV_ELTWISE #define GEMM_LIKE_KERNEL_ARGS __global Dtype* eltwise_data, const __global Dtype *src0, const __global Dtype *src1, const __global Dtype *biases, __global Dtype *dst, const ushort input_width, const ushort input_height, const ushort output_width, const ushort output_height, const int_tp out_pitch_y, const int_tp out_pitch_z, const int_tp aligned_input_size, const int_tp slice_pitch #else #define GEMM_LIKE_KERNEL_ARGS const __global Dtype *src0, const __global Dtype *src1, const __global Dtype *biases, __global Dtype *dst, const ushort input_width, const ushort input_height, const ushort output_width, const ushort output_height, const int_tp out_pitch_y, const int_tp out_pitch_z, const int_tp aligned_input_size, const int_tp slice_pitch #endif #endif #ifdef GEMM_LIKE_CONV_32_1 ////////////////////////////////////////////////////////////////////////////// // Conv_Interleaved_32_1_flex // // Convolution: each workitem computes 1 patch x 32 filters worth of output // data. Kernel's inner loop works on a single tile consisting of one // row from each patch and the filter data corresponding to that row. Filter // matrix is interleaved to reduce GRF bank conflicts. Patches are walked // by rows and then by slices. Relies on sub_group extension for block // reads and SIMD broadcast. Allows flexible sizing of TILE width (TILE_N) // by dynamically selecting one of two code paths: one uses TILE_N = 32 and // the other uses TILE_N = 8, 16, or 24. #define TILE_M 1 #define TILE_K KERNEL_WIDTH #define TILE_N 32 #ifdef __BEIGNET__ __attribute__((intel_reqd_sub_group_size(8))) #endif __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { const int group_x = get_group_id(0); const int group_y = get_group_id(1); const int global_x = get_global_id(0); const int global_y = get_global_id(1); const int global_z = get_global_id(2); int interleaved_y; int kernel_y; int kernel_idx; #define DOT_PRODUCT_8( _result, _rowA, colB ) { _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); } typedef CAT( float, KERNEL_WIDTH ) float_t; // True for all threads if filter_width is multiple of TILE_N // else, true for all but right-most column of threads. if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N ) { // Result ctile (*dst) is M rows x N columns // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. float8 blockC00 = 0.f; float8 blockC10 = 0.f; float8 blockC20 = 0.f; float8 blockC30 = 0.f; // Src0 (patch input) is directly used as atile. // Each work item points to the start of a different patch. // atile is M rows x K columns. int curr_x = ( global_y % output_width ) * STRIDE_X; int curr_y = ( global_y / output_width ) * STRIDE_Y; #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 int saved_y = curr_y; #endif const __global float *src0_read = src0 + aligned_input_size * global_z // batch offset + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset + (curr_x - INPUT_PAD_W); // x offset // Src1 (filter) is directly used as btile. // It starts at the top of src1 and walks down. // btile is K rows x N columns. const __global float *src1_read = src1 + ( global_x * TILE_N * 2); // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch // and KERNEL_WIDTH/2 rows of interleaved filter. int patch_depth = 0; do { int patch_row = 0; #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 curr_y = saved_y; #endif do { // Load atile and btile. // Kernel data is partially interleaved. Every 2 rows are interleaved at float8 granularity. // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non // interleaved row is padded with zero to ensure same size as interleaved rows. This // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3. // (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) .. // (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ... // (0, 2) (8, 2) (16, 2) (24, 2) ... ... // ... const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 float_t blockA00 = ( (const __global float_t*)src0_read )[ 0 ]; float* pblockA00 = (float*)(&blockA00); #else float_t blockA00; float* pblockA00 = (float*)(&blockA00); int pos = 0; LOOP(KERNEL_WIDTH, pos, { if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read[pos * DILATION_X]; else pblockA00[pos] = 0; }) curr_y += DILATION_Y; #endif src0_read += (ROW_PITCH * DILATION_Y); float blockB00[KERNEL_WIDTH*4]; float8* p8BlockB00 = (float8*)blockB00; float4* p4BlockB00 = (float4*)blockB00; float* pBlockB00 = (float* )blockB00; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { p8BlockB00[interleaved_y] = as_float8( intel_sub_group_block_read8( (const __global uint*)src1_read ) ); src1_read += WIDTH1 * 2; } ) if ( kernel_width_is_odd ) { p4BlockB00[KERNEL_WIDTH - 1] = as_float4( intel_sub_group_block_read4( (const __global uint*)src1_read ) ); src1_read += WIDTH1 * 2; } // Perform MADs kernel_idx = 0; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { kernel_y = interleaved_y * 2; DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; } ) kernel_y = interleaved_y * 2; if ( kernel_width_is_odd ) { DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; } } //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch } //while ( ++patch_depth < 1 ); //debug while ( ++patch_depth < INPUT_DEPTH ); // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. __global float *out = dst + global_z * out_pitch_z // batch offset + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset float bias[4]; float4 *bias_vec; bias_vec = (float4*)bias; *bias_vec = as_float4(intel_sub_group_block_read4((__global uint *)biases + group_x * TILE_N)); if (global_y * TILE_M < output_width * output_height ) { for (int i = 0; i < 8; i++) { out[( 0+i) * out_pitch_y] = blockC00[i] + intel_sub_group_shuffle(bias[0], i); out[( 8+i) * out_pitch_y] = blockC10[i] + intel_sub_group_shuffle(bias[1], i); out[(16+i) * out_pitch_y] = blockC20[i] + intel_sub_group_shuffle(bias[2], i); out[(24+i) * out_pitch_y] = blockC30[i] + intel_sub_group_shuffle(bias[3], i); } } } #if TILE_N_LAST > 0 else { // Result ctile (*dst) is M rows x N columns // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. int i = 0; float8 blockC[TILE_N_LAST_DIV8]; LOOP(TILE_N_LAST_DIV8, i, { blockC[i] = 0.f; } ) // Src0 (patch input) is directly used as atile. // Each work item points to the start of a different patch. // atile is M rows x K columns. int curr_x = ( global_y % output_width ) * STRIDE_X; int curr_y = ( global_y / output_width ) * STRIDE_Y; #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 int saved_y = curr_y; #endif const __global float *src0_read = src0 + aligned_input_size * global_z // batch offset + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset + (curr_x - INPUT_PAD_W); // x offset // Src1 (filter) is directly used as btile. // It starts at the top of src1 and walks down. // btile is K rows x N columns. const __global float *src1_read = src1 + ( global_x * TILE_N * 2); // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch // and KERNEL_WIDTH/2 rows of interleaved filter. int patch_depth = 0; do { int patch_row = 0; #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 curr_y = saved_y; #endif do { // Load atile and interleaved btile. const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 float_t blockA00 = ( (const __global float_t*)src0_read )[ 0 ]; float* pblockA00 = (float*)(&blockA00); #else float_t blockA00; float* pblockA00 = (float*)(&blockA00); int pos = 0; LOOP(KERNEL_WIDTH, pos, { if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read[pos * DILATION_X]; else pblockA00[pos] = 0; }) curr_y += DILATION_Y; #endif src0_read += (ROW_PITCH * DILATION_Y); float blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8]; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { #if TILE_N_LAST_DIV8 == 1 float2* p2BlockB = (float2* )blockB; p2BlockB[interleaved_y] = as_float2( intel_sub_group_block_read2( (const __global uint*)src1_read ) ); #elif TILE_N_LAST_DIV8 == 2 float4* p4BlockB = (float4* )blockB; p4BlockB[interleaved_y] = as_float4( intel_sub_group_block_read4( (const __global uint*)src1_read ) ); #elif TILE_N_LAST_DIV8 == 3 //TODO: broken. No block_read6 float6* p6BlockB = (float6* )blockB; (*((float8*)(&p6BlockB[interleaved_y]))).s0123 = as_float4( intel_sub_group_block_read4( (const __global uint*)src1_read ) ); (*((float8*)(&p6BlockB[interleaved_y]))).s45 = as_float2( intel_sub_group_block_read2( (const __global uint*)(src1_read + 4 * 8) ) ); #endif src1_read += WIDTH1 * 2; } ) if ( kernel_width_is_odd ) { #if TILE_N_LAST_DIV8 == 1 float* pBlockB = (float* )blockB; pBlockB[KERNEL_WIDTH - 1] = as_float( intel_sub_group_block_read( (const __global uint*)src1_read ) ); #elif TILE_N_LAST_DIV8 == 2 float2* p2BlockB = (float2* )blockB; p2BlockB[KERNEL_WIDTH - 1] = as_float2( intel_sub_group_block_read2( (const __global uint*)src1_read ) ); #elif TILE_N_LAST_DIV8 == 3 float3* p3BlockB = (float3* )blockB; p3BlockB[KERNEL_WIDTH - 1].s01 = as_float2( intel_sub_group_block_read2( (const __global uint*)src1_read ) ); p3BlockB[KERNEL_WIDTH - 1].s2 = as_float( intel_sub_group_block_read( (const __global uint*) (src1_read + 2 * 8) ) ); #endif src1_read += WIDTH1 * 2; } // Perform MADs float* pBlockB = (float*)blockB; kernel_idx = 0; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { kernel_y = interleaved_y * 2; DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; #if TILE_N_LAST_DIV8 >= 2 DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; #if TILE_N_LAST_DIV8 >= 3 DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; #endif #endif } ) kernel_y = interleaved_y * 2; if ( kernel_width_is_odd ) { DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; #if TILE_N_LAST_DIV8 >= 2 DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; #if TILE_N_LAST_DIV8 >= 3 DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; #endif #endif } } //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch } //while ( ++patch_depth < 1 ); //debug while ( ++patch_depth < INPUT_DEPTH ); // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. __global float *out = dst + global_z * out_pitch_z // batch offset + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset float bias[4]; float4 *bias_vec; bias_vec = (float4*)bias; *bias_vec = as_float4(intel_sub_group_block_read4((__global uint *)biases + group_x * TILE_N)); if (global_y * TILE_M < output_width * output_height ) { for (int i = 0; i < 8; i++) { if ( TILE_N_LAST_DIV8 > 0 ) out[( 0+i) * out_pitch_y] = blockC[0][i] + intel_sub_group_shuffle(bias[0], i); if ( TILE_N_LAST_DIV8 > 1 ) out[( 8+i) * out_pitch_y] = blockC[1][i] + intel_sub_group_shuffle(bias[1], i); if ( TILE_N_LAST_DIV8 > 2 ) out[(16+i) * out_pitch_y] = blockC[2][i] + intel_sub_group_shuffle(bias[2], i); if ( TILE_N_LAST_DIV8 > 3 ) out[(24+i) * out_pitch_y] = blockC[3][i] + intel_sub_group_shuffle(bias[3], i); } } } #endif } #endif #ifdef GEMM_LIKE_CONV_32_1_SIMD16 #define TILE_M 1 #define TILE_K KERNEL_WIDTH #define TILE_N 32 #ifndef __BEIGNET__ __attribute__((intel_reqd_sub_group_size(16))) #endif __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { const int group_x = get_group_id(0); const int group_y = get_group_id(1); const int global_x = get_global_id(0); const int global_y = get_global_id(1); const int global_z = get_global_id(2); int interleaved_y; int kernel_y; int kernel_idx; // Result ctile (*dst) is M rows x N columns // LWG size is 1x16. Thus each thread calculates 16*M rows x N cols of ctile. Dtype16 blockC00 = 0.f; Dtype16 blockC10 = 0.f; // Src0 (patch input) is directly used as atile. // Each work item points to the start of a different patch. // atile is M rows x K columns. int curr_x = ( global_y % output_width ) * STRIDE_X; int curr_y = ( global_y / output_width ) * STRIDE_Y; #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 int saved_y = curr_y; #endif const __global Dtype *src0_read = src0 + aligned_input_size * global_z // batch offset + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x - INPUT_PAD_W; // x offset const __global Dtype *src0_read_orig = src0_read; // Src1 (filter) is directly used as btile. // It starts at the top of src1 and walks down. // btile is K rows x N columns. const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2 ); #define DOT_PRODUCT_16( _result, _rowA, colB ) { _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); _result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 ); _result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 ); _result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa ); _result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb ); _result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc ); _result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd ); _result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se ); _result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf ); } typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t; // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch // and KERNEL_WIDTH/2 rows of interleaved filter. int patch_depth = 0; #ifndef __BEIGNET__ __attribute__((opencl_unroll_hint(1))) #endif do { int patch_row = 0; #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 curr_y = saved_y; #endif #ifndef __BEIGNET__ __attribute__((opencl_unroll_hint(1))) #endif do { // Load atile and btile. // Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype16 granularity. // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non // interleaved row is padded with zero to ensure same size as interleaved rows. This // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3. // (0, 0) (16, 0) (32, 0) (48, 0) ... (0, 0) ( 0, 1) (16, 0) ( 0, 1) (32, 0) (0, 1) (48, 0) ... // (0, 1) (16, 1) (32, 1) (48, 1) ... => (0, 2) (16, 2) (32, 2) (48, 2) ... // (0, 2) (16, 2) (32, 2) (48, 2) ... ... // ... const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; Dtype* pblockA00 = (Dtype*)(&blockA00); #else Dtype_t blockA00; Dtype* pblockA00 = (Dtype*)(&blockA00); int pos = 0; LOOP(KERNEL_WIDTH, pos, { if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read[pos * DILATION_X]; else pblockA00[pos] = 0; }) curr_y += DILATION_Y; #endif src0_read += ROW_PITCH * DILATION_X; uint blockB00[KERNEL_WIDTH * 2]; uint4* p4BlockB00 = (uint4*)blockB00; uint2* p2BlockB00 = (uint2*)blockB00; Dtype* pBlockB00 = (Dtype*)blockB00; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { p4BlockB00[interleaved_y] = intel_sub_group_block_read4( (const __global uint*)src1_read ); src1_read += WIDTH1 * 2; } ) if ( kernel_width_is_odd ) { p2BlockB00[KERNEL_WIDTH - 1] = intel_sub_group_block_read2( (const __global uint*)src1_read ); src1_read += WIDTH1 * 2; } // Perform MADs kernel_idx = 0; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { kernel_y = interleaved_y * 2; DOT_PRODUCT_16( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_16( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; } ) if ( kernel_width_is_odd ) { kernel_y = interleaved_y * 2; DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; } } //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch } //while ( ++patch_depth < 1 ); //debug while ( ++patch_depth < INPUT_DEPTH ); // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. __global Dtype *out = dst + global_z * out_pitch_z // batch offset + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset Dtype bias[2]; Dtype2 *bias_vec; bias_vec = (Dtype2*)bias; *bias_vec = as_float2(intel_sub_group_block_read2((__global uint *)biases + group_x * TILE_N)); // Work around a potential compiler bug. if (group_x > 0xFFFFFFFEul) out[0] = bias[0] + bias[1]; if (global_y * TILE_M < output_width * output_height ) { #if ( ( OUT_DEPTH % TILE_N ) == 0 ) for (int i = 0; i < 16; i++) { out[( 0+i) * out_pitch_y] = blockC00[i] + intel_sub_group_shuffle(bias[0], i); out[(16+i) * out_pitch_y] = blockC10[i] + intel_sub_group_shuffle(bias[1], i);; } #elif ( ( OUT_DEPTH % 16 ) == 0 ) if ( ( global_x + 1 ) < get_global_size(0) ) { for ( int i = 0; i < 16; i++ ) { out[( 0+i) * out_pitch_y] = blockC00[i] + intel_sub_group_shuffle(bias[0], i);; out[(16+i) * out_pitch_y] = blockC10[i] + intel_sub_group_shuffle(bias[1], i);; } } else { for (int i = 0; i < 16; i++) { out[( 0+i) * out_pitch_y] = blockC00[i] + intel_sub_group_shuffle(bias[0], i);; } } #else if ( ( global_x + 1 ) < get_global_size(0) ) { for ( int i = 0; i < 16; i++ ) { out[( 0+i) * out_pitch_y] = blockC00[i] + intel_sub_group_shuffle(bias[0], i);; out[(16+i) * out_pitch_y] = blockC10[i] + intel_sub_group_shuffle(bias[1], i);; } } else { #if ( (OUT_DEPTH % TILE_N) > 16 ) { for (int i = 0; i < 16 ; i++) { out[( 0+i) * out_pitch_y] = blockC00[i] + intel_sub_group_shuffle(bias[0], i);; } for (int i = 0; i < OUT_DEPTH % 16 ; i++) { out[(16+i) * out_pitch_y] = blockC10[i] + intel_sub_group_shuffle(bias[1], i);; } } #else { for (int i = 0; i < OUT_DEPTH % 16 ; i++) { out[( 0+i) * out_pitch_y] = blockC00[i] + intel_sub_group_shuffle(bias[0], i);; } } #endif } #endif } } #endif #ifdef GEMM_LIKE_CONV_32_2 ////////////////////////////////////////////////////////////////////////////// // Conv_Interleaved_32_2_flex // // Convolution: each workitem computes 1 patch x 32 filters worth of output // data. Kernel's inner loop works on a single tile consisting of one // row from each patch and the filter data corresponding to that row. Filter // matrix is interleaved to reduce GRF bank conflicts. Patches are walked // by rows and then by slices. Relies on sub_group extension for block // reads and SIMD broadcast. Allows flexible sizing of TILE width (TILE_N) // by dynamically selecting one of two code paths: one uses TILE_N = 32 and // the other uses TILE_N = 8, 16, or 24. #define TILE_M 2 #define TILE_K KERNEL_WIDTH #define TILE_N 32 #ifdef __BEIGNET__ __attribute__((intel_reqd_sub_group_size(8))) #endif __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { const int group_x = get_group_id(0); const int group_y = get_group_id(1); const int global_x = get_global_id(0); const int global_y = get_global_id(1); const int global_z = get_global_id(2); int interleaved_y; int kernel_y; int kernel_idx; #define DOT_PRODUCT_8( _result, _rowA, colB ) { _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); } typedef CAT( float, KERNEL_WIDTH ) float_t; // True for all threads if filter_width is multiple of TILE_N // else, true for all but right-most column of threads. if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N ) { // Result ctile (*dst) is M rows x N columns // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. float8 blockC00 = 0.f; float8 blockC10 = 0.f; float8 blockC20 = 0.f; float8 blockC30 = 0.f; float8 blockC01 = 0.f; float8 blockC11 = 0.f; float8 blockC21 = 0.f; float8 blockC31 = 0.f; // Src0 (patch input) is directly used as atile. // Each work item points to the start of a different patch. // atile is M rows x K columns. int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X; int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 int saved_y0 = curr_y0; int saved_y1 = curr_y1; #endif const __global float *src0_read0 = src0 + aligned_input_size * global_z // batch offset + (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x0 - INPUT_PAD_W; // x offset const __global float *src0_read1 = src0 + aligned_input_size * global_z // batch offset + (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x1 - INPUT_PAD_W; // x offset // Src1 (filter) is directly used as btile. // It starts at the top of src1 and walks down. // btile is K rows x N columns. const __global float *src1_read = src1 + ( global_x * TILE_N * 2); // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch // and KERNEL_WIDTH/2 rows of interleaved filter. int patch_depth = 0; do { int patch_row = 0; do { // Load atile and btile. // Kernel data is partially interleaved. Every 2 rows are interleaved at float8 granularity. // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non // interleaved row is padded with zero to ensure same size as interleaved rows. This // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3. // (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) .. // (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ... // (0, 2) (8, 2) (16, 2) (24, 2) ... ... // ... const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; #if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 float_t blockA00 = ( (const __global float_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; float_t blockA01 = ( (const __global float_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; float* pblockA00 = (float*)(&blockA00); float* pblockA01 = (float*)(&blockA01); #else float_t blockA00; float* pblockA00 = (float*)(&blockA00); int pos = 0; LOOP(KERNEL_WIDTH, pos, { if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read0[pos * DILATION_X]; else pblockA00[pos] = 0; }) curr_y0 += DILATION_Y; float_t blockA01; float* pblockA01 = (float*)(&blockA01); pos = 0; LOOP(KERNEL_WIDTH, pos, { if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA01[pos] = src0_read1[pos * DILATION_X]; else pblockA01[pos] = 0; }) curr_y1 += DILATION_Y; src0_read0 += ROW_PITCH * DILATION_Y; src0_read1 += ROW_PITCH * DILATION_Y; #endif float blockB00[KERNEL_WIDTH*4]; float8* p8BlockB00 = (float8*)blockB00; float4* p4BlockB00 = (float4*)blockB00; float* pBlockB00 = (float* )blockB00; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { p8BlockB00[interleaved_y] = as_float8( intel_sub_group_block_read8( (const __global uint*)src1_read ) ); src1_read += WIDTH1 * 2; } ) if ( kernel_width_is_odd ) { p4BlockB00[KERNEL_WIDTH - 1] = as_float4( intel_sub_group_block_read4( (const __global uint*)src1_read ) ); src1_read += WIDTH1 * 2; } // Perform MADs kernel_idx = 0; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { kernel_y = interleaved_y * 2; DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC01, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC11, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC21, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC21, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC31, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC31, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; } ) if ( kernel_width_is_odd ) { kernel_y = interleaved_y * 2; DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC21, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] ); DOT_PRODUCT_8( blockC31, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; } } //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 curr_y0 = saved_y0; curr_y1 = saved_y1; #endif src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); } //while ( ++patch_depth < 1 ); //debug while ( ++patch_depth < INPUT_DEPTH ); // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. __global float *out0 = dst + global_z * out_pitch_z // batch offset + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset __global float *out1 = dst + global_z * out_pitch_z // batch offset + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset float bias[4]; float4 *bias_vec; bias_vec = (float4*)bias; *bias_vec = as_float4(intel_sub_group_block_read4((__global uint *)biases + group_x * TILE_N)); if( global_y * TILE_M < output_width * output_height ) { for( int i = 0; i < 8; i++ ) { out0[( 0+i) * out_pitch_y] = blockC00[i] + intel_sub_group_shuffle(bias[0], i); out0[( 8+i) * out_pitch_y] = blockC10[i] + intel_sub_group_shuffle(bias[1], i); out0[(16+i) * out_pitch_y] = blockC20[i] + intel_sub_group_shuffle(bias[2], i); out0[(24+i) * out_pitch_y] = blockC30[i] + intel_sub_group_shuffle(bias[3], i); } } if( global_y * TILE_M + 1 < output_width * output_height ) { for( int i = 0; i < 8; i++ ) { out1[( 0+i) * out_pitch_y] = blockC01[i] + intel_sub_group_shuffle(bias[0], i); out1[( 8+i) * out_pitch_y] = blockC11[i] + intel_sub_group_shuffle(bias[1], i); out1[(16+i) * out_pitch_y] = blockC21[i] + intel_sub_group_shuffle(bias[2], i); out1[(24+i) * out_pitch_y] = blockC31[i] + intel_sub_group_shuffle(bias[3], i); } } } #if TILE_N_LAST > 0 else { // Result ctile (*dst) is M rows x N columns // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. int i = 0; float8 blockC0[TILE_N_LAST_DIV8]; float8 blockC1[TILE_N_LAST_DIV8]; LOOP(TILE_N_LAST_DIV8, i, { blockC0[i] = 0.f; blockC1[i] = 0.f; } ) // Src0 (patch input) is directly used as atile. // Each work item points to the start of a different patch. // atile is M rows x K columns. int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X; int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 int saved_y0 = curr_y0; int saved_y1 = curr_y1; #endif const __global float *src0_read0 = src0 + aligned_input_size * global_z // batch offset + (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x0 - INPUT_PAD_W; // x offset const __global float *src0_read1 = src0 + aligned_input_size * global_z // batch offset + (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x1 - INPUT_PAD_W; // x offset // Src1 (filter) is directly used as btile. // It starts at the top of src1 and walks down. // btile is K rows x N columns. const __global float *src1_read = src1 + ( global_x * TILE_N * 2); // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch // and KERNEL_WIDTH/2 rows of interleaved filter. int patch_depth = 0; do { int patch_row = 0; do { // Load atile and interleaved btile. const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; #if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 float_t blockA00 = ( (const __global float_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; float_t blockA01 = ( (const __global float_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; float* pblockA00 = (float*)(&blockA00); float* pblockA01 = (float*)(&blockA01); #else float_t blockA00; float* pblockA00 = (float*)(&blockA00); int pos = 0; LOOP(KERNEL_WIDTH, pos, { if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read0[pos * DILATION_X]; else pblockA00[pos] = 0; }) curr_y0 += DILATION_Y; float_t blockA01; float* pblockA01 = (float*)(&blockA01); pos = 0; LOOP(KERNEL_WIDTH, pos, { if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA01[pos] = src0_read1[pos * DILATION_X]; else pblockA01[pos] = 0; }) curr_y1 += DILATION_Y; src0_read0 += (ROW_PITCH * DILATION_Y); src0_read1 += (ROW_PITCH * DILATION_Y); #endif float blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8]; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { #if TILE_N_LAST_DIV8 == 1 float2* p2BlockB = (float2* )blockB; p2BlockB[interleaved_y] = as_float2( intel_sub_group_block_read2( (const __global uint*)src1_read ) ); #elif TILE_N_LAST_DIV8 == 2 float4* p4BlockB = (float4* )blockB; p4BlockB[interleaved_y] = as_float4( intel_sub_group_block_read4( (const __global uint*)src1_read ) ); #elif TILE_N_LAST_DIV8 == 3 //TODO: broken. No block_read6 float6* p6BlockB = (float6* )blockB; (*((float8*)(&p6BlockB[interleaved_y]))).s0123 = as_float4( intel_sub_group_block_read4( (const __global uint*)src1_read ) ); (*((float8*)(&p6BlockB[interleaved_y]))).s45 = as_float2( intel_sub_group_block_read2( (const __global uint*)(src1_read + 4 * 8) ) ); #endif src1_read += WIDTH1 * 2; } ) if ( kernel_width_is_odd ) { #if TILE_N_LAST_DIV8 == 1 float* pBlockB = (float* )blockB; pBlockB[KERNEL_WIDTH - 1] = as_float( intel_sub_group_block_read( (const __global uint*)src1_read ) ); #elif TILE_N_LAST_DIV8 == 2 float2* p2BlockB = (float2* )blockB; p2BlockB[KERNEL_WIDTH - 1] = as_float2( intel_sub_group_block_read2( (const __global uint*)src1_read ) ); #elif TILE_N_LAST_DIV8 == 3 float3* p3BlockB = (float3* )blockB; p3BlockB[KERNEL_WIDTH - 1].s01 = as_float2( intel_sub_group_block_read2( (const __global uint*)src1_read ) ); p3BlockB[KERNEL_WIDTH - 1].s2 = as_float( intel_sub_group_block_read( (const __global uint*) (src1_read + 8) ) ); #endif src1_read += WIDTH1 * 2; } // Perform MADs float* pBlockB = (float*)blockB; kernel_idx = 0; interleaved_y = 0; LOOP(KERNEL_WIDTH_DIV2, interleaved_y, { kernel_y = interleaved_y * 2; DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y ], pBlockB[kernel_idx] ); DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; #if TILE_N_LAST_DIV8 >= 2 DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y ], pBlockB[kernel_idx] ); DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; #if TILE_N_LAST_DIV8 >= 3 DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y ], pBlockB[kernel_idx] ); DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; #endif #endif } ) kernel_y = interleaved_y * 2; if ( kernel_width_is_odd ) { DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y], pBlockB[kernel_idx] ); DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; #if TILE_N_LAST_DIV8 >= 2 DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y], pBlockB[kernel_idx] ); DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; #if TILE_N_LAST_DIV8 >= 3 DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y], pBlockB[kernel_idx] ); DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; #endif #endif } } //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 curr_y0 = saved_y0; curr_y1 = saved_y1; #endif src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); } //while ( ++patch_depth < 1 ); //debug while ( ++patch_depth < INPUT_DEPTH ); // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. __global float *out0 = dst + global_z * out_pitch_z // batch offset + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset __global float *out1 = dst + global_z * out_pitch_z // batch offset + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset float bias[4]; float4 *bias_vec; bias_vec = (float4*)bias; *bias_vec = as_float4(intel_sub_group_block_read4((__global uint *)biases + group_x * TILE_N)); if( global_y * TILE_M < output_width * output_height ) { for( int i = 0; i < 8; i++ ) { if ( TILE_N_LAST_DIV8 > 0 ) out0[( 0+i) * out_pitch_y] = blockC0[0][i] + intel_sub_group_shuffle(bias[0], i); if ( TILE_N_LAST_DIV8 > 1 ) out0[( 8+i) * out_pitch_y] = blockC0[1][i] + intel_sub_group_shuffle(bias[1], i); if ( TILE_N_LAST_DIV8 > 2 ) out0[(16+i) * out_pitch_y] = blockC0[2][i] + intel_sub_group_shuffle(bias[2], i); if ( TILE_N_LAST_DIV8 > 3 ) out0[(24+i) * out_pitch_y] = blockC0[3][i] + intel_sub_group_shuffle(bias[3], i); } } if( global_y * TILE_M + 1 < output_width * output_height ) { for( int i = 0; i < 8; i++ ) { if ( TILE_N_LAST_DIV8 > 0 ) out1[( 0+i) * out_pitch_y] = blockC1[0][i] + intel_sub_group_shuffle(bias[0], i); if ( TILE_N_LAST_DIV8 > 1 ) out1[( 8+i) * out_pitch_y] = blockC1[1][i] + intel_sub_group_shuffle(bias[1], i); if ( TILE_N_LAST_DIV8 > 2 ) out1[(16+i) * out_pitch_y] = blockC1[2][i] + intel_sub_group_shuffle(bias[2], i); if ( TILE_N_LAST_DIV8 > 3 ) out1[(24+i) * out_pitch_y] = blockC1[3][i] + intel_sub_group_shuffle(bias[3], i); } } } #endif } #endif #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(copyImage, Dtype) (__global Dtype* image_data, int_tp image_offset, const int_tp channels, const int_tp height, const int_tp width, const int_tp adjustedHeight, const int_tp adjustedWidth, const int_tp pad_h, const int_tp pad_w, __global Dtype* output_image, const int_tp output_offset, const int_tp batch_size) { uint_tp sX = get_global_id(0); uint_tp sY = get_global_id(1); uint_tp sZ = get_global_id(2); int_tp in_y = sY - pad_h; int_tp in_x = sX - pad_w; int_tp batch_offset = 0; int_tp adjusted_batch_offset = 0; for(uint_tp batch_idx = 0; batch_idx < batch_size; batch_idx++) { int_tp dst_offset = adjusted_batch_offset + output_offset + sZ*adjustedHeight*adjustedWidth + sY*adjustedWidth +sX; int_tp src_offset = batch_offset + image_offset + sZ*height*width + in_y*width + in_x; if((in_y >= 0 && in_y < height && in_x >= 0 && in_x < width)) output_image[dst_offset] = image_data[src_offset]; else output_image[dst_offset] = 0; batch_offset += height * width * channels; adjusted_batch_offset += adjustedHeight * adjustedWidth * channels; } } __kernel void TEMPLATE(copyWeightsSwizzled, Dtype) (__global Dtype* weightIn, __global Dtype* weightOut, const int_tp kernel_w, const int_tp kernel_h, const int_tp channels, const int_tp outputs, const int_tp swizzleFactor) { uint_tp sX = get_global_id(0); //Original location //Output location int_tp outputSublayer = channels / swizzleFactor; int_tp outputSublayerIndex = channels % swizzleFactor; int_tp filter = sX / (kernel_w*kernel_h*channels); int_tp kernel_X = sX % kernel_w; int_tp kernel_Y = (sX / kernel_w) % kernel_h; int_tp kernel_C = (sX / (kernel_w * kernel_h)) % channels; int_tp FP = filter / swizzleFactor; int_tp F1 = filter % swizzleFactor; weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1] = weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X]; } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif inline int_tp TEMPLATE(compute_uncropped_index,Dtype)( int_tp index, const int_tp ndims, __global const int_tp* src_strides, __global const int_tp* dst_strides, __global const int_tp* offsets) { int_tp dest_index = index; int_tp src_index = 0; for (int_tp i = 0; i < ndims; ++i) { int_tp coord = dest_index / dst_strides[i]; dest_index -= coord * dst_strides[i]; src_index += src_strides[i] * (coord + offsets[i]); } return src_index; } __kernel void TEMPLATE(crop_forward,Dtype)(const int_tp nthreads, const int_tp ndims, __global const int_tp* src_strides, __global const int_tp* dst_strides, __global const int_tp* offsets, __global const Dtype* src, const int_tp src_off, __global Dtype* dst, const int_tp dst_off) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp src_index = TEMPLATE(compute_uncropped_index,Dtype)( index, ndims, src_strides, dst_strides, offsets); dst[dst_off + index] = src[src_off + src_index]; } } __kernel void TEMPLATE(crop_backward,Dtype)(const int_tp nthreads, const int_tp ndims, __global const int_tp* src_strides, __global const int_tp* dst_strides, __global const int_tp* offsets, __global Dtype* src, const int_tp src_off, __global const Dtype* dst, const int_tp dst_off) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp src_index = TEMPLATE(compute_uncropped_index,Dtype)( index, ndims, src_strides, dst_strides, offsets); src[src_off + src_index] = dst[dst_off + index]; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(dropout_forward,Dtype)(const int_tp n, __global const Dtype* in, __global const uint_tp* mask, const uint_tp threshold, const Dtype scale, __global Dtype* out) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out[index] = in[index] * ((mask[index] > threshold)?1.0:0.0) * scale; } } __kernel void TEMPLATE(dropout_backward,Dtype)( const int_tp n, __global const Dtype* in_diff, __global const uint_tp* mask, const uint_tp threshold, const Dtype scale, __global Dtype* out_diff) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out_diff[index] = in_diff[index] * ((mask[index] > threshold)?1.0:0.0) * scale; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(eltwise_max_forward,Dtype)( const int_tp nthreads, __global const Dtype* bottom_data_a, __global const Dtype* bottom_data_b, const int_tp blob_idx, __global Dtype* top_data, __global int_tp* mask) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { Dtype maxval = -FLT_MAX; int_tp maxidx = -1; if (bottom_data_a[index] > bottom_data_b[index]) { // only update for very first bottom_data blob (blob_idx == 0) if (blob_idx == 0) { maxval = bottom_data_a[index]; top_data[index] = maxval; maxidx = blob_idx; mask[index] = maxidx; } } else { maxval = bottom_data_b[index]; top_data[index] = maxval; maxidx = blob_idx + 1; mask[index] = maxidx; } } } __kernel void TEMPLATE(eltwise_max_backward,Dtype)(const int_tp nthreads, __global const Dtype* top_diff, const int_tp blob_idx, __global const int_tp* mask, __global Dtype* bottom_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { Dtype gradient = 0; if (mask[index] == blob_idx) { gradient += top_diff[index]; } bottom_diff[index] = gradient; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(elu_forward,Dtype)(const int n, __global const Dtype* in, __global Dtype* out, Dtype alpha) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out[index] = in[index] > 0 ? in[index] : alpha * (exp(in[index]) - 1.0); } } __kernel void TEMPLATE(elu_backward,Dtype)(const int n, __global const Dtype* in_diff, __global const Dtype* out_data, __global const Dtype* in_data, __global Dtype* out_diff, Dtype alpha) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { out_diff[index] = in_data[index] > 0 ? in_diff[index] : in_diff[index] * (out_data[index] + alpha); } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(embed_forward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data, __global const Dtype* weight, const int_tp M, const int_tp N, const int_tp K, __global Dtype* top_data) { for (int_tp top_index = get_global_id(0); top_index < nthreads; top_index += get_global_size(0)) { const int_tp n = top_index / N; const int_tp d = top_index % N; const int_tp index = (int_tp)(bottom_data[n]); const int_tp weight_index = index * N + d; top_data[top_index] = weight[weight_index]; } } // atomic_add from: http://suhorukov.blogspot.com/2011/12/opencl-11-atomic-operations-on-floating.html #if (TYPE == TYPE_FLOAT) #ifdef ATOMICS_32_AVAILABLE inline void TEMPLATE(atomic_add,Dtype)(volatile __global Dtype *source, const Dtype operand) { union { uint_tp intVal; Dtype floatVal; } newVal; union { uint_tp intVal; Dtype floatVal; } prevVal; do { prevVal.floatVal = *source; newVal.floatVal = prevVal.floatVal + operand; } while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal); } __kernel void TEMPLATE(embed_backward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data, __global const Dtype* top_diff, const int_tp M, const int_tp N, const int_tp K, __global Dtype* weight_diff) { for (int_tp top_index = get_global_id(0); top_index < nthreads; top_index += get_global_size(0)) { const int_tp n = top_index / N; const int_tp d = top_index % N; const int_tp index = (int_tp)(bottom_data[n]); const int_tp weight_index = index * N + d; TEMPLATE(atomic_add,Dtype)((weight_diff + weight_index), *(top_diff + top_index)); } } #endif #endif #if (TYPE == TYPE_DOUBLE) #ifdef ATOMICS_64_AVAILABLE inline void TEMPLATE(atomic_add,Dtype)(volatile __global Dtype *source, const Dtype operand) { union { unsigned long intVal; Dtype floatVal; } newVal; union { unsigned long intVal; Dtype floatVal; } prevVal; do { prevVal.floatVal = *source; newVal.floatVal = prevVal.floatVal + operand; } while (atom_cmpxchg((volatile __global unsigned long *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal); } __kernel void TEMPLATE(embed_backward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data, __global const Dtype* top_diff, const int_tp M, const int_tp N, const int_tp K, __global Dtype* weight_diff) { for (int_tp top_index = get_global_id(0); top_index < nthreads; top_index += get_global_size(0)) { const int_tp n = top_index / N; const int_tp d = top_index % N; const int_tp index = (int_tp)(bottom_data[n]); const int_tp weight_index = index * N + d; TEMPLATE(atomic_add,Dtype)((weight_diff + weight_index), *(top_diff + top_index)); } } #endif #endif #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(fillbuffer,Dtype)(const int_tp n, const char alpha, __global char* x, const int_tp offx) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { x[index + offx] = alpha; } } __kernel void TEMPLATE(fill,Dtype)(const int_tp n, const Dtype alpha, __global Dtype* x, const int_tp offx) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { x[index + offx] = alpha; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(im2col,Dtype)(const int_tp n, __global const Dtype* data_im, const int_tp data_im_off, const int_tp height, const int_tp width, const int_tp kernel_h, const int_tp kernel_w, const int_tp pad_h, const int_tp pad_w, const int_tp stride_h, const int_tp stride_w, const int_tp dilation_h, const int_tp dilation_w, const int_tp height_col, const int_tp width_col, __global Dtype* data_col, const int_tp data_col_off) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { const int_tp h_index = index / width_col; const int_tp h_col = h_index % height_col; const int_tp w_col = index % width_col; const int_tp c_im = h_index / height_col; const int_tp c_col = c_im * kernel_h * kernel_w; const int_tp h_offset = h_col * stride_h - pad_h; const int_tp w_offset = w_col * stride_w - pad_w; __global Dtype* data_col_ptr = data_col + data_col_off; data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; __global const Dtype* data_im_ptr = data_im + data_im_off; data_im_ptr += (c_im * height + h_offset) * width + w_offset; for (int_tp i = 0; i < kernel_h; ++i) { for (int_tp j = 0; j < kernel_w; ++j) { int_tp h_im = h_offset + i * dilation_h; int_tp w_im = w_offset + j * dilation_w; *data_col_ptr = (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ? data_im_ptr[i * dilation_h * width + j * dilation_w] : 0; data_col_ptr += height_col * width_col; } } } } __kernel void TEMPLATE(col2im,Dtype)(const int_tp n, __global const Dtype* data_col, const int_tp data_col_off, const int_tp height, const int_tp width, const int_tp channels, const int_tp kernel_h, const int_tp kernel_w, const int_tp pad_h, const int_tp pad_w, const int_tp stride_h, const int_tp stride_w, const int_tp dilation_h, const int_tp dilation_w, const int_tp height_col, const int_tp width_col, __global Dtype* data_im, const int_tp data_im_off) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { Dtype val = 0; const int_tp w_im = index % width + pad_w; const int_tp h_im = (index / width) % height + pad_h; const int_tp c_im = index / (width * height); int_tp kernel_extent_w = (kernel_w - 1) * dilation_w + 1; int_tp kernel_extent_h = (kernel_h - 1) * dilation_h + 1; // compute the start and end of the output const int_tp w_col_start = (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1; const int_tp w_col_end = min(w_im / stride_w + 1, width_col); const int_tp h_col_start = (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1; const int_tp h_col_end = min(h_im / stride_h + 1, height_col); // TODO: use LCM of stride and dilation to avoid unnecessary loops for (int_tp h_col = h_col_start; h_col < h_col_end; h_col += 1) { for (int_tp w_col = w_col_start; w_col < w_col_end; w_col += 1) { int_tp h_k = (h_im - h_col * stride_h); int_tp w_k = (w_im - w_col * stride_w); if (h_k % dilation_h == 0 && w_k % dilation_w == 0) { h_k /= dilation_h; w_k /= dilation_w; int_tp data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) * height_col + h_col) * width_col + w_col; val += data_col[data_col_off + data_col_index]; } } } data_im[data_im_off + index] = val; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(im2col_nd, Dtype)(const int_tp n, const int_tp num_axes, const int_tp channel_axis, __global const Dtype* data_im, const int_tp data_im_off, __global const int_tp* im_shape, __global const int_tp* col_shape, __global const int_tp* kernel_shape, __global const int_tp* pad, __global const int_tp* stride, __global const int_tp* dilation, __global Dtype* data_col, const int_tp data_col_off) { int_tp d_temp[6]; int_tp d_iter[6]; int_tp i; __global const int_tp* im_shape_ptr = im_shape + channel_axis; __global const int_tp* col_shape_ptr = col_shape + channel_axis; __local int_tp shared_dilation[6]; __local int_tp shared_kernel_shape[6]; __local int_tp shared_pad[6]; __local int_tp shared_stride[6]; __local int_tp shared_col_shape[6 + 1]; __local int_tp shared_im_shape[6 + 1]; for (int li = get_local_id(0); li < num_axes; li += get_local_size(0)) { shared_dilation[li] = dilation[li]; shared_kernel_shape[li] = kernel_shape[li]; shared_pad[li] = pad[li]; shared_stride[li] = stride[li]; } for (int li = get_local_id(0); li < num_axes + 1; li += get_local_size(0)) { shared_col_shape[li] = col_shape_ptr[li]; shared_im_shape[li] = im_shape_ptr[li]; } barrier(CLK_LOCAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { // Initialize channel_in, computed in the loop below, with intermediate // computations used to compute the spatial indices. int_tp channel_in = index; int_tp channel_out = 1; for (i = num_axes - 1; i >= 0; --i) { d_temp[i] = channel_in % shared_col_shape[i + 1]; channel_in /= shared_col_shape[i + 1]; channel_out *= shared_kernel_shape[i]; } channel_out *= channel_in; int_tp data_col_inc = 1; for (i = 0; i < num_axes; ++i) { channel_out *= shared_col_shape[i + 1]; channel_out += d_temp[i]; d_temp[i] = d_temp[i] * shared_stride[i] - shared_pad[i]; channel_in *= shared_im_shape[i + 1]; channel_in += d_temp[i]; data_col_inc *= shared_col_shape[i + 1]; d_iter[i] = 0; } __global Dtype* data_col_ptr = data_col + data_col_off + channel_out; __global const Dtype* data_im_ptr = data_im + data_im_off + channel_in; bool incremented; do { bool in_range = true; for (i = 0; i < num_axes; ++i) { const int_tp d_iter_im = d_iter[i] * shared_dilation[i] + d_temp[i]; in_range &= d_iter_im >= 0 && d_iter_im < shared_im_shape[i + 1]; if (!in_range) { break; } } if (in_range) { int_tp data_im_offset = d_iter[0] * shared_dilation[0]; for (i = 1; i < num_axes; ++i) { data_im_offset *= shared_im_shape[i + 1]; data_im_offset += d_iter[i] * shared_dilation[i]; } *data_col_ptr = data_im_ptr[data_im_offset]; } else { *data_col_ptr = 0; } data_col_ptr += data_col_inc; incremented = false; for (i = num_axes - 1; i >= 0; --i) { const int_tp d_max = shared_kernel_shape[i]; if (d_iter[i] == d_max - 1) { d_iter[i] = 0; } else { // d_iter[i] < d_max - 1 ++d_iter[i]; incremented = true; break; } } // for (int_tp i = num_axes - 1; i >= 0; --i) } while (incremented); // do } } __kernel void TEMPLATE(col2im_nd, Dtype)(const int_tp n, const int_tp num_axes, const int_tp channel_axis, __global const Dtype* data_col, const int_tp data_col_off, __global const int_tp* im_shape, __global const int_tp* col_shape, __global const int_tp* kernel_shape, __global const int_tp* pad, __global const int_tp* stride, __global const int_tp* dilation, __global Dtype* data_im, const int_tp data_im_off) { int_tp d_im[6]; int_tp d_col_iter[6]; int_tp d_col_start[6]; int_tp d_col_end[6]; __global const int_tp* im_shape_ptr = im_shape + channel_axis; __global const int_tp* col_shape_ptr = col_shape + channel_axis; __local int_tp shared_dilation[6]; __local int_tp shared_kernel_shape[6]; __local int_tp shared_pad[6]; __local int_tp shared_stride[6]; __local int_tp shared_col_shape[6 + 1]; __local int_tp shared_im_shape[6 + 1]; for (int li = get_local_id(0); li < num_axes; li += get_local_size(0)) { shared_dilation[li] = dilation[li]; shared_kernel_shape[li] = kernel_shape[li]; shared_pad[li] = pad[li]; shared_stride[li] = stride[li]; } for (int li = get_local_id(0); li < num_axes + 1; li += get_local_size(0)) { shared_col_shape[li] = col_shape_ptr[li]; shared_im_shape[li] = im_shape_ptr[li]; } barrier(CLK_LOCAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { // Initialize channel_in, computed in the loop below, with intermediate // computations used to compute the spatial indices. int_tp c_im = index; // Calculate d_im (image dimensions). for (int_tp i = num_axes - 1; i >= 0; --i) { d_im[i] = c_im % shared_im_shape[i + 1] + shared_pad[i]; c_im /= shared_im_shape[i + 1]; } // Calculate col start/end indices. bool done = false; for (int_tp i = 0; i < num_axes; ++i) { const int_tp kernel_extent = shared_dilation[i] * (shared_kernel_shape[i] - 1) + 1; d_col_start[i] = d_col_iter[i] = (d_im[i] < kernel_extent) ? 0 : (d_im[i] - kernel_extent) / shared_stride[i] + 1; d_col_end[i] = min(d_im[i] / shared_stride[i] + 1, shared_col_shape[i + 1]); if (d_col_start[i] >= d_col_end[i]) { // Skip computation if the dimension is 0 at any spatial axis -- // final val will be 0. data_im[index] = (Dtype)0.0; done = true; break; // for (int_tp i = 0; i < num_axes; ++i) } } if (!done) { // Loop over the col to compute the output val. Dtype val = (Dtype)0.0; bool incremented = true; bool skip = false; do { // Compute the final offset. int_tp final_offset = 0; int_tp kernel_shape_prod = 1; int_tp kernel_index; for (int_tp i = num_axes - 1; i >= 0; --i) { kernel_index = d_im[i] - d_col_iter[i] * shared_stride[i]; if (kernel_index % shared_dilation[i]) { skip = true; break; } else { kernel_index /= shared_dilation[i]; final_offset += kernel_index * kernel_shape_prod; kernel_shape_prod *= shared_kernel_shape[i]; } } if (!skip) { final_offset += kernel_shape_prod * c_im; for (int_tp i = 0; i < num_axes; ++i) { final_offset *= shared_col_shape[i + 1]; final_offset += d_col_iter[i]; } val += data_col[data_col_off + final_offset]; } skip = false; incremented = false; for (int_tp i = num_axes - 1; i >= 0; --i) { const int_tp d_max = d_col_end[i]; if (d_col_iter[i] == d_max - 1) { d_col_iter[i] = d_col_start[i]; } else { // d_col_iter[i] < d_max - 1 ++d_col_iter[i]; incremented = true; break; // for (int_tp i = num_axes - 1; i >= 0; --i) } } // for (int_tp i = num_axes - 1; i >= 0; --i) } while (incremented); data_im[data_im_off + index] = val; } } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(lrn_compute_output,Dtype)(const int_tp nthreads, __global const Dtype* in, __global const Dtype* scale, const Dtype negative_beta, __global Dtype* out) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { out[index] = in[index] * pow(scale[index], negative_beta); } } __kernel void TEMPLATE(lrn_fill_scale,Dtype)(const int_tp nthreads, __global const Dtype* in, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp size, const Dtype alpha_over_size, const Dtype k, __global Dtype* const scale) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { // find out the local offset const int_tp w = index % width; const int_tp h = (index / width) % height; const int_tp n = index / width / height; const int_tp offset = (n * channels * height + h) * width + w; const int_tp step = height * width; __global const Dtype* in_off = in + offset; __global Dtype* scale_off = scale + offset; int_tp head = 0; const int_tp pre_pad = (size - 1) / 2; const int_tp post_pad = size - pre_pad - 1; Dtype accum_scale = 0; // fill the scale at [n, :, h, w] // accumulate values while (head < post_pad && head < channels) { accum_scale += in_off[head * step] * in_off[head * step]; ++head; } // both add and subtract while (head < channels) { accum_scale += in_off[head * step] * in_off[head * step]; if (head - size >= 0) { accum_scale -= in_off[(head - size) * step] * in_off[(head - size) * step]; } scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size; ++head; } // subtract only while (head < channels + post_pad) { if (head - size >= 0) { accum_scale -= in_off[(head - size) * step] * in_off[(head - size) * step]; } scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size; ++head; } } } __kernel void TEMPLATE(lrn_compute_diff,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data, __global const Dtype* top_data, __global const Dtype* scale, __global const Dtype* top_diff, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp size, const Dtype negative_beta, const Dtype cache_ratio, __global Dtype* bottom_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { // find out the local offset const int_tp w = index % width; const int_tp h = (index / width) % height; const int_tp n = index / width / height; const int_tp offset = (n * channels * height + h) * width + w; const int_tp step = height * width; __global const Dtype* bottom_off = bottom_data + offset; __global const Dtype* top_off = top_data + offset; __global const Dtype* scale_off = scale + offset; __global const Dtype* top_diff_off = top_diff + offset; __global Dtype* bottom_diff_off = bottom_diff + offset; int_tp head = 0; const int_tp pre_pad = size - (size + 1) / 2; const int_tp post_pad = size - pre_pad - 1; Dtype accum_ratio = 0; // accumulate values while (head < post_pad && head < channels) { accum_ratio += top_diff_off[head * step] * top_off[head * step] / scale_off[head * step]; ++head; } // both add and subtract while (head < channels) { accum_ratio += top_diff_off[head * step] * top_off[head * step] / scale_off[head * step]; if (head - size >= 0) { accum_ratio -= top_diff_off[(head - size) * step] * top_off[(head - size) * step] / scale_off[(head - size) * step]; } bottom_diff_off[(head - post_pad) * step] = top_diff_off[(head - post_pad) * step] * pow(scale_off[(head - post_pad) * step], negative_beta) - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio; ++head; } // subtract only while (head < channels + post_pad) { if (head - size >= 0) { accum_ratio -= top_diff_off[(head - size) * step] * top_off[(head - size) * step] / scale_off[(head - size) * step]; } bottom_diff_off[(head - post_pad) * step] = top_diff_off[(head - post_pad) * step] * pow(scale_off[(head - post_pad) * step], negative_beta) - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio; ++head; } } } __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int_tp nthreads, __global const Dtype* in, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp size, const Dtype alpha_over_size, const Dtype k, __global Dtype* const out, const Dtype negative_beta) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { // find out the local offset const int_tp w = index % width; const int_tp h = (index / width) % height; const int_tp n = index / width / height; const int_tp offset = (n * channels * height + h) * width + w; const int_tp step = height * width; __global const Dtype* in_off = in + offset; __global Dtype* out_off = out + offset; Dtype scale_val; int_tp head = 0; const int_tp pre_pad = (size - 1) / 2; const int_tp post_pad = size - pre_pad - 1; Dtype accum_scale = 0; // fill the scale at [n, :, h, w] // accumulate values while (head < post_pad && head < channels) { accum_scale += in_off[head * step] * in_off[head * step]; ++head; } // both add and subtract while (head < channels) { accum_scale += in_off[head * step] * in_off[head * step]; if (head - size >= 0) { accum_scale -= in_off[(head - size) * step] * in_off[(head - size) * step]; } scale_val = k + accum_scale * alpha_over_size; out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta); ++head; } // subtract only while (head < channels + post_pad) { if (head - size >= 0) { accum_scale -= in_off[(head - size) * step] * in_off[(head - size) * step]; } scale_val = k + accum_scale * alpha_over_size; out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta); ++head; } } } __kernel void TEMPLATE(lrn_full,Dtype)(const int_tp nthreads, __global const Dtype* in, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp size, const Dtype alpha_over_size, const Dtype k, __global Dtype* const scale, __global Dtype* const out, const Dtype negative_beta) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { // find out the local offset const int_tp w = index % width; const int_tp h = (index / width) % height; const int_tp n = index / width / height; const int_tp offset = (n * channels * height + h) * width + w; const int_tp step = height * width; __global const Dtype* in_off = in + offset; __global Dtype* out_off = out + offset; __global Dtype* scale_off = scale + offset; Dtype scale_val; int_tp head = 0; const int_tp pre_pad = (size - 1) / 2; const int_tp post_pad = size - pre_pad - 1; Dtype accum_scale = 0; // fill the scale at [n, :, h, w] // accumulate values while (head < post_pad && head < channels) { accum_scale += in_off[head * step] * in_off[head * step]; ++head; } // both add and subtract while (head < channels) { accum_scale += in_off[head * step] * in_off[head * step]; if (head - size >= 0) { accum_scale -= in_off[(head - size) * step] * in_off[(head - size) * step]; } scale_val = k + accum_scale * alpha_over_size; scale_off[(head - post_pad) * step] = scale_val; out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta); ++head; } // subtract only while (head < channels + post_pad) { if (head - size >= 0) { accum_scale -= in_off[(head - size) * step] * in_off[(head - size) * step]; } scale_val = k + accum_scale * alpha_over_size; scale_off[(head - post_pad) * step] = scale_val; out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta); ++head; } } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif inline Dtype TEMPLATE(lstm_sigmoid,Dtype)(const Dtype x) { return (Dtype)1 / ((Dtype)1 + exp(-x)); } inline Dtype TEMPLATE(lstm_tanh,Dtype)(const Dtype x) { return (Dtype)2 * TEMPLATE(lstm_sigmoid,Dtype)((Dtype)2 * x) - (Dtype)1; } __kernel void TEMPLATE(lstm_acts_forward,Dtype)(const int_tp nthreads, const int_tp dim, __global const Dtype* X, __global Dtype* X_acts) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp x_dim = 4 * dim; const int_tp d = index % x_dim; if (d < 3 * dim) { X_acts[index] = TEMPLATE(lstm_sigmoid,Dtype)(X[index]); } else { X_acts[index] = TEMPLATE(lstm_tanh,Dtype)(X[index]); } } } __kernel void TEMPLATE(lstm_unit_forward,Dtype)(const int_tp nthreads, const int_tp dim, __global const Dtype* C_prev, __global const Dtype* X, __global const Dtype* cont, __global Dtype* C, __global Dtype* H) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp n = index / dim; const int_tp d = index % dim; __global const Dtype* X_offset = X + 4 * dim * n; const Dtype i = X_offset[d]; const Dtype f = X_offset[1 * dim + d]; const Dtype o = X_offset[2 * dim + d]; const Dtype g = X_offset[3 * dim + d]; const Dtype c_prev = C_prev[index]; const Dtype c = cont[n] * f * c_prev + i * g; C[index] = c; const Dtype tanh_c = TEMPLATE(lstm_tanh,Dtype)(c); H[index] = o * tanh_c; } } __kernel void TEMPLATE(lstm_unit_backward,Dtype)(const int_tp nthreads, const int_tp dim, __global const Dtype* C_prev, __global const Dtype* X, __global const Dtype* C, __global const Dtype* H, __global const Dtype* cont, __global const Dtype* C_diff, __global const Dtype* H_diff, __global Dtype* C_prev_diff, __global Dtype* X_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp n = index / dim; const int_tp d = index % dim; __global const Dtype* X_offset = X + 4 * dim * n; const Dtype i = X_offset[d]; const Dtype f = X_offset[1 * dim + d]; const Dtype o = X_offset[2 * dim + d]; const Dtype g = X_offset[3 * dim + d]; const Dtype c_prev = C_prev[index]; const Dtype c = C[index]; const Dtype tanh_c = TEMPLATE(lstm_tanh,Dtype)(c); __global Dtype* c_prev_diff = C_prev_diff + index; __global Dtype* X_diff_offset = X_diff + 4 * dim * n; __global Dtype* i_diff = X_diff_offset + d; __global Dtype* f_diff = X_diff_offset + 1 * dim + d; __global Dtype* o_diff = X_diff_offset + 2 * dim + d; __global Dtype* g_diff = X_diff_offset + 3 * dim + d; const Dtype c_term_diff = C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c); const Dtype cont_n = cont[n]; *c_prev_diff = cont_n * c_term_diff * f; *i_diff = c_term_diff * g; *f_diff = cont_n * c_term_diff * c_prev; *o_diff = H_diff[index] * tanh_c; *g_diff = c_term_diff * i; } } __kernel void TEMPLATE(lstm_acts_backward,Dtype)(const int_tp nthreads, const int_tp dim, __global const Dtype* X_acts, __global const Dtype* X_acts_diff, __global Dtype* X_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp x_dim = 4 * dim; const int_tp d = index % x_dim; const Dtype X_act = X_acts[index]; if (d < 3 * dim) { X_diff[index] = X_acts_diff[index] * X_act * ((Dtype)1 - X_act); } else { X_diff[index] = X_acts_diff[index] * ((Dtype)1 - X_act * X_act); } } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(mul,Dtype)(const int_tp n, __global const Dtype* a, const int_tp offa, __global Dtype* b, const int_tp offb, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[index + offy] = a[index + offa] * b[index + offb]; } } __kernel void TEMPLATE(div,Dtype)(const int_tp n, __global const Dtype* a, const int_tp offa, __global Dtype* b, const int_tp offb, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[index + offy] = a[index + offa] / b[index + offb]; } } __kernel void TEMPLATE(add_scalar,Dtype)(const int_tp N, const Dtype alpha, __global Dtype* Y, const int_tp offY) { for (int_tp index = get_global_id(0); index < N; index += get_global_size(0)) { Y[offY + index] += alpha; } } __kernel void TEMPLATE(add,Dtype)(const int_tp n, __global const Dtype* a, const int_tp offa, __global const Dtype* b, const int_tp offb, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[offy + index] = a[offa + index] + b[offb + index]; } } __kernel void TEMPLATE(sub,Dtype)(const int_tp n, __global const Dtype* a, const int_tp offa, __global const Dtype* b, const int_tp offb, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[offy + index] = a[offa + index] - b[offb + index]; } } __kernel void TEMPLATE(abs,Dtype)(const int_tp n, __global const Dtype* a, const int_tp offa, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[offy + index] = fabs((Dtype)(a[offa + index])); } } __kernel void TEMPLATE(exp,Dtype)(const int_tp n, __global const Dtype* a, const int_tp offa, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[offy + index] = exp(a[offa + index]); } } __kernel void TEMPLATE(log,Dtype)(const int_tp n, __global const Dtype* a, const int_tp offa, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[offy + index] = log((Dtype)(a[offa + index])); } } __kernel void TEMPLATE(sqrt,Dtype)(const int_tp n, __global const Dtype* a, const int_tp offa, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[offy + index] = sqrt((Dtype)a[offa + index]); } } __kernel void TEMPLATE(powx,Dtype)(const int_tp n, __global const Dtype* a, const int_tp offa, Dtype alpha, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { if(alpha == 2.0) { y[offy + index] = pow((Dtype)fabs(a[offa + index]), (Dtype)alpha); } else { y[offy + index] = pow((Dtype)a[offa + index], (Dtype)alpha); } } } __kernel void TEMPLATE(sign,Dtype)(const int_tp n, __global const Dtype* x, const int_tp offx, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[index + offy] = (0.0 < x[index + offx]) - (x[index + offx] < 0.0); } } __kernel void TEMPLATE(sgnbit,Dtype)(const int_tp n, __global const Dtype* x, const int_tp offx, __global Dtype* y, const int_tp offy) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { y[index + offy] = signbit(x[index + offx]); } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(merge_copy_forward_stack, Dtype)(const int_tp nthreads, const int_tp dims, __global const Dtype* bottom_a, const int_tp forward_a, __global const Dtype* bottom_b, const int_tp forward_b, __global Dtype* top, const int_tp num, const int_tp channels_a, const int_tp channels_b, __global const int_tp* shape_a, __global const int_tp* shape_b) { int_tp pad[6]; int_tp tmp_idx[6]; int_tp size_a = 1; int_tp size_b = 1; for (int_tp i = 0; i < dims; ++i) { pad[i] = (shape_b[i] - shape_a[i]) / 2; size_a *= shape_a[i]; size_b *= shape_b[i]; } for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp batch_id = index / ((channels_a + channels_b) * size_a); int_tp bottom_id = ((index - batch_id * (channels_a + channels_b) * size_a) / (channels_a * size_a)) % 2; int_tp counter = index; for (int_tp i = dims - 1; i >= 0; --i) { tmp_idx[i] = counter % shape_a[i]; counter /= shape_a[i]; } if (bottom_id == 0) { int_tp channel_id = (index / size_a) % channels_a; int_tp aidx = batch_id * channels_a + channel_id; for (int_tp i = 0; i < dims; ++i) { aidx *= shape_a[i]; aidx += tmp_idx[i]; } top[index] = (forward_a == 1) ? bottom_a[aidx] : 0; } else { int_tp channel_id = (index / size_a) % channels_b; int_tp bidx = (batch_id * channels_b + channel_id) * size_b; int_tp btemp = 1; for (int_tp i = dims - 1; i >= 0; --i) { bidx += btemp * (tmp_idx[i] + pad[i]); btemp *= shape_b[i]; } top[index] = (forward_b == 1) ? bottom_b[bidx] : 0; } } } __kernel void TEMPLATE(merge_copy_backward_stack,Dtype)(const int_tp nthreads, const int_tp dims, __global Dtype* bottom_a, const int_tp backward_a, __global Dtype* bottom_b, const int_tp backward_b, __global const Dtype* top, const int_tp num, const int_tp channels_a, const int_tp channels_b, __global const int_tp* shape_a, __global const int_tp* shape_b) { int_tp pad[6]; int_tp tmp_idx[6]; int_tp size_a = 1; int_tp size_b = 1; for (int_tp i = 0; i < dims; ++i) { pad[i] = (shape_b[i] - shape_a[i]) / 2; size_a *= shape_a[i]; size_b *= shape_b[i]; } for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp batch_id = index / ((channels_a + channels_b) * size_a); int_tp bottom_id = ((index - batch_id * (channels_a + channels_b) * size_a) / (channels_a * size_a)) % 2; int_tp counter = index; for (int_tp i = dims - 1; i >= 0; --i) { tmp_idx[i] = counter % shape_a[i]; counter /= shape_a[i]; } if (bottom_id == 0) { int_tp channel_id = (index / size_a) % channels_a; int_tp aidx = batch_id * channels_a + channel_id; for (int_tp i = 0; i < dims; ++i) { aidx *= shape_a[i]; aidx += tmp_idx[i]; } bottom_a[aidx] = (backward_a == 1) ? top[index] : 0; } else { int_tp channel_id = (index / size_a) % channels_b; int_tp bidx = (batch_id * channels_b + channel_id) * size_b; int_tp btemp = 1; for (int_tp i = dims - 1; i >= 0; --i) { bidx += btemp * (tmp_idx[i] + pad[i]); btemp *= shape_b[i]; } bottom_b[bidx] = (backward_b == 1) ? top[index] : 0; } } } __kernel void TEMPLATE(merge_copy_forward_add, Dtype)(const int_tp nthreads, const int_tp dims, __global const Dtype* bottom_a, const int_tp forward_a, __global const Dtype* bottom_b, const int_tp forward_b, __global Dtype* top, const int_tp num, const int_tp channels, __global const int_tp* shape_a, __global const int_tp* shape_b) { int_tp pad[6]; int_tp tmp_idx[6]; int_tp size_a = 1; int_tp size_b = 1; for (int_tp i = 0; i < dims; ++i) { pad[i] = (shape_b[i] - shape_a[i]) / 2; size_a *= shape_a[i]; size_b *= shape_b[i]; } for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp batch_id = index / (channels * size_a); int_tp counter = index; for (int_tp i = dims - 1; i >= 0; --i) { tmp_idx[i] = counter % shape_a[i]; counter /= shape_a[i]; } top[index] = 0; int_tp channel_id = (index / size_a) % channels; int_tp aidx = batch_id * channels + channel_id; for (int_tp i = 0; i < dims; ++i) { aidx *= shape_a[i]; aidx += tmp_idx[i]; } top[index] = forward_a ? top[index] + bottom_a[aidx] : top[index]; int_tp bidx = (batch_id * channels + channel_id) * size_b; int_tp btemp = 1; for (int_tp i = dims - 1; i >= 0; --i) { bidx += btemp * (tmp_idx[i] + pad[i]); btemp *= shape_b[i]; } top[index] = forward_b ? top[index] + bottom_b[bidx] : top[index]; } } __kernel void TEMPLATE(merge_copy_backward_add,Dtype)(const int_tp nthreads, const int_tp dims, __global Dtype* bottom_a, const int_tp backward_a, __global Dtype* bottom_b, const int_tp backward_b, __global const Dtype* top, const int_tp num, const int_tp channels, __global const int_tp* shape_a, __global const int_tp* shape_b) { int_tp pad[6]; int_tp tmp_idx[6]; int_tp size_a = 1; int_tp size_b = 1; for (int_tp i = 0; i < dims; ++i) { pad[i] = (shape_b[i] - shape_a[i]) / 2; size_a *= shape_a[i]; size_b *= shape_b[i]; } for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp batch_id = index / (channels * size_a); int_tp counter = index; for (int_tp i = dims - 1; i >= 0; --i) { tmp_idx[i] = counter % shape_a[i]; counter /= shape_a[i]; } int_tp channel_id = (index / size_a) % channels; int_tp aidx = batch_id * channels + channel_id; for (int_tp i = 0; i < dims; ++i) { aidx *= shape_a[i]; aidx += tmp_idx[i]; } bottom_a[aidx] = backward_a ? top[index] : 0; int_tp bidx = (batch_id * channels + channel_id) * size_b; int_tp btemp = 1; for (int_tp i = dims - 1; i >= 0; --i) { bidx += btemp * (tmp_idx[i] + pad[i]); btemp *= shape_b[i]; } bottom_b[bidx] = backward_b ? top[index] : 0; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(max_pool_forward,Dtype)( const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w, __global Dtype* top_data, const int use_mask, __global int_tp* mask, __global Dtype* top_mask) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp pw = index % pooled_width; const int_tp ph = (index / pooled_width) % pooled_height; const int_tp c = (index / pooled_width / pooled_height) % channels; const int_tp n = index / pooled_width / pooled_height / channels; int_tp hstart = ph * stride_h - pad_h; int_tp wstart = pw * stride_w - pad_w; const int_tp hend = min(hstart + kernel_h, height); const int_tp wend = min(wstart + kernel_w, width); hstart = max(hstart, (int_tp)0); wstart = max(wstart, (int_tp)0); Dtype maxval = -FLT_MAX; int_tp maxidx = -1; __global const Dtype* bottom_slice = bottom_data + (n * channels + c) * height * width; for (int_tp h = hstart; h < hend; ++h) { for (int_tp w = wstart; w < wend; ++w) { if (bottom_slice[h * width + w] > maxval) { maxidx = h * width + w; maxval = bottom_slice[maxidx]; } } } top_data[index] = maxval; if (use_mask == 1) { mask[index] = maxidx; } else { top_mask[index] = maxidx; } } } __kernel void TEMPLATE(ave_pool_forward,Dtype)( const int_tp nthreads, __global const Dtype* const bottom_data, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w, __global Dtype* top_data) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { { const int_tp pw = index % pooled_width; const int_tp ph = (index / pooled_width) % pooled_height; const int_tp c = (index / pooled_width / pooled_height) % channels; const int_tp n = index / pooled_width / pooled_height / channels; int_tp hstart = ph * stride_h - pad_h; int_tp wstart = pw * stride_w - pad_w; int_tp hend = min(hstart + kernel_h, height + pad_h); int_tp wend = min(wstart + kernel_w, width + pad_w); const int_tp pool_size = (hend - hstart) * (wend - wstart); hstart = max(hstart, (int_tp)0); wstart = max(wstart, (int_tp)0); hend = min(hend, height); wend = min(wend, width); Dtype aveval = 0; __global const Dtype* bottom_slice = bottom_data + (n * channels + c) * height * width; for (int_tp h = hstart; h < hend; ++h) { for (int_tp w = wstart; w < wend; ++w) { aveval += bottom_slice[h * width + w]; } } top_data[index] = aveval / pool_size; } } } __kernel void TEMPLATE(sto_pool_forward_train,Dtype)( const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, __global Dtype* rand_idx, __global Dtype* top_data) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp pw = index % pooled_width; const int_tp ph = (index / pooled_width) % pooled_height; const int_tp c = (index / pooled_width / pooled_height) % channels; const int_tp n = index / pooled_width / pooled_height / channels; const int_tp hstart = ph * stride_h; const int_tp hend = min(hstart + kernel_h, height); const int_tp wstart = pw * stride_w; const int_tp wend = min(wstart + kernel_w, width); Dtype cumsum = 0.; __global const Dtype* bottom_slice = bottom_data + (n * channels + c) * height * width; // First pass: get sum for (int_tp h = hstart; h < hend; ++h) { for (int_tp w = wstart; w < wend; ++w) { cumsum += bottom_slice[h * width + w]; } } const float thres = rand_idx[index] * cumsum; // Second pass: get value, and set index. cumsum = 0; for (int_tp h = hstart; h < hend; ++h) { for (int_tp w = wstart; w < wend; ++w) { cumsum += bottom_slice[h * width + w]; if (cumsum >= thres) { rand_idx[index] = ((n * channels + c) * height + h) * width + w; top_data[index] = bottom_slice[h * width + w]; h = hend; w = wend; } } } } } __kernel void TEMPLATE(sto_pool_forward_test,Dtype)( const int_tp nthreads, __global const Dtype* const bottom_data, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, __global Dtype* top_data) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp pw = index % pooled_width; const int_tp ph = (index / pooled_width) % pooled_height; const int_tp c = (index / pooled_width / pooled_height) % channels; const int_tp n = index / pooled_width / pooled_height / channels; const int_tp hstart = ph * stride_h; const int_tp hend = min(hstart + kernel_h, height); const int_tp wstart = pw * stride_w; const int_tp wend = min(wstart + kernel_w, width); // We set cumsum to be 0 to avoid divide-by-zero problems Dtype cumsum = FLT_MIN; Dtype cumvalues = 0.; __global const Dtype* bottom_slice = bottom_data + (n * channels + c) * height * width; // First pass: get sum for (int_tp h = hstart; h < hend; ++h) { for (int_tp w = wstart; w < wend; ++w) { cumsum += bottom_slice[h * width + w]; cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w]; } } top_data[index] = cumvalues / cumsum; } } __kernel void TEMPLATE(max_pool_backward,Dtype)(const int_tp nthreads, __global const Dtype* top_diff, const int use_mask, __global const int_tp* mask, __global const Dtype* top_mask, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w, __global Dtype* bottom_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { // find out the local index // find out the local offset const int_tp w = index % width; const int_tp h = (index / width) % height; const int_tp c = (index / width / height) % channels; const int_tp n = index / width / height / channels; const int_tp phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; const int_tp phend = min((h + pad_h) / stride_h + 1, pooled_height); const int_tp pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; const int_tp pwend = min((w + pad_w) / stride_w + 1, pooled_width); Dtype gradient = 0; const int_tp offset = (n * channels + c) * pooled_height * pooled_width; __global const Dtype* top_diff_slice = top_diff + offset; if (use_mask == 1) { __global const int_tp* mask_slice = mask + offset; for (int_tp ph = phstart; ph < phend; ++ph) { for (int_tp pw = pwstart; pw < pwend; ++pw) { if (mask_slice[ph * pooled_width + pw] == h * width + w) { gradient += top_diff_slice[ph * pooled_width + pw]; } } } } else { __global const Dtype* top_mask_slice = top_mask + offset; for (int_tp ph = phstart; ph < phend; ++ph) { for (int_tp pw = pwstart; pw < pwend; ++pw) { if (top_mask_slice[ph * pooled_width + pw] == h * width + w) { gradient += top_diff_slice[ph * pooled_width + pw]; } } } } bottom_diff[index] = gradient; } } __kernel void TEMPLATE(ave_pool_backward,Dtype)(const int_tp nthreads, __global const Dtype* top_diff, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w, __global Dtype* bottom_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { // find out the local index // find out the local offset const int_tp w = index % width + pad_w; const int_tp h = (index / width) % height + pad_h; const int_tp c = (index / width / height) % channels; const int_tp n = index / width / height / channels; const int_tp phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; const int_tp phend = min(h / stride_h + 1, pooled_height); const int_tp pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; const int_tp pwend = min(w / stride_w + 1, pooled_width); Dtype gradient = 0.0; __global const Dtype* const top_diff_slice = top_diff + (n * channels + c) * pooled_height * pooled_width; for (int_tp ph = phstart; ph < phend; ++ph) { for (int_tp pw = pwstart; pw < pwend; ++pw) { // figure out the pooling size int_tp hstart = ph * stride_h - pad_h; int_tp wstart = pw * stride_w - pad_w; int_tp hend = min(hstart + kernel_h, height + pad_h); int_tp wend = min(wstart + kernel_w, width + pad_w); int_tp pool_size = (hend - hstart) * (wend - wstart); gradient += top_diff_slice[ph * pooled_width + pw] / pool_size; } } bottom_diff[index] = gradient; } } __kernel void TEMPLATE(sto_pool_backward,Dtype)( const int_tp nthreads, __global const Dtype* rand_idx, __global const Dtype* const top_diff, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, __global Dtype* bottom_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { // find out the local index // find out the local offset const int_tp w = index % width; const int_tp h = (index / width) % height; const int_tp c = (index / width / height) % channels; const int_tp n = index / width / height / channels; const int_tp phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; const int_tp phend = min(h / stride_h + 1, pooled_height); const int_tp pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; const int_tp pwend = min(w / stride_w + 1, pooled_width); Dtype gradient = 0.0; __global const Dtype* rand_idx_slice = rand_idx + (n * channels + c) * pooled_height * pooled_width; __global const Dtype* top_diff_slice = top_diff + (n * channels + c) * pooled_height * pooled_width; for (int_tp ph = phstart; ph < phend; ++ph) { for (int_tp pw = pwstart; pw < pwend; ++pw) { gradient += top_diff_slice[ph * pooled_width + pw] * (index == (int_tp) (rand_idx_slice[ph * pooled_width + pw])?1.0:0.0); } } bottom_diff[index] = gradient; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(max_pool_forward_nd, Dtype)(const int_tp n, const int_tp num_axes, __global const Dtype* bottom_data, const int_tp channels, __global const int_tp* size, __global const int_tp* pooled_size, __global const int_tp* kernel_size, __global const int_tp* ext_kernel_size, __global const int_tp* stride, __global const int_tp* dilation, __global const int_tp* pad, __global Dtype* top_data, const int use_mask, __global int_tp* mask, __global Dtype* top_mask) { int_tp d_idx[6]; int_tp d_start[6]; int_tp d_end[6]; int_tp d_iter[6]; int_tp i; for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { int_tp offset = 1; int_tp num = index; bool do_continue = false; for (i = num_axes - 1; i >= 0; --i) { d_idx[i] = num % pooled_size[i]; d_start[i] = d_idx[i] * stride[i] - pad[i]; d_end[i] = min(d_start[i] + ext_kernel_size[i], size[i]); while (d_start[i] < 0) { d_start[i] += dilation[i]; } num /= pooled_size[i]; offset *= size[i]; d_iter[i] = d_start[i]; if (d_start[i] >= d_end[i]) { top_data[index] = -FLT_MAX; if (use_mask) { mask[index] = -1; } else { top_mask[index] = -1; } do_continue = true; } } if(do_continue) { continue; } int_tp chan = num % channels; num /= channels; offset *= (num * channels + chan); Dtype maxval = -FLT_MAX; int_tp maxidx = -1; int_tp final_offset = 0; bool incremented; do { final_offset = 0; int_tp size_prod = 1; for (i = num_axes - 1; i >= 0; --i) { final_offset += d_iter[i] * size_prod; size_prod *= size[i]; } if (bottom_data[final_offset + offset] > maxval) { maxidx = final_offset; maxval = bottom_data[offset + final_offset]; } incremented = false; for (i = num_axes - 1; i >= 0; --i) { if (d_iter[i] >= d_end[i] - dilation[i]) { d_iter[i] = d_start[i]; } else { d_iter[i] += dilation[i]; incremented = true; break; } } } while (incremented); top_data[index] = maxval; if (use_mask == 1) { mask[index] = maxidx; } else { top_mask[index] = maxidx; } } } __kernel void TEMPLATE(max_pool_backward_nd, Dtype)(const int_tp n, const int_tp num_axes, __global const Dtype* top_diff, const int use_mask, __global const int_tp* mask, __global const Dtype* top_mask, const int_tp channels, __global const int_tp* size, __global const int_tp* pooled_size, __global const int_tp* kernel_size, __global const int_tp* ext_kernel_size, __global const int_tp* stride, __global const int_tp* dilation, __global const int_tp* pad, __global Dtype* bottom_diff) { int_tp d_idx[6]; int_tp d_start[6]; int_tp d_end[6]; int_tp d_iter[6]; int_tp i; for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { // find out the local index // find out the local offset int_tp offset = 1; int_tp num = index; bool do_continue = false; for (i = num_axes - 1; i >= 0; --i) { d_idx[i] = num % size[i]; d_start[i] = (d_idx[i] + pad[i] < ext_kernel_size[i]) ? 0 : (d_idx[i] + pad[i] - ext_kernel_size[i]) / stride[i] + 1; d_end[i] = min((int_tp) ((d_idx[i] + pad[i]) / stride[i]), (int_tp) (pooled_size[i] - 1)); num /= size[i]; offset *= pooled_size[i]; d_iter[i] = d_start[i]; if (d_start[i] > d_end[i]) { bottom_diff[index] = 0; do_continue = true; } } if (do_continue) { continue; } int_tp chan = num % channels; num /= channels; offset *= (num * channels + chan); Dtype gradient = 0.0; int_tp final_offset = 0; int_tp im_offset = 0; bool incremented; do { final_offset = offset; im_offset = 0; int_tp size_prod = 1; int_tp pooled_size_prod = 1; for (i = num_axes - 1; i >= 0; --i) { final_offset += d_iter[i] * pooled_size_prod; im_offset += d_idx[i] * size_prod; size_prod *= size[i]; pooled_size_prod *= pooled_size[i]; } if (use_mask) { if (mask[final_offset] == im_offset) { gradient += top_diff[final_offset]; } } else { if (top_mask[final_offset] == im_offset) { gradient += top_diff[final_offset]; } } incremented = false; for (i = num_axes - 1; i >= 0; --i) { if (d_iter[i] >= d_end[i]) { d_iter[i] = d_start[i]; } else { ++d_iter[i]; incremented = true; break; } } } while (incremented); bottom_diff[index] = gradient; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(max_pool_forward_sk,Dtype)(const int_tp nthreads, __global Dtype* bottom_data, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h, const int_tp pad_w, __global Dtype* top_data, const int use_mask, __global int_tp* mask, __global Dtype* top_mask) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp pw = index % pooled_width; int_tp ph = (index / pooled_width) % pooled_height; int_tp c = (index / pooled_width / pooled_height) % channels; int_tp n = index / pooled_width / pooled_height / channels; int_tp hstart = ph * stride_h - pad_h; int_tp wstart = pw * stride_w - pad_w; int_tp hend = min(hstart + ext_kernel_h, height); int_tp wend = min(wstart + ext_kernel_w, width); while (hstart < 0) { hstart += dilation_h; } while (wstart < 0) { wstart += dilation_w; } Dtype maxval = -FLT_MAX; int_tp maxidx = -1; __global Dtype* bottom_data_ptr = bottom_data + (n * channels + c) * height * width; for (int_tp h = hstart; h < hend; h += dilation_h) { for (int_tp w = wstart; w < wend; w += dilation_w) { if (bottom_data_ptr[h * width + w] > maxval) { maxidx = h * width + w; maxval = bottom_data_ptr[maxidx]; } } } top_data[index] = maxval; if (use_mask == 1) { mask[index] = maxidx; } else { top_mask[index] = maxidx; } } } __kernel void TEMPLATE(max_pool_backward_sk,Dtype)( const int_tp nthreads, __global const Dtype* top_diff, const int use_mask, __global const int_tp* mask, __global const Dtype* top_mask, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h, const int_tp pad_w, __global Dtype* bottom_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { __global const int_tp* mask_ptr = mask; __global const Dtype* top_diff_ptr = top_diff; // find out the local index // find out the local offset int_tp w = index % width; int_tp h = (index / width) % height; int_tp c = (index / width / height) % channels; int_tp n = index / width / height / channels; int_tp phstart = (h + pad_h < ext_kernel_h) ? 0 : (h + pad_h - ext_kernel_h) / stride_h + 1; int_tp phend = min(((h + pad_h) / stride_h + 1), pooled_height); int_tp pwstart = (w + pad_w < ext_kernel_w) ? 0 : (w + pad_w - ext_kernel_w) / stride_w + 1; int_tp pwend = min(((w + pad_w) / stride_w + 1), pooled_width); Dtype gradient = 0.0; int_tp offset = (n * channels + c) * pooled_height * pooled_width; top_diff_ptr += offset; if (use_mask == 1) { mask_ptr += offset; for (int_tp ph = phstart; ph < phend; ++ph) { for (int_tp pw = pwstart; pw < pwend; ++pw) { if (mask_ptr[ph * pooled_width + pw] == h * width + w) { gradient += top_diff_ptr[ph * pooled_width + pw]; } } } } else { for (int_tp ph = phstart; ph < phend; ++ph) { for (int_tp pw = pwstart; pw < pwend; ++pw) { if (top_mask[ph * pooled_width + pw] == h * width + w) { gradient += top_diff_ptr[ph * pooled_width + pw]; } } } } bottom_diff[index] = gradient; } } __kernel void TEMPLATE(ave_pool_forward_sk,Dtype)( const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h, const int_tp pad_w, __global Dtype* top_data) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp pool_size = 0; int_tp pw = index % pooled_width; int_tp ph = (index / pooled_width) % pooled_height; int_tp c = (index / pooled_width / pooled_height) % channels; int_tp n = index / pooled_width / pooled_height / channels; int_tp hstart = ph * stride_h - pad_h; int_tp wstart = pw * stride_w - pad_w; int_tp hend = hstart + ext_kernel_h; int_tp wend = wstart + ext_kernel_w; // Overspill over the image + pad does // not contribute to pool size while (hend > height + pad_h) { hend -= dilation_h; } while (wend > width + pad_w) { wend -= dilation_w; } Dtype aveval = 0; __global const Dtype* bottom_data_ptr = bottom_data; bottom_data_ptr += (n * channels + c) * height * width; for (int_tp h = hstart; h < hend; h += dilation_h) { for (int_tp w = wstart; w < wend; w += dilation_w) { if (h >= 0 && h < height && w >= 0 && w < width) { aveval += bottom_data_ptr[h * width + w]; } ++pool_size; } } top_data[index] = aveval / pool_size; } } __kernel void TEMPLATE(ave_pool_backward_sk,Dtype)(const int_tp nthreads, __global const Dtype* top_diff, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h, const int_tp pad_w, __global Dtype* bottom_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { // find out the local index // find out the local offset const int_tp w = index % width; const int_tp h = (index / width) % height; const int_tp c = (index / width / height) % channels; const int_tp n = index / width / height / channels; int_tp phstart = (h + pad_h < ext_kernel_h) ? 0 : (h + pad_h - ext_kernel_h) / stride_h + 1; int_tp phend = min(((h + pad_h) / stride_h + 1), pooled_height); int_tp pwstart = (w + pad_w < ext_kernel_w) ? 0 : (w + pad_w - ext_kernel_w) / stride_w + 1; int_tp pwend = min(((w + pad_w) / stride_w + 1), pooled_width); Dtype gradient = 0.0; __global const Dtype* const top_diff_slice = top_diff + (n * channels + c) * pooled_height * pooled_width; for (int_tp ph = phstart; ph < phend; ++ph) { for (int_tp pw = pwstart; pw < pwend; ++pw) { // figure out the pooling size int_tp hstart = ph * stride_h - pad_h; int_tp wstart = pw * stride_w - pad_w; int_tp hend = min(hstart + ext_kernel_h, height + pad_h); int_tp wend = min(wstart + ext_kernel_w, width + pad_w); int_tp pool_size = ((hend - hstart - 1) / dilation_h + 1) * ((wend - wstart - 1) / dilation_w + 1); if (h >= hstart && h < hend && (h - hstart) % dilation_h == 0 && w >= wstart && w < wend && (w - wstart) % dilation_w == 0) { gradient += top_diff_slice[ph * pooled_width + pw] / pool_size; } } } bottom_diff[index] = gradient; } } __kernel void TEMPLATE(sto_pool_forward_train_sk,Dtype)( const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp dilation_h, const int_tp dilation_w, __global Dtype* rand_idx, __global Dtype* top_data) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp pw = index % pooled_width; int_tp ph = (index / pooled_width) % pooled_height; int_tp c = (index / pooled_width / pooled_height) % channels; int_tp n = index / pooled_width / pooled_height / channels; int_tp hstart = ph * stride_h; int_tp hend = min(hstart + ext_kernel_h, height); int_tp wstart = pw * stride_w; int_tp wend = min(wstart + ext_kernel_w, width); Dtype cumsum = 0.; __global const Dtype* bottom_data_ptr = bottom_data; bottom_data_ptr += (n * channels + c) * height * width; // First pass: get sum for (int_tp h = hstart; h < hend; h += dilation_h) { for (int_tp w = wstart; w < wend; w += dilation_w) { cumsum += bottom_data_ptr[h * width + w]; } } float thres = rand_idx[index] * cumsum; // Second pass: get value, and set index. cumsum = 0; for (int_tp h = hstart; h < hend; h += dilation_h) { for (int_tp w = wstart; w < wend; w += dilation_w) { cumsum += bottom_data_ptr[h * width + w]; if (cumsum >= thres) { rand_idx[index] = ((n * channels + c) * height + h) * width + w; top_data[index] = bottom_data_ptr[h * width + w]; h = hend; w = wend; } } } } } __kernel void TEMPLATE(sto_pool_forward_test_sk,Dtype)( const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, const int_tp channels, const int_tp height, const int_tp width, const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp dilation_h, const int_tp dilation_w, __global Dtype* top_data) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { int_tp pw = index % pooled_width; int_tp ph = (index / pooled_width) % pooled_height; int_tp c = (index / pooled_width / pooled_height) % channels; int_tp n = index / pooled_width / pooled_height / channels; int_tp hstart = ph * stride_h; int_tp hend = min(hstart + ext_kernel_h, height); int_tp wstart = pw * stride_w; int_tp wend = min(wstart + ext_kernel_w, width); // We set cumsum to be 0 to avoid divide-by-zero problems Dtype cumsum = FLT_MIN; Dtype cumvalues = 0.; __global const Dtype* bottom_data_ptr = bottom_data; bottom_data_ptr += (n * channels + c) * height * width; // First pass: get sum for (int_tp h = hstart; h < hend; h += dilation_h) { for (int_tp w = wstart; w < wend; w += dilation_w) { cumsum += bottom_data_ptr[h * width + w]; cumvalues += bottom_data_ptr[h * width + w] * bottom_data_ptr[h * width + w]; } } top_data[index] = cumvalues / cumsum; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(slice,Dtype)(const int_tp nthreads, __global const Dtype* in_data, const int forward, const int_tp num_slices, const int_tp slice_size, const int_tp bottom_slice_axis, const int_tp top_slice_axis, const int_tp offset_slice_axis, __global Dtype* out_data) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp total_slice_size = slice_size * top_slice_axis; const int_tp slice_num = index / total_slice_size; const int_tp slice_index = index % total_slice_size; const int_tp bottom_index = slice_index + (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size; if (forward == 1) { out_data[index] = in_data[bottom_index]; } else { out_data[bottom_index] = in_data[index]; } } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif #ifndef __OPENCL_VERSION__ #include "header.cl" #endif #if defined(cl_intel_subgroups) #pragma OPENCL EXTENSION cl_intel_subgroups : enable __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int_tp num, const int_tp channels, const int_tp spatial_dim, __global Dtype* scale, __global const Dtype* data, __global Dtype* out, __local Dtype *out_tmp, __local Dtype *scale_tmp, __local Dtype *group_tmp) { int_tp n = get_global_id(1); for (int_tp index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index += get_global_size(0), ++s) { float maxval = -FLT_MAX; for (int_tp c = get_global_id(0); c < channels; c += get_global_size(0)) { Dtype tmp = data[(n * channels + c) * spatial_dim + s]; maxval = max((Dtype)tmp, (Dtype)maxval); } maxval = sub_group_reduce_max(maxval); //if (get_sub_group_local_id() == 0) group_tmp[get_sub_group_id() * spatial_dim + s] = maxval; } barrier(CLK_LOCAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index += get_global_size(0)) { int_tp s = index / get_max_sub_group_size(); Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]); //if (get_sub_group_local_id() == 0) scale_tmp[s] = maxval; } barrier(CLK_LOCAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < channels * spatial_dim; index += get_global_size(0)) { int_tp s = index % spatial_dim; out_tmp[index] = exp(data[n * channels * spatial_dim + index] - scale_tmp[s]); } barrier(CLK_LOCAL_MEM_FENCE); for (int_tp index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index += get_global_size(0), ++s) { Dtype sum = 0; for (int_tp c = get_global_id(0); c < channels; c += get_global_size(0)) { sum += out_tmp[c * spatial_dim + s]; } sum = sub_group_reduce_add(sum); group_tmp[get_sub_group_id() * spatial_dim + s] = sum; } barrier(CLK_LOCAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index += get_global_size(0)) { int_tp s = index / get_max_sub_group_size(); Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]); //if (get_sub_group_local_id() == 0) scale_tmp[s] = sum; } barrier(CLK_LOCAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < channels * spatial_dim; index += get_global_size(0)) { int_tp s = index % spatial_dim; out[n * channels * spatial_dim + index] = out_tmp[index] / scale_tmp[s]; } } __kernel void TEMPLATE(softmax_forward,Dtype)(const int_tp num, const int_tp channels, const int_tp spatial_dim, __global Dtype* scale, __global const Dtype* data, __global Dtype* out) { int_tp n = get_global_id(1); __global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim; for (int_tp index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index += get_global_size(0), ++s) { float maxval = -FLT_MAX; for (int_tp c = get_global_id(0); c < channels; c += get_global_size(0)) { Dtype tmp = data[(n * channels + c) * spatial_dim + s]; maxval = max((Dtype)tmp, (Dtype)maxval); } maxval = sub_group_reduce_max(maxval); //if (get_sub_group_local_id() == 0) group_tmp[get_sub_group_id() * spatial_dim + s] = maxval; } barrier(CLK_GLOBAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index += get_global_size(0)) { int_tp s = index / get_max_sub_group_size(); Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]); //if (get_sub_group_local_id() == 0) scale[n * spatial_dim + s] = maxval; } barrier(CLK_GLOBAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < channels * spatial_dim; index += get_global_size(0)) { int_tp s = index % spatial_dim; out[n * channels * spatial_dim + index] = exp(data[n * channels * spatial_dim + index] - scale[n * spatial_dim + s]); } barrier(CLK_GLOBAL_MEM_FENCE); for (int_tp index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index += get_global_size(0), ++s) { Dtype sum = 0; for (int_tp c = get_global_id(0); c < channels; c += get_global_size(0)) { sum += out[n * channels * spatial_dim + c * spatial_dim + s]; } sum = sub_group_reduce_add(sum); group_tmp[get_sub_group_id() * spatial_dim + s] = sum; } barrier(CLK_GLOBAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index += get_global_size(0)) { int_tp s = index / get_max_sub_group_size(); Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]); //if (get_sub_group_local_id() == 0) scale[n * spatial_dim + s] = sum; } barrier(CLK_GLOBAL_MEM_FENCE); for (int_tp index = get_global_id(0); index < channels * spatial_dim; index += get_global_size(0)) { int_tp s = index % spatial_dim; out[n * channels * spatial_dim + index] /= scale[n * spatial_dim + s]; } } // Copied from caffe.pb.h, must keep consistent with the original definition #ifndef __SOFTMAX_LOSS_CL__ #define __SOFTMAX_LOSS_CL__ enum LossParameter_NormalizationMode { LossParameter_NormalizationMode_FULL = 0, LossParameter_NormalizationMode_VALID = 1, LossParameter_NormalizationMode_BATCH_SIZE = 2, LossParameter_NormalizationMode_NONE = 3 }; #endif // Copied from softmax_loss_layer.cpp, must keep consistent with the original implementation Dtype TEMPLATE(get_normalizer, Dtype)( enum LossParameter_NormalizationMode normalization_mode, int_tp valid_count, int_tp outer_num_, int_tp inner_num_) { Dtype normalizer; switch (normalization_mode) { case LossParameter_NormalizationMode_FULL: normalizer = (Dtype)(outer_num_ * inner_num_); break; case LossParameter_NormalizationMode_VALID: if (valid_count == -1) { normalizer = (Dtype)(outer_num_ * inner_num_); } else { normalizer = (Dtype)(valid_count); } break; case LossParameter_NormalizationMode_BATCH_SIZE: normalizer = (Dtype)(outer_num_); break; case LossParameter_NormalizationMode_NONE: normalizer = (Dtype)(1); break; default: normalizer = (Dtype)(0); } // Some users will have no labels for some examples in order to 'turn off' a // particular loss in a multi-task setup. The max prevents NaNs in that case. return fmax((Dtype)(1.0), normalizer); } Dtype TEMPLATE(asum, Dtype)(int_tp n, __global const Dtype *data, __local Dtype *sum_tmp) { Dtype sum = 0; for(int_tp i = get_global_id(0); i < n; i += get_global_size(0)) { sum += data[i]; } sum = sub_group_reduce_add(sum); sum_tmp[get_sub_group_id()] = sum; barrier(CLK_LOCAL_MEM_FENCE); if (get_sub_group_id() == 0) sum = sub_group_reduce_add(sum_tmp[get_sub_group_local_id()]); return sum; } __kernel void TEMPLATE(softmax_loss_forward_asum, Dtype)( int_tp n, int_tp outer_num_, int_tp inner_num_, int_tp compute_count_sum, int_tp normalization_type, __global const Dtype *loss, __global const Dtype *counts, __global Dtype *out) { __local Dtype sum_tmp[16]; Dtype loss_sum = TEMPLATE(asum, Dtype)(n, loss, sum_tmp); Dtype counts_sum = -1; if (compute_count_sum) counts_sum = TEMPLATE(asum, Dtype)(n, counts, sum_tmp); if (get_global_id(0) == 0) out[0] = loss_sum / TEMPLATE(get_normalizer, Dtype)(normalization_type, counts_sum, outer_num_, inner_num_); } #endif __kernel void TEMPLATE(softmax_loss_forward,Dtype)( int_tp n, __global const Dtype* prob_data, __global const Dtype* label, __global Dtype* loss, const int_tp num, const int_tp dim, const int_tp spatial_dim, const int has_ignore_label_, const int_tp ignore_label_, __global Dtype* counts) { for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { const int_tp n = index / spatial_dim; const int_tp s = index % spatial_dim; const int_tp label_value = (int_tp) (label[n * spatial_dim + s]); if (has_ignore_label_ == 1 && label_value == ignore_label_) { loss[index] = 0; counts[index] = 0; } else { loss[index] = -log((Dtype)( max((Dtype) (prob_data[n * dim + label_value * spatial_dim + s]), (Dtype) FLT_MIN))); counts[index] = 1; } } } __kernel void TEMPLATE(softmax_loss_backward,Dtype)(const int_tp nthreads, __global const Dtype* top, __global const Dtype* label, __global Dtype* bottom_diff, const int_tp num, const int_tp dim, const int_tp spatial_dim, const int has_ignore_label_, const int_tp ignore_label_, __global Dtype* counts) { const int_tp channels = dim / spatial_dim; for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp n = index / spatial_dim; const int_tp s = index % spatial_dim; const int_tp label_value = (int_tp) (label[n * spatial_dim + s]); if (has_ignore_label_ == 1 && label_value == ignore_label_) { for (int_tp c = 0; c < channels; ++c) { bottom_diff[n * dim + c * spatial_dim + s] = 0; } counts[index] = 0; } else { bottom_diff[n * dim + label_value * spatial_dim + s] -= 1; counts[index] = 1; } } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(ada_delta_update,Dtype)(int_tp N, __global Dtype* g, __global Dtype* h, __global Dtype* h2, Dtype momentum, Dtype delta, Dtype local_rate) { for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { Dtype gi = g[i]; Dtype hi = h[i] = momentum * h[i] + (1.0 - momentum) * gi * gi; gi = gi * sqrt((h2[i] + delta) / (hi + delta)); h2[i] = momentum * h2[i] + (1.0 - momentum) * gi * gi; g[i] = local_rate * gi; } } __kernel void TEMPLATE(ada_grad_update,Dtype)(int_tp N, __global Dtype* g, __global Dtype* h, Dtype delta, Dtype local_rate) { for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { Dtype gi = g[i]; Dtype hi = h[i] = h[i] + gi * gi; g[i] = local_rate * gi / (sqrt(hi) + delta); } } __kernel void TEMPLATE(adam_update,Dtype)(int_tp N, __global Dtype* g, __global Dtype* m, __global Dtype* v, Dtype beta1, Dtype beta2, Dtype eps_hat, Dtype corrected_local_rate) { for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { Dtype gi = g[i]; Dtype mi = m[i] = m[i] * beta1 + gi * (1 - beta1); Dtype vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2); g[i] = corrected_local_rate * mi / (sqrt(vi) + eps_hat); } } __kernel void TEMPLATE(nesterov_update,Dtype)(int_tp N, __global Dtype* g, __global Dtype* h, Dtype momentum, Dtype local_rate) { for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { Dtype hi = h[i]; Dtype hi_new = h[i] = momentum * hi + local_rate * g[i]; g[i] = (1 + momentum) * hi_new - momentum * hi; } } __kernel void TEMPLATE(rms_prop_update,Dtype)(int_tp N, __global Dtype* g, __global Dtype* h, Dtype rms_decay, Dtype delta, Dtype local_rate) { for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { Dtype gi = g[i]; Dtype hi = h[i] = rms_decay * h[i] + (1 - rms_decay) * gi * gi; g[i] = local_rate * g[i] / (sqrt(hi) + delta); } } __kernel void TEMPLATE(sgd_update,Dtype)(int_tp N, __global Dtype* g, __global Dtype* h, Dtype momentum, Dtype local_rate) { for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { g[i] = h[i] = momentum * h[i] + local_rate * g[i]; } } #ifndef __OPENCL_VERSION__ #include "header.cl" #endif __kernel void TEMPLATE(tile,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data, const int_tp tile_size, const int_tp num_tiles, const int_tp bottom_tile_axis, __global Dtype* top_data) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp d = index % tile_size; const int_tp b = (index / tile_size / num_tiles) % bottom_tile_axis; const int_tp n = index / tile_size / num_tiles / bottom_tile_axis; const int_tp bottom_index = (n * bottom_tile_axis + b) * tile_size + d; top_data[index] = bottom_data[bottom_index]; } } __kernel void TEMPLATE(tile_backward,Dtype)(const int_tp nthreads, __global const Dtype* top_diff, const int_tp tile_size, const int_tp num_tiles, const int_tp bottom_tile_axis, __global Dtype* bottom_diff) { for (int_tp index = get_global_id(0); index < nthreads; index += get_global_size(0)) { const int_tp d = index % tile_size; const int_tp b = (index / tile_size) % bottom_tile_axis; const int_tp n = index / tile_size / bottom_tile_axis; bottom_diff[index] = 0; int_tp top_index = (n * num_tiles * bottom_tile_axis + b) * tile_size + d; for (int_tp t = 0; t < num_tiles; ++t) { bottom_diff[index] += top_diff[top_index]; top_index += bottom_tile_axis * tile_size; } } } #endif // DOUBLE_SUPPORT_AVAILABLE I0216 16:14:28.957569 2475 common.cpp:542] OpenCL platform: Intel(R) Corporation: OpenCL 2.0 does not work correctly. I0216 16:14:28.963583 2475 net.cpp:57] Initializing net from parameters: name: "AlexNet" state { phase: TRAIN level: 0 stage: "" } layer { name: "data" type: "Input" top: "data" input_param { shape { dim: 10 dim: 3 dim: 227 dim: 227 } } } layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 96 kernel_size: 11 stride: 4 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 5 alpha: 0.0001 beta: 0.75 } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 2 kernel_size: 5 group: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 5 alpha: 0.0001 beta: 0.75 } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 384 pad: 1 kernel_size: 3 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 384 pad: 1 kernel_size: 3 group: 2 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 group: 2 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } layer { name: "pool5" type: "Pooling" bottom: "conv5" top: "pool5" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 } } layer { name: "fc8" type: "InnerProduct" bottom: "fc7" top: "fc8" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 1000 } } layer { name: "prob" type: "Softmax" bottom: "fc8" top: "prob" } I0216 16:14:28.964151 2475 layer_factory.cpp:67] Creating layer data I0216 16:14:28.964184 2475 net.cpp:96] Creating Layer data I0216 16:14:28.964231 2475 net.cpp:413] data -> data I0216 16:14:28.964299 2475 net.cpp:134] Setting up data I0216 16:14:28.964323 2475 net.cpp:142] Top shape: 10 3 227 227 (1545870) I0216 16:14:28.964341 2475 layer_factory.cpp:67] Creating layer conv1 I0216 16:14:28.964385 2475 net.cpp:96] Creating Layer conv1 I0216 16:14:28.964396 2475 net.cpp:444] conv1 <- data I0216 16:14:28.964412 2475 net.cpp:413] conv1 -> conv1 I0216 16:14:28.964947 2475 net.cpp:134] Setting up conv1 I0216 16:14:28.964972 2475 net.cpp:142] Top shape: 10 96 55 55 (2904000) I0216 16:14:28.965003 2475 layer_factory.cpp:67] Creating layer relu1 I0216 16:14:28.965018 2475 net.cpp:96] Creating Layer relu1 I0216 16:14:28.965026 2475 net.cpp:444] relu1 <- conv1 I0216 16:14:28.965044 2475 net.cpp:400] relu1 -> conv1 (in-place) I0216 16:14:28.965067 2475 net.cpp:134] Setting up relu1 I0216 16:14:28.965080 2475 net.cpp:142] Top shape: 10 96 55 55 (2904000) I0216 16:14:28.965090 2475 layer_factory.cpp:67] Creating layer norm1 I0216 16:14:28.965102 2475 net.cpp:96] Creating Layer norm1 I0216 16:14:28.965108 2475 net.cpp:444] norm1 <- conv1 I0216 16:14:28.965118 2475 net.cpp:413] norm1 -> norm1 I0216 16:14:28.965152 2475 net.cpp:134] Setting up norm1 I0216 16:14:28.965167 2475 net.cpp:142] Top shape: 10 96 55 55 (2904000) I0216 16:14:28.965180 2475 layer_factory.cpp:67] Creating layer pool1 I0216 16:14:28.965198 2475 net.cpp:96] Creating Layer pool1 I0216 16:14:28.965209 2475 net.cpp:444] pool1 <- norm1 I0216 16:14:28.965219 2475 net.cpp:413] pool1 -> pool1 I0216 16:14:29.129166 2475 net.cpp:134] Setting up pool1 I0216 16:14:29.129186 2475 net.cpp:142] Top shape: 10 96 27 27 (699840) I0216 16:14:29.129197 2475 layer_factory.cpp:67] Creating layer conv2 I0216 16:14:29.129207 2475 net.cpp:96] Creating Layer conv2 I0216 16:14:29.129210 2475 net.cpp:444] conv2 <- pool1 I0216 16:14:29.129216 2475 net.cpp:413] conv2 -> conv2 I0216 16:14:29.129384 2475 net.cpp:134] Setting up conv2 I0216 16:14:29.129390 2475 net.cpp:142] Top shape: 10 256 27 27 (1866240) I0216 16:14:29.129397 2475 layer_factory.cpp:67] Creating layer relu2 I0216 16:14:29.129402 2475 net.cpp:96] Creating Layer relu2 I0216 16:14:29.129405 2475 net.cpp:444] relu2 <- conv2 I0216 16:14:29.129407 2475 net.cpp:400] relu2 -> conv2 (in-place) I0216 16:14:29.129411 2475 net.cpp:134] Setting up relu2 I0216 16:14:29.129415 2475 net.cpp:142] Top shape: 10 256 27 27 (1866240) I0216 16:14:29.129418 2475 layer_factory.cpp:67] Creating layer norm2 I0216 16:14:29.129422 2475 net.cpp:96] Creating Layer norm2 I0216 16:14:29.129425 2475 net.cpp:444] norm2 <- conv2 I0216 16:14:29.129427 2475 net.cpp:413] norm2 -> norm2 I0216 16:14:29.129433 2475 net.cpp:134] Setting up norm2 I0216 16:14:29.129437 2475 net.cpp:142] Top shape: 10 256 27 27 (1866240) I0216 16:14:29.129442 2475 layer_factory.cpp:67] Creating layer pool2 I0216 16:14:29.129447 2475 net.cpp:96] Creating Layer pool2 I0216 16:14:29.129451 2475 net.cpp:444] pool2 <- norm2 I0216 16:14:29.129454 2475 net.cpp:413] pool2 -> pool2 I0216 16:14:29.275784 2475 net.cpp:134] Setting up pool2 I0216 16:14:29.275802 2475 net.cpp:142] Top shape: 10 256 13 13 (432640) I0216 16:14:29.275813 2475 layer_factory.cpp:67] Creating layer conv3 I0216 16:14:29.275823 2475 net.cpp:96] Creating Layer conv3 I0216 16:14:29.275826 2475 net.cpp:444] conv3 <- pool2 I0216 16:14:29.275832 2475 net.cpp:413] conv3 -> conv3 I0216 16:14:29.276307 2475 net.cpp:134] Setting up conv3 I0216 16:14:29.276314 2475 net.cpp:142] Top shape: 10 384 13 13 (648960) I0216 16:14:29.276324 2475 layer_factory.cpp:67] Creating layer relu3 I0216 16:14:29.276329 2475 net.cpp:96] Creating Layer relu3 I0216 16:14:29.276331 2475 net.cpp:444] relu3 <- conv3 I0216 16:14:29.276335 2475 net.cpp:400] relu3 -> conv3 (in-place) I0216 16:14:29.276340 2475 net.cpp:134] Setting up relu3 I0216 16:14:29.276342 2475 net.cpp:142] Top shape: 10 384 13 13 (648960) I0216 16:14:29.276346 2475 layer_factory.cpp:67] Creating layer conv4 I0216 16:14:29.276351 2475 net.cpp:96] Creating Layer conv4 I0216 16:14:29.276371 2475 net.cpp:444] conv4 <- conv3 I0216 16:14:29.276374 2475 net.cpp:413] conv4 -> conv4 I0216 16:14:29.276732 2475 net.cpp:134] Setting up conv4 I0216 16:14:29.276739 2475 net.cpp:142] Top shape: 10 384 13 13 (648960) I0216 16:14:29.276744 2475 layer_factory.cpp:67] Creating layer relu4 I0216 16:14:29.276748 2475 net.cpp:96] Creating Layer relu4 I0216 16:14:29.276751 2475 net.cpp:444] relu4 <- conv4 I0216 16:14:29.276754 2475 net.cpp:400] relu4 -> conv4 (in-place) I0216 16:14:29.276757 2475 net.cpp:134] Setting up relu4 I0216 16:14:29.276760 2475 net.cpp:142] Top shape: 10 384 13 13 (648960) I0216 16:14:29.276764 2475 layer_factory.cpp:67] Creating layer conv5 I0216 16:14:29.276770 2475 net.cpp:96] Creating Layer conv5 I0216 16:14:29.276772 2475 net.cpp:444] conv5 <- conv4 I0216 16:14:29.276775 2475 net.cpp:413] conv5 -> conv5 I0216 16:14:29.277026 2475 net.cpp:134] Setting up conv5 I0216 16:14:29.277031 2475 net.cpp:142] Top shape: 10 256 13 13 (432640) I0216 16:14:29.277038 2475 layer_factory.cpp:67] Creating layer relu5 I0216 16:14:29.277041 2475 net.cpp:96] Creating Layer relu5 I0216 16:14:29.277045 2475 net.cpp:444] relu5 <- conv5 I0216 16:14:29.277047 2475 net.cpp:400] relu5 -> conv5 (in-place) I0216 16:14:29.277051 2475 net.cpp:134] Setting up relu5 I0216 16:14:29.277055 2475 net.cpp:142] Top shape: 10 256 13 13 (432640) I0216 16:14:29.277058 2475 layer_factory.cpp:67] Creating layer pool5 I0216 16:14:29.277062 2475 net.cpp:96] Creating Layer pool5 I0216 16:14:29.277065 2475 net.cpp:444] pool5 <- conv5 I0216 16:14:29.277068 2475 net.cpp:413] pool5 -> pool5 I0216 16:14:29.425851 2475 net.cpp:134] Setting up pool5 I0216 16:14:29.425871 2475 net.cpp:142] Top shape: 10 256 6 6 (92160) I0216 16:14:29.425881 2475 layer_factory.cpp:67] Creating layer fc6 I0216 16:14:29.425889 2475 net.cpp:96] Creating Layer fc6 I0216 16:14:29.425894 2475 net.cpp:444] fc6 <- pool5 I0216 16:14:29.425899 2475 net.cpp:413] fc6 -> fc6 I0216 16:14:29.445935 2475 net.cpp:134] Setting up fc6 I0216 16:14:29.445955 2475 net.cpp:142] Top shape: 10 4096 (40960) I0216 16:14:29.445967 2475 layer_factory.cpp:67] Creating layer relu6 I0216 16:14:29.445974 2475 net.cpp:96] Creating Layer relu6 I0216 16:14:29.445978 2475 net.cpp:444] relu6 <- fc6 I0216 16:14:29.445982 2475 net.cpp:400] relu6 -> fc6 (in-place) I0216 16:14:29.445988 2475 net.cpp:134] Setting up relu6 I0216 16:14:29.445991 2475 net.cpp:142] Top shape: 10 4096 (40960) I0216 16:14:29.445996 2475 layer_factory.cpp:67] Creating layer drop6 I0216 16:14:29.446005 2475 net.cpp:96] Creating Layer drop6 I0216 16:14:29.446007 2475 net.cpp:444] drop6 <- fc6 I0216 16:14:29.446010 2475 net.cpp:400] drop6 -> fc6 (in-place) I0216 16:14:29.446025 2475 net.cpp:134] Setting up drop6 I0216 16:14:29.446029 2475 net.cpp:142] Top shape: 10 4096 (40960) I0216 16:14:29.446033 2475 layer_factory.cpp:67] Creating layer fc7 I0216 16:14:29.446038 2475 net.cpp:96] Creating Layer fc7 I0216 16:14:29.446039 2475 net.cpp:444] fc7 <- fc6 I0216 16:14:29.446043 2475 net.cpp:413] fc7 -> fc7 I0216 16:14:29.453935 2475 net.cpp:134] Setting up fc7 I0216 16:14:29.453948 2475 net.cpp:142] Top shape: 10 4096 (40960) I0216 16:14:29.453958 2475 layer_factory.cpp:67] Creating layer relu7 I0216 16:14:29.453961 2475 net.cpp:96] Creating Layer relu7 I0216 16:14:29.453964 2475 net.cpp:444] relu7 <- fc7 I0216 16:14:29.453969 2475 net.cpp:400] relu7 -> fc7 (in-place) I0216 16:14:29.453972 2475 net.cpp:134] Setting up relu7 I0216 16:14:29.453975 2475 net.cpp:142] Top shape: 10 4096 (40960) I0216 16:14:29.453979 2475 layer_factory.cpp:67] Creating layer drop7 I0216 16:14:29.453982 2475 net.cpp:96] Creating Layer drop7 I0216 16:14:29.453984 2475 net.cpp:444] drop7 <- fc7 I0216 16:14:29.453987 2475 net.cpp:400] drop7 -> fc7 (in-place) I0216 16:14:29.453991 2475 net.cpp:134] Setting up drop7 I0216 16:14:29.453994 2475 net.cpp:142] Top shape: 10 4096 (40960) I0216 16:14:29.453999 2475 layer_factory.cpp:67] Creating layer fc8 I0216 16:14:29.454006 2475 net.cpp:96] Creating Layer fc8 I0216 16:14:29.454022 2475 net.cpp:444] fc8 <- fc7 I0216 16:14:29.454026 2475 net.cpp:413] fc8 -> fc8 I0216 16:14:29.456043 2475 net.cpp:134] Setting up fc8 I0216 16:14:29.456055 2475 net.cpp:142] Top shape: 10 1000 (10000) I0216 16:14:29.456063 2475 layer_factory.cpp:67] Creating layer prob I0216 16:14:29.456069 2475 net.cpp:96] Creating Layer prob I0216 16:14:29.456073 2475 net.cpp:444] prob <- fc8 I0216 16:14:29.456076 2475 net.cpp:413] prob -> prob I0216 16:14:29.456089 2475 net.cpp:134] Setting up prob I0216 16:14:29.456092 2475 net.cpp:142] Top shape: 10 1000 (10000) I0216 16:14:29.456095 2475 net.cpp:223] prob does not need backward computation. I0216 16:14:29.456097 2475 net.cpp:223] fc8 does not need backward computation. I0216 16:14:29.456100 2475 net.cpp:223] drop7 does not need backward computation. I0216 16:14:29.456109 2475 net.cpp:223] relu7 does not need backward computation. I0216 16:14:29.456110 2475 net.cpp:223] fc7 does not need backward computation. I0216 16:14:29.456112 2475 net.cpp:223] drop6 does not need backward computation. I0216 16:14:29.456115 2475 net.cpp:223] relu6 does not need backward computation. I0216 16:14:29.456117 2475 net.cpp:223] fc6 does not need backward computation. I0216 16:14:29.456120 2475 net.cpp:223] pool5 does not need backward computation. I0216 16:14:29.456123 2475 net.cpp:223] relu5 does not need backward computation. I0216 16:14:29.456125 2475 net.cpp:223] conv5 does not need backward computation. I0216 16:14:29.456128 2475 net.cpp:223] relu4 does not need backward computation. I0216 16:14:29.456131 2475 net.cpp:223] conv4 does not need backward computation. I0216 16:14:29.456133 2475 net.cpp:223] relu3 does not need backward computation. I0216 16:14:29.456136 2475 net.cpp:223] conv3 does not need backward computation. I0216 16:14:29.456140 2475 net.cpp:223] pool2 does not need backward computation. I0216 16:14:29.456142 2475 net.cpp:223] norm2 does not need backward computation. I0216 16:14:29.456145 2475 net.cpp:223] relu2 does not need backward computation. I0216 16:14:29.456146 2475 net.cpp:223] conv2 does not need backward computation. I0216 16:14:29.456149 2475 net.cpp:223] pool1 does not need backward computation. I0216 16:14:29.456151 2475 net.cpp:223] norm1 does not need backward computation. I0216 16:14:29.456154 2475 net.cpp:223] relu1 does not need backward computation. I0216 16:14:29.456156 2475 net.cpp:223] conv1 does not need backward computation. I0216 16:14:29.456159 2475 net.cpp:223] data does not need backward computation. I0216 16:14:29.456161 2475 net.cpp:266] This network produces output prob I0216 16:14:29.456177 2475 net.cpp:280] Network initialization done. I0216 16:14:29.456179 2475 net.cpp:281] Memory required for data: 83232440 I0216 16:14:29.456226 2475 caffe.cpp:406] Performing Forward ViennaCL: FATAL ERROR: Could not find kernel 'im2col_float' from program '' Number of kernels in program: 0 terminate called after throwing an instance of 'viennacl::ocl::kernel_not_found' what(): Kernel not found *** Aborted at 1518794069 (unix time) try "date -d @1518794069" if you are using GNU date *** PC: @ 0x7f168fd3c1f7 __GI_raise *** SIGABRT (@0x1452000009ab) received by PID 2475 (TID 0x7f1694d57a40) from PID 2475; stack trace: *** @ 0x7f16934715e0 (unknown) @ 0x7f168fd3c1f7 __GI_raise @ 0x7f168fd3d8e8 __GI_abort @ 0x7f1690642ac5 (unknown) @ 0x7f1690640a36 (unknown) @ 0x7f1690640a63 (unknown) @ 0x7f1690640c83 (unknown) @ 0x7f16945ca1d2 caffe::greentea_im2col_gpu<>() @ 0x7f1694669ebc caffe::BaseConvolutionLayer<>::greentea_conv_im2col_gpu() @ 0x7f169466a04e caffe::BaseConvolutionLayer<>::forward_gpu_gemm() @ 0x7f1694690e17 caffe::ConvolutionLayerSpatial<>::Forward_gpu() @ 0x7f1694770d3c caffe::Net<>::ForwardFromTo() @ 0x7f1694771137 caffe::Net<>::Forward() @ 0x41305a time() @ 0x40fcfc main @ 0x7f168fd28c05 __libc_start_main @ 0x410639 (unknown)