Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
xvec = np.meshgrid(*[dxi * np.arange(si, si + ni)
for dxi, si, ni in zip(dx, start, rank_shape)],
indexing='ij')
phases = sum(ki * xi for ki, xi in zip(kvec, xvec))
if h > 0:
fx_h[h:-h, h:-h, h:-h] = np.sin(phases)
else:
fx_h[:] = np.sin(phases)
fx_cos = np.cos(phases)
fx = cla.empty(queue, pencil_shape, dtype)
fx.set(fx_h)
lap = cla.empty(queue, rank_shape, dtype)
grd = cla.empty(queue, (3,)+rank_shape, dtype)
derivs(queue, fx=fx, lap=lap, grd=grd)
eff_kmag_sq = sum(get_evals_2(kvec_i, dxi) for dxi, kvec_i in zip(dx, kvec))
lap_true = cla.to_device(queue, eff_kmag_sq * np.sin(phases))
diff = clm.fabs(lap - lap_true)
max_err = cla.max(diff) / cla.max(clm.fabs(lap_true))
avg_err = cla.sum(diff) / cla.sum(clm.fabs(lap_true))
max_rtol = 1.e-11 if dtype == np.float64 else 3.e-4
avg_rtol = 1.e-12 if dtype == np.float64 else 5.e-5
assert max_err < max_rtol and avg_err < avg_rtol, \
"lap inaccurate for halo_shape=%s, grid_shape=%s, proc_shape=%s" \
% (h, grid_shape, proc_shape)
for i in range(3):
def test_gradient(self):
"""
tests the gradient kernel (norm and orientation)
"""
border_dist, peakthresh, EdgeThresh, EdgeThresh0, octsize, scale, nb_keypoints, width, height, DOGS, g = local_maxmin_setup()
self.mat = numpy.ascontiguousarray(g[1])
self.height, self.width = numpy.int32(self.mat.shape)
self.gpu_mat = pyopencl.array.to_device(queue, self.mat)
self.gpu_grad = pyopencl.array.empty(queue, self.mat.shape, dtype=numpy.float32, order="C")
self.gpu_ori = pyopencl.array.empty(queue, self.mat.shape, dtype=numpy.float32, order="C")
self.shape = calc_size((self.width, self.height), self.wg)
t0 = time.time()
k1 = self.program.compute_gradient_orientation(queue, self.shape, self.wg, self.gpu_mat.data, self.gpu_grad.data, self.gpu_ori.data, self.width, self.height)
res_norm = self.gpu_grad.get()
res_ori = self.gpu_ori.get()
t1 = time.time()
ref_norm, ref_ori = my_gradient(self.mat)
t2 = time.time()
delta_norm = abs(ref_norm - res_norm).max()
delta_ori = abs(ref_ori - res_ori).max()
if (PRINT_KEYPOINTS):
rmin, cmin = 0, 0
rmax, cmax = rmin+6, cmin+6
def test_local_maxmin(self):
"""
tests the local maximum/minimum detection kernel
"""
#local_maxmin_setup :
border_dist, peakthresh, EdgeThresh, EdgeThresh0, octsize, s, nb_keypoints, width, height, DOGS, g = local_maxmin_setup()
self.s = numpy.int32(s) #1, 2, 3 ... not 4 nor 0.
self.gpu_dogs = pyopencl.array.to_device(queue, DOGS)
self.output = pyopencl.array.empty(queue, (nb_keypoints, 4), dtype=numpy.float32, order="C")
self.output.fill(-1.0, queue) #memset for invalid keypoints
self.counter = pyopencl.array.zeros(queue, (1,), dtype=numpy.int32, order="C")
nb_keypoints = numpy.int32(nb_keypoints)
self.shape = calc_size((DOGS.shape[1], DOGS.shape[0] * DOGS.shape[2]), self.wg) #it's a 3D vector !!
t0 = time.time()
k1 = self.program.local_maxmin(queue, self.shape, self.wg,
self.gpu_dogs.data, self.output.data,
border_dist, peakthresh, octsize, EdgeThresh0, EdgeThresh,
self.counter.data, nb_keypoints, self.s, width, height)
res = self.output.get()
self.keypoints1 = self.output #for further use
self.actual_nb_keypoints = self.counter.get()[0] #for further use
t1 = time.time()
# For now, at least, do not create retval or chunk buffer here.
# Iterate through this entire mess once per depth level
row_list = np.array([x for x in xrange(xlen)])
negfound = False
for row_chunk in chunks(row_list, best_rows):
# Do not prepend buffer rows for first row.
realfirst = row_chunk[0]
if (row_chunk[0] != row_list[0]):
realfirst -= maxdepth
# Do not postpend buffer rows for last row.
reallast = row_chunk[-1]
if (row_chunk[-1] != row_list[-1]):
reallast += maxdepth
# Create retvals and chunk buffer here instead of above.
chunk = np.copy(workingarr[realfirst*ylen:reallast*ylen])
outchunk_buf = cla.empty(self.queue, chunk.shape, chunk.dtype)
inchunk_buf = cla.to_device(self.queue, chunk)
newxlen = reallast-realfirst
newxlen_arg = np.uint32(newxlen)
lenchunk = newxlen*ylen
lenchunk_arg = np.uint32(lenchunk)
currdepth = 0
while (currdepth <= maxdepth):
currdepth_arg = np.uint32(currdepth)
if (currdepth % 2 == 0):
event = self.program.bathy(self.queue, self.global_size[self.devices[best_device]], self.local_size[self.devices[best_device]], outchunk_buf.data, inchunk_buf.data, newxlen_arg, ylen_arg, currdepth_arg, maxdepth_arg)
else:
event = self.program.bathy(self.queue, self.global_size[self.devices[best_device]], self.local_size[self.devices[best_device]], inchunk_buf.data, outchunk_buf.data, newxlen_arg, ylen_arg, currdepth_arg, maxdepth_arg)
event.wait()
currdepth += 1
# cl.enqueue_copy(self.queue, retvals_arr, retvals_buf)
# copy important parts of chunk_buf.data back
level_start_target_or_target_parent_box_nrs, evt_tp = \
extract_level_start_box_nrs(
target_or_target_parent_boxes, wait_for=wait_for)
wait_for = [evt_s, evt_sp, evt_t, evt_tp]
# }}}
# {{{ box extents
fin_debug("finding box extents")
box_source_bounding_box_min = cl.array.empty(
queue, (tree.dimensions, tree.aligned_nboxes),
dtype=tree.coord_dtype)
box_source_bounding_box_max = cl.array.empty(
queue, (tree.dimensions, tree.aligned_nboxes),
dtype=tree.coord_dtype)
if tree.sources_are_targets:
box_target_bounding_box_min = box_source_bounding_box_min
box_target_bounding_box_max = box_source_bounding_box_max
else:
box_target_bounding_box_min = cl.array.empty(
queue, (tree.dimensions, tree.aligned_nboxes),
dtype=tree.coord_dtype)
box_target_bounding_box_max = cl.array.empty(
queue, (tree.dimensions, tree.aligned_nboxes),
dtype=tree.coord_dtype)
bogus_radii_array = cl.array.empty(queue, 1, dtype=tree.coord_dtype)
:return: The decompressed image as an pyopencl array.
:rtype: pyopencl.array
"""
assert self.dec_size is not None, \
"dec_size is a mandatory ByteOffset init argument for decompression"
events = []
with self.sem:
len_raw = numpy.int32(len(raw))
if len_raw > self.padded_raw_size:
wg = self.block_size
self.raw_size = int(len(raw))
self.padded_raw_size = (self.raw_size + wg - 1) & ~(wg - 1)
logger.info("increase raw buffer size to %s", self.padded_raw_size)
buffers = {
"raw": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int8),
"mask": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int32),
"exceptions": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int32),
"values": pyopencl.array.empty(self.queue, self.padded_raw_size, dtype=numpy.int32),
}
self.cl_mem.update(buffers)
else:
wg = self.block_size
evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["raw"].data,
raw,
is_blocking=False)
events.append(EventDescription("copy raw H -> D", evt))
evt = self.kernels.fill_int_mem(self.queue, (self.padded_raw_size,), (wg,),
self.cl_mem["mask"].data,
numpy.int32(self.padded_raw_size),
numpy.int32(0),
program.reduce2(queue, (workgroup_size,), (workgroup_size,), d_preresult.data, d_minmax.data)
min0 = pos[:, :, 0].min()
max0 = pos[:, :, 0].max()
min1 = pos[:, :, 1].min()
max1 = pos[:, :, 1].max()
minmax = (min0, max0, min1, max1)
print(minmax)
print(d_minmax)
memset_size = (bins + workgroup_size - 1) & ~(workgroup_size - 1),
d_outMax = cl.array.empty(queue, (bins,), dtype=numpy.int32)
program.memset_out_int(queue, memset_size, (workgroup_size,), d_outMax.data)
global_size = (size + workgroup_size - 1) & ~(workgroup_size - 1),
program.lut1(queue, global_size, (workgroup_size,), d_pos.data, d_minmax.data, numpy.uint32(size), d_outMax.data)
outMax_1 = numpy.copy(d_outMax)
d_idx_ptr = cl.array.empty(queue, (bins + 1,), dtype=numpy.int32)
d_lutsize = cl.array.empty(queue, (1,), dtype=numpy.int32)
d_idx_ptr = cl.array.empty(queue, (bins + 1,), dtype=numpy.int32)
d_lutsize = cl.array.empty(queue, (1,), dtype=numpy.int32)
program.lut2(queue, (1,), (1,), d_outMax.data, d_idx_ptr.data, d_lutsize.data)
lutsize = numpy.ndarray(1, dtype=numpy.int32)
cl.enqueue_copy(queue, lutsize, d_lutsize.data)
print(lutsize)
lut_size = int(lutsize[0])
d_indices = cl.array.empty(queue, (lut_size,), dtype=numpy.int32)
d_data = cl.array.empty(queue, (lut_size,), dtype=numpy.float32)
# d_check_atomics = cl.Buffer(ctx, mf.READ_WRITE, 4*lut_size)
program.memset_out_int(queue, memset_size, (workgroup_size,), d_outMax.data)
d_outData = cl.array.empty(queue, (bins,), dtype=numpy.float32)
d_outCount = cl.array.empty(queue, (bins,), dtype=numpy.float32)
d_outMerge = cl.array.empty(queue, (bins,), dtype=numpy.float32)
program.lut3(queue, global_size, (workgroup_size,), d_pos.data, d_minmax.data, numpy.uint32(size), d_outMax.data, d_idx_ptr.data, d_indices.data, d_data.data)
outMax_2 = numpy.copy(d_outMax)
outMax_2 = numpy.copy(d_outMax)
# check_atomics = numpy.ndarray(lut_size, dtype=numpy.int32)
# cl.enqueue_copy(queue, check_atomics, d_check_atomics)
program.memset_out(queue, memset_size, (workgroup_size,), d_outData.data, d_outCount.data, d_outMerge.data)
d_image = cl.array.to_device(queue, data)
d_image_float = cl.array.empty(queue, (size,), dtype=numpy.float32)
# program.s32_to_float(queue, global_size, (workgroup_size,), d_image.data, d_image_float) # Pilatus1M
program.u16_to_float(queue, global_size, (workgroup_size,), d_image.data, d_image_float.data) # halfccd
program.csr_integrate(queue, (bins * workgroup_size,), (workgroup_size,), d_image_float.data, d_data.data, d_indices.data, d_idx_ptr.data, d_outData.data, d_outCount.data, d_outMerge.data)
# outData = numpy.ndarray(bins, dtype=numpy.float32)
# outCount = numpy.ndarray(bins, dtype=numpy.float32)
outMerge = numpy.ndarray(bins, dtype=numpy.float32)
# cl.enqueue_copy(queue,outData, d_outData)
# cl.enqueue_copy(queue,outCount, d_outCount)
cl.enqueue_copy(queue, outMerge, d_outMerge.data)
if input_array.shape != self.input_shape:
raise ValueError('Input array does not have the shape prepared for')
if input_array.dtype != np.float32:
raise ValueError('Input array does not have float32 dtype')
if workspace.size < self.workspace_size:
raise ValueError('Workspace is too small ({0} vs {1})'.format(
workspace.size, self.workspace_size))
# Create workspace and output array views on workspace
workspace_array = cla.Array(self.queue,
input_array.shape, cla.vec.float2,
data=workspace, offset=0)
output_array = cla.Array(self.queue,
input_array.shape, cla.vec.float4,
data=workspace, offset=(workspace_array.size+1)>>1)
output_array = cla.empty(self.queue, self.input_shape, cla.vec.float4)
# output_array, workspace_array = output, workspace
# C-style ordering where first index is horizontal
input_array_strides = (1, input_array.shape[1],
input_array.shape[1]*input_array.shape[0])
# C-style ordering where first index is horizontal
workspace_array_strides = (1, workspace_array.shape[1],
workspace_array.shape[1] * workspace_array.shape[0])
# Perform column convolution
input_region = Region(input_array.base_data,
input_array.offset,
input_array.shape[::-1],
(0,0), (1,1), input_array_strides)