Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# data_big = numpy.zeros((self.npt_height, self.npt_width), dtype=numpy.float32) + dummy
# data_big[:data.shape[0], :] = data
# self.cl_mem["input_data"].set(data_big)
# else:
if isinstance(data, pyopencl.array.Array):
evt = pyopencl.enqueue(data.queue, self.cl_mem["input_data"].data, data.data)
events.append(EventDescription("copy", evt))
else:
self.cl_mem["input_data"].set(data)
ws = self.npt_width // 8
if self.block_size < ws:
raise RuntimeError("Requested a workgoup size of %s, maximum is %s" % (ws, self.block_size))
kargs = self.cl_kernel_args["bsort_horizontal"]
local_mem = kargs["l_data"]
if not local_mem or local_mem.size < ws * 32:
kargs["l_data"] = pyopencl.LocalMemory(ws * 32) # 2float4 = 2*4*4 bytes per workgroup size
evt = self.kernels.bsort_horizontal(self.queue, (self.npt_height, ws), (1, ws), *kargs.values())
events.append(EventDescription("bsort_horizontal", evt))
if self.profile:
with self.sem:
self.events += events
return self.cl_mem["input_data"]
@timings("Radix: naive scan")
def naive_scan(self, num):
nhist = num/2/self.cta_size*16
global_size = (nhist,)
local_size = (nhist,)
extra_space = nhist / 16 #NUM_BANKS defined as 16 in RadixSort.cpp
shared_mem_size = self.uintsz * (nhist + extra_space)
scan_args = ( self.mCountersSum,
self.mCounters,
np.uint32(nhist),
cl.LocalMemory(2*shared_mem_size)
)
self.radix_prg.scanNaive(self.queue, global_size, local_size, *(scan_args)).wait()
else:
argsort = 1
run_queue = self.sort_b_prepare_wl(
argsort,
arr.dtype,
idx.dtype if idx is not None else None, arr.shape,
axis)
knl, nt, wg, aux = run_queue[0]
if idx is not None:
if aux:
last_evt = knl(
queue, (nt,), wg, arr.data, idx.data,
cl.LocalMemory(
_tmpl.LOCAL_MEM_FACTOR*wg[0]*arr.dtype.itemsize),
cl.LocalMemory(
_tmpl.LOCAL_MEM_FACTOR*wg[0]*idx.dtype.itemsize),
wait_for=[last_evt])
for knl, nt, wg, _ in run_queue[1:]:
last_evt = knl(
queue, (nt,), wg, arr.data, idx.data,
wait_for=[last_evt])
else:
if aux:
last_evt = knl(
queue, (nt,), wg, arr.data,
cl.LocalMemory(
_tmpl.LOCAL_MEM_FACTOR*wg[0]*4*arr.dtype.itemsize),
wait_for=[last_evt])
def _calculate_statistics(self, range_start, range_end):
nmr_problems = range_end - range_start
max_work_group_sizes = [
cl.Kernel(self._statistic_kernels, 'mean_and_max').get_work_group_info(
cl.kernel_work_group_info.WORK_GROUP_SIZE, self._cl_environment.device),
cl.Kernel(self._statistic_kernels, 'logsum_variance').get_work_group_info(
cl.kernel_work_group_info.WORK_GROUP_SIZE, self._cl_environment.device)
]
workgroup_size = min(max_work_group_sizes)
lse_tmp_buffer = cl.LocalMemory(workgroup_size * np.dtype('double').itemsize)
var_tmp_buffer = cl.LocalMemory(workgroup_size * np.dtype('double').itemsize)
buffers = [self._ll_buffer, self._lse_buffer, self._variances_buffer, lse_tmp_buffer, var_tmp_buffer]
self._statistic_kernels.mean_and_max(
self._cl_queue, (int(nmr_problems * workgroup_size),), (int(workgroup_size),),
*buffers, global_offset=(int(range_start * workgroup_size),))
self._statistic_kernels.logsum_variance(
self._cl_queue, (int(nmr_problems * workgroup_size),), (int(workgroup_size),),
*buffers, global_offset=(int(range_start * workgroup_size),))
def hough_lineseg_kernel(img, rhos, thetas, rhores=1, max_gap=0):
device_rhos = thr.to_device(rhos.astype(np.int32))
cos_thetas = thr.to_device(np.cos(thetas).astype(np.float32))
sin_thetas = thr.to_device(np.sin(thetas).astype(np.float32))
segments = thr.empty_like(Type(np.int32, (len(rhos), 2, 2)))
segments.fill(0)
temp = pyopencl.LocalMemory(img.shape[0] + img.shape[1]/8) # bit-packed
prg.hough_lineseg(*[img,
np.int32(img.shape[1]),
np.int32(img.shape[0]),
device_rhos,
np.int32(rhores),
cos_thetas, sin_thetas] +
[temp] * ('ocl' in api.__name__) +
[np.int32(max_gap),
segments],
global_size=(len(rhos),),
local_size=(1,),
local_mem=img.shape[0] + img.shape[1]/8)
return segments
def hough_line_kernel(img, rhores, numrho, thetas, num_workers=1):
cos_thetas = thr.to_device(np.cos(thetas).astype(np.float32))
sin_thetas = thr.to_device(np.sin(thetas).astype(np.float32))
bins = thr.empty_like(Type(np.float32, (len(thetas), numrho)))
bins.fill(0)
temp = pyopencl.LocalMemory(4 * num_workers)
prg.hough_line(*[img,
np.int32(img.shape[1]),
np.int32(img.shape[0]),
np.int32(rhores),
cos_thetas, sin_thetas] +
[temp] * ('ocl' in api.__name__) +
[bins],
global_size=(int(numrho * num_workers), len(thetas)),
local_size=(num_workers, 1),
local_mem=4 * num_workers)
return bins
def reorder(self, startbit, num):
totalBlocks = num/2/self.cta_size
global_size = (self.cta_size*totalBlocks,)
local_size = (self.cta_size,)
reorder_args = ( self.keys,
self.values,
self.d_tempKeys,
self.d_tempValues,
self.mBlockOffsets,
self.mCountersSum,
self.mCounters,
np.uint32(startbit),
np.uint32(num),
np.uint32(totalBlocks),
cl.LocalMemory(2*self.cta_size*self.uintsz),
cl.LocalMemory(2*self.cta_size*self.uintsz)
)
self.radix_prg.reorderDataKeysValues(self.queue, global_size, local_size, *(reorder_args))
#self.radix_prg.reorderDataKeysOnly(self.queue, global_size, local_size, *(reorder_args))
def create_local_bytearray(size):
return cl.LocalMemory(size)
def scan_local2(self, dst, src, n, size):
elements = n * size
dividend = elements
divisor = self.WORKGROUP_SIZE
if dividend % divisor == 0:
global_size = (dividend,)
else:
global_size = (dividend - dividend % divisor + divisor,)
local_size = (self.WORKGROUP_SIZE, )
scan_args = (self.scan_buffer,
dst,
src,
cl.LocalMemory(2 * self.WORKGROUP_SIZE * self.dtype_size),
np.uint32(elements),
np.uint32(size)
)
self.scan_prg.scanExclusiveLocal2(self.queue, global_size, local_size, *scan_args)