How to use the pyopencl.LocalMemory function in pyopencl

To help you get started, we’ve selected a few pyopencl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github silx-kit / pyFAI / pyFAI / opencl / sort.py View on Github external
#                 data_big = numpy.zeros((self.npt_height, self.npt_width), dtype=numpy.float32) + dummy
#                 data_big[:data.shape[0], :] = data
#                 self.cl_mem["input_data"].set(data_big)
#         else:
        if isinstance(data, pyopencl.array.Array):
            evt = pyopencl.enqueue(data.queue, self.cl_mem["input_data"].data, data.data)
            events.append(EventDescription("copy", evt))
        else:
            self.cl_mem["input_data"].set(data)
        ws = self.npt_width // 8
        if self.block_size < ws:
            raise RuntimeError("Requested a workgoup size of %s, maximum is %s" % (ws, self.block_size))
        kargs = self.cl_kernel_args["bsort_horizontal"]
        local_mem = kargs["l_data"]
        if not local_mem or local_mem.size < ws * 32:
            kargs["l_data"] = pyopencl.LocalMemory(ws * 32)  # 2float4 = 2*4*4 bytes per workgroup size
        evt = self.kernels.bsort_horizontal(self.queue, (self.npt_height, ws), (1, ws), *kargs.values())
        events.append(EventDescription("bsort_horizontal", evt))

        if self.profile:
            with self.sem:
                self.events += events
        return self.cl_mem["input_data"]
github enjalot / adventures_in_opencl / experiments / radix / nv / radix.py View on Github external
    @timings("Radix: naive scan")
    def naive_scan(self, num):
        nhist = num/2/self.cta_size*16
        global_size = (nhist,)
        local_size = (nhist,)
        extra_space = nhist / 16 #NUM_BANKS defined as 16 in RadixSort.cpp
        shared_mem_size = self.uintsz * (nhist + extra_space)
        scan_args = (   self.mCountersSum,
                        self.mCounters,
                        np.uint32(nhist),
                        cl.LocalMemory(2*shared_mem_size)
                    )
        self.radix_prg.scanNaive(self.queue, global_size, local_size, *(scan_args)).wait()
github inducer / pyopencl / pyopencl / bitonic_sort.py View on Github external
else:
            argsort = 1

        run_queue = self.sort_b_prepare_wl(
                argsort,
                arr.dtype,
                idx.dtype if idx is not None else None, arr.shape,
                axis)

        knl, nt, wg, aux = run_queue[0]

        if idx is not None:
            if aux:
                last_evt = knl(
                        queue, (nt,), wg, arr.data, idx.data,
                        cl.LocalMemory(
                            _tmpl.LOCAL_MEM_FACTOR*wg[0]*arr.dtype.itemsize),
                        cl.LocalMemory(
                            _tmpl.LOCAL_MEM_FACTOR*wg[0]*idx.dtype.itemsize),
                        wait_for=[last_evt])
            for knl, nt, wg, _ in run_queue[1:]:
                last_evt = knl(
                        queue, (nt,), wg, arr.data, idx.data,
                        wait_for=[last_evt])

        else:
            if aux:
                last_evt = knl(
                        queue, (nt,), wg, arr.data,
                        cl.LocalMemory(
                            _tmpl.LOCAL_MEM_FACTOR*wg[0]*4*arr.dtype.itemsize),
                        wait_for=[last_evt])
github robbert-harms / MOT / mot / cl_routines / mapping / waic_calculator.py View on Github external
def _calculate_statistics(self, range_start, range_end):
        nmr_problems = range_end - range_start

        max_work_group_sizes = [
            cl.Kernel(self._statistic_kernels, 'mean_and_max').get_work_group_info(
                cl.kernel_work_group_info.WORK_GROUP_SIZE, self._cl_environment.device),
            cl.Kernel(self._statistic_kernels, 'logsum_variance').get_work_group_info(
                cl.kernel_work_group_info.WORK_GROUP_SIZE, self._cl_environment.device)
        ]
        workgroup_size = min(max_work_group_sizes)

        lse_tmp_buffer = cl.LocalMemory(workgroup_size * np.dtype('double').itemsize)
        var_tmp_buffer = cl.LocalMemory(workgroup_size * np.dtype('double').itemsize)

        buffers = [self._ll_buffer, self._lse_buffer, self._variances_buffer, lse_tmp_buffer, var_tmp_buffer]

        self._statistic_kernels.mean_and_max(
            self._cl_queue, (int(nmr_problems * workgroup_size),), (int(workgroup_size),),
            *buffers, global_offset=(int(range_start * workgroup_size),))

        self._statistic_kernels.logsum_variance(
            self._cl_queue, (int(nmr_problems * workgroup_size),), (int(workgroup_size),),
            *buffers, global_offset=(int(range_start * workgroup_size),))
github ringw / homer / metaomr / hough.py View on Github external
def hough_lineseg_kernel(img, rhos, thetas, rhores=1, max_gap=0):
    device_rhos = thr.to_device(rhos.astype(np.int32))
    cos_thetas = thr.to_device(np.cos(thetas).astype(np.float32))
    sin_thetas = thr.to_device(np.sin(thetas).astype(np.float32))
    segments = thr.empty_like(Type(np.int32, (len(rhos), 2, 2)))
    segments.fill(0)
    temp = pyopencl.LocalMemory(img.shape[0] + img.shape[1]/8) # bit-packed
    prg.hough_lineseg(*[img,
                        np.int32(img.shape[1]),
                        np.int32(img.shape[0]),
                        device_rhos,
                        np.int32(rhores),
                        cos_thetas, sin_thetas] +
                        [temp] * ('ocl' in api.__name__) +
                       [np.int32(max_gap),
                        segments],
                      global_size=(len(rhos),),
                      local_size=(1,),
                      local_mem=img.shape[0] + img.shape[1]/8)
    return segments
github ringw / homer / metaomr / hough.py View on Github external
def hough_line_kernel(img, rhores, numrho, thetas, num_workers=1):
    cos_thetas = thr.to_device(np.cos(thetas).astype(np.float32))
    sin_thetas = thr.to_device(np.sin(thetas).astype(np.float32))
    bins = thr.empty_like(Type(np.float32, (len(thetas), numrho)))
    bins.fill(0)
    temp = pyopencl.LocalMemory(4 * num_workers)
    prg.hough_line(*[img,
                     np.int32(img.shape[1]),
                     np.int32(img.shape[0]),
                     np.int32(rhores),
                     cos_thetas, sin_thetas] + 
                    [temp] * ('ocl' in api.__name__) +
                    [bins],
                   global_size=(int(numrho * num_workers), len(thetas)),
                   local_size=(num_workers, 1),
                   local_mem=4 * num_workers)
    return bins
github enjalot / adventures_in_opencl / experiments / radix / nv / radix.py View on Github external
def reorder(self, startbit, num):
        totalBlocks = num/2/self.cta_size
        global_size = (self.cta_size*totalBlocks,)
        local_size = (self.cta_size,)
        reorder_args = ( self.keys,
                         self.values,
                         self.d_tempKeys,
                         self.d_tempValues,
                         self.mBlockOffsets,
                         self.mCountersSum,
                         self.mCounters,
                         np.uint32(startbit),
                         np.uint32(num),
                         np.uint32(totalBlocks),
                         cl.LocalMemory(2*self.cta_size*self.uintsz),
                         cl.LocalMemory(2*self.cta_size*self.uintsz)
                    )
        self.radix_prg.reorderDataKeysValues(self.queue, global_size, local_size, *(reorder_args))
        #self.radix_prg.reorderDataKeysOnly(self.queue, global_size, local_size, *(reorder_args))
github PyOCL / OpenCLGA / OpenCLGA / evaluation / memory_usage / main.py View on Github external
def create_local_bytearray(size):
    return cl.LocalMemory(size)
github benma / pysph / src / sph / radix_sort / radix_sort.py View on Github external
def scan_local2(self, dst, src, n, size):
        elements = n * size
        dividend = elements
        divisor = self.WORKGROUP_SIZE
        if dividend % divisor == 0:
            global_size = (dividend,)
        else: 
            global_size = (dividend - dividend % divisor + divisor,)

        local_size = (self.WORKGROUP_SIZE, )
        scan_args = (self.scan_buffer,
                     dst,
                     src,
                     cl.LocalMemory(2 * self.WORKGROUP_SIZE * self.dtype_size),
                     np.uint32(elements),
                     np.uint32(size)
            )
        self.scan_prg.scanExclusiveLocal2(self.queue, global_size, local_size, *scan_args)