Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
numba / cuda / simulator / kernelapi.py
Size: Mime:
'''
Implements the cuda module as called from within an executing kernel
(@cuda.jit-decorated function).
'''

from contextlib import contextmanager
import sys
import threading
import traceback

import numpy as np

from numba import numpy_support


class Dim3(object):
    '''
    Used to implement thread/block indices/dimensions
    '''
    def __init__(self, x, y, z):
        self.x = x
        self.y = y
        self.z = z

    def __str__(self):
        return '(%s, %s, %s)' % (self.x, self.y, self.z)

    def __repr__(self):
        return 'Dim3(%s, %s, %s)' % (self.x, self.y, self.z)

    def __iter__(self):
        yield self.x
        yield self.y
        yield self.z


class FakeCUDALocal(object):
    '''
    CUDA Local arrays
    '''
    def array(self, shape, dtype):
        dtype = numpy_support.as_dtype(dtype)
        return np.empty(shape, dtype)


class FakeCUDAConst(object):
    '''
    CUDA Const arrays
    '''
    def array_like(self, ary):
        return ary


class FakeCUDAShared(object):
    '''
    CUDA Shared arrays.

    Limitations: assumes that only one call to cuda.shared.array is on a line,
    and that that line is only executed once per thread. i.e.::

        a = cuda.shared.array(...); b = cuda.shared.array(...)

    will erroneously alias a and b, and::

        for i in range(10):
            sharedarrs[i] = cuda.shared.array(...)

    will alias all arrays created at that point (though it is not certain that
    this would be supported by Numba anyway).
    '''

    def __init__(self, dynshared_size):
        self._allocations = {}
        self._dynshared_size = dynshared_size
        self._dynshared = np.zeros(dynshared_size, dtype=np.byte)

    def array(self, shape, dtype):
        dtype = numpy_support.as_dtype(dtype)
        # Dynamic shared memory is requested with size 0 - this all shares the
        # same underlying memory
        if shape == 0:
            # Count must be the maximum number of whole elements that fit in the
            # buffer (Numpy complains if the buffer is not a multiple of the
            # element size)
            count = self._dynshared_size // dtype.itemsize
            return np.frombuffer(self._dynshared.data, dtype=dtype, count=count)

        # Otherwise, identify allocations by source file and line number
        # We pass the reference frame explicitly to work around
        # http://bugs.python.org/issue25108
        stack = traceback.extract_stack(sys._getframe())
        caller = stack[-2][0:2]
        res = self._allocations.get(caller)
        if res is None:
            res = np.empty(shape, dtype)
            self._allocations[caller] = res
        return res

addlock = threading.Lock()
maxlock = threading.Lock()
minlock = threading.Lock()
caslock = threading.Lock()


class FakeCUDAAtomic(object):
    def add(self, array, index, val):
        with addlock:
            array[index] += val

    def max(self, array, index, val):
        with maxlock:
            # CUDA Python's semantics for max differ from Numpy's Python's,
            # so we have special handling here (CUDA Python treats NaN as
            # missing data).
            if np.isnan(array[index]):
                array[index] = val
            elif np.isnan(val):
                return
            array[index] = max(array[index], val)

    def min(self, array, index, val):
        with minlock:
            # CUDA Python's semantics for min differ from Numpy's Python's,
            # so we have special handling here (CUDA Python treats NaN as
            # missing data).
            if np.isnan(array[index]):
                array[index] = val
            elif np.isnan(val):
                return
            array[index] = min(array[index], val)

    def compare_and_swap(self, array, old, val):
        with caslock:
            index = (0,) * array.ndim
            loaded = array[index]
            if loaded == old:
                array[index] = val
            return loaded


class FakeCUDAModule(object):
    '''
    An instance of this class will be injected into the __globals__ for an
    executing function in order to implement calls to cuda.*. This will fail to
    work correctly if the user code does::

        from numba import cuda as something_else

    In other words, the CUDA module must be called cuda.
    '''

    def __init__(self, grid_dim, block_dim, dynshared_size):
        self.gridDim = Dim3(*grid_dim)
        self.blockDim = Dim3(*block_dim)
        self._local = FakeCUDALocal()
        self._shared = FakeCUDAShared(dynshared_size)
        self._const = FakeCUDAConst()
        self._atomic = FakeCUDAAtomic()

    @property
    def local(self):
        return self._local

    @property
    def shared(self):
        return self._shared

    @property
    def const(self):
        return self._const

    @property
    def atomic(self):
        return self._atomic

    @property
    def threadIdx(self):
        return threading.current_thread().threadIdx

    @property
    def blockIdx(self):
        return threading.current_thread().blockIdx

    def syncthreads(self):
        threading.current_thread().syncthreads()

    def threadfence(self):
        # No-op
        pass

    def threadfence_block(self):
        # No-op
        pass

    def threadfence_system(self):
        # No-op
        pass

    def grid(self, n):
        bdim = self.blockDim
        bid = self.blockIdx
        tid = self.threadIdx
        x = bid.x * bdim.x + tid.x
        if n == 1:
            return x
        y = bid.y * bdim.y + tid.y
        if n == 2:
            return (x, y)
        z = bid.z * bdim.z + tid.z
        if n == 3:
            return (x, y, z)

        raise RuntimeError("Global ID has 1-3 dimensions. %d requested" % n)

    def gridsize(self, n):
        bdim = self.blockDim
        gdim = self.gridDim
        x = bdim.x * gdim.x
        if n == 1:
            return x
        y = bdim.y * gdim.y
        if n == 2:
            return (x, y)
        z = bdim.z * gdim.z
        if n == 3:
            return (x, y, z)

        raise RuntimeError("Global grid has 1-3 dimensions. %d requested" % n)


@contextmanager
def swapped_cuda_module(fn, fake_cuda_module):
    from numba import cuda

    fn_globs = fn.__globals__
    # get all globals that is the "cuda" module
    orig = dict((k, v) for k, v in fn_globs.items() if v is cuda)
    # build replacement dict
    repl = dict((k, fake_cuda_module) for k, v in orig.items())
    # replace
    fn_globs.update(repl)
    try:
        yield
    finally:
        # revert
        fn_globs.update(orig)