Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

edgify / pycuda   python

Repository URL to install this package:

Version: 2020.1 

/ driver.py

from __future__ import absolute_import, print_function

import os
import sys

import six

import numpy as np


# {{{ add cuda lib dir to Python DLL path

def _search_on_path(filenames):
    """Find file on system path."""
    # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52224

    from os.path import exists, abspath, join
    from os import pathsep, environ

    search_path = environ["PATH"]

    paths = search_path.split(pathsep)
    for path in paths:
        for filename in filenames:
            if exists(join(path, filename)):
                return abspath(join(path, filename))


def _add_cuda_libdir_to_dll_path():
    from os.path import join, dirname

    cuda_path = os.environ.get("CUDA_PATH")

    if cuda_path is not None:
        os.add_dll_directory(join(cuda_path, 'bin'))
        return

    nvcc_path = _search_on_path(["nvcc.exe"])
    if nvcc_path is not None:
        os.add_dll_directory(dirname(nvcc_path))

    from warnings import warn
    warn("Unable to discover CUDA installation directory "
            "while attempting to add it to Python's DLL path. "
            "Either set the 'CUDA_PATH' environment variable "
            "or ensure that 'nvcc.exe' is on the path.")


try:
    os.add_dll_directory
except AttributeError:
    # likely not on Py3.8 and Windows
    # https://github.com/inducer/pycuda/issues/213
    pass
else:
    _add_cuda_libdir_to_dll_path()

# }}}


try:
    from pycuda._driver import *  # noqa
except ImportError as e:
    if "_v2" in str(e):
        from warnings import warn
        warn("Failed to import the CUDA driver interface, with an error "
                "message indicating that the version of your CUDA header "
                "does not match the version of your CUDA driver.")
    raise


if sys.version_info >= (3,):
    _memoryview = memoryview
    _my_bytes = bytes
else:
    _memoryview = buffer
    _my_bytes = str


try:
    ManagedAllocationOrStub = ManagedAllocation
except NameError:
    # Provide ManagedAllocationOrStub if not on CUDA 6.
    # This avoids having to do a version check in a high-traffic code path below.

    class ManagedAllocationOrStub(object):
        pass


CUDA_DEBUGGING = False


def set_debugging(flag=True):
    global CUDA_DEBUGGING
    CUDA_DEBUGGING = flag


class CompileError(Error):
    def __init__(self, msg, command_line, stdout=None, stderr=None):
        self.msg = msg
        self.command_line = command_line
        self.stdout = stdout
        self.stderr = stderr

    def __str__(self):
        result = self.msg
        if self.command_line:
            try:
                result += "\n[command: %s]" % (" ".join(self.command_line))
            except Exception as e:
                print(e)
        if self.stdout:
            result += "\n[stdout:\n%s]" % self.stdout
        if self.stderr:
            result += "\n[stderr:\n%s]" % self.stderr

        return result


class ArgumentHandler(object):
    def __init__(self, ary):
        self.array = ary
        self.dev_alloc = None

    def get_device_alloc(self):
        if self.dev_alloc is None:
            try:
                self.dev_alloc = mem_alloc_like(self.array)
            except AttributeError:
                raise TypeError("could not determine array length of '%s': unsupported array type or not an array" % type(self.array))
        return self.dev_alloc

    def pre_call(self, stream):
        pass


class In(ArgumentHandler):
    def pre_call(self, stream):
        if stream is not None:
            memcpy_htod(self.get_device_alloc(), self.array)
        else:
            memcpy_htod(self.get_device_alloc(), self.array)


class Out(ArgumentHandler):
    def post_call(self, stream):
        if stream is not None:
            memcpy_dtoh(self.array, self.get_device_alloc())
        else:
            memcpy_dtoh(self.array, self.get_device_alloc())


class InOut(In, Out):
    pass


def _add_functionality():

    def device_get_attributes(dev):
        result = {}

        for att_name in dir(device_attribute):
            if not att_name[0].isupper():
                continue

            att_id = getattr(device_attribute, att_name)

            try:
                att_value = dev.get_attribute(att_id)
            except LogicError as e:
                from warnings import warn
                warn("CUDA driver raised '%s' when querying '%s' on '%s'"
                        % (e, att_name, dev))
            else:
                result[att_id] = att_value

        return result

    def device___getattr__(dev, name):
        return dev.get_attribute(getattr(device_attribute, name.upper()))

    def _build_arg_buf(args):
        handlers = []

        arg_data = []
        format = ""
        for i, arg in enumerate(args):
            if isinstance(arg, np.number):
                arg_data.append(arg)
                format += arg.dtype.char
            elif isinstance(arg, (DeviceAllocation, PooledDeviceAllocation)):
                arg_data.append(int(arg))
                format += "P"
            elif isinstance(arg, ArgumentHandler):
                handlers.append(arg)
                arg_data.append(int(arg.get_device_alloc()))
                format += "P"
            elif isinstance(arg, np.ndarray):
                if isinstance(arg.base, ManagedAllocationOrStub):
                    arg_data.append(int(arg.base))
                    format += "P"
                else:
                    arg_data.append(arg)
                    format += "%ds" % arg.nbytes
            elif isinstance(arg, np.void):
                arg_data.append(_my_bytes(_memoryview(arg)))
                format += "%ds" % arg.itemsize
            else:
                try:
                    gpudata = np.uintp(arg.gpudata)
                except AttributeError:
                    raise TypeError("invalid type on parameter #%d (0-based)" % i)
                else:
                    # for gpuarrays
                    arg_data.append(int(gpudata))
                    format += "P"

        from pycuda._pvt_struct import pack
        return handlers, pack(format, *arg_data)

    # {{{ pre-CUDA 4 call interface (stateful)

    def function_param_set_pre_v4(func, *args):
        handlers = []

        handlers, buf = _build_arg_buf(args)

        func._param_setv(0, buf)
        func._param_set_size(len(buf))

        return handlers

    def function_call_pre_v4(func, *args, **kwargs):
        grid = kwargs.pop("grid", (1, 1))
        stream = kwargs.pop("stream", None)
        block = kwargs.pop("block", None)
        shared = kwargs.pop("shared", None)
        texrefs = kwargs.pop("texrefs", [])
        time_kernel = kwargs.pop("time_kernel", False)

        if kwargs:
            raise ValueError(
                    "extra keyword arguments: %s"
                    % (",".join(six.iterkeys(kwargs))))

        if block is None:
            raise ValueError("must specify block size")

        func._set_block_shape(*block)
        handlers = func._param_set(*args)
        if shared is not None:
            func._set_shared_size(shared)

        for handler in handlers:
            handler.pre_call(stream)

        for texref in texrefs:
            func.param_set_texref(texref)

        post_handlers = [handler
                for handler in handlers
                if hasattr(handler, "post_call")]

        if stream is None:
            if time_kernel:
                Context.synchronize()

                from time import time
                start_time = time()
            func._launch_grid(*grid)
            if post_handlers or time_kernel:
                Context.synchronize()

                if time_kernel:
                    run_time = time()-start_time

                for handler in post_handlers:
                    handler.post_call(stream)

                if time_kernel:
                    return run_time
        else:
            assert not time_kernel, \
                    "Can't time the kernel on an asynchronous invocation"
            func._launch_grid_async(grid[0], grid[1], stream)

            if post_handlers:
                for handler in post_handlers:
                    handler.post_call(stream)

    def function_prepare_pre_v4(func, arg_types, block=None,
            shared=None, texrefs=[]):
        from warnings import warn
        if block is not None:
            warn("setting the block size in Function.prepare is deprecated",
                    DeprecationWarning, stacklevel=2)
            func._set_block_shape(*block)

        if shared is not None:
            warn("setting the shared memory size in Function.prepare is deprecated",
                    DeprecationWarning, stacklevel=2)
            func._set_shared_size(shared)

        func.texrefs = texrefs

        func.arg_format = ""

        for i, arg_type in enumerate(arg_types):
            if (isinstance(arg_type, type)
                    and np is not None and np.number in arg_type.__mro__):
                func.arg_format += np.dtype(arg_type).char
            elif isinstance(arg_type, str):
                func.arg_format += arg_type
            else:
                func.arg_format += np.dtype(np.uintp).char

        from pycuda._pvt_struct import calcsize
        func._param_set_size(calcsize(func.arg_format))

        return func

    def function_prepared_call_pre_v4(func, grid, block, *args, **kwargs):
        if isinstance(block, tuple):
            func._set_block_shape(*block)
        else:
            from warnings import warn
            warn("Not passing the block size to prepared_call is deprecated as of "
                    "version 2011.1.", DeprecationWarning, stacklevel=2)
            args = (block,) + args

        shared_size = kwargs.pop("shared_size", None)
        if shared_size is not None:
            func._set_shared_size(shared_size)

        if kwargs:
            raise TypeError("unknown keyword arguments: "
                    + ", ".join(six.iterkeys(kwargs)))

        from pycuda._pvt_struct import pack
        func._param_setv(0, pack(func.arg_format, *args))

        for texref in func.texrefs:
            func.param_set_texref(texref)

        func._launch_grid(*grid)
Loading ...