from __future__ import absolute_import, print_function
import os
import sys
import six
import numpy as np
# {{{ add cuda lib dir to Python DLL path
def _search_on_path(filenames):
"""Find file on system path."""
# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52224
from os.path import exists, abspath, join
from os import pathsep, environ
search_path = environ["PATH"]
paths = search_path.split(pathsep)
for path in paths:
for filename in filenames:
if exists(join(path, filename)):
return abspath(join(path, filename))
def _add_cuda_libdir_to_dll_path():
from os.path import join, dirname
cuda_path = os.environ.get("CUDA_PATH")
if cuda_path is not None:
os.add_dll_directory(join(cuda_path, 'bin'))
return
nvcc_path = _search_on_path(["nvcc.exe"])
if nvcc_path is not None:
os.add_dll_directory(dirname(nvcc_path))
from warnings import warn
warn("Unable to discover CUDA installation directory "
"while attempting to add it to Python's DLL path. "
"Either set the 'CUDA_PATH' environment variable "
"or ensure that 'nvcc.exe' is on the path.")
try:
os.add_dll_directory
except AttributeError:
# likely not on Py3.8 and Windows
# https://github.com/inducer/pycuda/issues/213
pass
else:
_add_cuda_libdir_to_dll_path()
# }}}
try:
from pycuda._driver import * # noqa
except ImportError as e:
if "_v2" in str(e):
from warnings import warn
warn("Failed to import the CUDA driver interface, with an error "
"message indicating that the version of your CUDA header "
"does not match the version of your CUDA driver.")
raise
if sys.version_info >= (3,):
_memoryview = memoryview
_my_bytes = bytes
else:
_memoryview = buffer
_my_bytes = str
try:
ManagedAllocationOrStub = ManagedAllocation
except NameError:
# Provide ManagedAllocationOrStub if not on CUDA 6.
# This avoids having to do a version check in a high-traffic code path below.
class ManagedAllocationOrStub(object):
pass
CUDA_DEBUGGING = False
def set_debugging(flag=True):
global CUDA_DEBUGGING
CUDA_DEBUGGING = flag
class CompileError(Error):
def __init__(self, msg, command_line, stdout=None, stderr=None):
self.msg = msg
self.command_line = command_line
self.stdout = stdout
self.stderr = stderr
def __str__(self):
result = self.msg
if self.command_line:
try:
result += "\n[command: %s]" % (" ".join(self.command_line))
except Exception as e:
print(e)
if self.stdout:
result += "\n[stdout:\n%s]" % self.stdout
if self.stderr:
result += "\n[stderr:\n%s]" % self.stderr
return result
class ArgumentHandler(object):
def __init__(self, ary):
self.array = ary
self.dev_alloc = None
def get_device_alloc(self):
if self.dev_alloc is None:
try:
self.dev_alloc = mem_alloc_like(self.array)
except AttributeError:
raise TypeError("could not determine array length of '%s': unsupported array type or not an array" % type(self.array))
return self.dev_alloc
def pre_call(self, stream):
pass
class In(ArgumentHandler):
def pre_call(self, stream):
if stream is not None:
memcpy_htod(self.get_device_alloc(), self.array)
else:
memcpy_htod(self.get_device_alloc(), self.array)
class Out(ArgumentHandler):
def post_call(self, stream):
if stream is not None:
memcpy_dtoh(self.array, self.get_device_alloc())
else:
memcpy_dtoh(self.array, self.get_device_alloc())
class InOut(In, Out):
pass
def _add_functionality():
def device_get_attributes(dev):
result = {}
for att_name in dir(device_attribute):
if not att_name[0].isupper():
continue
att_id = getattr(device_attribute, att_name)
try:
att_value = dev.get_attribute(att_id)
except LogicError as e:
from warnings import warn
warn("CUDA driver raised '%s' when querying '%s' on '%s'"
% (e, att_name, dev))
else:
result[att_id] = att_value
return result
def device___getattr__(dev, name):
return dev.get_attribute(getattr(device_attribute, name.upper()))
def _build_arg_buf(args):
handlers = []
arg_data = []
format = ""
for i, arg in enumerate(args):
if isinstance(arg, np.number):
arg_data.append(arg)
format += arg.dtype.char
elif isinstance(arg, (DeviceAllocation, PooledDeviceAllocation)):
arg_data.append(int(arg))
format += "P"
elif isinstance(arg, ArgumentHandler):
handlers.append(arg)
arg_data.append(int(arg.get_device_alloc()))
format += "P"
elif isinstance(arg, np.ndarray):
if isinstance(arg.base, ManagedAllocationOrStub):
arg_data.append(int(arg.base))
format += "P"
else:
arg_data.append(arg)
format += "%ds" % arg.nbytes
elif isinstance(arg, np.void):
arg_data.append(_my_bytes(_memoryview(arg)))
format += "%ds" % arg.itemsize
else:
try:
gpudata = np.uintp(arg.gpudata)
except AttributeError:
raise TypeError("invalid type on parameter #%d (0-based)" % i)
else:
# for gpuarrays
arg_data.append(int(gpudata))
format += "P"
from pycuda._pvt_struct import pack
return handlers, pack(format, *arg_data)
# {{{ pre-CUDA 4 call interface (stateful)
def function_param_set_pre_v4(func, *args):
handlers = []
handlers, buf = _build_arg_buf(args)
func._param_setv(0, buf)
func._param_set_size(len(buf))
return handlers
def function_call_pre_v4(func, *args, **kwargs):
grid = kwargs.pop("grid", (1, 1))
stream = kwargs.pop("stream", None)
block = kwargs.pop("block", None)
shared = kwargs.pop("shared", None)
texrefs = kwargs.pop("texrefs", [])
time_kernel = kwargs.pop("time_kernel", False)
if kwargs:
raise ValueError(
"extra keyword arguments: %s"
% (",".join(six.iterkeys(kwargs))))
if block is None:
raise ValueError("must specify block size")
func._set_block_shape(*block)
handlers = func._param_set(*args)
if shared is not None:
func._set_shared_size(shared)
for handler in handlers:
handler.pre_call(stream)
for texref in texrefs:
func.param_set_texref(texref)
post_handlers = [handler
for handler in handlers
if hasattr(handler, "post_call")]
if stream is None:
if time_kernel:
Context.synchronize()
from time import time
start_time = time()
func._launch_grid(*grid)
if post_handlers or time_kernel:
Context.synchronize()
if time_kernel:
run_time = time()-start_time
for handler in post_handlers:
handler.post_call(stream)
if time_kernel:
return run_time
else:
assert not time_kernel, \
"Can't time the kernel on an asynchronous invocation"
func._launch_grid_async(grid[0], grid[1], stream)
if post_handlers:
for handler in post_handlers:
handler.post_call(stream)
def function_prepare_pre_v4(func, arg_types, block=None,
shared=None, texrefs=[]):
from warnings import warn
if block is not None:
warn("setting the block size in Function.prepare is deprecated",
DeprecationWarning, stacklevel=2)
func._set_block_shape(*block)
if shared is not None:
warn("setting the shared memory size in Function.prepare is deprecated",
DeprecationWarning, stacklevel=2)
func._set_shared_size(shared)
func.texrefs = texrefs
func.arg_format = ""
for i, arg_type in enumerate(arg_types):
if (isinstance(arg_type, type)
and np is not None and np.number in arg_type.__mro__):
func.arg_format += np.dtype(arg_type).char
elif isinstance(arg_type, str):
func.arg_format += arg_type
else:
func.arg_format += np.dtype(np.uintp).char
from pycuda._pvt_struct import calcsize
func._param_set_size(calcsize(func.arg_format))
return func
def function_prepared_call_pre_v4(func, grid, block, *args, **kwargs):
if isinstance(block, tuple):
func._set_block_shape(*block)
else:
from warnings import warn
warn("Not passing the block size to prepared_call is deprecated as of "
"version 2011.1.", DeprecationWarning, stacklevel=2)
args = (block,) + args
shared_size = kwargs.pop("shared_size", None)
if shared_size is not None:
func._set_shared_size(shared_size)
if kwargs:
raise TypeError("unknown keyword arguments: "
+ ", ".join(six.iterkeys(kwargs)))
from pycuda._pvt_struct import pack
func._param_setv(0, pack(func.arg_format, *args))
for texref in func.texrefs:
func.param_set_texref(texref)
func._launch_grid(*grid)
Loading ...