from __future__ import division, absolute_import
import numpy as np
import pycuda.elementwise as elementwise
from pytools import memoize, memoize_method
import pycuda.driver as drv
from pycuda.compyte.array import (
as_strided as _as_strided,
f_contiguous_strides as _f_contiguous_strides,
c_contiguous_strides as _c_contiguous_strides,
ArrayFlags as _ArrayFlags,
get_common_dtype as _get_common_dtype_base)
from pycuda.characterize import has_double_support
import six
from six.moves import range, zip, reduce
import numbers
def _get_common_dtype(obj1, obj2):
return _get_common_dtype_base(obj1, obj2, has_double_support())
# {{{ vector types
class vec: # noqa
pass
def _create_vector_types():
from pycuda.characterize import platform_bits
if platform_bits() == 32:
long_dtype = np.int32
ulong_dtype = np.uint32
else:
long_dtype = np.int64
ulong_dtype = np.uint64
field_names = ["x", "y", "z", "w"]
from pycuda.tools import get_or_register_dtype
for base_name, base_type, counts in [
('char', np.int8, [1, 2, 3, 4]),
('uchar', np.uint8, [1, 2, 3, 4]),
('short', np.int16, [1, 2, 3, 4]),
('ushort', np.uint16, [1, 2, 3, 4]),
('int', np.int32, [1, 2, 3, 4]),
('uint', np.uint32, [1, 2, 3, 4]),
('long', long_dtype, [1, 2, 3, 4]),
('ulong', ulong_dtype, [1, 2, 3, 4]),
('longlong', np.int64, [1, 2]),
('ulonglong', np.uint64, [1, 2]),
('float', np.float32, [1, 2, 3, 4]),
('double', np.float64, [1, 2]),
]:
for count in counts:
name = "%s%d" % (base_name, count)
dtype = np.dtype([
(field_names[i], base_type)
for i in range(count)])
get_or_register_dtype(name, dtype)
setattr(vec, name, dtype)
my_field_names = ",".join(field_names[:count])
setattr(vec, "make_"+name,
staticmethod(eval(
"lambda %s: array((%s), dtype=my_dtype)"
% (my_field_names, my_field_names),
dict(array=np.array, my_dtype=dtype))))
_create_vector_types()
# }}}
# {{{ helper functionality
@memoize
def _splay_backend(n, dev):
# heavily modified from cublas
from pycuda.tools import DeviceData
devdata = DeviceData(dev)
min_threads = devdata.warp_size
max_threads = 128
max_blocks = 4 * devdata.thread_blocks_per_mp \
* dev.get_attribute(drv.device_attribute.MULTIPROCESSOR_COUNT)
if n < min_threads:
block_count = 1
threads_per_block = min_threads
elif n < (max_blocks * min_threads):
block_count = (n + min_threads - 1) // min_threads
threads_per_block = min_threads
elif n < (max_blocks * max_threads):
block_count = max_blocks
grp = (n + min_threads - 1) // min_threads
threads_per_block = ((grp + max_blocks - 1) // max_blocks) * min_threads
else:
block_count = max_blocks
threads_per_block = max_threads
# print "n:%d bc:%d tpb:%d" % (n, block_count, threads_per_block)
return (block_count, 1), (threads_per_block, 1, 1)
def splay(n, dev=None):
if dev is None:
dev = drv.Context.get_device()
return _splay_backend(n, dev)
# }}}
# {{{ main GPUArray class
def _make_binary_op(operator):
def func(self, other):
if not self.flags.forc:
raise RuntimeError("only contiguous arrays may "
"be used as arguments to this operation")
if isinstance(other, GPUArray):
assert self.shape == other.shape
if not other.flags.forc:
raise RuntimeError("only contiguous arrays may "
"be used as arguments to this operation")
result = self._new_like_me()
func = elementwise.get_binary_op_kernel(
self.dtype, other.dtype, result.dtype,
operator)
func.prepared_async_call(self._grid, self._block, None,
self.gpudata, other.gpudata, result.gpudata,
self.mem_size)
return result
else: # scalar operator
result = self._new_like_me()
func = elementwise.get_scalar_op_kernel(
self.dtype, result.dtype, operator)
func.prepared_async_call(self._grid, self._block, None,
self.gpudata, other, result.gpudata,
self.mem_size)
return result
return func
class GPUArray(object):
"""A GPUArray is used to do array-based calculation on the GPU.
This is mostly supposed to be a numpy-workalike. Operators
work on an element-by-element basis, just like numpy.ndarray.
"""
__array_priority__ = 100
def __init__(self, shape, dtype, allocator=drv.mem_alloc,
base=None, gpudata=None, strides=None, order="C"):
dtype = np.dtype(dtype)
try:
s = 1
for dim in shape:
s *= dim
except TypeError:
# handle dim-0 ndarrays:
if isinstance(shape, np.ndarray):
shape = np.asscalar(shape)
assert isinstance(shape, numbers.Integral)
s = shape
shape = (shape,)
else:
# handle shapes that are ndarrays
shape = tuple(shape)
if isinstance(s, np.integer):
# bombs if s is a Python integer
s = np.asscalar(s)
if strides is None:
if order == "F":
strides = _f_contiguous_strides(
dtype.itemsize, shape)
elif order == "C":
strides = _c_contiguous_strides(
dtype.itemsize, shape)
else:
raise ValueError("invalid order: %s" % order)
else:
# FIXME: We should possibly perform some plausibility
# checking on 'strides' here.
strides = tuple(strides)
self.shape = tuple(shape)
self.dtype = dtype
self.strides = strides
self.mem_size = self.size = s
self.nbytes = self.dtype.itemsize * self.size
self.itemsize = self.dtype.itemsize
self.allocator = allocator
if gpudata is None:
if self.size:
self.gpudata = self.allocator(self.size * self.dtype.itemsize)
else:
self.gpudata = None
assert base is None
else:
self.gpudata = gpudata
self.base = base
self._grid, self._block = splay(self.mem_size)
@property
def ndim(self):
return len(self.shape)
@property
@memoize_method
def flags(self):
return _ArrayFlags(self)
def set(self, ary, async_=False, stream=None, **kwargs):
# {{{ handle 'async' deprecation
async_arg = kwargs.pop("async", None)
if async_arg is not None:
if async_ is not None:
raise TypeError("may not specify both 'async' and 'async_'")
async_ = async_arg
if async_ is None:
async_ = False
if kwargs:
raise TypeError("extra keyword arguments specified: %s"
% ", ".join(kwargs))
# }}}
if ary.size != self.size:
raise ValueError("ary and self must be the same size")
if ary.shape != self.shape:
from warnings import warn
warn("Setting array from one with different shape.",
stacklevel=2)
ary = ary.reshape(self.shape)
if ary.dtype != self.dtype:
raise ValueError("ary and self must have the same dtype")
if self.size:
_memcpy_discontig(self, ary, async_=async_, stream=stream)
def set_async(self, ary, stream=None):
return self.set(ary, async_=True, stream=stream)
def get(self, ary=None, pagelocked=False, async_=False, stream=None, **kwargs):
# {{{ handle 'async' deprecation
async_arg = kwargs.pop("async", None)
if async_arg is not None:
if async_ is not None:
raise TypeError("may not specify both 'async' and 'async_'")
async_ = async_arg
if async_ is None:
async_ = False
if kwargs:
raise TypeError("extra keyword arguments specified: %s"
% ", ".join(kwargs))
# }}}
if ary is None:
if pagelocked:
ary = drv.pagelocked_empty(self.shape, self.dtype)
else:
ary = np.empty(self.shape, self.dtype)
strides = _compact_strides(self)
ary = _as_strided(ary, strides=strides)
else:
if self.size != ary.size:
raise ValueError("self and ary must be the same size")
if self.shape != ary.shape:
from warnings import warn
warn("get() between arrays of different shape is deprecated "
"and will be removed in PyCUDA 2017.x",
DeprecationWarning, stacklevel=2)
ary = ary.reshape(self.shape)
if self.dtype != ary.dtype:
raise TypeError("self and ary must have the same dtype")
if self.size:
_memcpy_discontig(ary, self, async_=async_, stream=stream)
return ary
def get_async(self, stream=None, ary=None):
return self.get(ary=ary, async_=True, stream=stream)
def copy(self):
new = GPUArray(self.shape, self.dtype, self.allocator)
_memcpy_discontig(new, self)
return new
def __str__(self):
return str(self.get())
def __repr__(self):
return repr(self.get())
def __hash__(self):
raise TypeError("GPUArrays are not hashable.")
@property
def ptr(self):
return self.gpudata.__int__()
# kernel invocation wrappers ----------------------------------------------
def _axpbyz(self, selffac, other, otherfac, out, add_timer=None, stream=None):
"""Compute ``out = selffac * self + otherfac*other``,
where `other` is a vector.."""
assert self.shape == other.shape
if not self.flags.forc or not other.flags.forc:
raise RuntimeError("only contiguous arrays may "
"be used as arguments to this operation")
func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype)
if add_timer is not None:
add_timer(3*self.size, func.prepared_timed_call(self._grid,
selffac, self.gpudata, otherfac, other.gpudata,
out.gpudata, self.mem_size))
else:
func.prepared_async_call(self._grid, self._block, stream,
Loading ...