Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

edgify / pycuda   python

Repository URL to install this package:

Version: 2020.1 

/ tools.py

"""Miscallenous helper functionality."""

from __future__ import division, print_function
from __future__ import absolute_import
import six
from six.moves import range
from six.moves import input

__copyright__ = "Copyright (C) 2008 Andreas Kloeckner"

__license__ = """
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
"""

import pycuda.driver as cuda
from decorator import decorator
import pycuda._driver as _drv
import numpy as np


bitlog2 = _drv.bitlog2
DeviceMemoryPool = _drv.DeviceMemoryPool
PageLockedMemoryPool = _drv.PageLockedMemoryPool
PageLockedAllocator = _drv.PageLockedAllocator

from pycuda.compyte.dtypes import (
        register_dtype, get_or_register_dtype, _fill_dtype_registry,
        dtype_to_ctype as base_dtype_to_ctype)

_fill_dtype_registry(respect_windows=True)
get_or_register_dtype("pycuda::complex<float>", np.complex64)
get_or_register_dtype("pycuda::complex<double>", np.complex128)


# {{{ debug memory pool

class DebugMemoryPool(DeviceMemoryPool):
    def __init__(self, interactive=True, logfile=None):
        DeviceMemoryPool.__init__(self)
        self.last_free, _ = cuda.mem_get_info()
        self.interactive = interactive

        if logfile is None:
            import sys
            logfile = sys.stdout

        self.logfile = logfile

        from weakref import WeakKeyDictionary
        self.blocks = WeakKeyDictionary()

        if interactive:
            from pytools.diskdict import DiskDict
            self.stacktrace_mnemonics = DiskDict("pycuda-stacktrace-mnemonics")

    def allocate(self, size):
        from traceback import extract_stack
        stack = tuple(frm[2] for frm in extract_stack())
        description = self.describe(stack, size)

        histogram = {}
        for bsize, descr in six.itervalues(self.blocks):
            histogram[bsize, descr] = histogram.get((bsize, descr), 0) + 1

        from pytools import common_prefix
        cpfx = common_prefix(descr for bsize, descr in histogram)

        print(
                "\n  Allocation of size %d occurring "
                "(mem: last_free:%d, free: %d, total:%d) (pool: held:%d, active:%d):"
                "\n      at: %s" % (
                    (size, self.last_free) + cuda.mem_get_info()
                    + (self.held_blocks, self.active_blocks,
                    description)),
                file=self.logfile)

        hist_items = sorted(list(six.iteritems(histogram)))
        for (bsize, descr), count in hist_items:
            print("  %s (%d bytes): %dx" % (descr[len(cpfx):], bsize, count),
                    file=self.logfile)

        if self.interactive:
            input("  [Enter]")

        result = DeviceMemoryPool.allocate(self, size)
        self.blocks[result] = size, description
        self.last_free, _ = cuda.mem_get_info()
        return result

    def describe(self, stack, size):
        if not self.interactive:
            return "|".join(stack)
        else:
            try:
                return self.stacktrace_mnemonics[stack, size]
            except KeyError:
                print(size, stack)
                while True:
                    mnemonic = input("Enter mnemonic or [Enter] for more info:")
                    if mnemonic == '':
                        from traceback import print_stack
                        print_stack()
                    else:
                        break
                self.stacktrace_mnemonics[stack, size] = mnemonic
                return mnemonic

# }}}


# {{{ default device/context

def get_default_device(default=0):
    from warnings import warn
    warn("get_default_device() is deprecated; "
            "use make_default_context() instead", DeprecationWarning)

    from pycuda.driver import Device
    import os
    dev = os.environ.get("CUDA_DEVICE")

    if dev is None:
        try:
            dev = (open(os.path.join(os.path.expanduser("~"), ".cuda_device"))
                    .read().strip())
        except:
            pass

    if dev is None:
        dev = default

    try:
        dev = int(dev)
    except TypeError:
        raise TypeError("CUDA device number (CUDA_DEVICE or ~/.cuda-device) "
                "must be an integer")

    return Device(dev)


def make_default_context(ctx_maker=None):
    if ctx_maker is None:
        def ctx_maker(dev):
            return dev.make_context()

    ndevices = cuda.Device.count()
    if ndevices == 0:
        raise RuntimeError("No CUDA enabled device found. "
                "Please check your installation.")

    # Is CUDA_DEVICE set?
    import os
    devn = os.environ.get("CUDA_DEVICE")

    # Is $HOME/.cuda_device set ?
    if devn is None:
        try:
            homedir = os.environ.get("HOME")
            assert homedir is not None
            devn = (open(os.path.join(homedir, ".cuda_device"))
                    .read().strip())
        except:
            pass

    # If either CUDA_DEVICE or $HOME/.cuda_device is set, try to use it
    if devn is not None:
        try:
            devn = int(devn)
        except TypeError:
            raise TypeError("CUDA device number (CUDA_DEVICE or ~/.cuda_device)"
                    " must be an integer")

        dev = cuda.Device(devn)
        return ctx_maker(dev)

    # Otherwise, try to use any available device
    else:
        for devn in range(ndevices):
            dev = cuda.Device(devn)
            try:
                return ctx_maker(dev)
            except cuda.Error:
                pass

        raise RuntimeError("make_default_context() wasn't able to create a context "
                "on any of the %d detected devices" % ndevices)

# }}}


# {{{ rounding helpers

def _exact_div(dividend, divisor):
    quot, rem = divmod(dividend, divisor)
    assert rem == 0
    return quot


def _int_ceiling(value, multiple_of=1):
    """Round C{value} up to be a C{multiple_of} something."""
    # Mimicks the Excel "floor" function (for code stolen from occupancy calculator)

    from math import ceil
    return int(ceil(value/multiple_of))*multiple_of


def _int_floor(value, multiple_of=1):
    """Round C{value} down to be a C{multiple_of} something."""
    # Mimicks the Excel "floor" function (for code stolen from occupancy calculator)

    from math import floor
    return int(floor(value/multiple_of))*multiple_of

# }}}


# {{{ device data

class DeviceData:
    def __init__(self, dev=None):
        import pycuda.driver as drv

        if dev is None:
            dev = cuda.Context.get_device()

        self.max_threads = dev.get_attribute(
                drv.device_attribute.MAX_THREADS_PER_BLOCK)
        self.warp_size = dev.get_attribute(drv.device_attribute.WARP_SIZE)

        if dev.compute_capability() >= (3, 0):
            self.warps_per_mp = 64
        elif dev.compute_capability() >= (2, 0):
            self.warps_per_mp = 48
        elif dev.compute_capability() >= (1, 2):
            self.warps_per_mp = 32
        else:
            self.warps_per_mp = 24

        self.thread_blocks_per_mp = 8
        self.registers = dev.get_attribute(
                drv.device_attribute.MAX_REGISTERS_PER_BLOCK)
        self.shared_memory = dev.get_attribute(
                drv.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK)

        if dev.compute_capability() >= (2, 0):
            self.smem_alloc_granularity = 128
            self.smem_granularity = 32
        else:
            self.smem_alloc_granularity = 512
            self.smem_granularity = 16

        if dev.compute_capability() >= (2, 0):
            self.register_allocation_unit = "warp"
        else:
            self.register_allocation_unit = "block"

    def align(self, bytes, word_size=4):
        return _int_ceiling(bytes, self.align_bytes(word_size))

    def align_dtype(self, elements, dtype_size):
        return _int_ceiling(elements,
                self.align_words(dtype_size))

    def align_words(self, word_size):
        return _exact_div(self.align_bytes(word_size), word_size)

    def align_bytes(self, word_size=4):
        if word_size == 4:
            return 64
        elif word_size == 8:
            return 128
        elif word_size == 16:
            return 128
        else:
            raise ValueError("no alignment possible for fetches of size %d" % word_size)

    def coalesce(self, thread_count):
        return _int_ceiling(thread_count, 16)

    @staticmethod
    def make_valid_tex_channel_count(size):
        valid_sizes = [1,2,4]
        for vs in valid_sizes:
            if size <= vs:
                return vs

        raise ValueError("could not enlarge argument to valid channel count")

# }}}

# {{{ occupancy

class OccupancyRecord:
    def __init__(self, devdata, threads, shared_mem=0, registers=0):
        if threads > devdata.max_threads:
            raise ValueError("too many threads")

        # copied literally from occupancy calculator
        alloc_warps = _int_ceiling(threads/devdata.warp_size)
        alloc_smem = _int_ceiling(shared_mem, devdata.smem_alloc_granularity)
        if devdata.register_allocation_unit == "warp":
            alloc_regs = alloc_warps*32*registers
        elif devdata.register_allocation_unit == "block":
            alloc_regs = _int_ceiling(alloc_warps*2, 4)*16*registers
        else:
            raise ValueError("Improper register allocation unit:"+devdata.register_allocation_unit)

        if alloc_regs > devdata.registers:
            raise ValueError("too many registers")

        if alloc_smem > devdata.shared_memory:
            raise ValueError("too much smem")

        self.tb_per_mp_limits = [(devdata.thread_blocks_per_mp, "device"),
                (_int_floor(devdata.warps_per_mp/alloc_warps), "warps")
                ]
        if registers > 0:
            self.tb_per_mp_limits.append((_int_floor(devdata.registers/alloc_regs), "regs"))
        if shared_mem > 0:
            self.tb_per_mp_limits.append((_int_floor(devdata.shared_memory/alloc_smem), "smem"))

        self.tb_per_mp, self.limited_by = min(self.tb_per_mp_limits)

        self.warps_per_mp = self.tb_per_mp * alloc_warps
        self.occupancy = self.warps_per_mp / devdata.warps_per_mp
Loading ...