Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

edgify / pycuda   python

Repository URL to install this package:

/ sparse / coordinate.py

from __future__ import division
from __future__ import absolute_import
from pytools import memoize_method
import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import numpy as np




COO_FLAT_KERNEL_TEMPLATE = """
#include <pycuda-helpers.hpp>

#define BLOCK_SIZE %(block_size)d
#define WARP_SIZE %(warp_size)d

typedef %(value_type)s value_type;
typedef %(index_type)s index_type;

texture<%(tex_value_type)s, 1, cudaReadModeElementType> tex_x;

static __inline__ __device__ float atomicAdd(float *addr, float val)
{
    float old=*addr, assumed;

    do {
        assumed = old;
        old = int_as_float( atomicCAS((int*)addr,
                                        float_as_int(assumed),
                                        float_as_int(val+assumed)));
    } while( assumed!=old );

    return old;
}

#ifndef CUDA_NO_SM_13_DOUBLE_INTRINSICS
static __attribute__ ((unused)) __inline__ __device__ double atomicAdd(double *addr, double val)
{
    double old=*addr, assumed;

    do {
        assumed = old;
        old = __longlong_as_double( atomicCAS((unsigned long long int*)addr,
                                        __double_as_longlong(assumed),
                                        __double_as_longlong(val+assumed)));
    } while( assumed!=old );

    return old;
}
#endif

__global__ void
spmv_coo_flat_kernel(const index_type num_nonzeros,
                     const index_type interval_size,
                     const index_type *I,
                     const index_type *J,
                     const value_type *V,
                           value_type *y)
{
  __shared__ index_type idx[BLOCK_SIZE];
  __shared__ value_type val[BLOCK_SIZE];
  __shared__ index_type carry_idx[BLOCK_SIZE / 32];
  __shared__ value_type carry_val[BLOCK_SIZE / 32];

  const index_type thread_id   = BLOCK_SIZE * blockIdx.x + threadIdx.x;     // global thread index
  const index_type thread_lane = threadIdx.x & (WARP_SIZE-1);               // thread index within the warp
  const index_type warp_id     = thread_id   / WARP_SIZE;                   // global warp index
  const index_type warp_lane   = threadIdx.x / WARP_SIZE;                   // warp index within the CTA

  const index_type begin = warp_id * interval_size + thread_lane;           // thread's offset into I,J,V
  const index_type end   = min(begin + interval_size, num_nonzeros);        // end of thread's work

  if(begin >= end) return;                                                 // warp has no work to do

  const index_type first_idx = I[warp_id * interval_size];                  // first row of this warp's interval

  if (thread_lane == 0)
  {
    carry_idx[warp_lane] = first_idx;
    carry_val[warp_lane] = 0;
  }

  for(index_type n = begin; n < end; n += WARP_SIZE)
  {
    idx[threadIdx.x] = I[n];                                             // row index
    val[threadIdx.x] = V[n] * fp_tex1Dfetch(tex_x, J[n]);                // val = A[row,col] * x[col]

    if (thread_lane == 0){
      if(idx[threadIdx.x] == carry_idx[warp_lane])
          val[threadIdx.x] += carry_val[warp_lane];                    // row continues into this warp's span
      else if(carry_idx[warp_lane] != first_idx)
          y[carry_idx[warp_lane]] += carry_val[warp_lane];             // row terminated, does not span boundary
      else
          atomicAdd(y + carry_idx[warp_lane], carry_val[warp_lane]);   // row terminated, but spans iter-warp boundary
    }

    // segmented reduction in shared memory
    if( thread_lane >=  1 && idx[threadIdx.x] == idx[threadIdx.x - 1] ) { val[threadIdx.x] += val[threadIdx.x -  1]; }
    if( thread_lane >=  2 && idx[threadIdx.x] == idx[threadIdx.x - 2] ) { val[threadIdx.x] += val[threadIdx.x -  2]; }
    if( thread_lane >=  4 && idx[threadIdx.x] == idx[threadIdx.x - 4] ) { val[threadIdx.x] += val[threadIdx.x -  4]; }
    if( thread_lane >=  8 && idx[threadIdx.x] == idx[threadIdx.x - 8] ) { val[threadIdx.x] += val[threadIdx.x -  8]; }
    if( thread_lane >= 16 && idx[threadIdx.x] == idx[threadIdx.x -16] ) { val[threadIdx.x] += val[threadIdx.x - 16]; }

    if( thread_lane == 31 ) {
      carry_idx[warp_lane] = idx[threadIdx.x];                         // last thread in warp saves its results
      carry_val[warp_lane] = val[threadIdx.x];
    }
    else if ( idx[threadIdx.x] != idx[threadIdx.x+1] ) {                 // row terminates here
      if(idx[threadIdx.x] != first_idx)
          y[idx[threadIdx.x]] += val[threadIdx.x];                     // row terminated, does not span inter-warp boundary
      else
          atomicAdd(y + idx[threadIdx.x], val[threadIdx.x]);           // row terminated, but spans iter-warp boundary
    }
  }

  // final carry
  if(thread_lane == 31){
    atomicAdd(y + carry_idx[warp_lane], carry_val[warp_lane]);
  }
}
"""



COO_SERIAL_KERNEL_TEMPLATE = """
typedef %(value_type)s value_type;
typedef %(index_type)s index_type;

__global__ void
spmv_coo_serial_kernel(const index_type num_nonzeros,
                       const index_type *I,
                       const index_type *J,
                       const value_type *V,
                       const value_type *x,
                             value_type *y)
{
  for (index_type n = 0; n < num_nonzeros; n++)
    y[I[n]] += V[n] * x[J[n]];
}
"""




class CoordinateSpMV:
    def __init__(self, mat, dtype):
        self.dtype = np.dtype(dtype)
        self.index_dtype = np.dtype(np.int32)
        self.shape = mat.shape

        self.block_size = 128

        from scipy.sparse import coo_matrix
        coo_mat = coo_matrix(mat, dtype=self.dtype)

        self.row_gpu = gpuarray.to_gpu(coo_mat.row.astype(self.index_dtype))
        self.col_gpu = gpuarray.to_gpu(coo_mat.col.astype(self.index_dtype))
        self.data_gpu = gpuarray.to_gpu(coo_mat.data)
        self.nnz = coo_mat.nnz

        from pycuda.tools import DeviceData
        dev = drv.Context.get_device()
        devdata = DeviceData()
        max_threads = (devdata.warps_per_mp*devdata.warp_size*
                dev.multiprocessor_count)
        max_blocks = 4*max_threads // self.block_size
        warps_per_block = self.block_size // dev.warp_size

        if self.nnz:
            def divide_into(x, y):
                return (x+y-1)//y

            num_units  = self.nnz // dev.warp_size
            num_warps  = min(num_units, warps_per_block * max_blocks)
            self.num_blocks = divide_into(num_warps, warps_per_block)
            num_iters  = divide_into(num_units, num_warps)

            self.interval_size = dev.warp_size * num_iters
            self.tail = num_units * dev.warp_size


    @memoize_method
    def get_flat_kernel(self):
        from pycuda.tools import dtype_to_ctype

        mod = SourceModule(
                COO_FLAT_KERNEL_TEMPLATE % {
                    "value_type": dtype_to_ctype(self.dtype),
                    "tex_value_type": dtype_to_ctype(
                        self.dtype, with_fp_tex_hack=True),
                    "index_type": dtype_to_ctype(self.index_dtype),
                    "block_size": self.block_size,
                    "warp_size": drv.Context.get_device().warp_size,
                    })
        func = mod.get_function("spmv_coo_flat_kernel")
        x_texref = mod.get_texref("tex_x")
        func.prepare(self.index_dtype.char*2 + "PPPP",
            (self.block_size, 1, 1), texrefs=[x_texref])
        return func, x_texref

    @memoize_method
    def get_serial_kernel(self):
        from pycuda.tools import dtype_to_ctype

        mod = SourceModule(
                COO_SERIAL_KERNEL_TEMPLATE % {
                    "value_type": dtype_to_ctype(self.dtype),
                    "index_type": dtype_to_ctype(self.index_dtype),
                    })
        func = mod.get_function("spmv_coo_serial_kernel")
        func.prepare(self.index_dtype.char + "PPPPP", (1, 1, 1))
        return func

    def __call__(self, x, y=None):
        if y is None:
            y = gpuarray.zeros(self.shape[0], dtype=self.dtype,
                    allocator=x.allocator)

        if self.nnz == 0:
            return y

        flat_func, x_texref = self.get_flat_kernel()
        x.bind_to_texref_ext(x_texref, allow_double_hack=True)
        flat_func.prepared_call((self.num_blocks, 1),
                self.tail, self.interval_size,
                self.row_gpu.gpudata,
                self.col_gpu.gpudata,
                self.data_gpu.gpudata,
                y.gpudata)

        self.get_serial_kernel().prepared_call(
                (1, 1),
                self.nnz - self.tail,
                self.row_gpu[self.tail:].gpudata,
                self.col_gpu[self.tail:].gpudata,
                self.data_gpu[self.tail:].gpudata,
                x.gpudata, y.gpudata)

        return y