首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >PyCuda:通过Cuda内核中的指针删除数组元素

PyCuda:通过Cuda内核中的指针删除数组元素
EN

Stack Overflow用户
提问于 2016-05-04 02:48:37
回答 1查看 361关注 0票数 0

我使用PyCuda通过指针将数组对传递给库达内核。数组是一个不同内核的输出,所以数据已经在GPU上了。

在内核中,我试图访问每个数组中的元素来执行向量减法。数组中元素的值是不正确的(下面的代码中h&p是错误的)。

有人能帮我看看我做错了什么吗?

我的代码:

代码语言:javascript
复制
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
import time
import cv2
from pycuda.tools import DeviceMemoryPool as DMP
from scipy.spatial import distance
import os
import glob

def get_cuda_hist_kernel():
        #Make the kernel
    histogram_kernel = """
    __global__ void kernel_getHist(unsigned int* array,unsigned int size, unsigned int* histo, float bucket_size, unsigned int num_bins, unsigned int* out_max)
    {
        unsigned int x = threadIdx.x + blockDim.x * blockIdx.x;
        if(x<size){
            unsigned int value = array[x];

            unsigned int bin = floor(float(value) * bucket_size) - 1;


            //Faster Modulo 3 for channel assignment
            unsigned int offset = x;
            offset = (offset >> 16) + (offset & 0xFFFF); 
            offset = (offset >>  8) + (offset & 0xFF);   
            offset = (offset >>  4) + (offset & 0xF);    
            offset = (offset >>  2) + (offset & 0x3);    
            offset = (offset >>  2) + (offset & 0x3);    
            offset = (offset >>  2) + (offset & 0x3);    
            if (offset > 2) offset = offset - 3;
            offset = offset * num_bins;

            bin += offset;

            atomicAdd(&histo[bin + offset],1);
        }
    }


    __global__ void kernel_chebyshev(unsigned int* histo, unsigned int* prev_histo, unsigned int number, int* output)
    {

        const unsigned int size = 12;
        //Get all of the differences
        __shared__ int temp_diffs[size];
        unsigned int i = threadIdx.x + blockDim.x * blockIdx.x;

        if (i < size){
            unsigned int diff = 0;
            unsigned int h = histo[i];
            unsigned int p = prev_histo[i];

            if (h > p)
            {
                diff = h - p;
            }
            else
            {
                diff = p - h;
            }
            temp_diffs[i] = (int)diff;
        }

        __syncthreads();

        output[number] = 0;
        atomicMax(&output[number], temp_diffs[i]);
    }
    """

    mod = SourceModule(histogram_kernel)
    return mod


def cuda_histogram(ims, block_size, kernel):

    start = time.time()
    max_val = 4
    num_bins = np.uint32(4)
    num_channels = np.uint32(3)
    bin_size = np.float32(1 / np.uint32(max_val / num_bins))

    #Memory Pool
    pool = DMP()
    print 'Pool Held Blocks: ', pool.held_blocks

    #Compute block & Grid dimensions

    bdim = (block_size, 1, 1)
    cols = ims[0].size
    rows = 1
    channels = 1

    dx, mx = divmod(cols, bdim[0])
    dy, my = divmod(rows, bdim[1])
    dz, mz = divmod(channels, bdim[2])
    g_x = (dx + (mx>0)) * bdim[0]
    g_y = (dy + (my>0)) * bdim[1]
    g_z = (dz + (mz>0)) * bdim[2]
    gdim = (g_x, g_y, g_z)

    #get the function
    func = kernel.get_function('kernel_getHist')
    func2 = kernel.get_function('kernel_chebyshev')

    #build list of histograms
    #send the histogram to the gpu
    hists = []
    device_hists = []
    for im in range(len(ims)):
        hists.append(np.zeros([num_channels * num_bins]).astype(np.uint32))

    end = time.time()
    dur = end - start
    print(' '.join(['Prep Time: ', str(dur)]))

    start = time.time()


    #Copy all of the image data to GPU
    device_images = []
    for im in range(len(ims)):
        #print('Allocating data for image :', im)
        #convert the image to 1D array of uint32s
        a = ims[im].astype(np.uint32)
        a = a.flatten('C')
        a_size = np.uint32(a.size)

        #allocate & send im data to gpu
        device_images.append(pool.allocate(a.nbytes))
        cuda.memcpy_htod(device_images[im], a)

        d_hist = pool.allocate(hists[im].nbytes)
        device_hists.append(d_hist)
        cuda.memcpy_htod(d_hist, hists[im])


    differences = np.zeros(len(ims)).astype(np.uint32)
    device_diffs = pool.allocate(differences.nbytes)
    cuda.memcpy_htod(device_diffs, differences)


    for im in range(len(ims)):
        #run histogram function
        func(device_images[im], a_size, device_hists[im], bin_size, num_bins, block=(block_size,1,1), grid=gdim)

    cuda.Context.synchronize()
    device_hist_size = np.uint32(len(device_hists[im]))
    for im in range(1, len(ims)):
        number = np.uint32(im - 1)
        func2(device_hists[im], device_hists[im - 1], number, device_diffs, block=(32,1,1))

    cuda.memcpy_dtoh(differences, device_diffs)
    print(differences)

    for im in range(len(ims)):
        #get histogram back
        cuda.memcpy_dtoh(hists[im], device_hists[im])
        device_hists[im] = 0


    end = time.time()
    dur = end - start
    print(' '.join(['Load, Compute, & Gather Time: ', str(dur)]))
    pool.free_held()
    return differences

def get_all_files(directory):
    pattern = os.path.join(directory, '*.jpg')
    files = [f for f in glob.glob(pattern)]
    return files
if __name__ == "__main__":
    RESOURCES_PATH = "../data/ims/"
    MAX_IMS = 1000
    direc = os.path.join(RESOURCES_PATH, '21JumpStreet', 'source_video_frames')
    files = get_all_files(direc)
    a = cv2.imread('t.png')
    ims = [cv2.imread(f) for f in files]
    print 'Shape of my image: ', ims[0].shape
    print 'Number of images to histogram: ', len(ims)
    block_size = 128
    kernel = get_cuda_hist_kernel()
    start = time.time()

    num_diffs = len(ims) // MAX_IMS + 1
    cuda_diffs = []

    for i in range(num_diffs):

        first = i * MAX_IMS
        last = (i + 1) * MAX_IMS
        print(first)
        small_set = ims[first:last]
        print 'Small set size: ', str(len(small_set))
        cuda_diffs.extend(cuda_histogram(small_set, block_size, kernel))

    end = time.time()
    dur = end - start
    print(' '.join(['CUDA version took:', str(dur)]))

    start = time.time()
    cv_hists = []
    for i in range(len(ims)):
        im = ims[i % len(ims)]
        h = []
        for j in range(3):
            hist = cv2.calcHist([im], [j], None, [4], [0, 100])
            h.extend(hist)
        cv_hists.append(h)

    #run Chebyshev on CPU:
    color_hist_diffs = np.array([distance.chebyshev(cv_hists[i-1], cv_hists[i]) \
                                 for i in range(len(cv_hists)) if i != 0])
    print(color_hist_diffs)
    end = time.time()
    dur = end - start
    print(' '.join(['CPU & cv2 version took:', str(dur)]))
EN

回答 1

Stack Overflow用户

回答已采纳

发布于 2016-05-07 02:13:45

这是一个糟糕的问题,因为错误在我的代码中的其他地方。很抱歉给你造成了混乱。

票数 0
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/37017562

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档