skip to Main Content

For the project I’m working on, I have defined a certain base class and several derived classes that inherit it. The base class has several virtual methods that are overriden by methods in the derived classes.
At a certain point I need to pass a (derived) custom class instantiation to a kernel and use one of the custom class’ methods inside said kernel. Using the method (hopefully) does not modify the object. I have tried to adapt the following C++ minimum working example so that it works on arrays:

#include<iostream>

class basefoo{
    public:
    virtual int getiden(int i){return 0;}
};

class derivefoo : public basefoo{
    public:
    int getiden(int i){return i;}
};

int getsq(basefoo *foo, int q){
    int result = (foo->getiden(q))*(foo->getiden(q));
    return result;
}

int main(){
    derivefoo foo;
    basefoo foo2;
    int q=3;
    std::cout<<getsq(&foo, q)<<std::endl;
    std::cout<<getsq(&foo2, q)<<std::endl;
    return 0;
}

Output is:

9
0

as expected.

My minimum CUDA example is the following:

#include <iostream>

class basefoo{
    public:
    basefoo(){}
    __host__ __device__ virtual int getiden(int i){return 0;}
};

class derivefoo : public basefoo{
    public:
    derivefoo(){}
    __host__ __device__ int getiden(int i){return i;}
};

__global__ void get_squares(int* squares, basefoo *foo, int sroot, int nsq){
    const int tid = threadIdx.x + blockDim.x*blockIdx.x;
    if(tid < nsq){
        squares[tid] = 0;
        int result = (foo->getiden(sroot))*(foo->getiden(sroot));
        squares[tid] += result;
    }
}

int main(){
    int sroot = 4;
    int nsquares = 2;
    int *hsquares, *dsquares;
    hsquares = (int *)malloc(nsquares*sizeof(int));
    cudaMalloc(&dsquares, nsquares*sizeof(int));

    basefoo foo, *dfoo;
    derivefoo foo2, *dfoo2;

    cudaMalloc(&dfoo, sizeof(basefoo));
    cudaMemcpy(dfoo, &foo, sizeof(basefoo), cudaMemcpyHostToDevice);
    cudaMalloc(&dfoo2, sizeof(derivefoo));
    cudaMemcpy(dfoo2, &foo2, sizeof(derivefoo), cudaMemcpyHostToDevice);

    get_squares<<<1, 2>>>(dsquares, dfoo, sroot, nsquares);
    cudaMemcpy(hsquares, dsquares, nsquares*sizeof(int), cudaMemcpyDeviceToHost);

    printf("%d %d n", hsquares[0], hsquares[1]);

    get_squares<<<1, 2>>>(dsquares, dfoo2, sroot, nsquares);
    cudaMemcpy(hsquares, dsquares, nsquares*sizeof(int), cudaMemcpyDeviceToHost);

    printf("%d %d n", hsquares[0], hsquares[1]);


    return 0;

}

When executing it with compute-sanitizer –tool memcheck I get several out of bounds errors (5 for this minimum example). Here is the first one:

========= Invalid __global__ read of size 4 bytes
=========     at 0xc8 in get_squares(int *, basefoo *, int, int)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x555f56399068 is out of bounds
=========     and is 45,900,972,322,712 bytes before the nearest allocation at 0x7f1e7d600000 of size 8 bytes
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x32e130]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:libcudart_static_4d8b33a106dceb3c07a56e26de61f2d53bb62a68 [0x1091e]
=========                in /home/arin/Desktop/cuda codes/ASMD/./inhtest
=========     Host Frame:cudaLaunchKernel [0x70b2e]
=========                in /home/arin/Desktop/cuda codes/ASMD/./inhtest
=========     Host Frame:cudaError cudaLaunchKernel<char>(char const*, dim3, dim3, void**, unsigned long, CUstream_st*) [0xb1ba]
=========                in /home/arin/Desktop/cuda codes/ASMD/./inhtest
=========     Host Frame:__device_stub__Z11get_squaresPiP7basefooii(int*, basefoo*, int, int) [0xafec]
=========                in /home/arin/Desktop/cuda codes/ASMD/./inhtest
=========     Host Frame:get_squares(int*, basefoo*, int, int) [0xb036]
=========                in /home/arin/Desktop/cuda codes/ASMD/./inhtest
=========     Host Frame:main [0xad1d]
=========                in /home/arin/Desktop/cuda codes/ASMD/./inhtest
=========     Host Frame:../sysdeps/nptl/libc_start_call_main.h:58:__libc_start_call_main [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:../csu/libc-start.c:379:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xaae5]
=========                in /home/arin/Desktop/cuda codes/ASMD/./inhtest

It’s worth noting that there are no memcheck errors at all if I delete the "virtual" keyword in basefoo’s getiden method, but then the override isn’t successful, and I need it for my project.Does anyone have any idea of what is causing this?
This is CUDA 12.2 in Ubuntu 22.04.3.

2

Answers


  1. The answer could be found in the CUDA documentation about virtual functions:

    If an object is created in host code, invoking a virtual function for that object in device code has undefined behavior.

    The following code creating object on the device will work

    #include <iostream>
    
    class basefoo
    {
    public:
        __host__ __device__ basefoo() {}
        __host__ __device__ virtual int getiden(int i) { return 0; }
    };
    
    class derivefoo : public basefoo
    {
    public:
        __host__ __device__ derivefoo() {}
        __host__ __device__ int getiden(int i) override { return i; }
    };
    
    __global__ void get_squares(int* squares, int sroot, int nsq)
    {
        const int tid = threadIdx.x + blockDim.x * blockIdx.x;
    
        derivefoo dfoo;
        basefoo* bfoo = &dfoo;
        if (tid < nsq)
        {
            squares[tid] = 0;
            int result = (bfoo->getiden(sroot)) * (bfoo->getiden(sroot));
            squares[tid] += result;
        }
    }
    
    int main()
    {
        int sroot = 4;
        int nsquares = 2;
        int *hsquares, *dsquares;
        hsquares = (int *)malloc(nsquares*sizeof(int));
        cudaMalloc(&dsquares, nsquares*sizeof(int));
    
        get_squares<<<1, 2>>>(dsquares, sroot, nsquares);
        cudaMemcpy(hsquares, dsquares, nsquares*sizeof(int), cudaMemcpyDeviceToHost);
    
        printf("%d %d n", hsquares[0], hsquares[1]);
    
        cudaFree(dsquares);
        free(hsquares);
    
        return 0;
    }
    
    Login or Signup to reply.
  2. cudaMemcpy* will transfer the vtable between host and device. In general, host pointers are invalid on the device, and vice-versa. To be usable on the device, objects with virtual functions must be constructed on the device to set up valid function pointers.

    To make a copied object usable on the destination (cpu or gpu), one can copy-construct the object in-place, using placement-new.

    #include <iostream>
    
    struct Foo{
        __host__ __device__
        virtual int compute() = 0;
    };
    
    struct Bar : public Foo{
        int x = 1;
        __host__ __device__
        int compute() override{
            return x + 2;
        }
    };
    
    __global__
    void computeKernel(Foo* object){
        int result = object->compute();
        printf("result %dn", result);
    }
    
    __global__
    void fixBarObject(Bar* b){
        new (b) Bar(*b);
    }
    
    int main(){
        Bar b;
        b.x = 3;
    
        Bar* d_b;
        cudaMalloc(&d_b, sizeof(Bar));
        cudaMemcpy(d_b, &b, sizeof(Bar), cudaMemcpyHostToDevice);
        fixBarObject<<<1,1>>>(d_b);
        computeKernel<<<1,1>>>(d_b);
        cudaDeviceSynchronize();
    }
    

    Output: result 5

    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search