Ubuntu - Out of bounds address when overriding virtual class method in cuda kernel

cyanmess
September 19, 2023
124 views
0 votes
2 Answers

For the project I’m working on, I have defined a certain base class and several derived classes that inherit it. The base class has several virtual methods that are overriden by methods in the derived classes.
At a certain point I need to pass a (derived) custom class instantiation to a kernel and use one of the custom class’ methods inside said kernel. Using the method (hopefully) does not modify the object. I have tried to adapt the following C++ minimum working example so that it works on arrays:

#include<iostream>

class basefoo{
    public:
    virtual int getiden(int i){return 0;}
};

class derivefoo : public basefoo{
    public:
    int getiden(int i){return i;}
};

int getsq(basefoo *foo, int q){
    int result = (foo->getiden(q))*(foo->getiden(q));
    return result;
}

int main(){
    derivefoo foo;
    basefoo foo2;
    int q=3;
    std::cout<<getsq(&foo, q)<<std::endl;
    std::cout<<getsq(&foo2, q)<<std::endl;
    return 0;
}

Output is:

9
0

as expected.

My minimum CUDA example is the following:

#include <iostream>

class basefoo{
    public:
    basefoo(){}
    __host__ __device__ virtual int getiden(int i){return 0;}
};

class derivefoo : public basefoo{
    public:
    derivefoo(){}
    __host__ __device__ int getiden(int i){return i;}
};

__global__ void get_squares(int* squares, basefoo *foo, int sroot, int nsq){
    const int tid = threadIdx.x + blockDim.x*blockIdx.x;
    if(tid < nsq){
        squares[tid] = 0;
        int result = (foo->getiden(sroot))*(foo->getiden(sroot));
        squares[tid] += result;
    }
}

int main(){
    int sroot = 4;
    int nsquares = 2;
    int *hsquares, *dsquares;
    hsquares = (int *)malloc(nsquares*sizeof(int));
    cudaMalloc(&dsquares, nsquares*sizeof(int));

    basefoo foo, *dfoo;
    derivefoo foo2, *dfoo2;

    cudaMalloc(&dfoo, sizeof(basefoo));
    cudaMemcpy(dfoo, &foo, sizeof(basefoo), cudaMemcpyHostToDevice);
    cudaMalloc(&dfoo2, sizeof(derivefoo));
    cudaMemcpy(dfoo2, &foo2, sizeof(derivefoo), cudaMemcpyHostToDevice);

    get_squares<<<1, 2>>>(dsquares, dfoo, sroot, nsquares);
    cudaMemcpy(hsquares, dsquares, nsquares*sizeof(int), cudaMemcpyDeviceToHost);

    printf("%d %d n", hsquares[0], hsquares[1]);

    get_squares<<<1, 2>>>(dsquares, dfoo2, sroot, nsquares);
    cudaMemcpy(hsquares, dsquares, nsquares*sizeof(int), cudaMemcpyDeviceToHost);

    printf("%d %d n", hsquares[0], hsquares[1]);


    return 0;

}

When executing it with compute-sanitizer –tool memcheck I get several out of bounds errors (5 for this minimum example). Here is the first one:

========= Invalid __global__ read of size 4 bytes
=========     at 0xc8 in get_squares(int *, basefoo *, int, int)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x555f56399068 is out of bounds
=========     and is 45,900,972,322,712 bytes before the nearest allocation at 0x7f1e7d600000 of size 8 bytes
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x32e130]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:libcudart_static_4d8b33a106dceb3c07a56e26de61f2d53bb62a68 [0x1091e]
=========                in /home/arin/Desktop/cuda codes/ASMD/./inhtest
=========     Host Frame:cudaLaunchKernel [0x70b2e]
=========                in /home/arin/Desktop/cuda codes/ASMD/./inhtest
=========     Host Frame:cudaError cudaLaunchKernel<char>(char const*, dim3, dim3, void**, unsigned long, CUstream_st*) [0xb1ba]
=========                in /home/arin/Desktop/cuda codes/ASMD/./inhtest
=========     Host Frame:__device_stub__Z11get_squaresPiP7basefooii(int*, basefoo*, int, int) [0xafec]
=========                in /home/arin/Desktop/cuda codes/ASMD/./inhtest
=========     Host Frame:get_squares(int*, basefoo*, int, int) [0xb036]
=========                in /home/arin/Desktop/cuda codes/ASMD/./inhtest
=========     Host Frame:main [0xad1d]
=========                in /home/arin/Desktop/cuda codes/ASMD/./inhtest
=========     Host Frame:../sysdeps/nptl/libc_start_call_main.h:58:__libc_start_call_main [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:../csu/libc-start.c:379:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xaae5]
=========                in /home/arin/Desktop/cuda codes/ASMD/./inhtest

It’s worth noting that there are no memcheck errors at all if I delete the "virtual" keyword in basefoo’s getiden method, but then the override isn’t successful, and I need it for my project.Does anyone have any idea of what is causing this?
This is CUDA 12.2 in Ubuntu 22.04.3.

Tags: c#class cuda inheritance overriding

Answers

The answer could be found in the CUDA documentation about virtual functions:

If an object is created in host code, invoking a virtual function for that object in device code has undefined behavior.

The following code creating object on the device will work

#include <iostream>

class basefoo
{
public:
    __host__ __device__ basefoo() {}
    __host__ __device__ virtual int getiden(int i) { return 0; }
};

class derivefoo : public basefoo
{
public:
    __host__ __device__ derivefoo() {}
    __host__ __device__ int getiden(int i) override { return i; }
};

__global__ void get_squares(int* squares, int sroot, int nsq)
{
    const int tid = threadIdx.x + blockDim.x * blockIdx.x;

    derivefoo dfoo;
    basefoo* bfoo = &dfoo;
    if (tid < nsq)
    {
        squares[tid] = 0;
        int result = (bfoo->getiden(sroot)) * (bfoo->getiden(sroot));
        squares[tid] += result;
    }
}

int main()
{
    int sroot = 4;
    int nsquares = 2;
    int *hsquares, *dsquares;
    hsquares = (int *)malloc(nsquares*sizeof(int));
    cudaMalloc(&dsquares, nsquares*sizeof(int));

    get_squares<<<1, 2>>>(dsquares, sroot, nsquares);
    cudaMemcpy(hsquares, dsquares, nsquares*sizeof(int), cudaMemcpyDeviceToHost);

    printf("%d %d n", hsquares[0], hsquares[1]);

    cudaFree(dsquares);
    free(hsquares);

    return 0;
}

- AbatorAbetor
- September 5, 2023 at 6:23 pm
- 0 votes
0
cudaMemcpy* will transfer the vtable between host and device. In general, host pointers are invalid on the device, and vice-versa. To be usable on the device, objects with virtual functions must be constructed on the device to set up valid function pointers.

To make a copied object usable on the destination (cpu or gpu), one can copy-construct the object in-place, using placement-new.
```
#include <iostream>

struct Foo{
    __host__ __device__
    virtual int compute() = 0;
};

struct Bar : public Foo{
    int x = 1;
    __host__ __device__
    int compute() override{
        return x + 2;
    }
};

__global__
void computeKernel(Foo* object){
    int result = object->compute();
    printf("result %dn", result);
}

__global__
void fixBarObject(Bar* b){
    new (b) Bar(*b);
}

int main(){
    Bar b;
    b.x = 3;

    Bar* d_b;
    cudaMalloc(&d_b, sizeof(Bar));
    cudaMemcpy(d_b, &b, sizeof(Bar), cudaMemcpyHostToDevice);
    fixBarObject<<<1,1>>>(d_b);
    computeKernel<<<1,1>>>(d_b);
    cudaDeviceSynchronize();
}
```
Output: result 5
Login or Signup to reply.

Please signup or login to give your own answer.

Click here to cancel reply.

Ubuntu – Out of bounds address when overriding virtual class method in cuda kernel

Answers