For the project I’m working on, I have defined a certain base class and several derived classes that inherit it. The base class has several virtual methods that are overriden by methods in the derived classes.
At a certain point I need to pass a (derived) custom class instantiation to a kernel and use one of the custom class’ methods inside said kernel. Using the method (hopefully) does not modify the object. I have tried to adapt the following C++ minimum working example so that it works on arrays:
#include<iostream>
class basefoo{
public:
virtual int getiden(int i){return 0;}
};
class derivefoo : public basefoo{
public:
int getiden(int i){return i;}
};
int getsq(basefoo *foo, int q){
int result = (foo->getiden(q))*(foo->getiden(q));
return result;
}
int main(){
derivefoo foo;
basefoo foo2;
int q=3;
std::cout<<getsq(&foo, q)<<std::endl;
std::cout<<getsq(&foo2, q)<<std::endl;
return 0;
}
Output is:
9
0
as expected.
My minimum CUDA example is the following:
#include <iostream>
class basefoo{
public:
basefoo(){}
__host__ __device__ virtual int getiden(int i){return 0;}
};
class derivefoo : public basefoo{
public:
derivefoo(){}
__host__ __device__ int getiden(int i){return i;}
};
__global__ void get_squares(int* squares, basefoo *foo, int sroot, int nsq){
const int tid = threadIdx.x + blockDim.x*blockIdx.x;
if(tid < nsq){
squares[tid] = 0;
int result = (foo->getiden(sroot))*(foo->getiden(sroot));
squares[tid] += result;
}
}
int main(){
int sroot = 4;
int nsquares = 2;
int *hsquares, *dsquares;
hsquares = (int *)malloc(nsquares*sizeof(int));
cudaMalloc(&dsquares, nsquares*sizeof(int));
basefoo foo, *dfoo;
derivefoo foo2, *dfoo2;
cudaMalloc(&dfoo, sizeof(basefoo));
cudaMemcpy(dfoo, &foo, sizeof(basefoo), cudaMemcpyHostToDevice);
cudaMalloc(&dfoo2, sizeof(derivefoo));
cudaMemcpy(dfoo2, &foo2, sizeof(derivefoo), cudaMemcpyHostToDevice);
get_squares<<<1, 2>>>(dsquares, dfoo, sroot, nsquares);
cudaMemcpy(hsquares, dsquares, nsquares*sizeof(int), cudaMemcpyDeviceToHost);
printf("%d %d n", hsquares[0], hsquares[1]);
get_squares<<<1, 2>>>(dsquares, dfoo2, sroot, nsquares);
cudaMemcpy(hsquares, dsquares, nsquares*sizeof(int), cudaMemcpyDeviceToHost);
printf("%d %d n", hsquares[0], hsquares[1]);
return 0;
}
When executing it with compute-sanitizer –tool memcheck I get several out of bounds errors (5 for this minimum example). Here is the first one:
========= Invalid __global__ read of size 4 bytes
========= at 0xc8 in get_squares(int *, basefoo *, int, int)
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x555f56399068 is out of bounds
========= and is 45,900,972,322,712 bytes before the nearest allocation at 0x7f1e7d600000 of size 8 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x32e130]
========= in /lib/x86_64-linux-gnu/libcuda.so.1
========= Host Frame:libcudart_static_4d8b33a106dceb3c07a56e26de61f2d53bb62a68 [0x1091e]
========= in /home/arin/Desktop/cuda codes/ASMD/./inhtest
========= Host Frame:cudaLaunchKernel [0x70b2e]
========= in /home/arin/Desktop/cuda codes/ASMD/./inhtest
========= Host Frame:cudaError cudaLaunchKernel<char>(char const*, dim3, dim3, void**, unsigned long, CUstream_st*) [0xb1ba]
========= in /home/arin/Desktop/cuda codes/ASMD/./inhtest
========= Host Frame:__device_stub__Z11get_squaresPiP7basefooii(int*, basefoo*, int, int) [0xafec]
========= in /home/arin/Desktop/cuda codes/ASMD/./inhtest
========= Host Frame:get_squares(int*, basefoo*, int, int) [0xb036]
========= in /home/arin/Desktop/cuda codes/ASMD/./inhtest
========= Host Frame:main [0xad1d]
========= in /home/arin/Desktop/cuda codes/ASMD/./inhtest
========= Host Frame:../sysdeps/nptl/libc_start_call_main.h:58:__libc_start_call_main [0x29d90]
========= in /lib/x86_64-linux-gnu/libc.so.6
========= Host Frame:../csu/libc-start.c:379:__libc_start_main [0x29e40]
========= in /lib/x86_64-linux-gnu/libc.so.6
========= Host Frame:_start [0xaae5]
========= in /home/arin/Desktop/cuda codes/ASMD/./inhtest
It’s worth noting that there are no memcheck errors at all if I delete the "virtual" keyword in basefoo’s getiden method, but then the override isn’t successful, and I need it for my project.Does anyone have any idea of what is causing this?
This is CUDA 12.2 in Ubuntu 22.04.3.
2
Answers
The answer could be found in the CUDA documentation about virtual functions:
The following code creating object on the device will work
cudaMemcpy* will transfer the vtable between host and device. In general, host pointers are invalid on the device, and vice-versa. To be usable on the device, objects with virtual functions must be constructed on the device to set up valid function pointers.
To make a copied object usable on the destination (cpu or gpu), one can copy-construct the object in-place, using placement-new.
Output:
result 5