I want to write a small library, which has to do some calculations. The library should be written in C++ with a wrapper in C to allow external calls.
As an example, the C++ code would like this (just a very simple example)
complex<double> lorentz(double x, double A, double E0, double Br){
return A * E0 * Br / (E0*E0 - x*x - I * Br * x);
}
// wrapper for the c-call
int call_fkt(double *energy, int N, double *parameter, int N_para, double *chi_real, double *chi_imag){
for(int idx_ener = 0; idx_ener < N; idx_ener++){
chi_real[idx_ener] = 0.0;
chi_imag[idx_ener] = 0.0;
complex<double> tmp;
tmp = lorentz(energy[idx_ener], parameter[0], parameter[1], parameter[2]);
chi_real[idx_ener] += real(tmp);
chi_imag[idx_ener] += imag(tmp);
}
}
A pure C version would look like:
double complex lorentz(double x, double A, double E0, double Br){
return A * E0 * Br / (E0*E0 - x*x - I * Br * x);
}
// wrapper for the c-call
int call_fkt(double *energy, int N, double *parameter, int N_para, double *chi_real, double *chi_imag){
for(int idx_ener = 0; idx_ener < N; idx_ener++){
chi_real[idx_ener] = 0.0;
chi_imag[idx_ener] = 0.0;
double complex tmp;
tmp = lorentz(energy[idx_ener], parameter[0], parameter[1], parameter[2]);
chi_real[idx_ener] += creal(tmp);
chi_imag[idx_ener] += cimag(tmp);
}
}
which is about a factor 2 faster than the C++ version. Where does this difference come from? Are the math libraries in C faster than in C++?
Note, I know, that this example does not require C++ but I have a more complicated problem, which is much easier to solve in C++.
Update
Here are the two versions I compiled. The C – code is
#include <complex.h>
#include <math.h>
double complex lorentz(double x, double A, double E0, double Br){
return A * E0 * Br / (E0*E0 - x*x - I * Br * x);
}
int c_call(double *energy, int N, double *parameter, int N_para, double *chi_real, double *chi_imag){
for(int idx_ener = 0; idx_ener < N; idx_ener++){
int start_index = 0;
chi_real[idx_ener] = 0.0;
chi_imag[idx_ener] = 0.0;
double complex tmp;
tmp = lorentz(energy[idx_ener], parameter[0], parameter[1], parameter[2]);
// if (idx_ener == 1){
// printf("Hallo, idx ener = %i, start index = %in", idx_ener, start_index);
// }
chi_real[idx_ener] += creal(tmp);
chi_imag[idx_ener] += cimag(tmp);
}
return 1;
}
which is compiled by using
gcc -fPIC -shared -O3 -ffast-math -o pure_c.so pure_c.c
The C++ version is
#include <complex>
#include <cmath>
using namespace std;
const complex<double> I(0.0,1.0);
complex<double> lorentz(double x, double A, double E0, double Br){
return A * E0 * Br / (E0*E0 - x*x - I * Br * x);
}
extern "C" void c_call(double *energy, int N, double *parameter, int N_para, double *chi_real, double *chi_imag){
for(int idx_ener = 0; idx_ener < N; idx_ener++){
int start_index = 0;
chi_real[idx_ener] = 0.0;
chi_imag[idx_ener] = 0.0;
complex<double> tmp;
tmp = lorentz(energy[idx_ener], parameter[0], parameter[1], parameter[2]);
// if (idx_ener == 1){
// printf("Hallo, idx ener = %i, start index = %in", idx_ener, start_index);
// }
chi_real[idx_ener] += real(tmp);
chi_imag[idx_ener] += imag(tmp);
}
return;
}
and is compiled by using
g++ -fPIC -shared -O3 -ffast-math -o mixed_c.so mixed_c.cpp
For measuring the execution time, I use a Python script. (The code should also be callable by means of another program. The measured time will be roughly the same.)
import numpy as np
# import ctypes
import time
from ctypes import CDLL, c_double, c_int, POINTER
N = 20088
N_time = 10000
ener = np.linspace(0.62, 6.2, N)
ener_c = (ener.astype(np.double)).ctypes.data_as(POINTER(c_double))
chi_real = (N * c_double)()
chi_imag = (N * c_double)()
p = np.array([2, 3, .01])
se_lib = CDLL("pure_c.so")
se_lib.argtypes = [POINTER(c_double), c_int, POINTER(c_double), c_int, POINTER(c_double), POINTER(c_double)]
times = np.zeros(N_time)
for idx in range(N_time):
t1 = time.time()
p_ctype = (p.astype(np.double)).ctypes.data_as(POINTER(c_double))
se_lib.c_call(ener_c, N, p_ctype, 3, chi_real, chi_imag)
df = 1 + np.array(chi_real) + 1j * np.array(chi_imag)
t2 = time.time()
times[idx] = (t2 - t1)*1e3
print("time (pure): mean / median ", np.round(np.mean(times), 3) , " / ", np.round(np.median(times), 3), "s")
se_lib2 = CDLL("mixed_c.so")
se_lib2.argtypes = [POINTER(c_double), c_int, POINTER(c_double), c_int, POINTER(c_double), POINTER(c_double)]
for idx in range(N_time):
t1 = time.time()
p_ctype = (p.astype(np.double)).ctypes.data_as(POINTER(c_double))
se_lib2.c_call(ener_c, N, p_ctype, 3, chi_real, chi_imag)
df = 1 + np.array(chi_real) + 1j * np.array(chi_imag)
t2 = time.time()
times[idx] = (t2 - t1)*1e3
print("time (mixed c): mean / median ", np.round(np.mean(times), 3) , " / ", np.round(np.median(times), 3), "s")
I already recognize that the fast-math flag speed up the C version a lot, whereas the enhancement in C++ is not that fast. Is there a way to achieve the same performance improvements?
Update 2:
-
I use Apple clang version 14.0.3 in order to compile this code.
-
I fixed the typo mentioned by @ComicSansMS
Update 3:
Looks like that this issue is caused by the compiler supplied by Apple as suggested by @n.m.couldbeanAI. Times are similar on a Ubuntu machine usig g++/gcc or clang.
2
Answers
The implementations are not equivalent. Your C version of the
lorentz()
function returns adouble
, when it should be returning adouble complex
:With this change, I see no measurable performance difference on gcc 13.2.
clang cannot inline complex arithmetic operators from libc++. Demo.
Perhaps this is not so unexpected, given how complex the libc++ implementation of
std::complex
operators is, compared to the libstdc++ implementation.