Source Code
use v5.32;
use Benchmark qw(cmpthese);
my $iter = 1000;
# the number of X and Y sample points
my $x_dim = 1023;
my $y_dim = 767;
# the range from which the points are taken from
my ($x_s, $x_e) = (0, 1);
my ($y_s, $y_e) = (0, 1);
# calculate the points
my $x_diff = $x_e - $x_s;
my $y_diff = $y_e - $y_s;
my $x_step = $x_diff / ($x_dim-1);
my $y_step = $y_diff / ($y_dim-1);
my @x_pts = map { $x_s + ($_*$x_step) } (0..$x_dim-1);
my @y_pts = map { $y_s + ($_*$y_step) } (0..$y_dim-1);
# the same points are fed to all these different implementations of Franke's function
my %to_cmp = (
Frankes_pure_Perl => sub { my $aref = Frankes_func_Perl(\@x_pts, \@y_pts); },
Frankes_Perl_and_C => sub { my $aref = Frankes_func_Perl_C(\@x_pts, \@y_pts); },
Frankes_C_no_AVX => sub { my $aref = Frankes_func_C_no_AVX(\@x_pts, \@y_pts); },
Frankes_func_C_AVX_no_alignment => sub { my $aref = Frankes_func_C_AVX_no_alignment(\@x_pts, \@y_pts); },
Frankes_func_C_AVX_with_alignment => sub { my $aref = Frankes_func_C_AVX_with_alignment(\@x_pts, \@y_pts); },
);
cmpthese( $iter, \%to_cmp );
exit;
sub Frankes_func_Perl {
my ($x_pts, $y_pts) = @_;
my @Frankes;
foreach my $y (@{$y_pts}) {
foreach my $x (@{x_pts}) {
my $p11 = -1.0*((9*$x-2)**2)/4;
my $p12 = ((9*$y-2)**2)/4;
my $t1 = 0.75*exp($p11-$p12);
my $p21 = -1.0*((9*$x+1)**2)/49;
my $p22 = (9*$y+1)/10;
my $t2 = 0.75*exp($p21-$p22);
my $p31 = -1.0*((9*$x-7)**2)/4;
my $p32 = ((9*$y-3)**2)/4;
my $t3 = 0.5*exp($p31-$p32);
my $p41 = -1.0*(9*$x-4)**2;
my $p42 = (9*$y-7)**2;
my $t4 = 0.2*exp($p41-$p42);
push @Frankes, $t1+$t2+$t3-$t4;
}
}
return \@Frankes;
}
sub Frankes_func_Perl_C {
my ($x_pts, $y_pts) = @_;
my @Frankes;
foreach my $y_pt (@{$y_pts}) {
foreach my $x_pt (@{x_pts}) {
push @Frankes, Frankes_func_onept($x_pt, $y_pt); # Calling C function for 1 point
}
}
return \@Frankes;
}
# C code with Inline::C
use Inline (C => Config =>
libs => '-lm -lsleef',
ccflagsex => '-O3 -mavx',
);
use Inline C => <<'END_OF_C_CODE';
#include
#include
#include
// calculate Franke's function for one (x,y) point
double Frankes_func_onept( double x, double y ) {
// compute x terms
double nine_x = 9*x;
double xt1_1 = nine_x-2;
double xt1 = (xt1_1*xt1_1)/4;
double xt2_1 = nine_x+1;
double xt2 = (xt2_1*xt2_1)/49;
double xt3_1 = nine_x-7;
double xt3 = (xt3_1*xt3_1)/4;
double xt4_1 = nine_x-4;
double xt4 = xt4_1*xt4_1;
// compute y terms
double nine_y = 9*y;
double yt1_1 = nine_y-2;
double yt1 = (yt1_1*yt1_1)/4;
double yt2 = (nine_y+1)/10;
double yt3_1 = nine_y-3;
double yt3 = (yt3_1*yt3_1)/4;
double yt4_1 = nine_y-7;
double yt4 = yt4_1*yt4_1;
// compute exponentials
double e_1 = exp(-1*xt1 - yt1);
double e_2 = exp(-1*xt2 - yt2);
double e_3 = exp(-1*xt3 - yt3);
double e_4 = exp(-1*xt4 - yt4);
// final summation
return 0.75*e_1 + 0.75*e_2 + 0.5*e_3 - 0.2*e_4;
}
// Bring the loops into the C function. Pass in array refs, return an array ref.
SV *Frankes_func_C_no_AVX(SV *x_pts, SV *y_pts) {
int i, j;
// make sure passed params are references
if (!SvROK(x_pts) || !SvROK(y_pts)) {
return &PL_sv_undef;
}
// make sure x_pts and y_pts are references to arrays
if ((SvTYPE(SvRV(x_pts)) != SVt_PVAV) || (SvTYPE(SvRV(y_pts)) != SVt_PVAV)) {
return &PL_sv_undef;
}
// dereference to get the arrays
AV *x_arr = (AV*)SvRV(x_pts);
AV *y_arr = (AV*)SvRV(y_pts);
// get the size of the arrays
int x_count = av_count(x_arr);
int y_count = av_count(y_arr);
// allocate some memory to extract the doubles from the perl arrays
double *xt1, *xt2, *xt3, *xt4;
Newx(xt1, x_count, double);
Newx(xt2, x_count, double);
Newx(xt3, x_count, double);
Newx(xt4, x_count, double);
// extract the raw values
for (i = 0; i < x_count; i++) {
double x_val = SvNV(*av_fetch(x_arr, i, 0));
double nine_x = 9*x_val;
// compute x terms
double xt1_1 = nine_x-2;
xt1[i] = -1*(xt1_1*xt1_1)/4;
double xt2_1 = nine_x+1;
xt2[i] = -1*(xt2_1*xt2_1)/49;
double xt3_1 = nine_x-7;
xt3[i] = -1*(xt3_1*xt3_1)/4;
double xt4_1 = nine_x-4;
xt4[i] = -1*xt4_1*xt4_1;
}
// and for the y vals
double *nine_y;
Newx(nine_y, y_count, double);
for (i = 0; i < y_count; i++) {
double y_val = SvNV(*av_fetch(y_arr, i, 0));
nine_y[i] = 9*y_val;
}
// Allocate some memory for the result
int total = x_count*y_count;
double *Frankes;
Newx(Frankes, total, double);
int k = 0;
for (j = 0; j < y_count; j++) {
// compute y terms
double yt1_1 = nine_y[j]-2;
double yt1 = (yt1_1*yt1_1)/4;
double yt2 = (nine_y[j]+1)/10;
double yt3_1 = nine_y[j]-3;
double yt3 = (yt3_1*yt3_1)/4;
double yt4_1 = nine_y[j]-7;
double yt4 = yt4_1*yt4_1;
for (i = 0; i < x_count; i++, k++) {
// x terms already computed
// compute exponentials
double e_1 = exp(xt1[i] - yt1);
double e_2 = exp(xt2[i] - yt2);
double e_3 = exp(xt3[i] - yt3);
double e_4 = exp(xt4[i] - yt4);
// final summation
Frankes[k] = 0.75*e_1 + 0.75*e_2 + 0.5*e_3 - 0.2*e_4;
}
}
// create a Perl array for returning the processed data
AV *av = newAV();
// fill up the array
for (int i = 0; i < total; i++) {
av_push(av, newSVnv(Frankes[i]));
}
// get a reference to that array to return
SV *sv = newRV_noinc((SV*)av);
// Before going release allocated memory
Safefree(xt1);
Safefree(xt2);
Safefree(xt3);
Safefree(xt4);
Safefree(nine_y);
Safefree(Frankes);
// return
return sv;
}
// Attempt to vectorize the inner loop
SV *Frankes_func_C_AVX_no_alignment(SV *x_pts, SV *y_pts) {
int i, j;
// make sure passed params are references
if (!SvROK(x_pts) || !SvROK(y_pts)) {
return &PL_sv_undef;
}
// make sure x_pts and y_pts are references to arrays
if ((SvTYPE(SvRV(x_pts)) != SVt_PVAV) || (SvTYPE(SvRV(y_pts)) != SVt_PVAV)) {
return &PL_sv_undef;
}
// dereference to get the arrays
AV *x_arr = (AV*)SvRV(x_pts);
AV *y_arr = (AV*)SvRV(y_pts);
// get the size of the arrays
int x_count = av_count(x_arr);
int y_count = av_count(y_arr);
// allocate some memory to extract the doubles from the perl arrays
double *xt1, *xt2, *xt3, *xt4;
Newx(xt1, x_count, double);
Newx(xt2, x_count, double);
Newx(xt3, x_count, double);
Newx(xt4, x_count, double);
// extract the raw values
for (i = 0; i < x_count; i++) {
double x_val = SvNV(*av_fetch(x_arr, i, 0));
double nine_x = 9*x_val;
// compute x terms
double xt1_1 = nine_x-2;
xt1[i] = -1*(xt1_1*xt1_1)/4;
double xt2_1 = nine_x+1;
xt2[i] = -1*(xt2_1*xt2_1)/49;
double xt3_1 = nine_x-7;
xt3[i] = -1*(xt3_1*xt3_1)/4;
double xt4_1 = nine_x-4;
xt4[i] = -1*xt4_1*xt4_1;
}
// and for the y vals
double *nine_y;
Newx(nine_y, y_count, double);
for (i = 0; i < y_count; i++) {
double y_val = SvNV(*av_fetch(y_arr, i, 0));
nine_y[i] = 9*y_val;
}
// Allocate some memory for the result
int total = x_count*y_count;
double *Frankes;
Newx(Frankes, total, double);
__m256d temp_v, n_1, n_2;
int k = 0;
for (j = 0; j < y_count; j++) {
// compute y terms
double yt1_1 = nine_y[j]-2;
double yt1 = (yt1_1*yt1_1)/4;
double yt2 = (nine_y[j]+1)/10;
double yt3_1 = nine_y[j]-3;
double yt3 = (yt3_1*yt3_1)/4;
double yt4_1 = nine_y[j]-7;
double yt4 = yt4_1*yt4_1;
// vectorize inner loop. Cannot use aligned load and store as Newx doesn't align memory (I think)
for (i = 0; i < x_count - (x_count % 4); i += 4, k += 4) {
// exp(x term 1 - y term 1)
n_2 = _mm256_loadu_pd(&xt1[i]);
temp_v = _mm256_set1_pd(yt1);
n_1 = _mm256_sub_pd(n_2, temp_v);
__m256d e_1 = Sleef_expd4_u10(n_1);
// exp(x term 2 - y term 2)
n_2 = _mm256_loadu_pd(&xt2[i]);
temp_v = _mm256_set1_pd(yt2);
n_1 = _mm256_sub_pd(n_2, temp_v);
__m256d e_2 = Sleef_expd4_u10(n_1);
// exp(x term 3 - y term 3)
n_2 = _mm256_loadu_pd(&xt3[i]);
temp_v = _mm256_set1_pd(yt3);
n_1 = _mm256_sub_pd(n_2, temp_v);
__m256d e_3 = Sleef_expd4_u10(n_1);
// exp(x term 4 - y term 4)
n_2 = _mm256_loadu_pd(&xt4[i]);
temp_v = _mm256_set1_pd(yt4);
n_1 = _mm256_sub_pd(n_2, temp_v);
__m256d e_4 = Sleef_expd4_u10(n_1);
temp_v = _mm256_set1_pd(0.75);
e_1 = _mm256_mul_pd(e_1, temp_v);
e_2 = _mm256_mul_pd(e_2, temp_v);
temp_v = _mm256_set1_pd(0.5);
e_3 = _mm256_mul_pd(e_3, temp_v);
temp_v = _mm256_set1_pd(0.2);
e_4 = _mm256_mul_pd(e_4, temp_v);
// final summation
temp_v = _mm256_add_pd(e_1, e_2);
temp_v = _mm256_add_pd(temp_v, e_3);
temp_v = _mm256_sub_pd(temp_v, e_4);
_mm256_storeu_pd(&Frankes[k], temp_v);
}
// deal with any tail elements
for (; i < x_count; i++, k++) {
// compute exponentials
double e_1 = exp(xt1[i] - yt1);
double e_2 = exp(xt2[i] - yt2);
double e_3 = exp(xt3[i] - yt3);
double e_4 = exp(xt4[i] - yt4);
// final summation
Frankes[k] = 0.75*e_1 + 0.75*e_2 + 0.5*e_3 - 0.2*e_4;
}
}
// create a Perl array for returning the processed data
AV *av = newAV();
// fill up the array
for (int i = 0; i < total; i++) {
av_push(av, newSVnv(Frankes[i]));
}
// get a reference to that array to return
SV *sv = newRV_noinc((SV*)av);
// Before going release allocated memory
Safefree(xt1);
Safefree(xt2);
Safefree(xt3);
Safefree(xt4);
Safefree(nine_y);
Safefree(Frankes);
// return
return sv;
}
// this time, use _mm_malloc to get some aligned memory
SV *Frankes_func_C_AVX_with_alignment(SV *x_pts, SV *y_pts) {
int i, j;
// make sure passed params are references
if (!SvROK(x_pts) || !SvROK(y_pts)) {
return &PL_sv_undef;
}
// make sure x_pts and y_pts are references to arrays
if ((SvTYPE(SvRV(x_pts)) != SVt_PVAV) || (SvTYPE(SvRV(y_pts)) != SVt_PVAV)) {
return &PL_sv_undef;
}
// dereference to get the arrays
AV *x_arr = (AV*)SvRV(x_pts);
AV *y_arr = (AV*)SvRV(y_pts);
// get the size of the arrays
int x_count = av_count(x_arr);
int y_count = av_count(y_arr);
// to use aligned memory for storing and loading, ensure we have a multiple of 4
int f_width = x_count;
if ( x_count % 4 != 0 ) {
f_width = x_count - (x_count % 4) + 4;
}
// allocate some memory to extract the doubles from the perl arrays
// By aligning the memory we should get faster loads and stores.
// call _mm_malloc once and use pointer offsets, for speed
double *aligned_mem = (double*) _mm_malloc(((4 * f_width)+(f_width*y_count))*sizeof(double), 32);
int x_terms_1 = 0;
int x_terms_2 = f_width;
int x_terms_3 = f_width*2;
int x_terms_4 = f_width*3;
int output = f_width*4;
// extract the raw values
for (i = 0; i < x_count; i++) {
double x_val = SvNV(*av_fetch(x_arr, i, 0));
double nine_x = 9*x_val;
// compute x terms
double xt1_1 = nine_x-2;
aligned_mem[x_terms_1+i] = -1*(xt1_1*xt1_1)/4;
double xt2_1 = nine_x+1;
aligned_mem[x_terms_2+i] = -1*(xt2_1*xt2_1)/49;
double xt3_1 = nine_x-7;
aligned_mem[x_terms_3+i] = -1*(xt3_1*xt3_1)/4;
double xt4_1 = nine_x-4;
aligned_mem[x_terms_4+i] = -1*xt4_1*xt4_1;
}
// and for the y vals. No need to align
double *nine_y;
Newx(nine_y, y_count, double);
for (i = 0; i < y_count; i++) {
double y_val = SvNV(*av_fetch(y_arr, i, 0));
nine_y[i] = 9*y_val;
}
__m256d temp_v;
int k = 0;
for (j = 0; j < y_count; j++) {
// compute y terms
double yt1_1 = nine_y[j]-2;
double yt1 = (yt1_1*yt1_1)/4;
double yt2 = (nine_y[j]+1)/10;
double yt3_1 = nine_y[j]-3;
double yt3 = (yt3_1*yt3_1)/4;
double yt4_1 = nine_y[j]-7;
double yt4 = yt4_1*yt4_1;
// vectorizing the inner loop. This time use aligned load and store
for (i = 0; i < x_count - (x_count % 4); i += 4) {
__m256d x_term_1 = _mm256_load_pd(&aligned_mem[x_terms_1+i]);
__m256d x_term_2 = _mm256_load_pd(&aligned_mem[x_terms_2+i]);
__m256d x_term_3 = _mm256_load_pd(&aligned_mem[x_terms_3+i]);
__m256d x_term_4 = _mm256_load_pd(&aligned_mem[x_terms_4+i]);
__m256d y_term_1 = _mm256_set1_pd(yt1);
__m256d y_term_2 = _mm256_set1_pd(yt2);
__m256d y_term_3 = _mm256_set1_pd(yt3);
__m256d y_term_4 = _mm256_set1_pd(yt4);
// exp(x term 1 - y term 1)
temp_v = _mm256_sub_pd(x_term_1, y_term_1);
__m256d e_1 = Sleef_expd4_u10(temp_v);
// exp(x term 2 - y term 2)
temp_v = _mm256_sub_pd(x_term_2, y_term_2);
__m256d e_2 = Sleef_expd4_u10(temp_v);
// exp(x term 3 - y term 3)
temp_v = _mm256_sub_pd(x_term_3, y_term_3);
__m256d e_3 = Sleef_expd4_u10(temp_v);
// exp(x term 4 - y term 4)
temp_v = _mm256_sub_pd(x_term_4, y_term_4);
__m256d e_4 = Sleef_expd4_u10(temp_v);
// multiply by the constant parameters
temp_v = _mm256_set1_pd(0.75);
e_1 = _mm256_mul_pd(e_1, temp_v);
e_2 = _mm256_mul_pd(e_2, temp_v);
temp_v = _mm256_set1_pd(0.5);
e_3 = _mm256_mul_pd(e_3, temp_v);
temp_v = _mm256_set1_pd(0.2);
e_4 = _mm256_mul_pd(e_4, temp_v);
// final summation
temp_v = _mm256_add_pd(e_1, e_2);
temp_v = _mm256_add_pd(temp_v, e_3);
temp_v = _mm256_sub_pd(temp_v, e_4);
_mm256_store_pd(&aligned_mem[output+k+i], temp_v);
}
// deal with any tail elements
for (; i < x_count; i++) {
// compute exponentials
double e_1 = exp(aligned_mem[x_terms_1+i] - yt1);
double e_2 = exp(aligned_mem[x_terms_2+i] - yt2);
double e_3 = exp(aligned_mem[x_terms_3+i] - yt3);
double e_4 = exp(aligned_mem[x_terms_4+i] - yt4);
// final summation
aligned_mem[output+k+i] = 0.75*e_1 + 0.75*e_2 + 0.5*e_3 - 0.2*e_4;
}
k += f_width;
}
// create a Perl array for returning the processed data
AV *av = newAV();
// fill up the array
k = 0;
for (j = 0; j < y_count; j++) {
for (int i = 0; i < x_count; i++) {
av_push(av, newSVnv(aligned_mem[output+k+i]));
}
k += f_width;
}
// get a reference to that array to return
SV *sv = newRV_noinc((SV*)av);
// Before going release allocated memory
_mm_free(aligned_mem);
Safefree(nine_y);
// return
return sv;
}
END_OF_C_CODE