Matrix rotation can be described as
, which naive baseline is shown below.
#define RIDX(i,j,n) ((i)*(n)+(j))
void naive_rotate(int dim, pixel *src, pixel *dst)
{
int i, j;
for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++)
dst[RIDX(dim - 1 - j, i, dim)] = src[RIDX(i, j, dim)];
}Rotate: Version = naive_rotate: Naive baseline implementation:
Dim 64 128 256 512 1024 Mean
Your CPEs 1.4 1.5 3.7 5.5 7.7
Baseline CPEs 14.7 40.1 46.4 65.9 94.5
Speedup 10.3 26.3 12.7 12.0 12.2 13.8
For every inner iteration,
will be computed repeatedly, which causes performance loss. We can put the computation of
outside of the inner loop.
void rotate(int dim, pixel *src, pixel *dst)
{
int i, j;
int s, t;
s = dim * (dim - 1);
for (i = 0; i < dim; i++)
{
t = i * dim;
for (j = 0; j < dim; j++)
{
dst[s - j * dim + i] = src[t + j];
}
}
}Rotate: Version = rotate: Current working version:
Dim 64 128 256 512 1024 Mean
Your CPEs 1.3 1.5 3.6 5.3 7.6
Baseline CPEs 14.7 40.1 46.4 65.9 94.5
Speedup 11.5 27.3 13.0 12.5 12.4 14.4
For a large dimension matrix, the memory access of dst and src will take long strides. We can unroll the inner loop to fit the size of cache. For computation convenience, we can simply unroll the parentheses like
.We set the base address n^2-n to dst and 0 to src, and decrease n for dst and increase 1 for src in a inner loop iteration.
void rotate(int dim, pixel *src, pixel *dst)
{
int BLOCK = 32; // Images' dimension are multiples of 32
dst += (dim * dim - dim);
for (int i = 0; i < dim / BLOCK; ++i)
{
for (int j = 0; j < dim; ++j)
{
for (int k = 0; k < BLOCK; ++k)
{
dst[k] = src[k * dim];
}
src += 1;
dst -= dim;
}
src += BLOCK * dim - dim;
dst += BLOCK + dim * dim;
}
}Rotate: Version = rotate: Current working version:
Dim 64 128 256 512 1024 Mean
Your CPEs 1.1 1.1 1.1 1.5 2.0
Baseline CPEs 14.7 40.1 46.4 65.9 94.5
Speedup 13.2 35.9 43.6 44.3 46.8 33.6
The smooth value of an element in matrix is defined as
, which naive baseline is shown below.
typedef struct {
unsigned short red;
unsigned short green;
unsigned short blue;
} pixel;
typedef struct {
int red;
int green;
int blue;
int num;
} pixel_sum;
static void initialize_pixel_sum(pixel_sum *sum)
{
sum->red = sum->green = sum->blue = 0;
sum->num = 0;
return;
}
static void accumulate_sum(pixel_sum *sum, pixel p)
{
sum->red += (int) p.red;
sum->green += (int) p.green;
sum->blue += (int) p.blue;
sum->num++;
return;
}
static void assign_sum_to_pixel(pixel *current_pixel, pixel_sum sum)
{
current_pixel->red = (unsigned short) (sum.red/sum.num);
current_pixel->green = (unsigned short) (sum.green/sum.num);
current_pixel->blue = (unsigned short) (sum.blue/sum.num);
return;
}
static pixel avg(int dim, int i, int j, pixel *src)
{
int ii, jj;
pixel_sum sum;
pixel current_pixel;
initialize_pixel_sum(&sum);
for (ii = max(i - 1, 0); ii <= min(i + 1, dim - 1); ii++)
for (jj = max(j - 1, 0); jj <= min(j + 1, dim - 1); jj++)
accumulate_sum(&sum, src[RIDX(ii, jj, dim)]);
assign_sum_to_pixel(¤t_pixel, sum);
return current_pixel;
}
void naive_smooth(int dim, pixel *src, pixel *dst)
{
int i, j;
for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++)
dst[RIDX(i, j, dim)] = avg(dim, i, j, src);
}Smooth: Version = naive_smooth: Naive baseline implementation:
Dim 32 64 128 256 512 Mean
Your CPEs 44.1 44.4 41.4 46.1 46.7
Baseline CPEs 695.0 698.0 702.0 717.0 722.0
Speedup 15.7 15.7 17.0 15.6 15.4 15.9
According to the definition of Smooth, we can classify pixels into three categories.
Then we can avoid using min and max in every loop iteration, which will bring the consumption of branch prediction. The code after unrolling is shown below.
void smooth(int dim, pixel *src, pixel *dst)
{
int i, j;
int ni, ci, pi;
pixel_sum sum;
pixel current_pixel;
// top-left
i = 0;
j = 0;
initialize_pixel_sum(&sum);
accumulate_sum(&sum, src[RIDX(i, j, dim)]);
accumulate_sum(&sum, src[RIDX(i, j + 1, dim)]);
accumulate_sum(&sum, src[RIDX(i + 1, j, dim)]);
accumulate_sum(&sum, src[RIDX(i + 1, j + 1, dim)]);
assign_sum_to_pixel(¤t_pixel, sum);
dst[RIDX(i, j, dim)] = current_pixel;
// top-right
i = 0;
j = dim - 1;
initialize_pixel_sum(&sum);
accumulate_sum(&sum, src[RIDX(i, j, dim)]);
accumulate_sum(&sum, src[RIDX(i, j - 1, dim)]);
accumulate_sum(&sum, src[RIDX(i + 1, j, dim)]);
accumulate_sum(&sum, src[RIDX(i + 1, j - 1, dim)]);
assign_sum_to_pixel(¤t_pixel, sum);
dst[RIDX(i, j, dim)] = current_pixel;
// bottom -left
i = dim - 1;
j = 0;
initialize_pixel_sum(&sum);
accumulate_sum(&sum, src[RIDX(i, j, dim)]);
accumulate_sum(&sum, src[RIDX(i, j + 1, dim)]);
accumulate_sum(&sum, src[RIDX(i - 1, j, dim)]);
accumulate_sum(&sum, src[RIDX(i - 1, j + 1, dim)]);
assign_sum_to_pixel(¤t_pixel, sum);
dst[RIDX(i, j, dim)] = current_pixel;
// bottom-right
i = dim - 1;
j = dim - 1;
initialize_pixel_sum(&sum);
accumulate_sum(&sum, src[RIDX(i, j, dim)]);
accumulate_sum(&sum, src[RIDX(i, j - 1, dim)]);
accumulate_sum(&sum, src[RIDX(i - 1, j, dim)]);
accumulate_sum(&sum, src[RIDX(i - 1, j - 1, dim)]);
assign_sum_to_pixel(¤t_pixel, sum);
dst[RIDX(i, j, dim)] = current_pixel;
// top
i = 0;
for (j = 1; j <= dim - 2; j++)
{
initialize_pixel_sum(&sum);
accumulate_sum(&sum, src[RIDX(i, j, dim)]);
accumulate_sum(&sum, src[RIDX(i, j - 1, dim)]);
accumulate_sum(&sum, src[RIDX(i, j + 1, dim)]);
accumulate_sum(&sum, src[RIDX(i + 1, j, dim)]);
accumulate_sum(&sum, src[RIDX(i + 1, j - 1, dim)]);
accumulate_sum(&sum, src[RIDX(i + 1, j + 1, dim)]);
assign_sum_to_pixel(¤t_pixel, sum);
dst[RIDX(i, j, dim)] = current_pixel;
}
// bottom
i = dim - 1;
for (j = 1; j <= dim - 2; j++)
{
initialize_pixel_sum(&sum);
accumulate_sum(&sum, src[RIDX(i, j, dim)]);
accumulate_sum(&sum, src[RIDX(i, j - 1, dim)]);
accumulate_sum(&sum, src[RIDX(i, j + 1, dim)]);
accumulate_sum(&sum, src[RIDX(i - 1, j, dim)]);
accumulate_sum(&sum, src[RIDX(i - 1, j - 1, dim)]);
accumulate_sum(&sum, src[RIDX(i - 1, j + 1, dim)]);
assign_sum_to_pixel(¤t_pixel, sum);
dst[RIDX(i, j, dim)] = current_pixel;
}
// left
j = 0;
for (i = 1; i <= dim - 2; i++)
{
initialize_pixel_sum(&sum);
accumulate_sum(&sum, src[RIDX(i, j, dim)]);
accumulate_sum(&sum, src[RIDX(i - 1, j, dim)]);
accumulate_sum(&sum, src[RIDX(i + 1, j, dim)]);
accumulate_sum(&sum, src[RIDX(i, j + 1, dim)]);
accumulate_sum(&sum, src[RIDX(i - 1, j + 1, dim)]);
accumulate_sum(&sum, src[RIDX(i + 1, j + 1, dim)]);
assign_sum_to_pixel(¤t_pixel, sum);
dst[RIDX(i, j, dim)] = current_pixel;
}
// right
j = dim - 1;
for (i = 1; i <= dim - 2; i++)
{
initialize_pixel_sum(&sum);
accumulate_sum(&sum, src[RIDX(i, j, dim)]);
accumulate_sum(&sum, src[RIDX(i - 1, j, dim)]);
accumulate_sum(&sum, src[RIDX(i + 1, j, dim)]);
accumulate_sum(&sum, src[RIDX(i, j - 1, dim)]);
accumulate_sum(&sum, src[RIDX(i - 1, j - 1, dim)]);
accumulate_sum(&sum, src[RIDX(i + 1, j - 1, dim)]);
assign_sum_to_pixel(¤t_pixel, sum);
dst[RIDX(i, j, dim)] = current_pixel;
}
// center
for (i = 1; i <= dim - 2; i++)
{
ni = (i - 1) * dim;
ci = i * dim;
pi = (i + 1) * dim;
for (j = 1; j <= dim - 2; j++)
{
initialize_pixel_sum(&sum);
accumulate_sum(&sum, src[ni + j - 1]);
accumulate_sum(&sum, src[ni + j]);
accumulate_sum(&sum, src[ni + j + 1]);
accumulate_sum(&sum, src[ci + j - 1]);
accumulate_sum(&sum, src[ci + j]);
accumulate_sum(&sum, src[ci + j + 1]);
accumulate_sum(&sum, src[pi + j - 1]);
accumulate_sum(&sum, src[pi + j]);
accumulate_sum(&sum, src[pi + j + 1]);
assign_sum_to_pixel(¤t_pixel, sum);
dst[ci + j] = current_pixel;
}
}
}Smooth: Version = smooth: Current working version:
Dim 32 64 128 256 512 Mean
Your CPEs 9.8 12.4 10.9 10.5 10.6
Baseline CPEs 695.0 698.0 702.0 717.0 722.0
Speedup 71.1 56.4 64.1 68.3 68.0 65.4