Introduction
Hi this is Tecca, and in this post I will be comparing the relative performance of various algorithms on a same machine across several implementations of AArch64 and x86_64 systems.
Source code
vol.h
/* This is the number of samples to be processed */
#define SAMPLES 16
/* This is the volume scaling factor to be used */
#define VOLUME 50.0 // Percent of original volume
/* Function prototype to fill an array sample of
* length sample_count with random int16_t numbers
* to simulate an audio buffer */
void vol_createsample(int16_t* sample, int32_t sample_count);
vol.h controls the number of samples(16) to be processed and the volume level(50) to be used
In vol.h a large number of sample for the algorithms to process seems to be reasonable, because it will allow us to analyze the differences in terms of performance much easier.
vol0.c
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include "vol.h"
int16_t scale_sample(int16_t sample, int volume) {
return (int16_t) ((float) (volume/100.0) * (float) sample);
}
int main() {
int x;
int ttl=0;
// ---- Create in[] and out[] arrays
int16_t* in;
int16_t* out;
in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
// ---- Create dummy samples in in[]
vol_createsample(in, SAMPLES);
// ---- Scale the samples from in[], placing results in out[]
for (x = 0; x < SAMPLES; x++) {
out[x]=scale_sample(in[x], VOLUME);
}
// ---- This part sums the samples
for (x = 0; x < SAMPLES; x++) {
ttl=(ttl+out[x])%1000;
}
// ---- Print the sum of the samples
printf("Result: %d\n", ttl);
return 0;
}
In vol0.c, Audio samples are multiplied by the volume scaling factor, casting between signed 16-bit integers and floating-point values. This way takes up a lot of resources.
vol1.c
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include "vol.h"
int16_t scale_sample(int16_t sample, int volume) {
return ((((int32_t) sample) * ((int32_t) (32767 * volume / 100) <<1) ) >> 16);
}
int main() {
int x;
int ttl=0;
// ---- Create in[] and out[] arrays
int16_t* in;
int16_t* out;
in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
// ---- Create dummy samples in in[]
vol_createsample(in, SAMPLES);
// ---- Scale the samples from in[], placing results in out[]
for (x = 0; x < SAMPLES; x++) {
out[x]=scale_sample(in[x], VOLUME);
}
// ---- This part sums the samples
for (x = 0; x < SAMPLES; x++) {
ttl=(ttl+out[x])%1000;
}
// ---- Print the sum of the samples
printf("Result: %d\n", ttl);
return 0;
}
vol1.c utilizes a fixed-point calculation. This avoids the cost of repetitively casting between integer and floating point.
vol2.c
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include "vol.h"
int main() {
int x;
int ttl=0;
// ---- Create in[] and out[] arrays
int16_t* in;
int16_t* out;
in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
static int16_t* precalc;
// ---- Create dummy samples in in[]
vol_createsample(in, SAMPLES);
// ---- Scale the samples from in[], placing results in out[]
precalc = (int16_t*) calloc(65536,2);
if (precalc == NULL) {
printf("malloc failed!\n");
return 1;
}
for (x = -32768; x <= 32767; x++) {
precalc[(uint16_t) x] = (int16_t) ((float) x * VOLUME / 100.0);
}
for (x = 0; x < SAMPLES; x++) {
out[x]=precalc[(uint16_t) in[x]];
}
// ---- This part sums the samples
for (x = 0; x < SAMPLES; x++) {
ttl=(ttl+out[x])%1000;
}
// ---- Print the sum of the samples
printf("Result: %d\n", ttl);
return 0;
}
Unlike vol0.c and vol1.c, vol2.c pre-calculates all 65535 results, looking up answers for each input value afterward.
vol3.c
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include "vol.h"
int16_t scale_sample(int16_t sample, int volume) {
return (int16_t) 100;
}
int main() {
int x;
int ttl=0;
// ---- Create in[] and out[] arrays
int16_t* in;
int16_t* out;
in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
// ---- Create dummy samples in in[]
vol_createsample(in, SAMPLES);
// ---- Scale the samples from in[], placing results in out[]
for (x = 0; x < SAMPLES; x++) {
out[x]=scale_sample(in[x], VOLUME);
}
// ---- This part sum the samples
for (x = 0; x < SAMPLES; x++) {
ttl=(ttl+out[x])%1000;
}
// ---- Print the sum of the samples
printf("Result: %d\n", ttl);
return 0;
}
vol3.c returns an identical sample value, the purpose of this program seems to be a baseline to compare to the other scaling volume algorithm.
vol4.c
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include "vol.h"
int main() {
#ifndef __aarch64__
printf("Wrong architecture - written for aarch64 only.\n");
#else
// these variables will also be accessed by our assembler code
int16_t* in_cursor; // input cursor
int16_t* out_cursor; // output cursor
int16_t vol_int; // volume as int16_t
int16_t* limit; // end of input array
int x; // array interator
int ttl=0 ; // array total
// ---- Create in[] and out[] arrays
int16_t* in;
int16_t* out;
in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
// ---- Create dummy samples in in[]
vol_createsample(in, SAMPLES);
// ---- Scale the samples from in[], placing results in out[]
// set vol_int to fixed-point representation of the volume factor
vol_int = (int16_t)(VOLUME/100.0 * 32767.0);
in_cursor = in;
out_cursor = out;
limit = in + SAMPLES;
__asm__ ("dup v1.8h,%w0"::"r"(vol_int)); // duplicate vol_int into v1.8h
while ( in_cursor < limit ) {
__asm__ (
"ldr q0, [%[in_cursor]], #16 \n\t"
// load eight samples into q0 (same as v0.8h)
// from [in_cursor]
// post-increment in_cursor by 16 bytes
// ans store back into the pointer register
"sqrdmulh v0.8h, v0.8h, v1.8h \n\t"
// with 32 signed integer output,
// multiply each lane in v0 * v1 * 2
// saturate results
// store upper 16 bits of results into
// the corresponding lane in v0
"str q0, [%[out_cursor]],#16 \n\t"
// store eight samples to [out_cursor]
// post-increment out_cursor by 16 bytes
// and store back into the pointer register
: [in_cursor]"+r"(in_cursor), [out_cursor]"+r"(out_cursor)
: "r"(in_cursor),"r"(out_cursor)
: "memory"
);
}
// --------------------------------------------------------------------
for (x = 0; x < SAMPLES; x++) {
ttl=(ttl+out[x])%1000;
}
printf("Result: %d\n", ttl);
return 0;
#endif
}
vol4.c uses the SIMD (Single Input, Multiple Data) instructions accessed through inline assembly. Which is only available on AArch64 architectures.
vol5.c
#include <stdint.h>
#ifdef __aarch64__
#include <arm_neon.h>
#endif
#include "vol.h"
int main() {
#ifndef __aarch64__
printf("Wrong architecture - written for aarch64 only.\n");
#else
register int16_t* in_cursor asm("r20"); // input cursor (pointer)
register int16_t* out_cursor asm("r21"); // output cursor (pointer)
register int16_t vol_int asm("r22"); // volume as int16_t
int16_t* limit; // end of input array
int x; // array interator
int ttl=0; // array total
// ---- Create in[] and out[] arrays
int16_t* in;
int16_t* out;
in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
// ---- Create dummy samples in in[]
vol_createsample(in, SAMPLES);
// ---- Scale the samples from in[], placing results in out[]
vol_int = (int16_t) (VOLUME/100.0 * 32767.0);
in_cursor = in;
out_cursor = out;
limit = in + SAMPLES ;
while ( in_cursor < limit ) {
vst1q_s16(out_cursor, vqrdmulhq_s16(vld1q_s16(in_cursor), vdupq_n_s16(vol_int)));
in_cursor += 8;
out_cursor += 8;
}
// --------------------------------------------------------------------
for (x = 0; x < SAMPLES; x++) {
ttl=(ttl+out[x])%1000;
}
printf("Result: %d\n", ttl);
return 0;
#endif
}
vol5.c like vol4.c also utilize SIMD instruction but with complier intrinsic built into the compiler. vol5.c is also specific to AArch64 due to usage of unique instructions of AArch64 architecture.
Above code from vol0.c to vol5.c implement the various algorithms that will be use to test the performance.
vol_createsample.c
#include <stdlib.h>
#include <stdint.h>
#include "vol.h"
void vol_createsample(int16_t* sample, int32_t sample_count) {
int i;
for (i=0; i<sample_count; i++) {
sample[i] = (rand()%65536)-32768;
}
return;
}
vol_createsample.c contains the function vol_createsample(int16_t* sample, int32_t sample_count) that will be use to create dummy samples for the algorithms to run with.
Conclusion
In this post we've examine a couple of algorithms for adjusting volume samples. We are aware of how each algorithm differed in their approach of achieving the same goal. In the next post, we will see how the performance of each program is and benchmarking it to prove the expectation.
Top comments (0)