Algorithm Selection LAB 5 in x86_64 and AArch64 - part 1

#spo600 #lab5

Introduction

Hi this is Tecca, and in this post I will be comparing the relative performance of various algorithms on a same machine across several implementations of AArch64 and x86_64 systems.

Source code
vol.h

/* This is the number of samples to be processed */
#define SAMPLES 16

/* This is the volume scaling factor to be used */
#define VOLUME 50.0 // Percent of original volume

/* Function prototype to fill an array sample of
 * length sample_count with random int16_t numbers
 * to simulate an audio buffer */
void vol_createsample(int16_t* sample, int32_t sample_count);

vol.h controls the number of samples(16) to be processed and the volume level(50) to be used

In vol.h a large number of sample for the algorithms to process seems to be reasonable, because it will allow us to analyze the differences in terms of performance much easier.

vol0.c

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include "vol.h"

int16_t scale_sample(int16_t sample, int volume) {
        return (int16_t) ((float) (volume/100.0) * (float) sample);
}

int main() {
        int             x;
        int             ttl=0;

// ---- Create in[] and out[] arrays
        int16_t*        in;
        int16_t*        out;
        in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
        out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));

// ---- Create dummy samples in in[]
        vol_createsample(in, SAMPLES);


// ---- Scale the samples from in[], placing results in out[]
        for (x = 0; x < SAMPLES; x++) {
                out[x]=scale_sample(in[x], VOLUME);
        }

// ---- This part sums the samples
        for (x = 0; x < SAMPLES; x++) {
                ttl=(ttl+out[x])%1000;
        }

// ---- Print the sum of the samples
        printf("Result: %d\n", ttl);
        return 0;
}

In vol0.c, Audio samples are multiplied by the volume scaling factor, casting between signed 16-bit integers and floating-point values. This way takes up a lot of resources.

vol1.c

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include "vol.h"

int16_t scale_sample(int16_t sample, int volume) {
        return ((((int32_t) sample) * ((int32_t) (32767 * volume / 100) <<1) ) >> 16);
}

int main() {
        int             x;
        int             ttl=0;

// ---- Create in[] and out[] arrays
        int16_t*        in;
        int16_t*        out;
        in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
        out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));

// ---- Create dummy samples in in[]
        vol_createsample(in, SAMPLES);

// ---- Scale the samples from in[], placing results in out[]
        for (x = 0; x < SAMPLES; x++) {
                out[x]=scale_sample(in[x], VOLUME);
        }

// ---- This part sums the samples
        for (x = 0; x < SAMPLES; x++) {
                ttl=(ttl+out[x])%1000;
        }

// ---- Print the sum of the samples
        printf("Result: %d\n", ttl);
        return 0;
}

vol1.c utilizes a fixed-point calculation. This avoids the cost of repetitively casting between integer and floating point.

vol2.c

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include "vol.h"

int main() {
        int             x;
        int             ttl=0;

// ---- Create in[] and out[] arrays
        int16_t*        in;
        int16_t*        out;
        in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
        out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));

        static int16_t* precalc;

// ---- Create dummy samples in in[]
        vol_createsample(in, SAMPLES);

// ---- Scale the samples from in[], placing results in out[]

        precalc = (int16_t*) calloc(65536,2);
        if (precalc == NULL) {
                printf("malloc failed!\n");
                return 1;
        }

        for (x = -32768; x <= 32767; x++) {
                precalc[(uint16_t) x] = (int16_t) ((float) x * VOLUME / 100.0);
        }

        for (x = 0; x < SAMPLES; x++) {
                out[x]=precalc[(uint16_t) in[x]];
        }

// ---- This part sums the samples
        for (x = 0; x < SAMPLES; x++) {
                ttl=(ttl+out[x])%1000;
        }

// ---- Print the sum of the samples
        printf("Result: %d\n", ttl);
        return 0;
}

Unlike vol0.c and vol1.c, vol2.c pre-calculates all 65535 results, looking up answers for each input value afterward.

vol3.c

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include "vol.h"

int16_t scale_sample(int16_t sample, int volume) {
        return (int16_t) 100;
}

int main() {
        int             x;
        int             ttl=0;

// ---- Create in[] and out[] arrays
        int16_t*        in;
        int16_t*        out;
        in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
        out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));

// ---- Create dummy samples in in[]
        vol_createsample(in, SAMPLES);

// ---- Scale the samples from in[], placing results in out[]
        for (x = 0; x < SAMPLES; x++) {
                out[x]=scale_sample(in[x], VOLUME);
        }

// ---- This part sum the samples
        for (x = 0; x < SAMPLES; x++) {
                ttl=(ttl+out[x])%1000;
        }

// ---- Print the sum of the samples
        printf("Result: %d\n", ttl);
        return 0;
}

vol3.c returns an identical sample value, the purpose of this program seems to be a baseline to compare to the other scaling volume algorithm.

vol4.c

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include "vol.h"

int main() {

#ifndef __aarch64__
        printf("Wrong architecture - written for aarch64 only.\n");
#else
        // these variables will also be accessed by our assembler code
        int16_t*        in_cursor;              // input cursor
        int16_t*        out_cursor;             // output cursor
        int16_t         vol_int;                // volume as int16_t

        int16_t*        limit;                  // end of input array

        int             x;                      // array interator
        int             ttl=0 ;                 // array total

// ---- Create in[] and out[] arrays
        int16_t*        in;
        int16_t*        out;
        in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
        out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));

// ---- Create dummy samples in in[]
        vol_createsample(in, SAMPLES);

// ---- Scale the samples from in[], placing results in out[]
        // set vol_int to fixed-point representation of the volume factor
        vol_int = (int16_t)(VOLUME/100.0 * 32767.0);

        in_cursor = in;
        out_cursor = out;
        limit = in + SAMPLES;

        __asm__ ("dup v1.8h,%w0"::"r"(vol_int)); // duplicate vol_int into v1.8h

        while ( in_cursor < limit ) {
                __asm__ (
                        "ldr q0, [%[in_cursor]], #16    \n\t"
                        // load eight samples into q0 (same as v0.8h)
                        // from [in_cursor]
                        // post-increment in_cursor by 16 bytes
                        // ans store back into the pointer register

                        "sqrdmulh v0.8h, v0.8h, v1.8h   \n\t"
                        // with 32 signed integer output,
                        // multiply each lane in v0 * v1 * 2
                        // saturate results
                        // store upper 16 bits of results into
                        // the corresponding lane in v0

                        "str q0, [%[out_cursor]],#16            \n\t"
                        // store eight samples to [out_cursor]
                        // post-increment out_cursor by 16 bytes
                        // and store back into the pointer register

                        : [in_cursor]"+r"(in_cursor), [out_cursor]"+r"(out_cursor)
                        : "r"(in_cursor),"r"(out_cursor)
                        : "memory"
                        );
        }

// --------------------------------------------------------------------

        for (x = 0; x < SAMPLES; x++) {
                ttl=(ttl+out[x])%1000;
        }

        printf("Result: %d\n", ttl);
        return 0;

#endif
}

vol4.c uses the SIMD (Single Input, Multiple Data) instructions accessed through inline assembly. Which is only available on AArch64 architectures.

vol5.c

#include <stdint.h>
#ifdef __aarch64__
#include <arm_neon.h>
#endif
#include "vol.h"

int main() {

#ifndef __aarch64__
        printf("Wrong architecture - written for aarch64 only.\n");
#else

        register int16_t*       in_cursor       asm("r20");     // input cursor (pointer)
        register int16_t*       out_cursor      asm("r21");     // output cursor (pointer)
        register int16_t        vol_int         asm("r22");     // volume as int16_t

        int16_t*                limit;          // end of input array

        int                     x;              // array interator
        int                     ttl=0;          // array total

// ---- Create in[] and out[] arrays
        int16_t*        in;
        int16_t*        out;
        in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
        out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));

// ---- Create dummy samples in in[]
        vol_createsample(in, SAMPLES);


// ---- Scale the samples from in[], placing results in out[]
        vol_int = (int16_t) (VOLUME/100.0 * 32767.0);

        in_cursor = in;
        out_cursor = out;
        limit = in + SAMPLES ;

        while ( in_cursor < limit ) {
                vst1q_s16(out_cursor, vqrdmulhq_s16(vld1q_s16(in_cursor), vdupq_n_s16(vol_int)));


                in_cursor += 8;
                out_cursor += 8;
        }

// --------------------------------------------------------------------

        for (x = 0; x < SAMPLES; x++) {
                ttl=(ttl+out[x])%1000;
        }


        printf("Result: %d\n", ttl);
        return 0;
#endif
}

vol5.c like vol4.c also utilize SIMD instruction but with complier intrinsic built into the compiler. vol5.c is also specific to AArch64 due to usage of unique instructions of AArch64 architecture.

Above code from vol0.c to vol5.c implement the various algorithms that will be use to test the performance.

vol_createsample.c

#include <stdlib.h>
#include <stdint.h>
#include "vol.h"

void vol_createsample(int16_t* sample, int32_t sample_count) {
        int i;
        for (i=0; i<sample_count; i++) {
                sample[i] = (rand()%65536)-32768;
        }
        return;
}

vol_createsample.c contains the function vol_createsample(int16_t* sample, int32_t sample_count) that will be use to create dummy samples for the algorithms to run with.

Conclusion
In this post we've examine a couple of algorithms for adjusting volume samples. We are aware of how each algorithm differed in their approach of achieving the same goal. In the next post, we will see how the performance of each program is and benchmarking it to prove the expectation.

DEV Community

Algorithm Selection LAB 5 in x86_64 and AArch64 - part 1

Top comments (0)

Read next

Docker and Kubernetes Integration: The Ultimate Solution for Containerized Applications

Docker Autoscaling: Dynamically Adjust Containers Based on Demand

Docker with Helm: Simplifying Kubernetes Deployment and Management

A Comprehensive Guide to Grasping Quantum Computing