Graviton versus M1, vs x86 on FFT Performance

AWS states that Graviton (ARM) is 30% faster than x86, however, I find out that FFT on the Graviton is slower than about 10% compared with the x86. Below is setup and results, hopefully, I have not correctly setup the numpy for Graviton. Anyway, multithread is 4x faster than single thread, in this case the 10240MB lambda has 6 vCPUs, I think

Lambda configuration

memory 10240 MB
timeout 90 seconds
deploy via ecr image and CDK
EC2 ARM64 to build the image for the Graviton
language python
numpy 1.22.1 and numpy.fft.fft
np.fft.fft(np.random.randint(0, 1000, (4098, 600)))

I deploy lambda by ecr image

Docker file

FROM public.ecr.aws/lambda/python:3.8

# create code dir inside container
RUN mkdir ${LAMBDA_TASK_ROOT}/source

# copy code to container
COPY . ${LAMBDA_TASK_ROOT}/source

# copy handler function to container
COPY ./handler.py ${LAMBDA_TASK_ROOT}

# install dependencies for running time environment
RUN pip3 install -r ./source/requirements.txt --target "${LAMBDA_TASK_ROOT}"

# set the CMD to your handler
CMD [ "handler.lambda_handler"]

lambda handler


import json
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime

def single_thread_fft(sig):
    """
    normal fft
    """
    start_time = datetime.now()
    for x in sig:
        np.fft.fft(x, axis=0)
    end_time = datetime.now()
    delta_time = end_time.timestamp() - start_time.timestamp()
    print("single thread running time {0} ms".format(delta_time * 1000))
    return delta_time

def multi_thread_fft(sig):
    """
    thread fft
    """
    start_time = datetime.now()
    with ThreadPoolExecutor(max_workers=4) as executor:
        for x in sig:
            executor.submit(np.fft.fft, x, axis=0)
    end_time = datetime.now()
    delta_time = end_time.timestamp() - start_time.timestamp()
    print("multi thread running time {0} ms".format(delta_time * 1000))
    return delta_time


def lambda_handler(event, context):
    """
    Lambda handler
    """
    # signal for one channel
    sig = [np.random.randint(0, 1000, (4098, 600)) for k in range(4)]
    # single thread
    single_thread_time = single_thread_fft(sig)
    # multi thread
    multi_thread_time = multi_thread_fft(sig)
    # response
    return {
        'statusCode': 200,
        'headers': {
            "Access-Control-Allow-Origin": "*",
            "Access-Control-Allow-Headers": "Content-Type",
            "Access-Control-Allow-Methods": "OPTIONS,GET"
        },
        'body': json.dumps({"single thread: {0}, multi thread: {1}".format(single_thread_time * 1000, multi_thread_time*1000)},
                           indent=4,
                           sort_keys=True,
                           default=str)
    }

CDK lambda api gateway stack

class LambdaFFTArm(Stack):

    def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
        super().__init__(scope, construct_id, **kwargs)
        # The code that defines your stack goes here
        handler = aws_lambda.Function(
            self,
            id="LambdaFFTArm",
            code=aws_lambda.EcrImageCode.from_ecr_image(
                repository=aws_ecr.Repository.from_repository_name(
                    self,
                    id="LambdaFFTArmImage",
                    repository_name="lambda-fft-arm-image"
                )
            ),
            architecture=aws_lambda.Architecture.ARM_64,
            handler=aws_lambda.Handler.FROM_IMAGE,
            runtime=aws_lambda.Runtime.FROM_IMAGE,
            memory_size=10240,
            timeout=Duration.seconds(90),
        )
        # api gateway
        api_gw = aws_apigateway.LambdaRestApi(
            self,
            id="ApiLambdaFFTArm",
            handler=handler
        )
        # get api endpoint
        self.url_output = CfnOutput(self, "Url", value=api_gw.url)