Hey,
I experimented the other day with some algorithms that were written in Akka and Golang (https://breaking-the-system.blogspot.com/2022/10/how-we-can-learn-parallel-computing.html), which is highly parallelised/threaded. I tested those against 2950x/3960x AMD ThreadRippers and against Apple M1 + M1 Max. The results were really shocking for me. The Apple won (by a long shot) the ThreadRippers, even when the TRs were OC to 250W and 350W+ and they are one the fastest CPUs money can buy for their time and costing so much more than the Apple one (not considering the insane cooling/electricity they need) and the algorithm is highly parallelized.
In this post I want to further experiment with the Apple processors to find out what the hell is happening here. I wanted to test "memory parallelism", which basically means how much the processor continues while waiting for memory, and how much memory data the CPU can access in parallel.
I search the web and found this really popular article (https://lemire.me/blog/2021/01/06/memory-access-on-the-apple-m1-processor/) claiming 26x memory parallelism (to be fair they wrote "or more") which did not make much sense to me. M1 has low frequency and high memory latency
// | |
// main.c | |
// test-loop-branch | |
// | |
// Created by Yaron Shani on 04/10/2022. | |
// | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <time.h> | |
typedef struct node { | |
int val; | |
struct node * next; | |
} node_t; | |
int main(int argc, const char * argv[]) { | |
long long arr_si = 1000*1000*300; | |
srand(time(NULL)); | |
node_t * head = NULL; | |
head = (node_t *) malloc(sizeof(node_t)); | |
if (head == NULL) { | |
return 1; | |
} | |
head->val = -1; | |
head->next = NULL; | |
node_t * lastNode = head; | |
for(long long i = 0; i < arr_si; i++) { | |
node_t *tmpNode = (node_t *) malloc(sizeof(node_t)); | |
if (tmpNode == NULL) { | |
return 1; | |
} | |
tmpNode->val = rand(); | |
tmpNode->next = NULL; | |
lastNode->next = tmpNode; | |
lastNode = tmpNode; | |
} | |
long long oo = 0; | |
clock_t start, end; | |
double cpu_time_used; | |
start = clock(); | |
node_t * current = head; | |
long long coun = 0; | |
long long count2 = 0; | |
while (current != NULL) { | |
coun += current->val; | |
current = current->next; | |
//count2 = count2 +1; | |
} | |
end = clock(); | |
cpu_time_used = ((double) (end - start) / CLOCKS_PER_SEC); | |
printf("fun() took %f seconds to execute \n", cpu_time_used); | |
printf("%i64\n", oo); | |
return 0; | |
} |
M1 took 0.45 seconds while 2950X TR took 2.42 seconds.
No comments:
Post a Comment