-
Notifications
You must be signed in to change notification settings - Fork 0
/
count_ge_zero.c
97 lines (91 loc) · 3.07 KB
/
count_ge_zero.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#include <stdio.h>
#include <emmintrin.h>
#define SIZE_LIMIT 100000000000
#define TRIES 30
// Solution 1: Basic for-loop linear scan
// Twice faster than Solution 2
// By 3.5 slower than Solution 3
size_t solution_basic(int16_t* input, size_t size) {
size_t count = 0;
int16_t* end = input + size;
while(input < end) {
if (*input >= 0) {
++count;
}
++input;
}
return count;
}
// Solution 2: First solution, than I've made
// The problem here is _mm_store_si128((__m128i*)res, r_part);
// as it stores data in RAM, that ruins performance as
// access to memory is an expensive operation
size_t solution_half_simd(int16_t* input, size_t size) {
int16_t* res = malloc(sizeof(int16_t) * size);
int16_t* result = res;
__m128i r_zeros = _mm_setzero_si128(),
r_neg_ones = _mm_cmpeq_epi16(r_zeros, r_zeros),
r_part;
int16_t* end = input + size;
for (int16_t* current = input; current < end; current += 8, res += 8) {
r_part = _mm_load_si128((__m128i*)current);
r_part = _mm_cmpgt_epi16(r_part, r_neg_ones);
_mm_store_si128((__m128i*)res, r_part);
}
size_t count = 0;
for (size_t i = 0; i < size; ++i) {
count -= result[i];
}
free(result);
return count;
}
// Solution 3: My best attempt
// result is stored in register, so I don't
// need to touch RAM unless process is complete
size_t solution_max_simd(int16_t* input, size_t size) {
__m128i r_sum = _mm_setzero_si128(),
r_neg_ones = _mm_cmpeq_epi16(r_sum, r_sum),
r_part;
int16_t* end = input + size;
for (int16_t* current = input; current < end; current += 8) {
r_part = _mm_load_si128((__m128i*) current);
r_part = _mm_cmpgt_epi16(r_part, r_neg_ones);
r_sum = _mm_add_epi16(r_sum, r_part);
}
size_t count = 0;
int16_t* sum = malloc(sizeof(int16_t) * 8);
_mm_store_si128((__m128i*) sum, r_sum);
for (size_t i = 0; i < 8; ++i) {
count -= sum[i];
}
free(sum);
return count;
}
// Stress test for given solution
void check(size_t (*calc)(int16_t*, size_t), int16_t* input, size_t size) {
u_int64_t elapsed_us = 0;
for (int t = 0; t < TRIES; ++t) {
struct timeval tstart, tend;
gettimeofday(&tstart, NULL);
int32_t cnt = calc(input, size);
gettimeofday(&tend, NULL);
elapsed_us += (tend.tv_sec - tstart.tv_sec) * 1000000 + tend.tv_usec - tstart.tv_usec;
}
elapsed_us /= TRIES;
printf("t=%llu us\n", elapsed_us);
}
int main() {
for (size_t size = 8; size < SIZE_LIMIT; size *= 2) {
int16_t* input = malloc(sizeof(int16_t) * size);
for (int i = 0; i < size; ++i) {
input[i] = i % 2 == 0 ? i : -i;
printf("\n`size=%d\nfor-loop:\t", size);
check(solution_basic, input, size);
printf("simd-mistake:\t");
check(solution_half_simd, input, size);
printf("simd-better:\t");
check(solution_max_simd, input, size);
free(input);
}
return 0;
}