| #include <iostream> |
| #include <chrono> |
| #include <vector> |
| #include <algorithm> |
| #include <numeric> |
| #include <stdlib.h> |
| #include <memory> |
| #include <cmath> |
| #include <string> |
| #include <thread> |
| |
| #define CACHE_HIT_SIZE 1 << 17 |
| |
| using namespace std; |
| |
| size_t size_start = 64; |
| size_t size_end = 16 * (1ull << 20); |
| size_t samples = 2048; |
| size_t size_per_test = 64 * (1ull << 20); |
| size_t tot_sum = 0; |
| size_t delay = 0; |
| float speed = 0; |
| bool dummy = false; |
| |
| void __attribute__((noinline)) memcpy_noinline(void *dst, void *src, size_t size); |
| void __attribute__((noinline)) memset_noinline(void *dst, int value, size_t size); |
| uint64_t __attribute__((noinline)) sum(volatile void *src, size_t size); |
| |
| enum BenchType { |
| MemcpyBench, |
| MemsetBench, |
| SumBench, |
| }; |
| |
| static void usage(char* p) { |
| printf("Usage: %s <test> <options>\n" |
| "<test> is one of the following:\n" |
| " --memcpy\n" |
| " --memset\n" |
| " --sum\n" |
| "<options> are optional and apply to all tests:\n" |
| " --dummy\n" |
| " Simulates cpu-only load of a test. Guaranteed to use L2\n" |
| " instead. Not supported on --sum test.\n" |
| " --delay DELAY_DIVISOR\n" |
| " --start START_SIZE_MB\n" |
| " --end END_SIZE_MB (requires start, optional)\n" |
| " --samples NUM_SAMPLES\n" |
| , p); |
| } |
| |
| int main(int argc, char *argv[]) |
| { |
| BenchType type = MemcpyBench; |
| if (argc <= 1) { |
| usage(argv[0]); |
| return 0; |
| } |
| for (int i = 1; i < argc; i++) { |
| if (string(argv[i]) == string("--memcpy")) { |
| type = MemcpyBench; |
| } else if (string(argv[i]) == string("--memset")) { |
| type = MemsetBench; |
| } else if (string(argv[i]) == string("--sum")) { |
| type = SumBench; |
| } else if (string(argv[i]) == string("--dummy")) { |
| dummy = true; |
| } else if (i + 1 < argc) { |
| if (string(argv[i]) == string("--delay")) { |
| delay = atoi(argv[++i]); |
| } else if (string(argv[i]) == string("--start")) { |
| size_start = atoi(argv[++i]) * (1ull << 20); |
| size_end = size_start; |
| } else if (string(argv[i]) == string("--end")) { |
| size_t end = atoi(argv[++i]) * (1ull << 20); |
| if (end > size_start && i > 3 |
| && string(argv[i-3]) == string("--start")) { |
| size_end = end; |
| } else { |
| printf("Cannot specify --end without --start.\n"); |
| return 0; |
| } |
| } else if (string(argv[i]) == string("--samples")) { |
| samples = atoi(argv[++i]); |
| } else { |
| printf("Unknown argument %s\n", argv[i]); |
| return 0; |
| } |
| } else { |
| printf("The %s option requires a single argument.\n", argv[i]); |
| return 0; |
| } |
| } |
| |
| unique_ptr<uint8_t[]> src(new uint8_t[size_end]); |
| unique_ptr<uint8_t[]> dst(new uint8_t[size_end]); |
| memset(src.get(), 1, size_end); |
| |
| double start_pow = log10(size_start); |
| double end_pow = log10(size_end); |
| double pow_inc = (end_pow - start_pow) / samples; |
| |
| //cout << "src: " << (uintptr_t)src.get() << endl; |
| //cout << "dst: " << (uintptr_t)dst.get() << endl; |
| |
| for (double cur_pow = start_pow; cur_pow <= end_pow && samples > 0; |
| cur_pow += pow_inc) { |
| chrono::time_point<chrono::high_resolution_clock> |
| copy_start, copy_end, pre_wait; |
| |
| size_t cur_size = (size_t)pow(10.0, cur_pow); |
| size_t iter_per_size = size_per_test / cur_size; |
| |
| // run benchmark |
| switch (type) { |
| case MemsetBench: { |
| memcpy_noinline(src.get(), dst.get(), cur_size); |
| memset_noinline(dst.get(), 0xdeadbeef, cur_size); |
| size_t hit_size = CACHE_HIT_SIZE; |
| copy_start = chrono::high_resolution_clock::now(); |
| for (int i = 0; i < iter_per_size; i++) { |
| if (!dummy) { |
| memset_noinline(dst.get(), 0xdeadbeef, cur_size); |
| } else { |
| while (hit_size < cur_size) { |
| memset_noinline |
| (dst.get(), 0xdeadbeef, CACHE_HIT_SIZE); |
| hit_size += 1 << 17; |
| } |
| } |
| if (delay != 0) |
| this_thread::sleep_for(chrono |
| ::nanoseconds(size_per_test / delay)); |
| } |
| copy_end = chrono::high_resolution_clock::now(); |
| break; |
| } |
| case MemcpyBench: { |
| memcpy_noinline(dst.get(), src.get(), cur_size); |
| memcpy_noinline(src.get(), dst.get(), cur_size); |
| size_t hit_size = CACHE_HIT_SIZE; |
| copy_start = chrono::high_resolution_clock::now(); |
| for (int i = 0; i < iter_per_size; i++) { |
| if (!dummy) { |
| memcpy_noinline(dst.get(), src.get(), cur_size); |
| } else { |
| while (hit_size < cur_size) { |
| memcpy_noinline |
| (dst.get(), src.get(), CACHE_HIT_SIZE); |
| hit_size += CACHE_HIT_SIZE; |
| } |
| } |
| if (delay != 0) |
| this_thread::sleep_for(chrono |
| ::nanoseconds(size_per_test / delay)); |
| } |
| copy_end = chrono::high_resolution_clock::now(); |
| break; |
| } |
| case SumBench: { |
| uint64_t s = 0; |
| s += sum(src.get(), cur_size); |
| copy_start = chrono::high_resolution_clock::now(); |
| for (int i = 0; i < iter_per_size; i++) { |
| s += sum(src.get(), cur_size); |
| if (delay != 0) |
| this_thread::sleep_for(chrono |
| ::nanoseconds(size_per_test / delay)); |
| } |
| copy_end = chrono::high_resolution_clock::now(); |
| tot_sum += s; |
| break; |
| } |
| } |
| |
| samples--; |
| double ns_per_copy = chrono::duration_cast<chrono::nanoseconds>(copy_end - copy_start).count() / double(iter_per_size); |
| double gb_per_sec = ((double)cur_size / (1ull<<30)) / (ns_per_copy / 1.0E9); |
| if (type == MemcpyBench) |
| gb_per_sec *= 2.0; |
| double percent_waiting = 0; |
| if (delay != 0) { |
| percent_waiting = (size_per_test / delay) / ns_per_copy * 100; |
| } |
| cout << "size: " << cur_size << ", perf: " << gb_per_sec |
| << "GB/s, iter: " << iter_per_size << ", \% time spent waiting: " |
| << percent_waiting << endl; |
| } |
| return 0; |
| } |