| Riley Andrews | d7822e6 | 2015-08-17 16:42:26 -0700 | [diff] [blame] | 1 | #include <iostream> |
| 2 | #include <chrono> |
| 3 | #include <vector> |
| 4 | #include <algorithm> |
| 5 | #include <numeric> |
| 6 | #include <stdlib.h> |
| 7 | #include <memory> |
| 8 | #include <cmath> |
| 9 | #include <string> |
| 10 | |
| 11 | using namespace std; |
| 12 | |
| 13 | const size_t size_start = 64; |
| 14 | const size_t size_end = 16 * (1ull << 20); |
| 15 | const size_t samples = 2048; |
| 16 | size_t size_per_test = 64 * (1ull << 20); |
| 17 | size_t tot_sum = 0; |
| 18 | |
| 19 | void __attribute__((noinline)) memcpy_noinline(void *dst, void *src, size_t size); |
| 20 | void __attribute__((noinline)) memset_noinline(void *dst, int value, size_t size); |
| 21 | uint64_t __attribute__((noinline)) sum(volatile void *src, size_t size); |
| 22 | |
| 23 | enum BenchType { |
| 24 | MemcpyBench, |
| 25 | MemsetBench, |
| 26 | SumBench, |
| 27 | }; |
| 28 | |
| 29 | int main(int argc, char *argv[]) |
| 30 | { |
| 31 | BenchType type; |
| 32 | if (argc <= 1) { |
| 33 | cerr << "memcpy_perf [--memcpy|--memset|--sum]" << endl; |
| 34 | return 0; |
| 35 | } |
| 36 | if (string(argv[1]) == string("--memcpy")) { |
| 37 | type = MemcpyBench; |
| 38 | } else if (string(argv[1]) == string("--memset")) { |
| 39 | type = MemsetBench; |
| 40 | } else if (string(argv[1]) == string("--sum")) { |
| 41 | type = SumBench; |
| 42 | } else { |
| 43 | type = MemcpyBench; |
| 44 | } |
| 45 | |
| 46 | unique_ptr<uint8_t[]> src(new uint8_t[size_end]); |
| 47 | unique_ptr<uint8_t[]> dst(new uint8_t[size_end]); |
| 48 | memset(src.get(), 1, size_end); |
| 49 | |
| 50 | double start_pow = log10(size_start); |
| 51 | double end_pow = log10(size_end); |
| 52 | double pow_inc = (end_pow - start_pow) / samples; |
| 53 | |
| 54 | //cout << "src: " << (uintptr_t)src.get() << endl; |
| 55 | //cout << "dst: " << (uintptr_t)dst.get() << endl; |
| 56 | |
| 57 | for (double cur_pow = start_pow; cur_pow <= end_pow; cur_pow += pow_inc) { |
| 58 | chrono::time_point<chrono::high_resolution_clock> copy_start, copy_end; |
| 59 | |
| 60 | size_t cur_size = (size_t)pow(10.0, cur_pow); |
| 61 | size_t iter_per_size = size_per_test / cur_size; |
| 62 | |
| 63 | // run benchmark |
| 64 | switch (type) { |
| 65 | case MemsetBench: { |
| 66 | memcpy_noinline(src.get(), dst.get(), cur_size); |
| 67 | memset_noinline(dst.get(), 0xdeadbeef, cur_size); |
| 68 | copy_start = chrono::high_resolution_clock::now(); |
| 69 | for (int i = 0; i < iter_per_size; i++) { |
| 70 | memset_noinline(dst.get(), 0xdeadbeef, cur_size); |
| 71 | } |
| 72 | copy_end = chrono::high_resolution_clock::now(); |
| 73 | break; |
| 74 | } |
| 75 | case MemcpyBench: { |
| 76 | memcpy_noinline(dst.get(), src.get(), cur_size); |
| 77 | memcpy_noinline(src.get(), dst.get(), cur_size); |
| 78 | copy_start = chrono::high_resolution_clock::now(); |
| 79 | for (int i = 0; i < iter_per_size; i++) { |
| 80 | memcpy_noinline(dst.get(), src.get(), cur_size); |
| 81 | } |
| 82 | copy_end = chrono::high_resolution_clock::now(); |
| 83 | break; |
| 84 | } |
| 85 | case SumBench: { |
| 86 | uint64_t s = 0; |
| 87 | s += sum(src.get(), cur_size); |
| 88 | copy_start = chrono::high_resolution_clock::now(); |
| 89 | for (int i = 0; i < iter_per_size; i++) { |
| 90 | s += sum(src.get(), cur_size); |
| 91 | } |
| 92 | copy_end = chrono::high_resolution_clock::now(); |
| 93 | tot_sum += s; |
| 94 | break; |
| 95 | } |
| 96 | } |
| 97 | |
| 98 | double ns_per_copy = chrono::duration_cast<chrono::nanoseconds>(copy_end - copy_start).count() / double(iter_per_size); |
| 99 | double gb_per_sec = ((double)cur_size / (1ull<<30)) / (ns_per_copy / 1.0E9); |
| 100 | if (type == MemcpyBench) |
| 101 | gb_per_sec *= 2.0; |
| 102 | cout << "size: " << cur_size << ", perf: " << gb_per_sec << "GB/s, iter: " << iter_per_size << endl; |
| 103 | } |
| 104 | return 0; |
| 105 | } |