You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

memory.c 39 kB

Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
8 years ago
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
10 years ago
10 years ago
10 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570
  1. /*****************************************************************************
  2. Copyright (c) 2011-2014, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written
  16. permission.
  17. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  23. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  24. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  25. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  26. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. **********************************************************************************/
  28. /*********************************************************************/
  29. /* Copyright 2009, 2010 The University of Texas at Austin. */
  30. /* All rights reserved. */
  31. /* */
  32. /* Redistribution and use in source and binary forms, with or */
  33. /* without modification, are permitted provided that the following */
  34. /* conditions are met: */
  35. /* */
  36. /* 1. Redistributions of source code must retain the above */
  37. /* copyright notice, this list of conditions and the following */
  38. /* disclaimer. */
  39. /* */
  40. /* 2. Redistributions in binary form must reproduce the above */
  41. /* copyright notice, this list of conditions and the following */
  42. /* disclaimer in the documentation and/or other materials */
  43. /* provided with the distribution. */
  44. /* */
  45. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  46. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  47. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  48. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  49. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  50. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  51. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  52. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  53. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  54. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  55. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  56. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  57. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  58. /* POSSIBILITY OF SUCH DAMAGE. */
  59. /* */
  60. /* The views and conclusions contained in the software and */
  61. /* documentation are those of the authors and should not be */
  62. /* interpreted as representing official policies, either expressed */
  63. /* or implied, of The University of Texas at Austin. */
  64. /*********************************************************************/
  65. //#undef DEBUG
  66. #include "common.h"
  67. #include <errno.h>
  68. #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
  69. #define ALLOC_WINDOWS
  70. #ifndef MEM_LARGE_PAGES
  71. #define MEM_LARGE_PAGES 0x20000000
  72. #endif
  73. #else
  74. #define ALLOC_MMAP
  75. #define ALLOC_MALLOC
  76. #endif
  77. #include <stdlib.h>
  78. #include <stdio.h>
  79. #include <fcntl.h>
  80. #if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
  81. #include <sys/mman.h>
  82. #ifndef NO_SYSV_IPC
  83. #include <sys/shm.h>
  84. #endif
  85. #include <sys/ipc.h>
  86. #endif
  87. #include <sys/types.h>
  88. #ifdef OS_LINUX
  89. #include <sys/sysinfo.h>
  90. #include <sched.h>
  91. #include <errno.h>
  92. #include <linux/unistd.h>
  93. #include <sys/syscall.h>
  94. #include <sys/time.h>
  95. #include <sys/resource.h>
  96. #endif
  97. #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
  98. #include <sys/sysctl.h>
  99. #include <sys/resource.h>
  100. #endif
  101. #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
  102. #include <conio.h>
  103. #undef printf
  104. #define printf _cprintf
  105. #endif
  106. #ifdef OS_LINUX
  107. #ifndef MPOL_PREFERRED
  108. #define MPOL_PREFERRED 1
  109. #endif
  110. #endif
  111. #if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
  112. #define NO_WARMUP
  113. #endif
  114. #ifndef SHM_HUGETLB
  115. #define SHM_HUGETLB 04000
  116. #endif
  117. #ifndef FIXED_PAGESIZE
  118. #define FIXED_PAGESIZE 4096
  119. #endif
  120. #ifndef BUFFERS_PER_THREAD
  121. #ifdef USE_OPENMP
  122. #define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
  123. #else
  124. #define BUFFERS_PER_THREAD NUM_BUFFERS
  125. #endif
  126. #endif
  127. #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
  128. #if defined(_MSC_VER) && !defined(__clang__)
  129. #define CONSTRUCTOR __cdecl
  130. #define DESTRUCTOR __cdecl
  131. #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
  132. #define CONSTRUCTOR __attribute__ ((constructor))
  133. #define DESTRUCTOR __attribute__ ((destructor))
  134. #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
  135. #define CONSTRUCTOR __attribute__ ((constructor(101)))
  136. #define DESTRUCTOR __attribute__ ((destructor(101)))
  137. #else
  138. #define CONSTRUCTOR __attribute__ ((constructor))
  139. #define DESTRUCTOR __attribute__ ((destructor))
  140. #endif
  141. extern void openblas_warning(int verbose, const char * msg);
  142. #ifndef SMP
  143. #define blas_cpu_number 1
  144. #define blas_num_threads 1
  145. /* Dummy Function */
  146. int goto_get_num_procs (void) { return 1;};
  147. void goto_set_num_threads(int num_threads) {};
  148. #else
  149. #if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
  150. #ifndef NO_AFFINITY
  151. int get_num_procs(void);
  152. #else
  153. int get_num_procs(void) {
  154. static int nums = 0;
  155. cpu_set_t *cpusetp;
  156. size_t size;
  157. int ret;
  158. int i,n;
  159. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  160. #if !defined(OS_LINUX)
  161. return nums;
  162. #endif
  163. #if !defined(__GLIBC_PREREQ)
  164. return nums;
  165. #else
  166. #if !__GLIBC_PREREQ(2, 3)
  167. return nums;
  168. #endif
  169. #if !__GLIBC_PREREQ(2, 7)
  170. ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
  171. if (ret!=0) return nums;
  172. n=0;
  173. #if !__GLIBC_PREREQ(2, 6)
  174. for (i=0;i<nums;i++)
  175. if (CPU_ISSET(i,cpusetp)) n++;
  176. nums=n;
  177. #else
  178. nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
  179. #endif
  180. return nums;
  181. #else
  182. cpusetp = CPU_ALLOC(nums);
  183. if (cpusetp == NULL) return nums;
  184. size = CPU_ALLOC_SIZE(nums);
  185. ret = sched_getaffinity(0,size,cpusetp);
  186. if (ret!=0) return nums;
  187. ret = CPU_COUNT_S(size,cpusetp);
  188. if (ret > 0 && ret < nums) nums = ret;
  189. CPU_FREE(cpusetp);
  190. return nums;
  191. #endif
  192. #endif
  193. }
  194. #endif
  195. #endif
  196. #ifdef OS_ANDROID
  197. int get_num_procs(void) {
  198. static int nums = 0;
  199. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  200. return nums;
  201. }
  202. #endif
  203. #ifdef OS_WINDOWS
  204. int get_num_procs(void) {
  205. static int nums = 0;
  206. if (nums == 0) {
  207. SYSTEM_INFO sysinfo;
  208. GetSystemInfo(&sysinfo);
  209. nums = sysinfo.dwNumberOfProcessors;
  210. }
  211. return nums;
  212. }
  213. #endif
  214. #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
  215. int get_num_procs(void) {
  216. static int nums = 0;
  217. int m[2];
  218. size_t len;
  219. if (nums == 0) {
  220. m[0] = CTL_HW;
  221. m[1] = HW_NCPU;
  222. len = sizeof(int);
  223. sysctl(m, 2, &nums, &len, NULL, 0);
  224. }
  225. return nums;
  226. }
  227. #endif
  228. #if defined(OS_DARWIN)
  229. int get_num_procs(void) {
  230. static int nums = 0;
  231. size_t len;
  232. if (nums == 0){
  233. len = sizeof(int);
  234. sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
  235. }
  236. return nums;
  237. }
  238. /*
  239. void set_stack_limit(int limitMB){
  240. int result=0;
  241. struct rlimit rl;
  242. rlim_t StackSize;
  243. StackSize=limitMB*1024*1024;
  244. result=getrlimit(RLIMIT_STACK, &rl);
  245. if(result==0){
  246. if(rl.rlim_cur < StackSize){
  247. rl.rlim_cur=StackSize;
  248. result=setrlimit(RLIMIT_STACK, &rl);
  249. if(result !=0){
  250. fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
  251. }
  252. }
  253. }
  254. }
  255. */
  256. #endif
  257. /*
  258. OpenBLAS uses the numbers of CPU cores in multithreading.
  259. It can be set by openblas_set_num_threads(int num_threads);
  260. */
  261. int blas_cpu_number = 0;
  262. /*
  263. The numbers of threads in the thread pool.
  264. This value is equal or large than blas_cpu_number. This means some threads are sleep.
  265. */
  266. int blas_num_threads = 0;
  267. int goto_get_num_procs (void) {
  268. return blas_cpu_number;
  269. }
  270. static void blas_memory_init();
  271. void openblas_fork_handler()
  272. {
  273. // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
  274. // built with "make USE_OPENMP=0".
  275. // Hanging can still happen when OpenBLAS is built against the libgomp
  276. // implementation of OpenMP. The problem is tracked at:
  277. // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
  278. // In the mean time build with USE_OPENMP=0 or link against another
  279. // implementation of OpenMP.
  280. #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
  281. int err;
  282. err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init);
  283. if(err != 0)
  284. openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
  285. #endif
  286. }
  287. extern int openblas_num_threads_env();
  288. extern int openblas_goto_num_threads_env();
  289. extern int openblas_omp_num_threads_env();
  290. int blas_get_cpu_number(void){
  291. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
  292. int max_num;
  293. #endif
  294. int blas_goto_num = 0;
  295. int blas_omp_num = 0;
  296. if (blas_num_threads) return blas_num_threads;
  297. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
  298. max_num = get_num_procs();
  299. #endif
  300. // blas_goto_num = 0;
  301. #ifndef USE_OPENMP
  302. blas_goto_num=openblas_num_threads_env();
  303. if (blas_goto_num < 0) blas_goto_num = 0;
  304. if (blas_goto_num == 0) {
  305. blas_goto_num=openblas_goto_num_threads_env();
  306. if (blas_goto_num < 0) blas_goto_num = 0;
  307. }
  308. #endif
  309. // blas_omp_num = 0;
  310. blas_omp_num=openblas_omp_num_threads_env();
  311. if (blas_omp_num < 0) blas_omp_num = 0;
  312. if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
  313. else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
  314. else blas_num_threads = MAX_CPU_NUMBER;
  315. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
  316. if (blas_num_threads > max_num) blas_num_threads = max_num;
  317. #endif
  318. if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
  319. #ifdef DEBUG
  320. printf( "Adjusted number of threads : %3d\n", blas_num_threads);
  321. #endif
  322. blas_cpu_number = blas_num_threads;
  323. return blas_num_threads;
  324. }
  325. #endif
  326. int openblas_get_num_procs(void) {
  327. #ifndef SMP
  328. return 1;
  329. #else
  330. return get_num_procs();
  331. #endif
  332. }
  333. int openblas_get_num_threads(void) {
  334. #ifndef SMP
  335. return 1;
  336. #else
  337. // init blas_cpu_number if needed
  338. blas_get_cpu_number();
  339. return blas_cpu_number;
  340. #endif
  341. }
  342. int hugetlb_allocated = 0;
  343. #if defined(OS_WINDOWS)
  344. #define THREAD_LOCAL __declspec(thread)
  345. #else
  346. #define THREAD_LOCAL __thread
  347. #endif
  348. /* Stores information about the allocation and how to release it */
  349. struct alloc_t {
  350. /* Whether this allocation is being used */
  351. int used;
  352. /* Any special attributes needed when releasing this allocation */
  353. int attr;
  354. /* Function that can properly release this memory */
  355. void (*release_func)(struct alloc_t *);
  356. /* Pad to 64-byte alignment */
  357. char pad[64 - 2 * sizeof(int) - sizeof(void(*))];
  358. };
  359. /* Convenience macros for storing release funcs */
  360. #define STORE_RELEASE_FUNC(address, func) \
  361. if (address != (void *)-1) { \
  362. struct alloc_t *alloc_info = (struct alloc_t *)address; \
  363. alloc_info->release_func = func; \
  364. }
  365. #define STORE_RELEASE_FUNC_WITH_ATTR(address, func, attr) \
  366. if (address != (void *)-1) { \
  367. struct alloc_t *alloc_info = (struct alloc_t *)address; \
  368. alloc_info->release_func = func; \
  369. alloc_info->attr = attr; \
  370. }
  371. /* The number of bytes that will be allocated for each buffer. When allocating
  372. memory, we store an alloc_t followed by the actual buffer memory. This means
  373. that each allocation always has its associated alloc_t, without the need
  374. for an auxiliary tracking structure. */
  375. static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
  376. /* Clang supports TLS from version 2.8 */
  377. #if defined(__clang__) && __clang_major__ > 2 || \
  378. (__clang_minor__ == 2 || __clang_minor__ == 8)
  379. #define HAS_COMPILER_TLS
  380. #endif
  381. /* GCC supports TLS from version 4.1 */
  382. #if !defined(__clang__) && defined(__GNUC__) && \
  383. (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
  384. #define HAS_COMPILER_TLS
  385. #endif
  386. /* MSVC supports TLS from version 2005 */
  387. #if defined(_MSC_VER) && _MSC_VER >= 1400
  388. #define HAS_COMPILER_TLS
  389. #endif
  390. /* Versions of XCode before 8 did not properly support TLS */
  391. #if defined(__apple_build_version__) && __apple_build_version__ < 8000042
  392. #undef HAS_COMPILER_TLS
  393. #endif
  394. /* Android NDK's before version 12b did not support TLS */
  395. #if defined(__ANDROID__) && defined(__clang__)
  396. #if __has_include(<android/ndk-version.h>)
  397. #include <android/ndk-version.h>
  398. #endif
  399. #if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
  400. defined(__NDK_MINOR__) && \
  401. ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
  402. #undef HAS_COMPILER_TLS
  403. #endif
  404. #endif
  405. /* Holds pointers to allocated memory */
  406. #if defined(SMP) && !defined(USE_OPENMP)
  407. /* This is the number of threads than can be spawned by the server, which is the
  408. server plus the number of threads in the thread pool */
  409. # define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER * 2
  410. static int next_memory_table_pos = 0;
  411. # if defined(HAS_COMPILER_TLS)
  412. /* Use compiler generated thread-local-storage */
  413. static int THREAD_LOCAL local_memory_table_pos = 0;
  414. # else
  415. /* Use system-dependent thread-local-storage */
  416. # if defined(OS_WINDOWS)
  417. static DWORD local_storage_key;
  418. # else
  419. static pthread_key_t local_storage_key;
  420. # endif /* defined(OS_WINDOWS) */
  421. # endif /* defined(HAS_COMPILER_TLS) */
  422. #else
  423. /* There is only one allocating thread when in single-threaded mode and when using OpenMP */
  424. # define MAX_ALLOCATING_THREADS 1
  425. #endif /* defined(SMP) && !defined(USE_OPENMP) */
  426. static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD];
  427. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  428. static int hot_alloc = 0;
  429. #endif
  430. /* Global lock for memory allocation */
  431. #if defined(USE_PTHREAD_LOCK)
  432. static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
  433. #elif defined(USE_PTHREAD_SPINLOCK)
  434. static pthread_spinlock_t alloc_lock = 0;
  435. #else
  436. static BLASULONG alloc_lock = 0UL;
  437. #endif
  438. /* Returns a pointer to the start of the per-thread memory allocation data */
  439. static __inline struct alloc_t ** get_memory_table() {
  440. #if defined(SMP) && !defined(USE_OPENMP)
  441. # if !defined(HAS_COMPILER_TLS)
  442. # if defined(OS_WINDOWS)
  443. int local_memory_table_pos = (int)::TlsGetValue(local_storage_key);
  444. # else
  445. int local_memory_table_pos = (int)pthread_getspecific(local_storage_key);
  446. # endif /* defined(OS_WINDOWS) */
  447. # endif /* !defined(HAS_COMPILER_TLS) */
  448. if (!local_memory_table_pos) {
  449. LOCK_COMMAND(&alloc_lock);
  450. local_memory_table_pos = next_memory_table_pos++;
  451. if (next_memory_table_pos > MAX_ALLOCATING_THREADS)
  452. printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n");
  453. UNLOCK_COMMAND(&alloc_lock);
  454. # if !defined(HAS_COMPILER_TLS)
  455. # if defined(OS_WINDOWS)
  456. ::TlsSetValue(local_storage_key, (void*)local_memory_table_pos);
  457. # else
  458. pthread_setspecific(local_storage_key, (void*)local_memory_table_pos);
  459. # endif /* defined(OS_WINDOWS) */
  460. # endif /* !defined(HAS_COMPILER_TLS) */
  461. }
  462. return local_memory_table[local_memory_table_pos];
  463. #else
  464. return local_memory_table[0];
  465. #endif /* defined(SMP) && !defined(USE_OPENMP) */
  466. }
  467. #ifdef ALLOC_MMAP
  468. static void alloc_mmap_free(struct alloc_t *alloc_info){
  469. if (munmap(alloc_info, allocation_block_size)) {
  470. printf("OpenBLAS : munmap failed\n");
  471. }
  472. }
  473. #ifdef NO_WARMUP
  474. static void *alloc_mmap(void *address){
  475. void *map_address;
  476. if (address){
  477. map_address = mmap(address,
  478. allocation_block_size,
  479. MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  480. } else {
  481. map_address = mmap(address,
  482. allocation_block_size,
  483. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  484. }
  485. STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
  486. #ifdef OS_LINUX
  487. my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
  488. #endif
  489. return map_address;
  490. }
  491. #else
  492. #define BENCH_ITERATION 4
  493. #define SCALING 2
  494. static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
  495. BLASULONG original, *p;
  496. BLASULONG start, stop, min;
  497. int iter, i, count;
  498. min = (BLASULONG)-1;
  499. original = *(BLASULONG *)(address + size - PAGESIZE);
  500. *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
  501. for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
  502. p = (BLASULONG *)address;
  503. count = size / PAGESIZE;
  504. start = rpcc();
  505. for (i = 0; i < count; i ++) {
  506. p = (BLASULONG *)(*p);
  507. }
  508. stop = rpcc();
  509. if (min > stop - start) min = stop - start;
  510. }
  511. *(BLASULONG *)(address + size - PAGESIZE + 0) = original;
  512. *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
  513. return min;
  514. }
  515. static void *alloc_mmap(void *address){
  516. void *map_address, *best_address;
  517. BLASULONG best, start, current;
  518. BLASULONG allocsize;
  519. if (address){
  520. /* Just give up use advanced operation */
  521. map_address = mmap(address, allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  522. #ifdef OS_LINUX
  523. my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
  524. #endif
  525. } else {
  526. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  527. if (hot_alloc == 0) {
  528. map_address = mmap(NULL, allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0);
  529. #ifdef OS_LINUX
  530. my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
  531. #endif
  532. } else {
  533. #endif
  534. map_address = mmap(NULL, allocation_block_size * SCALING,
  535. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  536. if (map_address != (void *)-1) {
  537. #ifdef OS_LINUX
  538. #ifdef DEBUG
  539. int ret=0;
  540. ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  541. if(ret==-1){
  542. int errsv=errno;
  543. perror("OpenBLAS alloc_mmap:");
  544. printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
  545. }
  546. #else
  547. my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  548. #endif
  549. #endif
  550. allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
  551. start = (BLASULONG)map_address;
  552. current = (SCALING - 1) * allocation_block_size;
  553. while(current > 0) {
  554. *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
  555. start += PAGESIZE;
  556. current -= PAGESIZE;
  557. }
  558. *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
  559. start = (BLASULONG)map_address;
  560. best = (BLASULONG)-1;
  561. best_address = map_address;
  562. while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) {
  563. current = run_bench(start, allocsize);
  564. if (best > current) {
  565. best = current;
  566. best_address = (void *)start;
  567. }
  568. start += PAGESIZE;
  569. }
  570. if ((BLASULONG)best_address > (BLASULONG)map_address)
  571. munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
  572. munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address);
  573. map_address = best_address;
  574. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  575. hot_alloc = 2;
  576. #endif
  577. }
  578. }
  579. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  580. }
  581. #endif
  582. STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
  583. return map_address;
  584. }
  585. #endif
  586. #endif
  587. #ifdef ALLOC_MALLOC
  588. static void alloc_malloc_free(struct alloc_t *alloc_info){
  589. free(alloc_info);
  590. }
  591. static void *alloc_malloc(void *address){
  592. void *map_address;
  593. map_address = (void *)malloc(allocation_block_size + FIXED_PAGESIZE);
  594. if (map_address == (void *)NULL) map_address = (void *)-1;
  595. STORE_RELEASE_FUNC(map_address, alloc_malloc_free);
  596. return map_address;
  597. }
  598. #endif
  599. #ifdef ALLOC_QALLOC
  600. void *qalloc(int flags, size_t bytes);
  601. void *qfree (void *address);
  602. #define QNONCACHE 0x1
  603. #define QCOMMS 0x2
  604. #define QFAST 0x4
  605. static void alloc_qalloc_free(struct alloc_t *alloc_info){
  606. qfree(alloc_info);
  607. }
  608. static void *alloc_qalloc(void *address){
  609. void *map_address;
  610. map_address = (void *)qalloc(QCOMMS | QFAST, allocation_block_size + FIXED_PAGESIZE);
  611. if (map_address == (void *)NULL) map_address = (void *)-1;
  612. STORE_RELEASE_FUNC(map_address, alloc_qalloc_free);
  613. return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
  614. }
  615. #endif
  616. #ifdef ALLOC_WINDOWS
  617. static void alloc_windows_free(struct alloc_t *alloc_info){
  618. VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT);
  619. }
  620. static void *alloc_windows(void *address){
  621. void *map_address;
  622. map_address = VirtualAlloc(address,
  623. allocation_block_size,
  624. MEM_RESERVE | MEM_COMMIT,
  625. PAGE_READWRITE);
  626. if (map_address == (void *)NULL) map_address = (void *)-1;
  627. STORE_RELEASE_FUNC(map_address, alloc_windows_free);
  628. return map_address;
  629. }
  630. #endif
  631. #ifdef ALLOC_DEVICEDRIVER
  632. #ifndef DEVICEDRIVER_NAME
  633. #define DEVICEDRIVER_NAME "/dev/mapper"
  634. #endif
  635. static void alloc_devicedirver_free(struct alloc_t *alloc_info){
  636. int attr = alloc_info -> attr;
  637. if (munmap(address, allocation_block_size)) {
  638. printf("OpenBLAS : Bugphysarea unmap failed.\n");
  639. }
  640. if (close(attr)) {
  641. printf("OpenBLAS : Bugphysarea close failed.\n");
  642. }
  643. }
  644. static void *alloc_devicedirver(void *address){
  645. int fd;
  646. void *map_address;
  647. if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
  648. return (void *)-1;
  649. }
  650. map_address = mmap(address, allocation_block_size,
  651. PROT_READ | PROT_WRITE,
  652. MAP_FILE | MAP_SHARED,
  653. fd, 0);
  654. STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd);
  655. return map_address;
  656. }
  657. #endif
  658. #ifdef ALLOC_SHM
  659. static void alloc_shm_free(struct alloc_t *alloc_info){
  660. if (shmdt(alloc_info)) {
  661. printf("OpenBLAS : Shared memory unmap failed.\n");
  662. }
  663. }
  664. static void *alloc_shm(void *address){
  665. void *map_address;
  666. int shmid;
  667. shmid = shmget(IPC_PRIVATE, allocation_block_size,IPC_CREAT | 0600);
  668. map_address = (void *)shmat(shmid, address, 0);
  669. if (map_address != (void *)-1){
  670. #ifdef OS_LINUX
  671. my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
  672. #endif
  673. shmctl(shmid, IPC_RMID, 0);
  674. struct alloc_t *alloc_info = (struct alloc_t *)map_address;
  675. alloc_info->release_func = alloc_shm_free;
  676. alloc_info->attr = shmid;
  677. }
  678. return map_address;
  679. }
  680. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  681. static void alloc_hugetlb_free(struct alloc_t *alloc_info){
  682. #if defined(OS_LINUX) || defined(OS_AIX)
  683. if (shmdt(alloc_info)) {
  684. printf("OpenBLAS : Hugepage unmap failed.\n");
  685. }
  686. #endif
  687. #ifdef __sun__
  688. munmap(alloc_info, allocation_block_size);
  689. #endif
  690. #ifdef OS_WINDOWS
  691. VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT);
  692. #endif
  693. }
  694. static void *alloc_hugetlb(void *address){
  695. void *map_address = (void *)-1;
  696. #if defined(OS_LINUX) || defined(OS_AIX)
  697. int shmid;
  698. shmid = shmget(IPC_PRIVATE, allocation_block_size,
  699. #ifdef OS_LINUX
  700. SHM_HUGETLB |
  701. #endif
  702. #ifdef OS_AIX
  703. SHM_LGPAGE | SHM_PIN |
  704. #endif
  705. IPC_CREAT | SHM_R | SHM_W);
  706. if (shmid != -1) {
  707. map_address = (void *)shmat(shmid, address, SHM_RND);
  708. #ifdef OS_LINUX
  709. my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
  710. #endif
  711. if (map_address != (void *)-1){
  712. shmctl(shmid, IPC_RMID, 0);
  713. }
  714. }
  715. #endif
  716. #ifdef __sun__
  717. struct memcntl_mha mha;
  718. mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
  719. mha.mha_flags = 0;
  720. mha.mha_pagesize = HUGE_PAGESIZE;
  721. memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
  722. map_address = (BLASULONG)memalign(HUGE_PAGESIZE, allocation_block_size);
  723. #endif
  724. #ifdef OS_WINDOWS
  725. HANDLE hToken;
  726. TOKEN_PRIVILEGES tp;
  727. if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
  728. tp.PrivilegeCount = 1;
  729. tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
  730. if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
  731. CloseHandle(hToken);
  732. return (void*)-1;
  733. }
  734. if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) {
  735. CloseHandle(hToken);
  736. return (void*)-1;
  737. }
  738. map_address = (void *)VirtualAlloc(address,
  739. allocation_block_size,
  740. MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
  741. PAGE_READWRITE);
  742. tp.Privileges[0].Attributes = 0;
  743. AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
  744. if (map_address == (void *)NULL) map_address = (void *)-1;
  745. #endif
  746. STORE_RELEASE_FUNC(map_address, alloc_hugetlb_free);
  747. return map_address;
  748. }
  749. #endif
  750. #endif
  751. #ifdef ALLOC_HUGETLBFILE
  752. static int hugetlb_pid = 0;
  753. static void alloc_hugetlbfile_free(struct alloc_t *alloc_info){
  754. int attr = alloc_info -> attr;
  755. if (munmap(alloc_info, allocation_block_size)) {
  756. printf("OpenBLAS : HugeTLBfs unmap failed.\n");
  757. }
  758. if (close(attr)) {
  759. printf("OpenBLAS : HugeTLBfs close failed.\n");
  760. }
  761. }
  762. static void *alloc_hugetlbfile(void *address){
  763. void *map_address = (void *)-1;
  764. int fd;
  765. char filename[64];
  766. if (!hugetlb_pid) hugetlb_pid = getpid();
  767. sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
  768. if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
  769. return (void *)-1;
  770. }
  771. unlink(filename);
  772. map_address = mmap(address, allocation_block_size,
  773. PROT_READ | PROT_WRITE,
  774. MAP_SHARED,
  775. fd, 0);
  776. STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd);
  777. return map_address;
  778. }
  779. #endif
  780. #ifdef SEEK_ADDRESS
  781. static BLASULONG base_address = 0UL;
  782. #else
  783. static BLASULONG base_address = BASE_ADDRESS;
  784. #endif
  785. #if __STDC_VERSION__ >= 201112L
  786. static _Atomic int memory_initialized = 0;
  787. #else
  788. static volatile int memory_initialized = 0;
  789. #endif
  790. /* Memory allocation routine */
  791. /* procpos ... indicates where it comes from */
  792. /* 0 : Level 3 functions */
  793. /* 1 : Level 2 functions */
  794. /* 2 : Thread */
  795. static void blas_memory_init(){
  796. #if defined(SMP) && !defined(USE_OPENMP)
  797. next_memory_table_pos = 0;
  798. # if !defined(HAS_COMPILER_TLS)
  799. # if defined(OS_WINDOWS)
  800. local_storage_key = ::TlsAlloc();
  801. # else
  802. pthread_key_create(&local_storage_key, NULL);
  803. # endif /* defined(OS_WINDOWS) */
  804. # endif /* defined(HAS_COMPILER_TLS) */
  805. #endif /* defined(SMP) && !defined(USE_OPENMP) */
  806. memset(local_memory_table, 0, sizeof(local_memory_table));
  807. }
  808. void *blas_memory_alloc(int procpos){
  809. int position;
  810. void *map_address;
  811. void *(*memoryalloc[])(void *address) = {
  812. #ifdef ALLOC_DEVICEDRIVER
  813. alloc_devicedirver,
  814. #endif
  815. /* Hugetlb implicitly assumes ALLOC_SHM */
  816. #ifdef ALLOC_SHM
  817. alloc_shm,
  818. #endif
  819. #if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS))
  820. alloc_hugetlb,
  821. #endif
  822. #ifdef ALLOC_MMAP
  823. alloc_mmap,
  824. #endif
  825. #ifdef ALLOC_QALLOC
  826. alloc_qalloc,
  827. #endif
  828. #ifdef ALLOC_WINDOWS
  829. alloc_windows,
  830. #endif
  831. #ifdef ALLOC_MALLOC
  832. alloc_malloc,
  833. #endif
  834. NULL,
  835. };
  836. void *(**func)(void *address);
  837. struct alloc_t * alloc_info;
  838. struct alloc_t ** alloc_table;
  839. if (!memory_initialized) {
  840. #if defined(SMP) && !defined(USE_OPENMP)
  841. /* Only allow a single thread to initialize memory system */
  842. LOCK_COMMAND(&alloc_lock);
  843. if (!memory_initialized) {
  844. #endif
  845. blas_memory_init();
  846. #ifdef DYNAMIC_ARCH
  847. gotoblas_dynamic_init();
  848. #endif
  849. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  850. gotoblas_affinity_init();
  851. #endif
  852. #ifdef SMP
  853. if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
  854. #endif
  855. #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
  856. #ifndef DYNAMIC_ARCH
  857. blas_set_parameter();
  858. #endif
  859. #endif
  860. memory_initialized = 1;
  861. #if defined(SMP) && !defined(USE_OPENMP)
  862. }
  863. UNLOCK_COMMAND(&alloc_lock);
  864. #endif
  865. }
  866. #ifdef DEBUG
  867. printf("Alloc Start ...\n");
  868. #endif
  869. position = 0;
  870. alloc_table = get_memory_table();
  871. do {
  872. if (!alloc_table[position] || !alloc_table[position]->used) goto allocation;
  873. position ++;
  874. } while (position < BUFFERS_PER_THREAD);
  875. goto error;
  876. allocation :
  877. #ifdef DEBUG
  878. printf(" Position -> %d\n", position);
  879. #endif
  880. alloc_info = alloc_table[position];
  881. if (!alloc_info) {
  882. do {
  883. #ifdef DEBUG
  884. printf("Allocation Start : %lx\n", base_address);
  885. #endif
  886. map_address = (void *)-1;
  887. func = &memoryalloc[0];
  888. while ((func != NULL) && (map_address == (void *) -1)) {
  889. map_address = (*func)((void *)base_address);
  890. #ifdef ALLOC_DEVICEDRIVER
  891. if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
  892. fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n");
  893. }
  894. #endif
  895. #ifdef ALLOC_HUGETLBFILE
  896. if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
  897. #ifndef OS_WINDOWS
  898. fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n");
  899. #endif
  900. }
  901. #endif
  902. #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
  903. if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
  904. #endif
  905. func ++;
  906. }
  907. #ifdef DEBUG
  908. printf(" Success -> %08lx\n", map_address);
  909. #endif
  910. if (((BLASLONG) map_address) == -1) base_address = 0UL;
  911. if (base_address) base_address += allocation_block_size + FIXED_PAGESIZE;
  912. } while ((BLASLONG)map_address == -1);
  913. alloc_table[position] = alloc_info = map_address;
  914. #ifdef DEBUG
  915. printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position);
  916. #endif
  917. }
  918. #ifdef DEBUG
  919. printf("Mapped : %p %3d\n\n", (void *)alloc_info, position);
  920. #endif
  921. alloc_info->used = 1;
  922. return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
  923. error:
  924. printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
  925. return NULL;
  926. }
  927. void blas_memory_free(void *buffer){
  928. #ifdef DEBUG
  929. int position;
  930. struct alloc_t ** alloc_table;
  931. #endif
  932. /* Since we passed an offset pointer to the caller, get back to the actual allocation */
  933. struct alloc_t *alloc_info = (void *)(((char *)buffer) - sizeof(struct alloc_t));
  934. #ifdef DEBUG
  935. printf("Unmapped Start : %p ...\n", alloc_info);
  936. #endif
  937. alloc_info->used = 0;
  938. #ifdef DEBUG
  939. printf("Unmap Succeeded.\n\n");
  940. #endif
  941. return;
  942. #ifdef DEBUG
  943. alloc_table = get_memory_table();
  944. for (position = 0; position < BUFFERS_PER_THREAD; position++){
  945. if (alloc_table[position]) {
  946. printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used);
  947. }
  948. }
  949. #endif
  950. return;
  951. }
  952. void *blas_memory_alloc_nolock(int unused) {
  953. void *map_address;
  954. map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
  955. return map_address;
  956. }
  957. void blas_memory_free_nolock(void * map_address) {
  958. free(map_address);
  959. }
  960. void blas_shutdown(void){
  961. int pos, thread;
  962. #ifdef SMP
  963. BLASFUNC(blas_thread_shutdown)();
  964. #endif
  965. for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){
  966. for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){
  967. struct alloc_t *alloc_info = local_memory_table[thread][pos];
  968. if (alloc_info) {
  969. alloc_info->release_func(alloc_info);
  970. alloc_info = (void *)0;
  971. }
  972. }
  973. }
  974. #ifdef SEEK_ADDRESS
  975. base_address = 0UL;
  976. #else
  977. base_address = BASE_ADDRESS;
  978. #endif
  979. return;
  980. }
  981. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  982. #ifdef SMP
  983. #if defined(USE_PTHREAD_LOCK)
  984. static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
  985. #elif defined(USE_PTHREAD_SPINLOCK)
  986. static pthread_spinlock_t init_lock = 0;
  987. #else
  988. static BLASULONG init_lock = 0UL;
  989. #endif
  990. #endif
  991. static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
  992. void *sa, void *sb, BLASLONG pos) {
  993. #if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
  994. size_t size;
  995. BLASULONG buffer;
  996. size = allocation_block_size - PAGESIZE;
  997. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  998. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  999. if (hot_alloc != 2) {
  1000. #endif
  1001. #ifdef SMP
  1002. LOCK_COMMAND(&init_lock);
  1003. #endif
  1004. while (size > 0) {
  1005. *(int *)buffer = size;
  1006. buffer += PAGESIZE;
  1007. size -= PAGESIZE;
  1008. }
  1009. #ifdef SMP
  1010. UNLOCK_COMMAND(&init_lock);
  1011. #endif
  1012. size = MIN((allocation_block_size - PAGESIZE), L2_SIZE);
  1013. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  1014. while (size > 0) {
  1015. *(int *)buffer = size;
  1016. buffer += 64;
  1017. size -= 64;
  1018. }
  1019. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1020. }
  1021. #endif
  1022. #endif
  1023. }
  1024. #ifdef SMP
  1025. static void _init_thread_memory(void *buffer) {
  1026. blas_queue_t queue[MAX_CPU_NUMBER];
  1027. int num_cpu;
  1028. for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
  1029. blas_queue_init(&queue[num_cpu]);
  1030. queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL;
  1031. queue[num_cpu].routine = &_touch_memory;
  1032. queue[num_cpu].args = NULL;
  1033. queue[num_cpu].next = &queue[num_cpu + 1];
  1034. }
  1035. queue[num_cpu - 1].next = NULL;
  1036. queue[0].sa = buffer;
  1037. exec_blas(num_cpu, queue);
  1038. }
  1039. #endif
  1040. static void gotoblas_memory_init(void) {
  1041. void *buffer;
  1042. hot_alloc = 1;
  1043. buffer = (void *)blas_memory_alloc(0);
  1044. #ifdef SMP
  1045. if (blas_cpu_number == 0) blas_get_cpu_number();
  1046. #ifdef SMP_SERVER
  1047. if (blas_server_avail == 0) blas_thread_init();
  1048. #endif
  1049. _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
  1050. #else
  1051. _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
  1052. #endif
  1053. blas_memory_free(buffer);
  1054. }
  1055. #endif
  1056. /* Initialization for all function; this function should be called before main */
  1057. static int gotoblas_initialized = 0;
  1058. extern void openblas_read_env();
  1059. void CONSTRUCTOR gotoblas_init(void) {
  1060. if (gotoblas_initialized) return;
  1061. #ifdef SMP
  1062. openblas_fork_handler();
  1063. #endif
  1064. openblas_read_env();
  1065. #ifdef PROFILE
  1066. moncontrol (0);
  1067. #endif
  1068. #ifdef DYNAMIC_ARCH
  1069. gotoblas_dynamic_init();
  1070. #endif
  1071. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  1072. gotoblas_affinity_init();
  1073. #endif
  1074. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1075. gotoblas_memory_init();
  1076. #endif
  1077. //#if defined(OS_LINUX)
  1078. #if 0
  1079. struct rlimit curlimit;
  1080. if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
  1081. {
  1082. if ( curlimit.rlim_cur != curlimit.rlim_max )
  1083. {
  1084. curlimit.rlim_cur = curlimit.rlim_max;
  1085. setrlimit(RLIMIT_STACK, &curlimit);
  1086. }
  1087. }
  1088. #endif
  1089. #ifdef SMP
  1090. if (blas_cpu_number == 0) blas_get_cpu_number();
  1091. #ifdef SMP_SERVER
  1092. if (blas_server_avail == 0) blas_thread_init();
  1093. #endif
  1094. #endif
  1095. #ifdef FUNCTION_PROFILE
  1096. gotoblas_profile_init();
  1097. #endif
  1098. gotoblas_initialized = 1;
  1099. #ifdef PROFILE
  1100. moncontrol (1);
  1101. #endif
  1102. }
  1103. void DESTRUCTOR gotoblas_quit(void) {
  1104. if (gotoblas_initialized == 0) return;
  1105. blas_shutdown();
  1106. #ifdef PROFILE
  1107. moncontrol (0);
  1108. #endif
  1109. #ifdef FUNCTION_PROFILE
  1110. gotoblas_profile_quit();
  1111. #endif
  1112. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  1113. gotoblas_affinity_quit();
  1114. #endif
  1115. #ifdef DYNAMIC_ARCH
  1116. gotoblas_dynamic_quit();
  1117. #endif
  1118. gotoblas_initialized = 0;
  1119. #ifdef PROFILE
  1120. moncontrol (1);
  1121. #endif
  1122. }
  1123. #if defined(_MSC_VER) && !defined(__clang__)
  1124. BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
  1125. {
  1126. switch (ul_reason_for_call)
  1127. {
  1128. case DLL_PROCESS_ATTACH:
  1129. gotoblas_init();
  1130. break;
  1131. case DLL_THREAD_ATTACH:
  1132. break;
  1133. case DLL_THREAD_DETACH:
  1134. break;
  1135. case DLL_PROCESS_DETACH:
  1136. gotoblas_quit();
  1137. break;
  1138. default:
  1139. break;
  1140. }
  1141. return TRUE;
  1142. }
  1143. /*
  1144. This is to allow static linking.
  1145. Code adapted from Google performance tools:
  1146. https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
  1147. Reference:
  1148. https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
  1149. http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
  1150. */
  1151. static int on_process_term(void)
  1152. {
  1153. gotoblas_quit();
  1154. return 0;
  1155. }
  1156. #ifdef _WIN64
  1157. #pragma comment(linker, "/INCLUDE:_tls_used")
  1158. #else
  1159. #pragma comment(linker, "/INCLUDE:__tls_used")
  1160. #endif
  1161. #ifdef _WIN64
  1162. #pragma const_seg(".CRT$XLB")
  1163. #else
  1164. #pragma data_seg(".CRT$XLB")
  1165. #endif
  1166. static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
  1167. #ifdef _WIN64
  1168. #pragma const_seg()
  1169. #else
  1170. #pragma data_seg()
  1171. #endif
  1172. #ifdef _WIN64
  1173. #pragma const_seg(".CRT$XTU")
  1174. #else
  1175. #pragma data_seg(".CRT$XTU")
  1176. #endif
  1177. static int(*p_process_term)(void) = on_process_term;
  1178. #ifdef _WIN64
  1179. #pragma const_seg()
  1180. #else
  1181. #pragma data_seg()
  1182. #endif
  1183. #endif
  1184. #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
  1185. /* Don't call me; this is just work around for PGI / Sun bug */
  1186. void gotoblas_dummy_for_PGI(void) {
  1187. gotoblas_init();
  1188. gotoblas_quit();
  1189. #if 0
  1190. asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
  1191. asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
  1192. #else
  1193. asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
  1194. asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
  1195. #endif
  1196. }
  1197. #endif