You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

memory.c 75 kB

Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
7 years ago
8 years ago
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
8 years ago
10 years ago
10 years ago
10 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255
  1. /*****************************************************************************
  2. Copyright (c) 2011-2014, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written
  16. permission.
  17. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  23. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  24. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  25. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  26. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. **********************************************************************************/
  28. /*********************************************************************/
  29. /* Copyright 2009, 2010 The University of Texas at Austin. */
  30. /* All rights reserved. */
  31. /* */
  32. /* Redistribution and use in source and binary forms, with or */
  33. /* without modification, are permitted provided that the following */
  34. /* conditions are met: */
  35. /* */
  36. /* 1. Redistributions of source code must retain the above */
  37. /* copyright notice, this list of conditions and the following */
  38. /* disclaimer. */
  39. /* */
  40. /* 2. Redistributions in binary form must reproduce the above */
  41. /* copyright notice, this list of conditions and the following */
  42. /* disclaimer in the documentation and/or other materials */
  43. /* provided with the distribution. */
  44. /* */
  45. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  46. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  47. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  48. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  49. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  50. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  51. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  52. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  53. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  54. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  55. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  56. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  57. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  58. /* POSSIBILITY OF SUCH DAMAGE. */
  59. /* */
  60. /* The views and conclusions contained in the software and */
  61. /* documentation are those of the authors and should not be */
  62. /* interpreted as representing official policies, either expressed */
  63. /* or implied, of The University of Texas at Austin. */
  64. /*********************************************************************/
  65. //#undef DEBUG
  66. #include "common.h"
  67. #if defined(USE_TLS) && defined(SMP)
  68. #define COMPILE_TLS
  69. #if USE_TLS != 1
  70. #undef COMPILE_TLS
  71. #endif
  72. #if defined(__GLIBC_PREREQ)
  73. #if !__GLIBC_PREREQ(2,20)
  74. #undef COMPILE_TLS
  75. #endif
  76. #endif
  77. #endif
  78. /* Memory buffer must fit two matrix subblocks of maximal size */
  79. #define XSTR(x) STR(x)
  80. #define STR(x) #x
  81. #if BUFFER_SIZE < (SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 * 2) || \
  82. BUFFER_SIZE < (SGEMM_DEFAULT_P * SGEMM_DEFAULT_R * 4 * 2) || \
  83. BUFFER_SIZE < (SGEMM_DEFAULT_R * SGEMM_DEFAULT_Q * 4 * 2)
  84. #error BUFFER_SIZE is too small for P, Q, and R of SGEMM:
  85. #pragma message "have " XSTR(BUFFER_SIZE) " need maximum of " XSTR(SGEMM_DEFAULT_P*SGEMM_DEFAULT_Q*4*2)
  86. #pragma message " and " XSTR(SGEMM_DEFAULT_P*SGEMM_DEFAULT_R*4*2)
  87. #pragma message " and " XSTR(SGEMM_DEFAULT_R*SGEMM_DEFAULT_Q*4*2)
  88. #endif
  89. #if BUFFER_SIZE < (DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 * 2) || \
  90. BUFFER_SIZE < (DGEMM_DEFAULT_P * DGEMM_DEFAULT_R * 8 * 2) || \
  91. BUFFER_SIZE < (DGEMM_DEFAULT_R * DGEMM_DEFAULT_Q * 8 * 2)
  92. #error BUFFER_SIZE is too small for P, Q, and R of DGEMM
  93. #pragma message "have " XSTR(BUFFER_SIZE) " need maximum of " XSTR(DGEMM_DEFAULT_P*DGEMM_DEFAULT_Q*4*2)
  94. #pragma message " and " XSTR(DGEMM_DEFAULT_P*DGEMM_DEFAULT_R*4*2)
  95. #pragma message " and " XSTR(DGEMM_DEFAULT_R*DGEMM_DEFAULT_Q*4*2)
  96. #endif
  97. #if BUFFER_SIZE < (CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 * 2) || \
  98. BUFFER_SIZE < (CGEMM_DEFAULT_P * CGEMM_DEFAULT_R * 8 * 2) || \
  99. BUFFER_SIZE < (CGEMM_DEFAULT_R * CGEMM_DEFAULT_Q * 8 * 2)
  100. #error BUFFER_SIZE is too small for P, Q, and R of CGEMM
  101. #pragma message "have " XSTR(BUFFER_SIZE) " need maximum of " XSTR(CGEMM_DEFAULT_P*CGEMM_DEFAULT_Q*4*2)
  102. #pragma message " and " XSTR(CGEMM_DEFAULT_P*CGEMM_DEFAULT_R*4*2)
  103. #pragma message " and " XSTR(CGEMM_DEFAULT_R*CGEMM_DEFAULT_Q*4*2)
  104. #endif
  105. #if BUFFER_SIZE < (ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 * 2) || \
  106. BUFFER_SIZE < (ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_R * 16 * 2) || \
  107. BUFFER_SIZE < (ZGEMM_DEFAULT_R * ZGEMM_DEFAULT_Q * 16 * 2)
  108. #error BUFFER_SIZE is too small for P, Q, and R of ZGEMM
  109. #pragma message "have " XSTR(BUFFER_SIZE) " need maximum of " XSTR(ZGEMM_DEFAULT_P*ZGEMM_DEFAULT_Q*4*2)
  110. #pragma message " and " XSTR(ZGEMM_DEFAULT_P*ZGEMM_DEFAULT_R*4*2)
  111. #pragma message " and " XSTR(ZGEMM_DEFAULT_R*ZGEMM_DEFAULT_Q*4*2)
  112. #endif
  113. #if defined(COMPILE_TLS)
  114. #include <errno.h>
  115. #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
  116. #define ALLOC_WINDOWS
  117. #ifndef MEM_LARGE_PAGES
  118. #define MEM_LARGE_PAGES 0x20000000
  119. #endif
  120. #else
  121. #define ALLOC_MMAP
  122. #define ALLOC_MALLOC
  123. #endif
  124. #include <stdlib.h>
  125. #include <stdio.h>
  126. #include <fcntl.h>
  127. #if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
  128. #include <sys/mman.h>
  129. #ifndef NO_SYSV_IPC
  130. #include <sys/shm.h>
  131. #endif
  132. #include <sys/ipc.h>
  133. #endif
  134. #include <sys/types.h>
  135. #ifdef OS_LINUX
  136. #include <sys/sysinfo.h>
  137. #include <sched.h>
  138. #include <errno.h>
  139. #include <linux/unistd.h>
  140. #include <sys/syscall.h>
  141. #include <sys/time.h>
  142. #include <sys/resource.h>
  143. #endif
  144. #ifdef OS_HAIKU
  145. #include <unistd.h>
  146. #endif
  147. #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
  148. #include <sys/sysctl.h>
  149. #include <sys/resource.h>
  150. #endif
  151. #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
  152. #include <conio.h>
  153. #undef printf
  154. #define printf _cprintf
  155. #endif
  156. #ifdef OS_LINUX
  157. #ifndef MPOL_PREFERRED
  158. #define MPOL_PREFERRED 1
  159. #endif
  160. #endif
  161. #if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
  162. #define NO_WARMUP
  163. #endif
  164. #ifndef SHM_HUGETLB
  165. #define SHM_HUGETLB 04000
  166. #endif
  167. #ifndef FIXED_PAGESIZE
  168. #define FIXED_PAGESIZE 4096
  169. #endif
  170. #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
  171. #if defined(_MSC_VER) && !defined(__clang__)
  172. #define CONSTRUCTOR __cdecl
  173. #define DESTRUCTOR __cdecl
  174. #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
  175. #define CONSTRUCTOR __attribute__ ((constructor))
  176. #define DESTRUCTOR __attribute__ ((destructor))
  177. #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
  178. #define CONSTRUCTOR __attribute__ ((constructor(101)))
  179. #define DESTRUCTOR __attribute__ ((destructor(101)))
  180. #else
  181. #define CONSTRUCTOR __attribute__ ((constructor))
  182. #define DESTRUCTOR __attribute__ ((destructor))
  183. #endif
  184. #ifdef DYNAMIC_ARCH
  185. gotoblas_t *gotoblas = NULL;
  186. #endif
  187. extern void openblas_warning(int verbose, const char * msg);
  188. #ifndef SMP
  189. #define blas_cpu_number 1
  190. #define blas_num_threads 1
  191. /* Dummy Function */
  192. int goto_get_num_procs (void) { return 1;};
  193. void goto_set_num_threads(int num_threads) {};
  194. #else
  195. #if defined(OS_LINUX) || defined(OS_SUNOS)
  196. #ifndef NO_AFFINITY
  197. int get_num_procs(void);
  198. #else
  199. int get_num_procs(void) {
  200. static int nums = 0;
  201. cpu_set_t cpuset,*cpusetp;
  202. size_t size;
  203. int ret;
  204. #if defined(__GLIBC_PREREQ)
  205. #if !__GLIBC_PREREQ(2, 7)
  206. int i;
  207. #if !__GLIBC_PREREQ(2, 6)
  208. int n;
  209. #endif
  210. #endif
  211. #endif
  212. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  213. #if !defined(OS_LINUX)
  214. return nums;
  215. #endif
  216. #if !defined(__GLIBC_PREREQ)
  217. return nums;
  218. #else
  219. #if !__GLIBC_PREREQ(2, 3)
  220. return nums;
  221. #endif
  222. #if !__GLIBC_PREREQ(2, 7)
  223. ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
  224. if (ret!=0) return nums;
  225. n=0;
  226. #if !__GLIBC_PREREQ(2, 6)
  227. for (i=0;i<nums;i++)
  228. if (CPU_ISSET(i,&cpuset)) n++;
  229. nums=n;
  230. #else
  231. nums = CPU_COUNT(sizeof(cpuset),&cpuset);
  232. #endif
  233. return nums;
  234. #else
  235. if (nums >= CPU_SETSIZE) {
  236. cpusetp = CPU_ALLOC(nums);
  237. if (cpusetp == NULL) {
  238. return nums;
  239. }
  240. size = CPU_ALLOC_SIZE(nums);
  241. ret = sched_getaffinity(0,size,cpusetp);
  242. if (ret!=0) {
  243. CPU_FREE(cpusetp);
  244. return nums;
  245. }
  246. ret = CPU_COUNT_S(size,cpusetp);
  247. if (ret > 0 && ret < nums) nums = ret;
  248. CPU_FREE(cpusetp);
  249. return nums;
  250. } else {
  251. ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
  252. if (ret!=0) {
  253. return nums;
  254. }
  255. ret = CPU_COUNT(&cpuset);
  256. if (ret > 0 && ret < nums) nums = ret;
  257. return nums;
  258. }
  259. #endif
  260. #endif
  261. }
  262. #endif
  263. #endif
  264. #ifdef OS_ANDROID
  265. int get_num_procs(void) {
  266. static int nums = 0;
  267. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  268. return nums;
  269. }
  270. #endif
  271. #ifdef OS_HAIKU
  272. int get_num_procs(void) {
  273. static int nums = 0;
  274. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  275. return nums;
  276. }
  277. #endif
  278. #ifdef OS_AIX
  279. int get_num_procs(void) {
  280. static int nums = 0;
  281. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  282. return nums;
  283. }
  284. #endif
  285. #ifdef OS_WINDOWS
  286. int get_num_procs(void) {
  287. static int nums = 0;
  288. if (nums == 0) {
  289. SYSTEM_INFO sysinfo;
  290. GetSystemInfo(&sysinfo);
  291. nums = sysinfo.dwNumberOfProcessors;
  292. }
  293. return nums;
  294. }
  295. #endif
  296. #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
  297. int get_num_procs(void) {
  298. static int nums = 0;
  299. int m[2];
  300. size_t len;
  301. if (nums == 0) {
  302. m[0] = CTL_HW;
  303. m[1] = HW_NCPU;
  304. len = sizeof(int);
  305. sysctl(m, 2, &nums, &len, NULL, 0);
  306. }
  307. return nums;
  308. }
  309. #endif
  310. #if defined(OS_DARWIN)
  311. int get_num_procs(void) {
  312. static int nums = 0;
  313. size_t len;
  314. if (nums == 0){
  315. len = sizeof(int);
  316. sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
  317. }
  318. return nums;
  319. }
  320. /*
  321. void set_stack_limit(int limitMB){
  322. int result=0;
  323. struct rlimit rl;
  324. rlim_t StackSize;
  325. StackSize=limitMB*1024*1024;
  326. result=getrlimit(RLIMIT_STACK, &rl);
  327. if(result==0){
  328. if(rl.rlim_cur < StackSize){
  329. rl.rlim_cur=StackSize;
  330. result=setrlimit(RLIMIT_STACK, &rl);
  331. if(result !=0){
  332. fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
  333. }
  334. }
  335. }
  336. }
  337. */
  338. #endif
  339. /*
  340. OpenBLAS uses the numbers of CPU cores in multithreading.
  341. It can be set by openblas_set_num_threads(int num_threads);
  342. */
  343. int blas_cpu_number = 0;
  344. /*
  345. The numbers of threads in the thread pool.
  346. This value is equal or large than blas_cpu_number. This means some threads are sleep.
  347. */
  348. int blas_num_threads = 0;
  349. int goto_get_num_procs (void) {
  350. return blas_cpu_number;
  351. }
  352. static void blas_memory_init();
  353. void openblas_fork_handler()
  354. {
  355. // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
  356. // built with "make USE_OPENMP=0".
  357. // Hanging can still happen when OpenBLAS is built against the libgomp
  358. // implementation of OpenMP. The problem is tracked at:
  359. // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
  360. // In the mean time build with USE_OPENMP=0 or link against another
  361. // implementation of OpenMP.
  362. #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
  363. int err;
  364. err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init);
  365. if(err != 0)
  366. openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
  367. #endif
  368. }
  369. extern int openblas_num_threads_env();
  370. extern int openblas_goto_num_threads_env();
  371. extern int openblas_omp_num_threads_env();
  372. int blas_get_cpu_number(void){
  373. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
  374. int max_num;
  375. #endif
  376. int blas_goto_num = 0;
  377. int blas_omp_num = 0;
  378. if (blas_num_threads) return blas_num_threads;
  379. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
  380. max_num = get_num_procs();
  381. #endif
  382. // blas_goto_num = 0;
  383. #ifndef USE_OPENMP_UNUSED
  384. blas_goto_num=openblas_num_threads_env();
  385. if (blas_goto_num < 0) blas_goto_num = 0;
  386. if (blas_goto_num == 0) {
  387. blas_goto_num=openblas_goto_num_threads_env();
  388. if (blas_goto_num < 0) blas_goto_num = 0;
  389. }
  390. #endif
  391. // blas_omp_num = 0;
  392. blas_omp_num=openblas_omp_num_threads_env();
  393. if (blas_omp_num < 0) blas_omp_num = 0;
  394. if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
  395. else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
  396. else blas_num_threads = MAX_CPU_NUMBER;
  397. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
  398. if (blas_num_threads > max_num) blas_num_threads = max_num;
  399. #endif
  400. if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
  401. #ifdef DEBUG
  402. printf( "Adjusted number of threads : %3d\n", blas_num_threads);
  403. #endif
  404. blas_cpu_number = blas_num_threads;
  405. return blas_num_threads;
  406. }
  407. #endif
  408. int openblas_get_num_procs(void) {
  409. #ifndef SMP
  410. return 1;
  411. #else
  412. return get_num_procs();
  413. #endif
  414. }
  415. int openblas_get_num_threads(void) {
  416. #ifndef SMP
  417. return 1;
  418. #else
  419. // init blas_cpu_number if needed
  420. blas_get_cpu_number();
  421. return blas_cpu_number;
  422. #endif
  423. }
  424. int hugetlb_allocated = 0;
  425. #if defined(OS_WINDOWS)
  426. #define LIKELY_ONE(x) (x)
  427. #else
  428. #define LIKELY_ONE(x) (__builtin_expect(x, 1))
  429. #endif
  430. /* Stores information about the allocation and how to release it */
  431. struct alloc_t {
  432. /* Whether this allocation is being used */
  433. int used;
  434. /* Any special attributes needed when releasing this allocation */
  435. int attr;
  436. /* Function that can properly release this memory */
  437. void (*release_func)(struct alloc_t *);
  438. /* Pad to 64-byte alignment */
  439. char pad[64 - 2 * sizeof(int) - sizeof(void(*))];
  440. };
  441. /* Convenience macros for storing release funcs */
  442. #define STORE_RELEASE_FUNC(address, func) \
  443. if (address != (void *)-1) { \
  444. struct alloc_t *alloc_info = (struct alloc_t *)address; \
  445. alloc_info->release_func = func; \
  446. }
  447. #define STORE_RELEASE_FUNC_WITH_ATTR(address, func, attr) \
  448. if (address != (void *)-1) { \
  449. struct alloc_t *alloc_info = (struct alloc_t *)address; \
  450. alloc_info->release_func = func; \
  451. alloc_info->attr = attr; \
  452. }
  453. /* The number of bytes that will be allocated for each buffer. When allocating
  454. memory, we store an alloc_t followed by the actual buffer memory. This means
  455. that each allocation always has its associated alloc_t, without the need
  456. for an auxiliary tracking structure. */
  457. static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
  458. #if defined(SMP)
  459. # if defined(OS_WINDOWS)
  460. static DWORD local_storage_key = 0;
  461. DWORD lsk;
  462. # else
  463. static pthread_key_t local_storage_key = 0;
  464. pthread_key_t lsk;
  465. # endif /* defined(OS_WINDOWS) */
  466. #endif /* defined(SMP) */
  467. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  468. static int hot_alloc = 0;
  469. #endif
  470. /* Global lock for memory allocation */
  471. #if defined(USE_PTHREAD_LOCK)
  472. static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
  473. #elif defined(USE_PTHREAD_SPINLOCK)
  474. static pthread_spinlock_t alloc_lock = 0;
  475. #else
  476. static BLASULONG alloc_lock = 0UL;
  477. #endif
  478. #if defined(USE_PTHREAD_LOCK)
  479. static pthread_mutex_t key_lock = PTHREAD_MUTEX_INITIALIZER;
  480. #elif defined(USE_PTHREAD_SPINLOCK)
  481. static pthread_spinlock_t key_lock = 0;
  482. #else
  483. static BLASULONG key_lock = 0UL;
  484. #endif
  485. /* Returns a pointer to the start of the per-thread memory allocation data */
  486. static __inline struct alloc_t ** get_memory_table() {
  487. #if defined(SMP)
  488. LOCK_COMMAND(&key_lock);
  489. lsk=local_storage_key;
  490. UNLOCK_COMMAND(&key_lock);
  491. if (!lsk) {
  492. blas_memory_init();
  493. }
  494. # if defined(OS_WINDOWS)
  495. struct alloc_t ** local_memory_table = (struct alloc_t **)TlsGetValue(local_storage_key);
  496. # else
  497. struct alloc_t ** local_memory_table = (struct alloc_t **)pthread_getspecific(local_storage_key);
  498. # endif /* defined(OS_WINDOWS) */
  499. #else
  500. static struct alloc_t ** local_memory_table = NULL;
  501. #endif /* defined(SMP) */
  502. #if defined (SMP)
  503. LOCK_COMMAND(&key_lock);
  504. lsk=local_storage_key;
  505. UNLOCK_COMMAND(&key_lock);
  506. if (lsk && !local_memory_table) {
  507. #else
  508. if (!local_memory_table) {
  509. #endif /* defined(SMP) */
  510. local_memory_table = (struct alloc_t **)malloc(sizeof(struct alloc_t *) * NUM_BUFFERS);
  511. memset(local_memory_table, 0, sizeof(struct alloc_t *) * NUM_BUFFERS);
  512. #if defined(SMP)
  513. # if defined(OS_WINDOWS)
  514. LOCK_COMMAND(&key_lock);
  515. TlsSetValue(local_storage_key, (void*)local_memory_table);
  516. UNLOCK_COMMAND(&key_lock);
  517. # else
  518. LOCK_COMMAND(&key_lock);
  519. pthread_setspecific(local_storage_key, (void*)local_memory_table);
  520. UNLOCK_COMMAND(&key_lock);
  521. # endif /* defined(OS_WINDOWS) */
  522. #endif /* defined(SMP) */
  523. }
  524. return local_memory_table;
  525. }
  526. #ifdef ALLOC_MMAP
  527. static void alloc_mmap_free(struct alloc_t *alloc_info){
  528. if (munmap(alloc_info, allocation_block_size)) {
  529. printf("OpenBLAS : munmap failed\n");
  530. }
  531. }
  532. #ifdef NO_WARMUP
  533. static void *alloc_mmap(void *address){
  534. void *map_address;
  535. if (address){
  536. map_address = mmap(address,
  537. allocation_block_size,
  538. MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  539. } else {
  540. map_address = mmap(address,
  541. allocation_block_size,
  542. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  543. }
  544. STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
  545. #ifdef OS_LINUX
  546. my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
  547. #endif
  548. return map_address;
  549. }
  550. #else
  551. #define BENCH_ITERATION 4
  552. #define SCALING 2
  553. static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
  554. BLASULONG original, *p;
  555. BLASULONG start, stop, min;
  556. int iter, i, count;
  557. min = (BLASULONG)-1;
  558. original = *(BLASULONG *)(address + size - PAGESIZE);
  559. *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
  560. for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
  561. p = (BLASULONG *)address;
  562. count = size / PAGESIZE;
  563. start = rpcc();
  564. for (i = 0; i < count; i ++) {
  565. p = (BLASULONG *)(*p);
  566. }
  567. stop = rpcc();
  568. if (min > stop - start) min = stop - start;
  569. }
  570. *(BLASULONG *)(address + size - PAGESIZE + 0) = original;
  571. *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
  572. return min;
  573. }
  574. static void *alloc_mmap(void *address){
  575. void *map_address, *best_address;
  576. BLASULONG best, start, current, original;
  577. BLASULONG allocsize;
  578. if (address){
  579. /* Just give up use advanced operation */
  580. map_address = mmap(address, allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  581. #ifdef OS_LINUX
  582. my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
  583. #endif
  584. } else {
  585. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  586. if (hot_alloc == 0) {
  587. map_address = mmap(NULL, allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0);
  588. #ifdef OS_LINUX
  589. my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
  590. #endif
  591. } else {
  592. #endif
  593. map_address = mmap(NULL, allocation_block_size * SCALING,
  594. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  595. if (map_address != (void *)-1) {
  596. #ifdef OS_LINUX
  597. #ifdef DEBUG
  598. int ret=0;
  599. ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  600. if(ret==-1){
  601. int errsv=errno;
  602. perror("OpenBLAS alloc_mmap:");
  603. printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
  604. }
  605. #else
  606. my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  607. #endif
  608. #endif
  609. allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
  610. start = (BLASULONG)map_address;
  611. current = (SCALING - 1) * allocation_block_size;
  612. original = current;
  613. while(current > 0 && current <= original) {
  614. *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
  615. start += PAGESIZE;
  616. current -= PAGESIZE;
  617. }
  618. *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
  619. start = (BLASULONG)map_address;
  620. best = (BLASULONG)-1;
  621. best_address = map_address;
  622. while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) {
  623. current = run_bench(start, allocsize);
  624. if (best > current) {
  625. best = current;
  626. best_address = (void *)start;
  627. }
  628. start += PAGESIZE;
  629. }
  630. if ((BLASULONG)best_address > (BLASULONG)map_address)
  631. munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
  632. munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address);
  633. map_address = best_address;
  634. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  635. hot_alloc = 2;
  636. #endif
  637. }
  638. }
  639. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  640. }
  641. #endif
  642. STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
  643. return map_address;
  644. }
  645. #endif
  646. #endif
  647. #ifdef ALLOC_MALLOC
  648. static void alloc_malloc_free(struct alloc_t *alloc_info){
  649. free(alloc_info);
  650. }
  651. static void *alloc_malloc(void *address){
  652. void *map_address;
  653. map_address = (void *)malloc(allocation_block_size + FIXED_PAGESIZE);
  654. if (map_address == (void *)NULL) map_address = (void *)-1;
  655. STORE_RELEASE_FUNC(map_address, alloc_malloc_free);
  656. return map_address;
  657. }
  658. #endif
  659. #ifdef ALLOC_QALLOC
  660. void *qalloc(int flags, size_t bytes);
  661. void *qfree (void *address);
  662. #define QNONCACHE 0x1
  663. #define QCOMMS 0x2
  664. #define QFAST 0x4
  665. static void alloc_qalloc_free(struct alloc_t *alloc_info){
  666. qfree(alloc_info);
  667. }
  668. static void *alloc_qalloc(void *address){
  669. void *map_address;
  670. map_address = (void *)qalloc(QCOMMS | QFAST, allocation_block_size + FIXED_PAGESIZE);
  671. if (map_address == (void *)NULL) map_address = (void *)-1;
  672. STORE_RELEASE_FUNC(map_address, alloc_qalloc_free);
  673. return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
  674. }
  675. #endif
  676. #ifdef ALLOC_WINDOWS
  677. static void alloc_windows_free(struct alloc_t *alloc_info){
  678. VirtualFree(alloc_info, 0, MEM_RELEASE);
  679. }
  680. static void *alloc_windows(void *address){
  681. void *map_address;
  682. map_address = VirtualAlloc(address,
  683. allocation_block_size,
  684. MEM_RESERVE | MEM_COMMIT,
  685. PAGE_READWRITE);
  686. if (map_address == (void *)NULL) map_address = (void *)-1;
  687. STORE_RELEASE_FUNC(map_address, alloc_windows_free);
  688. return map_address;
  689. }
  690. #endif
  691. #ifdef ALLOC_DEVICEDRIVER
  692. #ifndef DEVICEDRIVER_NAME
  693. #define DEVICEDRIVER_NAME "/dev/mapper"
  694. #endif
  695. static void alloc_devicedirver_free(struct alloc_t *alloc_info){
  696. int attr = alloc_info -> attr;
  697. if (munmap(address, allocation_block_size)) {
  698. printf("OpenBLAS : Bugphysarea unmap failed.\n");
  699. }
  700. if (close(attr)) {
  701. printf("OpenBLAS : Bugphysarea close failed.\n");
  702. }
  703. }
  704. static void *alloc_devicedirver(void *address){
  705. int fd;
  706. void *map_address;
  707. if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
  708. return (void *)-1;
  709. }
  710. map_address = mmap(address, allocation_block_size,
  711. PROT_READ | PROT_WRITE,
  712. MAP_FILE | MAP_SHARED,
  713. fd, 0);
  714. STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd);
  715. return map_address;
  716. }
  717. #endif
  718. #ifdef ALLOC_SHM
  719. static void alloc_shm_free(struct alloc_t *alloc_info){
  720. if (shmdt(alloc_info)) {
  721. printf("OpenBLAS : Shared memory unmap failed.\n");
  722. }
  723. }
  724. static void *alloc_shm(void *address){
  725. void *map_address;
  726. int shmid;
  727. shmid = shmget(IPC_PRIVATE, allocation_block_size,IPC_CREAT | 0600);
  728. map_address = (void *)shmat(shmid, address, 0);
  729. if (map_address != (void *)-1){
  730. #ifdef OS_LINUX
  731. my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
  732. #endif
  733. shmctl(shmid, IPC_RMID, 0);
  734. struct alloc_t *alloc_info = (struct alloc_t *)map_address;
  735. alloc_info->release_func = alloc_shm_free;
  736. alloc_info->attr = shmid;
  737. }
  738. return map_address;
  739. }
  740. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  741. static void alloc_hugetlb_free(struct alloc_t *alloc_info){
  742. #if defined(OS_LINUX) || defined(OS_AIX)
  743. if (shmdt(alloc_info)) {
  744. printf("OpenBLAS : Hugepage unmap failed.\n");
  745. }
  746. #endif
  747. #ifdef __sun__
  748. munmap(alloc_info, allocation_block_size);
  749. #endif
  750. #ifdef OS_WINDOWS
  751. VirtualFree(alloc_info, 0, MEM_LARGE_PAGES | MEM_RELEASE);
  752. #endif
  753. }
  754. static void *alloc_hugetlb(void *address){
  755. void *map_address = (void *)-1;
  756. #if defined(OS_LINUX) || defined(OS_AIX)
  757. int shmid;
  758. shmid = shmget(IPC_PRIVATE, allocation_block_size,
  759. #ifdef OS_LINUX
  760. SHM_HUGETLB |
  761. #endif
  762. #ifdef OS_AIX
  763. SHM_LGPAGE | SHM_PIN |
  764. #endif
  765. IPC_CREAT | SHM_R | SHM_W);
  766. if (shmid != -1) {
  767. map_address = (void *)shmat(shmid, address, SHM_RND);
  768. #ifdef OS_LINUX
  769. my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
  770. #endif
  771. if (map_address != (void *)-1){
  772. shmctl(shmid, IPC_RMID, 0);
  773. }
  774. }
  775. #endif
  776. #ifdef __sun__
  777. struct memcntl_mha mha;
  778. mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
  779. mha.mha_flags = 0;
  780. mha.mha_pagesize = HUGE_PAGESIZE;
  781. memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
  782. map_address = (BLASULONG)memalign(HUGE_PAGESIZE, allocation_block_size);
  783. #endif
  784. #ifdef OS_WINDOWS
  785. HANDLE hToken;
  786. TOKEN_PRIVILEGES tp;
  787. if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
  788. tp.PrivilegeCount = 1;
  789. tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
  790. if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
  791. CloseHandle(hToken);
  792. return (void*)-1;
  793. }
  794. if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) {
  795. CloseHandle(hToken);
  796. return (void*)-1;
  797. }
  798. map_address = (void *)VirtualAlloc(address,
  799. allocation_block_size,
  800. MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
  801. PAGE_READWRITE);
  802. tp.Privileges[0].Attributes = 0;
  803. AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
  804. if (map_address == (void *)NULL) map_address = (void *)-1;
  805. #endif
  806. STORE_RELEASE_FUNC(map_address, alloc_hugetlb_free);
  807. return map_address;
  808. }
  809. #endif
  810. #endif
  811. #ifdef ALLOC_HUGETLBFILE
  812. static int hugetlb_pid = 0;
  813. static void alloc_hugetlbfile_free(struct alloc_t *alloc_info){
  814. int attr = alloc_info -> attr;
  815. if (munmap(alloc_info, allocation_block_size)) {
  816. printf("OpenBLAS : HugeTLBfs unmap failed.\n");
  817. }
  818. if (close(attr)) {
  819. printf("OpenBLAS : HugeTLBfs close failed.\n");
  820. }
  821. }
  822. static void *alloc_hugetlbfile(void *address){
  823. void *map_address = (void *)-1;
  824. int fd;
  825. char filename[64];
  826. if (!hugetlb_pid) hugetlb_pid = getpid();
  827. sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
  828. if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
  829. return (void *)-1;
  830. }
  831. unlink(filename);
  832. map_address = mmap(address, allocation_block_size,
  833. PROT_READ | PROT_WRITE,
  834. MAP_SHARED,
  835. fd, 0);
  836. STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd);
  837. return map_address;
  838. }
  839. #endif
  840. #ifdef SEEK_ADDRESS
  841. static BLASULONG base_address = 0UL;
  842. #else
  843. static BLASULONG base_address = BASE_ADDRESS;
  844. #endif
  845. #if __STDC_VERSION__ >= 201112L
  846. static _Atomic int memory_initialized = 0;
  847. #else
  848. static volatile int memory_initialized = 0;
  849. #endif
  850. /* Memory allocation routine */
  851. /* procpos ... indicates where it comes from */
  852. /* 0 : Level 3 functions */
  853. /* 1 : Level 2 functions */
  854. /* 2 : Thread */
  855. static void blas_memory_cleanup(void* ptr){
  856. if (ptr) {
  857. struct alloc_t ** table = (struct alloc_t **)ptr;
  858. int pos;
  859. for (pos = 0; pos < NUM_BUFFERS; pos ++){
  860. struct alloc_t *alloc_info = table[pos];
  861. if (alloc_info) {
  862. alloc_info->release_func(alloc_info);
  863. table[pos] = (void *)0;
  864. }
  865. }
  866. free(table);
  867. }
  868. }
  869. static void blas_memory_init(){
  870. #if defined(SMP)
  871. # if defined(OS_WINDOWS)
  872. local_storage_key = TlsAlloc();
  873. # else
  874. pthread_key_create(&local_storage_key, blas_memory_cleanup);
  875. # endif /* defined(OS_WINDOWS) */
  876. #endif /* defined(SMP) */
  877. }
  878. void *blas_memory_alloc(int procpos){
  879. int position;
  880. void *map_address;
  881. void *(*memoryalloc[])(void *address) = {
  882. #ifdef ALLOC_DEVICEDRIVER
  883. alloc_devicedirver,
  884. #endif
  885. /* Hugetlb implicitly assumes ALLOC_SHM */
  886. #ifdef ALLOC_SHM
  887. alloc_shm,
  888. #endif
  889. #if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS))
  890. alloc_hugetlb,
  891. #endif
  892. #ifdef ALLOC_MMAP
  893. alloc_mmap,
  894. #endif
  895. #ifdef ALLOC_QALLOC
  896. alloc_qalloc,
  897. #endif
  898. #ifdef ALLOC_WINDOWS
  899. alloc_windows,
  900. #endif
  901. #ifdef ALLOC_MALLOC
  902. alloc_malloc,
  903. #endif
  904. NULL,
  905. };
  906. void *(**func)(void *address);
  907. struct alloc_t * alloc_info;
  908. struct alloc_t ** alloc_table;
  909. #if defined(SMP) && !defined(USE_OPENMP)
  910. int mi;
  911. LOCK_COMMAND(&alloc_lock);
  912. mi=memory_initialized;
  913. UNLOCK_COMMAND(&alloc_lock);
  914. if (!LIKELY_ONE(mi)) {
  915. #else
  916. if (!LIKELY_ONE(memory_initialized)) {
  917. #endif
  918. #if defined(SMP) && !defined(USE_OPENMP)
  919. /* Only allow a single thread to initialize memory system */
  920. LOCK_COMMAND(&alloc_lock);
  921. if (!memory_initialized) {
  922. #endif
  923. blas_memory_init();
  924. #ifdef DYNAMIC_ARCH
  925. gotoblas_dynamic_init();
  926. #endif
  927. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  928. gotoblas_affinity_init();
  929. #endif
  930. #ifdef SMP
  931. if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
  932. #endif
  933. #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
  934. #ifndef DYNAMIC_ARCH
  935. blas_set_parameter();
  936. #endif
  937. #endif
  938. memory_initialized = 1;
  939. #if defined(SMP) && !defined(USE_OPENMP)
  940. }
  941. UNLOCK_COMMAND(&alloc_lock);
  942. #endif
  943. }
  944. #ifdef DEBUG
  945. printf("Alloc Start ...\n");
  946. #endif
  947. position = 0;
  948. alloc_table = get_memory_table();
  949. do {
  950. if (!alloc_table[position] || !alloc_table[position]->used) goto allocation;
  951. position ++;
  952. } while (position < NUM_BUFFERS);
  953. goto error;
  954. allocation :
  955. #ifdef DEBUG
  956. printf(" Position -> %d\n", position);
  957. #endif
  958. alloc_info = alloc_table[position];
  959. if (!alloc_info) {
  960. do {
  961. #ifdef DEBUG
  962. printf("Allocation Start : %lx\n", base_address);
  963. #endif
  964. map_address = (void *)-1;
  965. func = &memoryalloc[0];
  966. while ((func != NULL) && (map_address == (void *) -1)) {
  967. map_address = (*func)((void *)base_address);
  968. #ifdef ALLOC_DEVICEDRIVER
  969. if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
  970. fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n");
  971. }
  972. #endif
  973. #ifdef ALLOC_HUGETLBFILE
  974. if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
  975. #ifndef OS_WINDOWS
  976. fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n");
  977. #endif
  978. }
  979. #endif
  980. #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
  981. if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
  982. #endif
  983. func ++;
  984. }
  985. #ifdef DEBUG
  986. printf(" Success -> %08lx\n", map_address);
  987. #endif
  988. if (((BLASLONG) map_address) == -1) base_address = 0UL;
  989. if (base_address) base_address += allocation_block_size + FIXED_PAGESIZE;
  990. } while ((BLASLONG)map_address == -1);
  991. alloc_table[position] = alloc_info = map_address;
  992. #ifdef DEBUG
  993. printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position);
  994. #endif
  995. }
  996. #ifdef DEBUG
  997. printf("Mapped : %p %3d\n\n", (void *)alloc_info, position);
  998. #endif
  999. alloc_info->used = 1;
  1000. return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
  1001. error:
  1002. printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
  1003. return NULL;
  1004. }
  1005. void blas_memory_free(void *buffer){
  1006. #ifdef DEBUG
  1007. int position;
  1008. struct alloc_t ** alloc_table;
  1009. #endif
  1010. /* Since we passed an offset pointer to the caller, get back to the actual allocation */
  1011. struct alloc_t *alloc_info = (void *)(((char *)buffer) - sizeof(struct alloc_t));
  1012. #ifdef DEBUG
  1013. printf("Unmapped Start : %p ...\n", alloc_info);
  1014. #endif
  1015. alloc_info->used = 0;
  1016. #ifdef DEBUG
  1017. printf("Unmap Succeeded.\n\n");
  1018. #endif
  1019. return;
  1020. #ifdef DEBUG
  1021. alloc_table = get_memory_table();
  1022. for (position = 0; position < NUM_BUFFERS; position++){
  1023. if (alloc_table[position]) {
  1024. printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used);
  1025. }
  1026. }
  1027. #endif
  1028. return;
  1029. }
  1030. void *blas_memory_alloc_nolock(int unused) {
  1031. void *map_address;
  1032. map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
  1033. return map_address;
  1034. }
  1035. void blas_memory_free_nolock(void * map_address) {
  1036. free(map_address);
  1037. }
  1038. #ifdef SMP
  1039. void blas_thread_memory_cleanup(void) {
  1040. blas_memory_cleanup((void*)get_memory_table());
  1041. }
  1042. #endif
  1043. void blas_shutdown(void){
  1044. #ifdef SMP
  1045. BLASFUNC(blas_thread_shutdown)();
  1046. #endif
  1047. #ifdef SMP
  1048. /* Only cleanupIf we were built for threading and TLS was initialized */
  1049. if (local_storage_key)
  1050. #endif
  1051. blas_thread_memory_cleanup();
  1052. #ifdef SEEK_ADDRESS
  1053. base_address = 0UL;
  1054. #else
  1055. base_address = BASE_ADDRESS;
  1056. #endif
  1057. return;
  1058. }
  1059. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1060. #ifdef SMP
  1061. #if defined(USE_PTHREAD_LOCK)
  1062. static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
  1063. #elif defined(USE_PTHREAD_SPINLOCK)
  1064. static pthread_spinlock_t init_lock = 0;
  1065. #else
  1066. static BLASULONG init_lock = 0UL;
  1067. #endif
  1068. #endif
  1069. static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
  1070. void *sa, void *sb, BLASLONG pos) {
  1071. #if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
  1072. size_t size;
  1073. BLASULONG buffer;
  1074. size = allocation_block_size - PAGESIZE;
  1075. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  1076. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1077. if (hot_alloc != 2) {
  1078. #endif
  1079. #ifdef SMP
  1080. LOCK_COMMAND(&init_lock);
  1081. #endif
  1082. while (size > 0) {
  1083. *(int *)buffer = size;
  1084. buffer += PAGESIZE;
  1085. size -= PAGESIZE;
  1086. }
  1087. #ifdef SMP
  1088. UNLOCK_COMMAND(&init_lock);
  1089. #endif
  1090. size = MIN((allocation_block_size - PAGESIZE), L2_SIZE);
  1091. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  1092. while (size > 0) {
  1093. *(int *)buffer = size;
  1094. buffer += 64;
  1095. size -= 64;
  1096. }
  1097. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1098. }
  1099. #endif
  1100. #endif
  1101. }
  1102. #ifdef SMP
  1103. static void _init_thread_memory(void *buffer) {
  1104. blas_queue_t queue[MAX_CPU_NUMBER];
  1105. int num_cpu;
  1106. for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
  1107. blas_queue_init(&queue[num_cpu]);
  1108. queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL;
  1109. queue[num_cpu].routine = &_touch_memory;
  1110. queue[num_cpu].args = NULL;
  1111. queue[num_cpu].next = &queue[num_cpu + 1];
  1112. }
  1113. queue[num_cpu - 1].next = NULL;
  1114. queue[0].sa = buffer;
  1115. exec_blas(num_cpu, queue);
  1116. }
  1117. #endif
  1118. static void gotoblas_memory_init(void) {
  1119. void *buffer;
  1120. hot_alloc = 1;
  1121. buffer = (void *)blas_memory_alloc(0);
  1122. #ifdef SMP
  1123. if (blas_cpu_number == 0) blas_get_cpu_number();
  1124. #ifdef SMP_SERVER
  1125. if (blas_server_avail == 0) blas_thread_init();
  1126. #endif
  1127. _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
  1128. #else
  1129. _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
  1130. #endif
  1131. blas_memory_free(buffer);
  1132. }
  1133. #endif
  1134. /* Initialization for all function; this function should be called before main */
  1135. static int gotoblas_initialized = 0;
  1136. extern void openblas_read_env();
  1137. void CONSTRUCTOR gotoblas_init(void) {
  1138. if (gotoblas_initialized) return;
  1139. #ifdef SMP
  1140. openblas_fork_handler();
  1141. #endif
  1142. openblas_read_env();
  1143. #ifdef PROFILE
  1144. moncontrol (0);
  1145. #endif
  1146. #ifdef DYNAMIC_ARCH
  1147. gotoblas_dynamic_init();
  1148. #endif
  1149. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  1150. gotoblas_affinity_init();
  1151. #endif
  1152. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1153. gotoblas_memory_init();
  1154. #endif
  1155. //#if defined(OS_LINUX)
  1156. #if 0
  1157. struct rlimit curlimit;
  1158. if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
  1159. {
  1160. if ( curlimit.rlim_cur != curlimit.rlim_max )
  1161. {
  1162. curlimit.rlim_cur = curlimit.rlim_max;
  1163. setrlimit(RLIMIT_STACK, &curlimit);
  1164. }
  1165. }
  1166. #endif
  1167. #ifdef SMP
  1168. if (blas_cpu_number == 0) blas_get_cpu_number();
  1169. #ifdef SMP_SERVER
  1170. if (blas_server_avail == 0) blas_thread_init();
  1171. #endif
  1172. #endif
  1173. #ifdef FUNCTION_PROFILE
  1174. gotoblas_profile_init();
  1175. #endif
  1176. gotoblas_initialized = 1;
  1177. #ifdef PROFILE
  1178. moncontrol (1);
  1179. #endif
  1180. }
  1181. void DESTRUCTOR gotoblas_quit(void) {
  1182. if (gotoblas_initialized == 0) return;
  1183. blas_shutdown();
  1184. #if defined(SMP)
  1185. #if defined(OS_WINDOWS)
  1186. TlsFree(local_storage_key);
  1187. #else
  1188. pthread_key_delete(local_storage_key);
  1189. #endif
  1190. #endif
  1191. #ifdef PROFILE
  1192. moncontrol (0);
  1193. #endif
  1194. #ifdef FUNCTION_PROFILE
  1195. gotoblas_profile_quit();
  1196. #endif
  1197. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  1198. gotoblas_affinity_quit();
  1199. #endif
  1200. #ifdef DYNAMIC_ARCH
  1201. gotoblas_dynamic_quit();
  1202. #endif
  1203. gotoblas_initialized = 0;
  1204. #ifdef PROFILE
  1205. moncontrol (1);
  1206. #endif
  1207. }
  1208. #if defined(_MSC_VER) && !defined(__clang__)
  1209. BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
  1210. {
  1211. switch (ul_reason_for_call)
  1212. {
  1213. case DLL_PROCESS_ATTACH:
  1214. gotoblas_init();
  1215. break;
  1216. case DLL_THREAD_ATTACH:
  1217. break;
  1218. case DLL_THREAD_DETACH:
  1219. #if defined(SMP)
  1220. blas_thread_memory_cleanup();
  1221. #endif
  1222. break;
  1223. case DLL_PROCESS_DETACH:
  1224. gotoblas_quit();
  1225. break;
  1226. default:
  1227. break;
  1228. }
  1229. return TRUE;
  1230. }
  1231. /*
  1232. This is to allow static linking.
  1233. Code adapted from Google performance tools:
  1234. https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
  1235. Reference:
  1236. https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
  1237. http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
  1238. */
  1239. static int on_process_term(void)
  1240. {
  1241. gotoblas_quit();
  1242. return 0;
  1243. }
  1244. #ifdef _WIN64
  1245. #pragma comment(linker, "/INCLUDE:_tls_used")
  1246. #else
  1247. #pragma comment(linker, "/INCLUDE:__tls_used")
  1248. #endif
  1249. #ifdef _WIN64
  1250. #pragma const_seg(".CRT$XLB")
  1251. #else
  1252. #pragma data_seg(".CRT$XLB")
  1253. #endif
  1254. static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
  1255. #ifdef _WIN64
  1256. #pragma const_seg()
  1257. #else
  1258. #pragma data_seg()
  1259. #endif
  1260. #ifdef _WIN64
  1261. #pragma const_seg(".CRT$XTU")
  1262. #else
  1263. #pragma data_seg(".CRT$XTU")
  1264. #endif
  1265. static int(*p_process_term)(void) = on_process_term;
  1266. #ifdef _WIN64
  1267. #pragma const_seg()
  1268. #else
  1269. #pragma data_seg()
  1270. #endif
  1271. #endif
  1272. #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
  1273. /* Don't call me; this is just work around for PGI / Sun bug */
  1274. void gotoblas_dummy_for_PGI(void) {
  1275. gotoblas_init();
  1276. gotoblas_quit();
  1277. #if __PGIC__ < 19
  1278. #if 0
  1279. asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
  1280. asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
  1281. #else
  1282. asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
  1283. asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
  1284. #endif
  1285. #endif
  1286. }
  1287. #endif
  1288. #else
  1289. /* USE_TLS / COMPILE_TLS not set */
  1290. #include <errno.h>
  1291. #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
  1292. #define ALLOC_WINDOWS
  1293. #ifndef MEM_LARGE_PAGES
  1294. #define MEM_LARGE_PAGES 0x20000000
  1295. #endif
  1296. #else
  1297. #define ALLOC_MMAP
  1298. #define ALLOC_MALLOC
  1299. #endif
  1300. #include <stdlib.h>
  1301. #include <stdio.h>
  1302. #include <fcntl.h>
  1303. #if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
  1304. #include <sys/mman.h>
  1305. #ifndef NO_SYSV_IPC
  1306. #include <sys/shm.h>
  1307. #endif
  1308. #include <sys/ipc.h>
  1309. #endif
  1310. #include <sys/types.h>
  1311. #ifdef OS_LINUX
  1312. #include <sys/sysinfo.h>
  1313. #include <sched.h>
  1314. #include <errno.h>
  1315. #include <linux/unistd.h>
  1316. #include <sys/syscall.h>
  1317. #include <sys/time.h>
  1318. #include <sys/resource.h>
  1319. #endif
  1320. #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
  1321. #include <sys/sysctl.h>
  1322. #include <sys/resource.h>
  1323. #endif
  1324. #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
  1325. #include <conio.h>
  1326. #undef printf
  1327. #define printf _cprintf
  1328. #endif
  1329. #ifdef OS_LINUX
  1330. #ifndef MPOL_PREFERRED
  1331. #define MPOL_PREFERRED 1
  1332. #endif
  1333. #endif
  1334. #if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
  1335. #define NO_WARMUP
  1336. #endif
  1337. #ifndef SHM_HUGETLB
  1338. #define SHM_HUGETLB 04000
  1339. #endif
  1340. #ifndef FIXED_PAGESIZE
  1341. #define FIXED_PAGESIZE 4096
  1342. #endif
  1343. #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
  1344. #if defined(_MSC_VER) && !defined(__clang__)
  1345. #define CONSTRUCTOR __cdecl
  1346. #define DESTRUCTOR __cdecl
  1347. #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
  1348. #define CONSTRUCTOR __attribute__ ((constructor))
  1349. #define DESTRUCTOR __attribute__ ((destructor))
  1350. #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
  1351. #define CONSTRUCTOR __attribute__ ((constructor(101)))
  1352. #define DESTRUCTOR __attribute__ ((destructor(101)))
  1353. #else
  1354. #define CONSTRUCTOR __attribute__ ((constructor))
  1355. #define DESTRUCTOR __attribute__ ((destructor))
  1356. #endif
  1357. #ifdef DYNAMIC_ARCH
  1358. gotoblas_t *gotoblas = NULL;
  1359. #endif
  1360. extern void openblas_warning(int verbose, const char * msg);
  1361. #ifndef SMP
  1362. #define blas_cpu_number 1
  1363. #define blas_num_threads 1
  1364. /* Dummy Function */
  1365. int goto_get_num_procs (void) { return 1;};
  1366. void goto_set_num_threads(int num_threads) {};
  1367. #else
  1368. #if defined(OS_LINUX) || defined(OS_SUNOS)
  1369. #ifndef NO_AFFINITY
  1370. int get_num_procs(void);
  1371. #else
  1372. int get_num_procs(void) {
  1373. static int nums = 0;
  1374. cpu_set_t cpuset,*cpusetp;
  1375. size_t size;
  1376. int ret;
  1377. #if defined(__GLIBC_PREREQ)
  1378. #if !__GLIBC_PREREQ(2, 7)
  1379. int i;
  1380. #if !__GLIBC_PREREQ(2, 6)
  1381. int n;
  1382. #endif
  1383. #endif
  1384. #endif
  1385. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  1386. #if !defined(OS_LINUX)
  1387. return nums;
  1388. #endif
  1389. #if !defined(__GLIBC_PREREQ)
  1390. return nums;
  1391. #else
  1392. #if !__GLIBC_PREREQ(2, 3)
  1393. return nums;
  1394. #endif
  1395. #if !__GLIBC_PREREQ(2, 7)
  1396. ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
  1397. if (ret!=0) return nums;
  1398. n=0;
  1399. #if !__GLIBC_PREREQ(2, 6)
  1400. for (i=0;i<nums;i++)
  1401. if (CPU_ISSET(i,&cpuset)) n++;
  1402. nums=n;
  1403. #else
  1404. nums = CPU_COUNT(sizeof(cpuset),&cpuset);
  1405. #endif
  1406. return nums;
  1407. #else
  1408. if (nums >= CPU_SETSIZE) {
  1409. cpusetp = CPU_ALLOC(nums);
  1410. if (cpusetp == NULL) {
  1411. return nums;
  1412. }
  1413. size = CPU_ALLOC_SIZE(nums);
  1414. ret = sched_getaffinity(0,size,cpusetp);
  1415. if (ret!=0) {
  1416. CPU_FREE(cpusetp);
  1417. return nums;
  1418. }
  1419. ret = CPU_COUNT_S(size,cpusetp);
  1420. if (ret > 0 && ret < nums) nums = ret;
  1421. CPU_FREE(cpusetp);
  1422. return nums;
  1423. } else {
  1424. ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
  1425. if (ret!=0) {
  1426. return nums;
  1427. }
  1428. ret = CPU_COUNT(&cpuset);
  1429. if (ret > 0 && ret < nums) nums = ret;
  1430. return nums;
  1431. }
  1432. #endif
  1433. #endif
  1434. }
  1435. #endif
  1436. #endif
  1437. #ifdef OS_ANDROID
  1438. int get_num_procs(void) {
  1439. static int nums = 0;
  1440. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  1441. return nums;
  1442. }
  1443. #endif
  1444. #ifdef OS_HAIKU
  1445. int get_num_procs(void) {
  1446. static int nums = 0;
  1447. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  1448. return nums;
  1449. }
  1450. #endif
  1451. #ifdef OS_AIX
  1452. int get_num_procs(void) {
  1453. static int nums = 0;
  1454. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  1455. return nums;
  1456. }
  1457. #endif
  1458. #ifdef OS_WINDOWS
  1459. int get_num_procs(void) {
  1460. static int nums = 0;
  1461. if (nums == 0) {
  1462. SYSTEM_INFO sysinfo;
  1463. GetSystemInfo(&sysinfo);
  1464. nums = sysinfo.dwNumberOfProcessors;
  1465. }
  1466. return nums;
  1467. }
  1468. #endif
  1469. #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
  1470. int get_num_procs(void) {
  1471. static int nums = 0;
  1472. int m[2];
  1473. size_t len;
  1474. if (nums == 0) {
  1475. m[0] = CTL_HW;
  1476. m[1] = HW_NCPU;
  1477. len = sizeof(int);
  1478. sysctl(m, 2, &nums, &len, NULL, 0);
  1479. }
  1480. return nums;
  1481. }
  1482. #endif
  1483. #if defined(OS_DARWIN)
  1484. int get_num_procs(void) {
  1485. static int nums = 0;
  1486. size_t len;
  1487. if (nums == 0){
  1488. len = sizeof(int);
  1489. sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
  1490. }
  1491. return nums;
  1492. }
  1493. /*
  1494. void set_stack_limit(int limitMB){
  1495. int result=0;
  1496. struct rlimit rl;
  1497. rlim_t StackSize;
  1498. StackSize=limitMB*1024*1024;
  1499. result=getrlimit(RLIMIT_STACK, &rl);
  1500. if(result==0){
  1501. if(rl.rlim_cur < StackSize){
  1502. rl.rlim_cur=StackSize;
  1503. result=setrlimit(RLIMIT_STACK, &rl);
  1504. if(result !=0){
  1505. fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
  1506. }
  1507. }
  1508. }
  1509. }
  1510. */
  1511. #endif
  1512. /*
  1513. OpenBLAS uses the numbers of CPU cores in multithreading.
  1514. It can be set by openblas_set_num_threads(int num_threads);
  1515. */
  1516. int blas_cpu_number = 0;
  1517. /*
  1518. The numbers of threads in the thread pool.
  1519. This value is equal or large than blas_cpu_number. This means some threads are sleep.
  1520. */
  1521. int blas_num_threads = 0;
  1522. int goto_get_num_procs (void) {
  1523. return blas_cpu_number;
  1524. }
  1525. void openblas_fork_handler()
  1526. {
  1527. // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
  1528. // built with "make USE_OPENMP=0".
  1529. // Hanging can still happen when OpenBLAS is built against the libgomp
  1530. // implementation of OpenMP. The problem is tracked at:
  1531. // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
  1532. // In the mean time build with USE_OPENMP=0 or link against another
  1533. // implementation of OpenMP.
  1534. #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
  1535. int err;
  1536. err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
  1537. if(err != 0)
  1538. openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
  1539. #endif
  1540. }
  1541. extern int openblas_num_threads_env();
  1542. extern int openblas_goto_num_threads_env();
  1543. extern int openblas_omp_num_threads_env();
  1544. int blas_get_cpu_number(void){
  1545. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
  1546. int max_num;
  1547. #endif
  1548. int blas_goto_num = 0;
  1549. int blas_omp_num = 0;
  1550. if (blas_num_threads) return blas_num_threads;
  1551. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
  1552. max_num = get_num_procs();
  1553. #endif
  1554. // blas_goto_num = 0;
  1555. #ifndef USE_OPENMP
  1556. blas_goto_num=openblas_num_threads_env();
  1557. if (blas_goto_num < 0) blas_goto_num = 0;
  1558. if (blas_goto_num == 0) {
  1559. blas_goto_num=openblas_goto_num_threads_env();
  1560. if (blas_goto_num < 0) blas_goto_num = 0;
  1561. }
  1562. #endif
  1563. // blas_omp_num = 0;
  1564. blas_omp_num=openblas_omp_num_threads_env();
  1565. if (blas_omp_num < 0) blas_omp_num = 0;
  1566. if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
  1567. else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
  1568. else blas_num_threads = MAX_CPU_NUMBER;
  1569. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
  1570. if (blas_num_threads > max_num) blas_num_threads = max_num;
  1571. #endif
  1572. if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
  1573. #ifdef DEBUG
  1574. printf( "Adjusted number of threads : %3d\n", blas_num_threads);
  1575. #endif
  1576. blas_cpu_number = blas_num_threads;
  1577. return blas_num_threads;
  1578. }
  1579. #endif
  1580. int openblas_get_num_procs(void) {
  1581. #ifndef SMP
  1582. return 1;
  1583. #else
  1584. return get_num_procs();
  1585. #endif
  1586. }
  1587. int openblas_get_num_threads(void) {
  1588. #ifndef SMP
  1589. return 1;
  1590. #else
  1591. // init blas_cpu_number if needed
  1592. blas_get_cpu_number();
  1593. return blas_cpu_number;
  1594. #endif
  1595. }
  1596. struct release_t {
  1597. void *address;
  1598. void (*func)(struct release_t *);
  1599. long attr;
  1600. };
  1601. int hugetlb_allocated = 0;
  1602. static struct release_t release_info[NUM_BUFFERS];
  1603. static int release_pos = 0;
  1604. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1605. static int hot_alloc = 0;
  1606. #endif
  1607. /* Global lock for memory allocation */
  1608. #if defined(USE_PTHREAD_LOCK)
  1609. static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
  1610. #elif defined(USE_PTHREAD_SPINLOCK)
  1611. static pthread_spinlock_t alloc_lock = 0;
  1612. #else
  1613. static BLASULONG alloc_lock = 0UL;
  1614. #endif
  1615. #ifdef ALLOC_MMAP
  1616. static void alloc_mmap_free(struct release_t *release){
  1617. if (!release->address) return;
  1618. if (munmap(release -> address, BUFFER_SIZE)) {
  1619. int errsv=errno;
  1620. perror("OpenBLAS : munmap failed:");
  1621. printf("error code=%d,\trelease->address=%lx\n",errsv,release->address);
  1622. }
  1623. }
  1624. #ifdef NO_WARMUP
  1625. static void *alloc_mmap(void *address){
  1626. void *map_address;
  1627. if (address){
  1628. map_address = mmap(address,
  1629. BUFFER_SIZE,
  1630. MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  1631. } else {
  1632. map_address = mmap(address,
  1633. BUFFER_SIZE,
  1634. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  1635. }
  1636. if (map_address != (void *)-1) {
  1637. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  1638. LOCK_COMMAND(&alloc_lock);
  1639. #endif
  1640. release_info[release_pos].address = map_address;
  1641. release_info[release_pos].func = alloc_mmap_free;
  1642. release_pos ++;
  1643. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  1644. UNLOCK_COMMAND(&alloc_lock);
  1645. #endif
  1646. } else {
  1647. #ifdef DEBUG
  1648. int errsv=errno;
  1649. perror("OpenBLAS : mmap failed:");
  1650. printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
  1651. #endif
  1652. }
  1653. #ifdef OS_LINUX
  1654. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  1655. #endif
  1656. return map_address;
  1657. }
  1658. #else
  1659. #define BENCH_ITERATION 4
  1660. #define SCALING 2
  1661. static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
  1662. BLASULONG original, *p;
  1663. BLASULONG start, stop, min;
  1664. int iter, i, count;
  1665. min = (BLASULONG)-1;
  1666. original = *(BLASULONG *)(address + size - PAGESIZE);
  1667. *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
  1668. for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
  1669. p = (BLASULONG *)address;
  1670. count = size / PAGESIZE;
  1671. start = rpcc();
  1672. for (i = 0; i < count; i ++) {
  1673. p = (BLASULONG *)(*p);
  1674. }
  1675. stop = rpcc();
  1676. if (min > stop - start) min = stop - start;
  1677. }
  1678. *(BLASULONG *)(address + size - PAGESIZE + 0) = original;
  1679. *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
  1680. return min;
  1681. }
  1682. static void *alloc_mmap(void *address){
  1683. void *map_address, *best_address;
  1684. BLASULONG best, start, current;
  1685. BLASULONG allocsize;
  1686. if (address){
  1687. /* Just give up use advanced operation */
  1688. map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  1689. #ifdef OS_LINUX
  1690. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  1691. #endif
  1692. } else {
  1693. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1694. if (hot_alloc == 0) {
  1695. map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
  1696. #ifdef OS_LINUX
  1697. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  1698. #endif
  1699. } else {
  1700. #endif
  1701. map_address = mmap(NULL, BUFFER_SIZE * SCALING,
  1702. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  1703. if (map_address != (void *)-1) {
  1704. #ifdef OS_LINUX
  1705. #ifdef DEBUG
  1706. int ret=0;
  1707. ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  1708. if(ret==-1){
  1709. int errsv=errno;
  1710. perror("OpenBLAS alloc_mmap:");
  1711. printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
  1712. }
  1713. #else
  1714. my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  1715. #endif
  1716. #endif
  1717. allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
  1718. start = (BLASULONG)map_address;
  1719. current = (SCALING - 1) * BUFFER_SIZE;
  1720. while(current > 0) {
  1721. *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
  1722. start += PAGESIZE;
  1723. current -= PAGESIZE;
  1724. }
  1725. *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
  1726. start = (BLASULONG)map_address;
  1727. best = (BLASULONG)-1;
  1728. best_address = map_address;
  1729. while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
  1730. current = run_bench(start, allocsize);
  1731. if (best > current) {
  1732. best = current;
  1733. best_address = (void *)start;
  1734. }
  1735. start += PAGESIZE;
  1736. }
  1737. if ((BLASULONG)best_address > (BLASULONG)map_address)
  1738. munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
  1739. munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
  1740. map_address = best_address;
  1741. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1742. hot_alloc = 2;
  1743. #endif
  1744. }
  1745. }
  1746. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1747. }
  1748. #endif
  1749. if (map_address != (void *)-1) {
  1750. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  1751. LOCK_COMMAND(&alloc_lock);
  1752. #endif
  1753. release_info[release_pos].address = map_address;
  1754. release_info[release_pos].func = alloc_mmap_free;
  1755. release_pos ++;
  1756. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  1757. UNLOCK_COMMAND(&alloc_lock);
  1758. #endif
  1759. }
  1760. return map_address;
  1761. }
  1762. #endif
  1763. #endif
  1764. #ifdef ALLOC_MALLOC
  1765. static void alloc_malloc_free(struct release_t *release){
  1766. free(release -> address);
  1767. }
  1768. static void *alloc_malloc(void *address){
  1769. void *map_address;
  1770. map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
  1771. if (map_address == (void *)NULL) map_address = (void *)-1;
  1772. if (map_address != (void *)-1) {
  1773. release_info[release_pos].address = map_address;
  1774. release_info[release_pos].func = alloc_malloc_free;
  1775. release_pos ++;
  1776. }
  1777. return map_address;
  1778. }
  1779. #endif
  1780. #ifdef ALLOC_QALLOC
  1781. void *qalloc(int flags, size_t bytes);
  1782. void *qfree (void *address);
  1783. #define QNONCACHE 0x1
  1784. #define QCOMMS 0x2
  1785. #define QFAST 0x4
  1786. static void alloc_qalloc_free(struct release_t *release){
  1787. qfree(release -> address);
  1788. }
  1789. static void *alloc_qalloc(void *address){
  1790. void *map_address;
  1791. map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
  1792. if (map_address == (void *)NULL) map_address = (void *)-1;
  1793. if (map_address != (void *)-1) {
  1794. release_info[release_pos].address = map_address;
  1795. release_info[release_pos].func = alloc_qalloc_free;
  1796. release_pos ++;
  1797. }
  1798. return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
  1799. }
  1800. #endif
  1801. #ifdef ALLOC_WINDOWS
  1802. static void alloc_windows_free(struct release_t *release){
  1803. VirtualFree(release -> address, 0, MEM_RELEASE);
  1804. }
  1805. static void *alloc_windows(void *address){
  1806. void *map_address;
  1807. map_address = VirtualAlloc(address,
  1808. BUFFER_SIZE,
  1809. MEM_RESERVE | MEM_COMMIT,
  1810. PAGE_READWRITE);
  1811. if (map_address == (void *)NULL) map_address = (void *)-1;
  1812. if (map_address != (void *)-1) {
  1813. release_info[release_pos].address = map_address;
  1814. release_info[release_pos].func = alloc_windows_free;
  1815. release_pos ++;
  1816. }
  1817. return map_address;
  1818. }
  1819. #endif
  1820. #ifdef ALLOC_DEVICEDRIVER
  1821. #ifndef DEVICEDRIVER_NAME
  1822. #define DEVICEDRIVER_NAME "/dev/mapper"
  1823. #endif
  1824. static void alloc_devicedirver_free(struct release_t *release){
  1825. if (munmap(release -> address, BUFFER_SIZE)) {
  1826. printf("OpenBLAS : Bugphysarea unmap failed.\n");
  1827. }
  1828. if (close(release -> attr)) {
  1829. printf("OpenBLAS : Bugphysarea close failed.\n");
  1830. }
  1831. }
  1832. static void *alloc_devicedirver(void *address){
  1833. int fd;
  1834. void *map_address;
  1835. if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
  1836. return (void *)-1;
  1837. }
  1838. map_address = mmap(address, BUFFER_SIZE,
  1839. PROT_READ | PROT_WRITE,
  1840. MAP_FILE | MAP_SHARED,
  1841. fd, 0);
  1842. if (map_address != (void *)-1) {
  1843. release_info[release_pos].address = map_address;
  1844. release_info[release_pos].attr = fd;
  1845. release_info[release_pos].func = alloc_devicedirver_free;
  1846. release_pos ++;
  1847. }
  1848. return map_address;
  1849. }
  1850. #endif
  1851. #ifdef ALLOC_SHM
  1852. static void alloc_shm_free(struct release_t *release){
  1853. if (shmdt(release -> address)) {
  1854. printf("OpenBLAS : Shared memory unmap failed.\n");
  1855. }
  1856. }
  1857. static void *alloc_shm(void *address){
  1858. void *map_address;
  1859. int shmid;
  1860. shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
  1861. map_address = (void *)shmat(shmid, address, 0);
  1862. if (map_address != (void *)-1){
  1863. #ifdef OS_LINUX
  1864. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  1865. #endif
  1866. shmctl(shmid, IPC_RMID, 0);
  1867. release_info[release_pos].address = map_address;
  1868. release_info[release_pos].attr = shmid;
  1869. release_info[release_pos].func = alloc_shm_free;
  1870. release_pos ++;
  1871. }
  1872. return map_address;
  1873. }
  1874. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  1875. static void alloc_hugetlb_free(struct release_t *release){
  1876. #if defined(OS_LINUX) || defined(OS_AIX)
  1877. if (shmdt(release -> address)) {
  1878. printf("OpenBLAS : Hugepage unmap failed.\n");
  1879. }
  1880. #endif
  1881. #ifdef __sun__
  1882. munmap(release -> address, BUFFER_SIZE);
  1883. #endif
  1884. #ifdef OS_WINDOWS
  1885. VirtualFree(release -> address, 0, MEM_LARGE_PAGES | MEM_RELEASE);
  1886. #endif
  1887. }
  1888. static void *alloc_hugetlb(void *address){
  1889. void *map_address = (void *)-1;
  1890. #if defined(OS_LINUX) || defined(OS_AIX)
  1891. int shmid;
  1892. shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
  1893. #ifdef OS_LINUX
  1894. SHM_HUGETLB |
  1895. #endif
  1896. #ifdef OS_AIX
  1897. SHM_LGPAGE | SHM_PIN |
  1898. #endif
  1899. IPC_CREAT | SHM_R | SHM_W);
  1900. if (shmid != -1) {
  1901. map_address = (void *)shmat(shmid, address, SHM_RND);
  1902. #ifdef OS_LINUX
  1903. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  1904. #endif
  1905. if (map_address != (void *)-1){
  1906. shmctl(shmid, IPC_RMID, 0);
  1907. }
  1908. }
  1909. #endif
  1910. #ifdef __sun__
  1911. struct memcntl_mha mha;
  1912. mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
  1913. mha.mha_flags = 0;
  1914. mha.mha_pagesize = HUGE_PAGESIZE;
  1915. memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
  1916. map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
  1917. #endif
  1918. #ifdef OS_WINDOWS
  1919. HANDLE hToken;
  1920. TOKEN_PRIVILEGES tp;
  1921. if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
  1922. tp.PrivilegeCount = 1;
  1923. tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
  1924. if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
  1925. CloseHandle(hToken);
  1926. return (void*)-1;
  1927. }
  1928. if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) {
  1929. CloseHandle(hToken);
  1930. return (void*)-1;
  1931. }
  1932. map_address = (void *)VirtualAlloc(address,
  1933. BUFFER_SIZE,
  1934. MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
  1935. PAGE_READWRITE);
  1936. tp.Privileges[0].Attributes = 0;
  1937. AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
  1938. if (map_address == (void *)NULL) map_address = (void *)-1;
  1939. #endif
  1940. if (map_address != (void *)-1){
  1941. release_info[release_pos].address = map_address;
  1942. release_info[release_pos].func = alloc_hugetlb_free;
  1943. release_pos ++;
  1944. }
  1945. return map_address;
  1946. }
  1947. #endif
  1948. #endif
  1949. #ifdef ALLOC_HUGETLBFILE
  1950. static int hugetlb_pid = 0;
  1951. static void alloc_hugetlbfile_free(struct release_t *release){
  1952. if (munmap(release -> address, BUFFER_SIZE)) {
  1953. printf("OpenBLAS : HugeTLBfs unmap failed.\n");
  1954. }
  1955. if (close(release -> attr)) {
  1956. printf("OpenBLAS : HugeTLBfs close failed.\n");
  1957. }
  1958. }
  1959. static void *alloc_hugetlbfile(void *address){
  1960. void *map_address = (void *)-1;
  1961. int fd;
  1962. char filename[64];
  1963. if (!hugetlb_pid) hugetlb_pid = getpid();
  1964. sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
  1965. if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
  1966. return (void *)-1;
  1967. }
  1968. unlink(filename);
  1969. map_address = mmap(address, BUFFER_SIZE,
  1970. PROT_READ | PROT_WRITE,
  1971. MAP_SHARED,
  1972. fd, 0);
  1973. if (map_address != (void *)-1) {
  1974. release_info[release_pos].address = map_address;
  1975. release_info[release_pos].attr = fd;
  1976. release_info[release_pos].func = alloc_hugetlbfile_free;
  1977. release_pos ++;
  1978. }
  1979. return map_address;
  1980. }
  1981. #endif
  1982. #ifdef SEEK_ADDRESS
  1983. static BLASULONG base_address = 0UL;
  1984. #else
  1985. static BLASULONG base_address = BASE_ADDRESS;
  1986. #endif
  1987. static volatile struct {
  1988. BLASULONG lock;
  1989. void *addr;
  1990. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  1991. int pos;
  1992. #endif
  1993. int used;
  1994. #ifndef __64BIT__
  1995. char dummy[48];
  1996. #else
  1997. char dummy[40];
  1998. #endif
  1999. } memory[NUM_BUFFERS];
  2000. static int memory_initialized = 0;
  2001. /* Memory allocation routine */
  2002. /* procpos ... indicates where it comes from */
  2003. /* 0 : Level 3 functions */
  2004. /* 1 : Level 2 functions */
  2005. /* 2 : Thread */
  2006. void *blas_memory_alloc(int procpos){
  2007. int position;
  2008. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  2009. int mypos = 0;
  2010. #endif
  2011. void *map_address;
  2012. void *(*memoryalloc[])(void *address) = {
  2013. #ifdef ALLOC_DEVICEDRIVER
  2014. alloc_devicedirver,
  2015. #endif
  2016. /* Hugetlb implicitly assumes ALLOC_SHM */
  2017. #ifdef ALLOC_SHM
  2018. alloc_shm,
  2019. #endif
  2020. #if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS))
  2021. alloc_hugetlb,
  2022. #endif
  2023. #ifdef ALLOC_MMAP
  2024. alloc_mmap,
  2025. #endif
  2026. #ifdef ALLOC_QALLOC
  2027. alloc_qalloc,
  2028. #endif
  2029. #ifdef ALLOC_WINDOWS
  2030. alloc_windows,
  2031. #endif
  2032. #ifdef ALLOC_MALLOC
  2033. alloc_malloc,
  2034. #endif
  2035. NULL,
  2036. };
  2037. void *(**func)(void *address);
  2038. #if defined(USE_OPENMP)
  2039. if (!memory_initialized) {
  2040. #endif
  2041. LOCK_COMMAND(&alloc_lock);
  2042. if (!memory_initialized) {
  2043. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  2044. for (position = 0; position < NUM_BUFFERS; position ++){
  2045. memory[position].addr = (void *)0;
  2046. memory[position].pos = -1;
  2047. memory[position].used = 0;
  2048. memory[position].lock = 0;
  2049. }
  2050. #endif
  2051. #ifdef DYNAMIC_ARCH
  2052. gotoblas_dynamic_init();
  2053. #endif
  2054. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  2055. gotoblas_affinity_init();
  2056. #endif
  2057. #ifdef SMP
  2058. if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
  2059. #endif
  2060. #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
  2061. #ifndef DYNAMIC_ARCH
  2062. blas_set_parameter();
  2063. #endif
  2064. #endif
  2065. memory_initialized = 1;
  2066. }
  2067. UNLOCK_COMMAND(&alloc_lock);
  2068. #if defined(USE_OPENMP)
  2069. }
  2070. #endif
  2071. #ifdef DEBUG
  2072. printf("Alloc Start ...\n");
  2073. #endif
  2074. /* #if defined(WHEREAMI) && !defined(USE_OPENMP)
  2075. mypos = WhereAmI();
  2076. position = mypos;
  2077. while (position >= NUM_BUFFERS) position >>= 1;
  2078. do {
  2079. if (!memory[position].used && (memory[position].pos == mypos)) {
  2080. #if defined(SMP) && !defined(USE_OPENMP)
  2081. LOCK_COMMAND(&alloc_lock);
  2082. #else
  2083. blas_lock(&memory[position].lock);
  2084. #endif
  2085. if (!memory[position].used) goto allocation;
  2086. #if defined(SMP) && !defined(USE_OPENMP)
  2087. UNLOCK_COMMAND(&alloc_lock);
  2088. #else
  2089. blas_unlock(&memory[position].lock);
  2090. #endif
  2091. }
  2092. position ++;
  2093. } while (position < NUM_BUFFERS);
  2094. #endif */
  2095. position = 0;
  2096. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2097. LOCK_COMMAND(&alloc_lock);
  2098. #endif
  2099. do {
  2100. #if defined(USE_OPENMP)
  2101. if (!memory[position].used) {
  2102. blas_lock(&memory[position].lock);
  2103. #endif
  2104. if (!memory[position].used) goto allocation;
  2105. #if defined(USE_OPENMP)
  2106. blas_unlock(&memory[position].lock);
  2107. }
  2108. #endif
  2109. position ++;
  2110. } while (position < NUM_BUFFERS);
  2111. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2112. UNLOCK_COMMAND(&alloc_lock);
  2113. #endif
  2114. goto error;
  2115. allocation :
  2116. #ifdef DEBUG
  2117. printf(" Position -> %d\n", position);
  2118. #endif
  2119. memory[position].used = 1;
  2120. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2121. UNLOCK_COMMAND(&alloc_lock);
  2122. #else
  2123. blas_unlock(&memory[position].lock);
  2124. #endif
  2125. if (!memory[position].addr) {
  2126. do {
  2127. #ifdef DEBUG
  2128. printf("Allocation Start : %lx\n", base_address);
  2129. #endif
  2130. map_address = (void *)-1;
  2131. func = &memoryalloc[0];
  2132. while ((func != NULL) && (map_address == (void *) -1)) {
  2133. map_address = (*func)((void *)base_address);
  2134. #ifdef ALLOC_DEVICEDRIVER
  2135. if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
  2136. fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
  2137. }
  2138. #endif
  2139. #ifdef ALLOC_HUGETLBFILE
  2140. if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
  2141. #ifndef OS_WINDOWS
  2142. fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
  2143. #endif
  2144. }
  2145. #endif
  2146. #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
  2147. if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
  2148. #endif
  2149. func ++;
  2150. }
  2151. #ifdef DEBUG
  2152. printf(" Success -> %08lx\n", map_address);
  2153. #endif
  2154. if (((BLASLONG) map_address) == -1) base_address = 0UL;
  2155. if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
  2156. } while ((BLASLONG)map_address == -1);
  2157. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2158. LOCK_COMMAND(&alloc_lock);
  2159. #endif
  2160. memory[position].addr = map_address;
  2161. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2162. UNLOCK_COMMAND(&alloc_lock);
  2163. #endif
  2164. #ifdef DEBUG
  2165. printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
  2166. #endif
  2167. }
  2168. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  2169. if (memory[position].pos == -1) memory[position].pos = mypos;
  2170. #endif
  2171. #ifdef DYNAMIC_ARCH
  2172. if (memory_initialized == 1) {
  2173. LOCK_COMMAND(&alloc_lock);
  2174. if (memory_initialized == 1) {
  2175. if (!gotoblas) gotoblas_dynamic_init();
  2176. memory_initialized = 2;
  2177. }
  2178. UNLOCK_COMMAND(&alloc_lock);
  2179. }
  2180. #endif
  2181. #ifdef DEBUG
  2182. printf("Mapped : %p %3d\n\n",
  2183. (void *)memory[position].addr, position);
  2184. #endif
  2185. return (void *)memory[position].addr;
  2186. error:
  2187. printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
  2188. return NULL;
  2189. }
  2190. void blas_memory_free(void *free_area){
  2191. int position;
  2192. #ifdef DEBUG
  2193. printf("Unmapped Start : %p ...\n", free_area);
  2194. #endif
  2195. position = 0;
  2196. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2197. LOCK_COMMAND(&alloc_lock);
  2198. #endif
  2199. while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
  2200. position++;
  2201. if (memory[position].addr != free_area) goto error;
  2202. #ifdef DEBUG
  2203. printf(" Position : %d\n", position);
  2204. #endif
  2205. // arm: ensure all writes are finished before other thread takes this memory
  2206. WMB;
  2207. memory[position].used = 0;
  2208. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2209. UNLOCK_COMMAND(&alloc_lock);
  2210. #endif
  2211. #ifdef DEBUG
  2212. printf("Unmap Succeeded.\n\n");
  2213. #endif
  2214. return;
  2215. error:
  2216. printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
  2217. #ifdef DEBUG
  2218. for (position = 0; position < NUM_BUFFERS; position++)
  2219. printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
  2220. #endif
  2221. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2222. UNLOCK_COMMAND(&alloc_lock);
  2223. #endif
  2224. return;
  2225. }
  2226. void *blas_memory_alloc_nolock(int unused) {
  2227. void *map_address;
  2228. map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
  2229. return map_address;
  2230. }
  2231. void blas_memory_free_nolock(void * map_address) {
  2232. free(map_address);
  2233. }
  2234. void blas_shutdown(void){
  2235. int pos;
  2236. #ifdef SMP
  2237. BLASFUNC(blas_thread_shutdown)();
  2238. #endif
  2239. LOCK_COMMAND(&alloc_lock);
  2240. for (pos = 0; pos < release_pos; pos ++) {
  2241. release_info[pos].func(&release_info[pos]);
  2242. }
  2243. #ifdef SEEK_ADDRESS
  2244. base_address = 0UL;
  2245. #else
  2246. base_address = BASE_ADDRESS;
  2247. #endif
  2248. for (pos = 0; pos < NUM_BUFFERS; pos ++){
  2249. memory[pos].addr = (void *)0;
  2250. memory[pos].used = 0;
  2251. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  2252. memory[pos].pos = -1;
  2253. #endif
  2254. memory[pos].lock = 0;
  2255. }
  2256. UNLOCK_COMMAND(&alloc_lock);
  2257. return;
  2258. }
  2259. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  2260. #if defined(SMP) || defined(USE_LOCKING)
  2261. #if defined(USE_PTHREAD_LOCK)
  2262. static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
  2263. #elif defined(USE_PTHREAD_SPINLOCK)
  2264. static pthread_spinlock_t init_lock = 0;
  2265. #else
  2266. static BLASULONG init_lock = 0UL;
  2267. #endif
  2268. #endif
  2269. static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
  2270. void *sa, void *sb, BLASLONG pos) {
  2271. #if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
  2272. size_t size;
  2273. BLASULONG buffer;
  2274. size = BUFFER_SIZE - PAGESIZE;
  2275. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  2276. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  2277. if (hot_alloc != 2) {
  2278. #endif
  2279. #if defined(SMP) || defined(USE_LOCKING)
  2280. LOCK_COMMAND(&init_lock);
  2281. #endif
  2282. while (size > 0) {
  2283. *(int *)buffer = size;
  2284. buffer += PAGESIZE;
  2285. size -= PAGESIZE;
  2286. }
  2287. #if defined(SMP) || defined(USE_LOCKING)
  2288. UNLOCK_COMMAND(&init_lock);
  2289. #endif
  2290. size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
  2291. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  2292. while (size > 0) {
  2293. *(int *)buffer = size;
  2294. buffer += 64;
  2295. size -= 64;
  2296. }
  2297. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  2298. }
  2299. #endif
  2300. #endif
  2301. }
  2302. #ifdef SMP
  2303. static void _init_thread_memory(void *buffer) {
  2304. blas_queue_t queue[MAX_CPU_NUMBER];
  2305. int num_cpu;
  2306. for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
  2307. blas_queue_init(&queue[num_cpu]);
  2308. queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL;
  2309. queue[num_cpu].routine = &_touch_memory;
  2310. queue[num_cpu].args = NULL;
  2311. queue[num_cpu].next = &queue[num_cpu + 1];
  2312. }
  2313. queue[num_cpu - 1].next = NULL;
  2314. queue[0].sa = buffer;
  2315. exec_blas(num_cpu, queue);
  2316. }
  2317. #endif
  2318. static void gotoblas_memory_init(void) {
  2319. void *buffer;
  2320. hot_alloc = 1;
  2321. buffer = (void *)blas_memory_alloc(0);
  2322. #ifdef SMP
  2323. if (blas_cpu_number == 0) blas_get_cpu_number();
  2324. #ifdef SMP_SERVER
  2325. if (blas_server_avail == 0) blas_thread_init();
  2326. #endif
  2327. _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
  2328. #else
  2329. _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
  2330. #endif
  2331. blas_memory_free(buffer);
  2332. }
  2333. #endif
  2334. /* Initialization for all function; this function should be called before main */
  2335. static int gotoblas_initialized = 0;
  2336. extern void openblas_read_env();
  2337. void CONSTRUCTOR gotoblas_init(void) {
  2338. if (gotoblas_initialized) return;
  2339. #ifdef SMP
  2340. openblas_fork_handler();
  2341. #endif
  2342. openblas_read_env();
  2343. #ifdef PROFILE
  2344. moncontrol (0);
  2345. #endif
  2346. #ifdef DYNAMIC_ARCH
  2347. gotoblas_dynamic_init();
  2348. #endif
  2349. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  2350. gotoblas_affinity_init();
  2351. #endif
  2352. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  2353. gotoblas_memory_init();
  2354. #endif
  2355. //#if defined(OS_LINUX)
  2356. #if 0
  2357. struct rlimit curlimit;
  2358. if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
  2359. {
  2360. if ( curlimit.rlim_cur != curlimit.rlim_max )
  2361. {
  2362. curlimit.rlim_cur = curlimit.rlim_max;
  2363. setrlimit(RLIMIT_STACK, &curlimit);
  2364. }
  2365. }
  2366. #endif
  2367. #ifdef SMP
  2368. if (blas_cpu_number == 0) blas_get_cpu_number();
  2369. #ifdef SMP_SERVER
  2370. if (blas_server_avail == 0) blas_thread_init();
  2371. #endif
  2372. #endif
  2373. #ifdef FUNCTION_PROFILE
  2374. gotoblas_profile_init();
  2375. #endif
  2376. gotoblas_initialized = 1;
  2377. #ifdef PROFILE
  2378. moncontrol (1);
  2379. #endif
  2380. }
  2381. void DESTRUCTOR gotoblas_quit(void) {
  2382. if (gotoblas_initialized == 0) return;
  2383. blas_shutdown();
  2384. #ifdef PROFILE
  2385. moncontrol (0);
  2386. #endif
  2387. #ifdef FUNCTION_PROFILE
  2388. gotoblas_profile_quit();
  2389. #endif
  2390. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  2391. gotoblas_affinity_quit();
  2392. #endif
  2393. #ifdef DYNAMIC_ARCH
  2394. gotoblas_dynamic_quit();
  2395. #endif
  2396. gotoblas_initialized = 0;
  2397. #ifdef PROFILE
  2398. moncontrol (1);
  2399. #endif
  2400. }
  2401. #if defined(_MSC_VER) && !defined(__clang__)
  2402. BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
  2403. {
  2404. switch (ul_reason_for_call)
  2405. {
  2406. case DLL_PROCESS_ATTACH:
  2407. gotoblas_init();
  2408. break;
  2409. case DLL_THREAD_ATTACH:
  2410. break;
  2411. case DLL_THREAD_DETACH:
  2412. break;
  2413. case DLL_PROCESS_DETACH:
  2414. gotoblas_quit();
  2415. break;
  2416. default:
  2417. break;
  2418. }
  2419. return TRUE;
  2420. }
  2421. /*
  2422. This is to allow static linking.
  2423. Code adapted from Google performance tools:
  2424. https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
  2425. Reference:
  2426. https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
  2427. http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
  2428. */
  2429. static int on_process_term(void)
  2430. {
  2431. gotoblas_quit();
  2432. return 0;
  2433. }
  2434. #ifdef _WIN64
  2435. #pragma comment(linker, "/INCLUDE:_tls_used")
  2436. #else
  2437. #pragma comment(linker, "/INCLUDE:__tls_used")
  2438. #endif
  2439. #ifdef _WIN64
  2440. #pragma const_seg(".CRT$XLB")
  2441. #else
  2442. #pragma data_seg(".CRT$XLB")
  2443. #endif
  2444. static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
  2445. #ifdef _WIN64
  2446. #pragma const_seg()
  2447. #else
  2448. #pragma data_seg()
  2449. #endif
  2450. #ifdef _WIN64
  2451. #pragma const_seg(".CRT$XTU")
  2452. #else
  2453. #pragma data_seg(".CRT$XTU")
  2454. #endif
  2455. static int(*p_process_term)(void) = on_process_term;
  2456. #ifdef _WIN64
  2457. #pragma const_seg()
  2458. #else
  2459. #pragma data_seg()
  2460. #endif
  2461. #endif
  2462. #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
  2463. /* Don't call me; this is just work around for PGI / Sun bug */
  2464. void gotoblas_dummy_for_PGI(void) {
  2465. gotoblas_init();
  2466. gotoblas_quit();
  2467. #if __PGIC__ < 19
  2468. #if 0
  2469. asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
  2470. asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
  2471. #else
  2472. asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
  2473. asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
  2474. #endif
  2475. #endif
  2476. }
  2477. #endif
  2478. #endif