| @@ -170,11 +170,20 @@ int sbgemm_kernel_spr_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFL | |||||
| BLASLONG n_count, k_count; | BLASLONG n_count, k_count; | ||||
| #ifndef ALPHA_ONE | #ifndef ALPHA_ONE | ||||
| FLOAT *tmp_c = malloc(sizeof(FLOAT) * m * n); | |||||
| memset(tmp_c, 0, sizeof(FLOAT) * m * n); | |||||
| // make sure each row is 64 bytes aligned | |||||
| BLASLONG cn = (n & 31) ? (n & ~31) + 32 : n; | |||||
| FLOAT *raw_tmp_c; | |||||
| if (k < 32) { | |||||
| // only need to zero buff in this situation | |||||
| raw_tmp_c = (FLOAT *)calloc(1, sizeof(FLOAT) * m * cn + 64); | |||||
| } else { | |||||
| raw_tmp_c = (FLOAT *)malloc(sizeof(FLOAT) * m * cn + 64); | |||||
| } | |||||
| // align buf to 64 byte boundary | |||||
| FLOAT *tmp_c = (FLOAT *)(((uintptr_t) raw_tmp_c + 63) & ~(uintptr_t)63); | |||||
| ptr_c = tmp_c; | ptr_c = tmp_c; | ||||
| BLASLONG ldc_o = ldc; | BLASLONG ldc_o = ldc; | ||||
| ldc = n; | |||||
| ldc = cn; | |||||
| #endif | #endif | ||||
| IFLOAT tail_a[32 * 2] __attribute__ ((aligned (64))); | IFLOAT tail_a[32 * 2] __attribute__ ((aligned (64))); | ||||
| IFLOAT tail_b[32 * 2] __attribute__ ((aligned (64))); | IFLOAT tail_b[32 * 2] __attribute__ ((aligned (64))); | ||||
| @@ -515,7 +524,7 @@ int sbgemm_kernel_spr_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFL | |||||
| MASK_APLPHA_STORE(0); | MASK_APLPHA_STORE(0); | ||||
| } | } | ||||
| } | } | ||||
| free(tmp_c); | |||||
| free(raw_tmp_c); | |||||
| #endif | #endif | ||||
| return 0; | return 0; | ||||
| } | } | ||||