You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

index.html 36 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986
  1. <!doctype html>
  2. <html lang="en" class="no-js">
  3. <head>
  4. <meta charset="utf-8">
  5. <meta name="viewport" content="width=device-width,initial-scale=1">
  6. <link rel="canonical" href="https://openblas.net/docs/developers/">
  7. <link rel="prev" href="../extensions/">
  8. <link rel="next" href="../build_system/">
  9. <link rel="icon" href="../logo.svg">
  10. <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.6.14">
  11. <title>Developer manual - OpenBLAS</title>
  12. <link rel="stylesheet" href="../assets/stylesheets/main.342714a4.min.css">
  13. <link rel="stylesheet" href="../assets/stylesheets/palette.06af60db.min.css">
  14. <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
  15. <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
  16. <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
  17. <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
  18. </head>
  19. <body dir="ltr" data-md-color-scheme="slate" data-md-color-primary="blue-grey" data-md-color-accent="indigo">
  20. <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
  21. <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
  22. <label class="md-overlay" for="__drawer"></label>
  23. <div data-md-component="skip">
  24. <a href="#developer-manual" class="md-skip">
  25. Skip to content
  26. </a>
  27. </div>
  28. <div data-md-component="announce">
  29. </div>
  30. <header class="md-header md-header--shadow" data-md-component="header">
  31. <nav class="md-header__inner md-grid" aria-label="Header">
  32. <a href=".." title="OpenBLAS" class="md-header__button md-logo" aria-label="OpenBLAS" data-md-component="logo">
  33. <img src="../logo.svg" alt="logo">
  34. </a>
  35. <label class="md-header__button md-icon" for="__drawer">
  36. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
  37. </label>
  38. <div class="md-header__title" data-md-component="header-title">
  39. <div class="md-header__ellipsis">
  40. <div class="md-header__topic">
  41. <span class="md-ellipsis">
  42. OpenBLAS
  43. </span>
  44. </div>
  45. <div class="md-header__topic" data-md-component="header-topic">
  46. <span class="md-ellipsis">
  47. Developer manual
  48. </span>
  49. </div>
  50. </div>
  51. </div>
  52. <form class="md-header__option" data-md-component="palette">
  53. <input class="md-option" data-md-color-media="" data-md-color-scheme="slate" data-md-color-primary="blue-grey" data-md-color-accent="indigo" aria-label="Switch to light mode" type="radio" name="__palette" id="__palette_0">
  54. <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_1" hidden>
  55. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
  56. </label>
  57. <input class="md-option" data-md-color-media="" data-md-color-scheme="default" data-md-color-primary="blue-grey" data-md-color-accent="indigo" aria-label="Switch to dark mode" type="radio" name="__palette" id="__palette_1">
  58. <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_0" hidden>
  59. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
  60. </label>
  61. </form>
  62. <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
  63. <label class="md-header__button md-icon" for="__search">
  64. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
  65. </label>
  66. <div class="md-search" data-md-component="search" role="dialog">
  67. <label class="md-search__overlay" for="__search"></label>
  68. <div class="md-search__inner" role="search">
  69. <form class="md-search__form" name="search">
  70. <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
  71. <label class="md-search__icon md-icon" for="__search">
  72. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
  73. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
  74. </label>
  75. <nav class="md-search__options" aria-label="Search">
  76. <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
  77. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
  78. </button>
  79. </nav>
  80. </form>
  81. <div class="md-search__output">
  82. <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
  83. <div class="md-search-result" data-md-component="search-result">
  84. <div class="md-search-result__meta">
  85. Initializing search
  86. </div>
  87. <ol class="md-search-result__list" role="presentation"></ol>
  88. </div>
  89. </div>
  90. </div>
  91. </div>
  92. </div>
  93. <div class="md-header__source">
  94. <a href="https://github.com/OpenMathLib/OpenBLAS" title="Go to repository" class="md-source" data-md-component="source">
  95. <div class="md-source__icon md-icon">
  96. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.7.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
  97. </div>
  98. <div class="md-source__repository">
  99. GitHub
  100. </div>
  101. </a>
  102. </div>
  103. </nav>
  104. </header>
  105. <div class="md-container" data-md-component="container">
  106. <main class="md-main" data-md-component="main">
  107. <div class="md-main__inner md-grid">
  108. <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
  109. <div class="md-sidebar__scrollwrap">
  110. <div class="md-sidebar__inner">
  111. <nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
  112. <label class="md-nav__title" for="__drawer">
  113. <a href=".." title="OpenBLAS" class="md-nav__button md-logo" aria-label="OpenBLAS" data-md-component="logo">
  114. <img src="../logo.svg" alt="logo">
  115. </a>
  116. OpenBLAS
  117. </label>
  118. <div class="md-nav__source">
  119. <a href="https://github.com/OpenMathLib/OpenBLAS" title="Go to repository" class="md-source" data-md-component="source">
  120. <div class="md-source__icon md-icon">
  121. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.7.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
  122. </div>
  123. <div class="md-source__repository">
  124. GitHub
  125. </div>
  126. </a>
  127. </div>
  128. <ul class="md-nav__list" data-md-scrollfix>
  129. <li class="md-nav__item">
  130. <a href=".." class="md-nav__link">
  131. <span class="md-ellipsis">
  132. Home
  133. </span>
  134. </a>
  135. </li>
  136. <li class="md-nav__item">
  137. <a href="../install/" class="md-nav__link">
  138. <span class="md-ellipsis">
  139. Install OpenBLAS
  140. </span>
  141. </a>
  142. </li>
  143. <li class="md-nav__item">
  144. <a href="../user_manual/" class="md-nav__link">
  145. <span class="md-ellipsis">
  146. User manual
  147. </span>
  148. </a>
  149. </li>
  150. <li class="md-nav__item">
  151. <a href="../extensions/" class="md-nav__link">
  152. <span class="md-ellipsis">
  153. Extensions
  154. </span>
  155. </a>
  156. </li>
  157. <li class="md-nav__item md-nav__item--active">
  158. <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
  159. <label class="md-nav__link md-nav__link--active" for="__toc">
  160. <span class="md-ellipsis">
  161. Developer manual
  162. </span>
  163. <span class="md-nav__icon md-icon"></span>
  164. </label>
  165. <a href="./" class="md-nav__link md-nav__link--active">
  166. <span class="md-ellipsis">
  167. Developer manual
  168. </span>
  169. </a>
  170. <nav class="md-nav md-nav--secondary" aria-label="Table of contents">
  171. <label class="md-nav__title" for="__toc">
  172. <span class="md-nav__icon md-icon"></span>
  173. Table of contents
  174. </label>
  175. <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
  176. <li class="md-nav__item">
  177. <a href="#source-code-layout" class="md-nav__link">
  178. <span class="md-ellipsis">
  179. Source code layout
  180. </span>
  181. </a>
  182. </li>
  183. <li class="md-nav__item">
  184. <a href="#optimizing-gemm-for-a-given-hardware" class="md-nav__link">
  185. <span class="md-ellipsis">
  186. Optimizing GEMM for a given hardware
  187. </span>
  188. </a>
  189. </li>
  190. <li class="md-nav__item">
  191. <a href="#running-openblas-tests" class="md-nav__link">
  192. <span class="md-ellipsis">
  193. Running OpenBLAS tests
  194. </span>
  195. </a>
  196. </li>
  197. <li class="md-nav__item">
  198. <a href="#benchmarking" class="md-nav__link">
  199. <span class="md-ellipsis">
  200. Benchmarking
  201. </span>
  202. </a>
  203. </li>
  204. <li class="md-nav__item">
  205. <a href="#adding-autodetection-support-for-a-new-revision-or-variant-of-a-supported-cpu" class="md-nav__link">
  206. <span class="md-ellipsis">
  207. Adding autodetection support for a new revision or variant of a supported CPU
  208. </span>
  209. </a>
  210. </li>
  211. <li class="md-nav__item">
  212. <a href="#adding-dedicated-support-for-a-new-cpu-model" class="md-nav__link">
  213. <span class="md-ellipsis">
  214. Adding dedicated support for a new CPU model
  215. </span>
  216. </a>
  217. </li>
  218. <li class="md-nav__item">
  219. <a href="#adding-support-for-an-entirely-new-architecture" class="md-nav__link">
  220. <span class="md-ellipsis">
  221. Adding support for an entirely new architecture
  222. </span>
  223. </a>
  224. </li>
  225. </ul>
  226. </nav>
  227. </li>
  228. <li class="md-nav__item">
  229. <a href="../build_system/" class="md-nav__link">
  230. <span class="md-ellipsis">
  231. Build system
  232. </span>
  233. </a>
  234. </li>
  235. <li class="md-nav__item">
  236. <a href="../runtime_variables/" class="md-nav__link">
  237. <span class="md-ellipsis">
  238. Runtime variables
  239. </span>
  240. </a>
  241. </li>
  242. <li class="md-nav__item">
  243. <a href="../distributing/" class="md-nav__link">
  244. <span class="md-ellipsis">
  245. Redistributing OpenBLAS
  246. </span>
  247. </a>
  248. </li>
  249. <li class="md-nav__item">
  250. <a href="../ci/" class="md-nav__link">
  251. <span class="md-ellipsis">
  252. CI jobs
  253. </span>
  254. </a>
  255. </li>
  256. <li class="md-nav__item">
  257. <a href="../about/" class="md-nav__link">
  258. <span class="md-ellipsis">
  259. About
  260. </span>
  261. </a>
  262. </li>
  263. <li class="md-nav__item">
  264. <a href="../faq/" class="md-nav__link">
  265. <span class="md-ellipsis">
  266. FAQ
  267. </span>
  268. </a>
  269. </li>
  270. </ul>
  271. </nav>
  272. </div>
  273. </div>
  274. </div>
  275. <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
  276. <div class="md-sidebar__scrollwrap">
  277. <div class="md-sidebar__inner">
  278. <nav class="md-nav md-nav--secondary" aria-label="Table of contents">
  279. <label class="md-nav__title" for="__toc">
  280. <span class="md-nav__icon md-icon"></span>
  281. Table of contents
  282. </label>
  283. <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
  284. <li class="md-nav__item">
  285. <a href="#source-code-layout" class="md-nav__link">
  286. <span class="md-ellipsis">
  287. Source code layout
  288. </span>
  289. </a>
  290. </li>
  291. <li class="md-nav__item">
  292. <a href="#optimizing-gemm-for-a-given-hardware" class="md-nav__link">
  293. <span class="md-ellipsis">
  294. Optimizing GEMM for a given hardware
  295. </span>
  296. </a>
  297. </li>
  298. <li class="md-nav__item">
  299. <a href="#running-openblas-tests" class="md-nav__link">
  300. <span class="md-ellipsis">
  301. Running OpenBLAS tests
  302. </span>
  303. </a>
  304. </li>
  305. <li class="md-nav__item">
  306. <a href="#benchmarking" class="md-nav__link">
  307. <span class="md-ellipsis">
  308. Benchmarking
  309. </span>
  310. </a>
  311. </li>
  312. <li class="md-nav__item">
  313. <a href="#adding-autodetection-support-for-a-new-revision-or-variant-of-a-supported-cpu" class="md-nav__link">
  314. <span class="md-ellipsis">
  315. Adding autodetection support for a new revision or variant of a supported CPU
  316. </span>
  317. </a>
  318. </li>
  319. <li class="md-nav__item">
  320. <a href="#adding-dedicated-support-for-a-new-cpu-model" class="md-nav__link">
  321. <span class="md-ellipsis">
  322. Adding dedicated support for a new CPU model
  323. </span>
  324. </a>
  325. </li>
  326. <li class="md-nav__item">
  327. <a href="#adding-support-for-an-entirely-new-architecture" class="md-nav__link">
  328. <span class="md-ellipsis">
  329. Adding support for an entirely new architecture
  330. </span>
  331. </a>
  332. </li>
  333. </ul>
  334. </nav>
  335. </div>
  336. </div>
  337. </div>
  338. <div class="md-content" data-md-component="content">
  339. <article class="md-content__inner md-typeset">
  340. <h1 id="developer-manual">Developer manual</h1>
  341. <h2 id="source-code-layout">Source code layout</h2>
  342. <div class="highlight"><pre><span></span><code>OpenBLAS/
  343. ├── benchmark Benchmark codes for BLAS
  344. ├── cmake CMakefiles
  345. ├── ctest Test codes for CBLAS interfaces
  346. ├── driver Implemented in C
  347. │   ├── level2
  348. │   ├── level3
  349. │   ├── mapper
  350. │   └── others Memory management, threading, etc
  351. ├── exports Generate shared library
  352. ├── interface Implement BLAS and CBLAS interfaces (calling driver or kernel)
  353. │   ├── lapack
  354. │   └── netlib
  355. ├── kernel Optimized assembly kernels for CPU architectures
  356. │   ├── alpha Original GotoBLAS kernels for DEC Alpha
  357. │   ├── arm ARMV5,V6,V7 kernels (including generic C codes used by other architectures)
  358. │   ├── arm64 ARMV8
  359. │   ├── generic General kernel codes written in plain C, parts used by many architectures.
  360. │   ├── ia64 Original GotoBLAS kernels for Intel Itanium
  361. │ ├── mips
  362. │   ├── mips64
  363. │   ├── power
  364. | ├── riscv64
  365. | ├── simd Common code for Universal Intrinsics, used by some x86_64 and arm64 kernels
  366. │   ├── sparc
  367. │   ├── x86
  368. │ ├── x86_64
  369. │   └── zarch
  370. ├── lapack Optimized LAPACK codes (replacing those in regular LAPACK)
  371. │   ├── getf2
  372. │   ├── getrf
  373. │   ├── getrs
  374. │   ├── laswp
  375. │   ├── lauu2
  376. │   ├── lauum
  377. │   ├── potf2
  378. │   ├── potrf
  379. │   ├── trti2
  380. │ ├── trtri
  381. │   └── trtrs
  382. ├── lapack-netlib LAPACK codes from netlib reference implementation
  383. ├── reference BLAS Fortran reference implementation (unused)
  384. ├── relapack Elmar Peise&#39;s recursive LAPACK (implemented on top of regular LAPACK)
  385. ├── test Test codes for BLAS
  386. └── utest Regression test
  387. </code></pre></div>
  388. <p>A call tree for <code>dgemm</code> looks as follows:
  389. <div class="highlight"><pre><span></span><code>interface/gemm.c
  390. driver/level3/level3.c
  391. gemm assembly kernels at kernel/
  392. </code></pre></div></p>
  393. <p>To find the kernel currently used for a particular supported CPU, please check the corresponding <code>kernel/$(ARCH)/KERNEL.$(CPU)</code> file.</p>
  394. <p>Here is an example for <code>kernel/x86_64/KERNEL.HASWELL</code>:
  395. <div class="highlight"><pre><span></span><code>...
  396. DTRMMKERNEL = dtrmm_kernel_4x8_haswell.c
  397. DGEMMKERNEL = dgemm_kernel_4x8_haswell.S
  398. ...
  399. </code></pre></div>
  400. According to the above <code>KERNEL.HASWELL</code>, OpenBLAS Haswell dgemm kernel file is <code>dgemm_kernel_4x8_haswell.S</code>.</p>
  401. <h2 id="optimizing-gemm-for-a-given-hardware">Optimizing GEMM for a given hardware</h2>
  402. <div class="admonition abstract">
  403. <p class="admonition-title">Read the Goto paper to understand the algorithm</p>
  404. <p>Goto, Kazushige; van de Geijn, Robert A. (2008).
  405. <a href="http://delivery.acm.org/10.1145/1360000/1356053/a12-goto.pdf?ip=155.68.162.54&amp;id=1356053&amp;acc=ACTIVE%20SERVICE&amp;key=A79D83B43E50B5B8%2EF070BBE7E45C3F17%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&amp;__acm__=1517932837_edfe766f1e295d9a7830812371e1d173">"Anatomy of High-Performance Matrix Multiplication"</a>.
  406. ACM Transactions on Mathematical Software 34 (3): Article 12</p>
  407. <p>(The above link is available only to ACM members, but this and many related
  408. papers is also available on <a href="http://www.cs.utexas.edu/~flame/web/FLAMEPublications.html">the pages of van de Geijn's FLAME project</a>)</p>
  409. </div>
  410. <p>The <code>driver/level3/level3.c</code> is the implementation of Goto's algorithm.
  411. Meanwhile, you can look at <code>kernel/generic/gemmkernel_2x2.c</code>, which is a naive
  412. <code>2x2</code> register blocking <code>gemm</code> kernel in C. Then:</p>
  413. <ul>
  414. <li>Write optimized assembly kernels. Consider instruction pipeline, available registers, memory/cache access.</li>
  415. <li>Tune cache block sizes (<code>Mc</code>, <code>Kc</code>, and <code>Nc</code>)</li>
  416. </ul>
  417. <p>Note that not all of the CPU-specific parameters in <code>param.h</code> are actively used in algorithms.
  418. <code>DNUMOPT</code> only appears as a scale factor in profiling output of the level3 <code>syrk</code> interface code,
  419. while its counterpart <code>SNUMOPT</code> (aliased as <code>NUMOPT</code> in <code>common.h</code>) is not used anywhere at all. </p>
  420. <p><code>SYMV_P</code> is only used in the generic kernels for the <code>symv</code> and <code>chemv</code>/<code>zhemv</code> functions -
  421. at least some of those are usually overridden by CPU-specific implementations, so if you start
  422. by cloning the existing implementation for a related CPU you need to check its <code>KERNEL</code> file
  423. to see if tuning <code>SYMV_P</code> would have any effect at all.</p>
  424. <p><code>GEMV_UNROLL</code> is only used by some older x86-64 kernels, so not all sections in <code>param.h</code> define it.
  425. Similarly, not all of the CPU parameters like L2 or L3 cache sizes are necessarily used in current
  426. kernels for a given model - by all indications the CPU identification code was imported from some
  427. other project originally.</p>
  428. <h2 id="running-openblas-tests">Running OpenBLAS tests</h2>
  429. <p>We use tests for Netlib BLAS, CBLAS, and LAPACK. In addition, we use
  430. OpenBLAS-specific regression tests. They can be run with Make:</p>
  431. <ul>
  432. <li><code>make -C test</code> for BLAS tests</li>
  433. <li><code>make -C ctest</code> for CBLAS tests</li>
  434. <li><code>make -C utest</code> for OpenBLAS regression tests</li>
  435. <li><code>make lapack-test</code> for LAPACK tests</li>
  436. </ul>
  437. <p>We also use the <a href="https://github.com/xianyi/BLAS-Tester">BLAS-Tester</a> tests for regression testing.
  438. It is basically the ATLAS test suite adapted for building with OpenBLAS.</p>
  439. <p>The project makes use of several Continuous Integration (CI) services
  440. conveniently interfaced with GitHub to automatically run tests on a number of
  441. platforms and build configurations.</p>
  442. <p>Also note that the test suites included with "numerically heavy" projects like
  443. Julia, NumPy, SciPy, Octave or QuantumEspresso can be used for regression
  444. testing, when those projects are built such that they use OpenBLAS.</p>
  445. <h2 id="benchmarking">Benchmarking</h2>
  446. <p>A number of benchmarking methods are used by OpenBLAS:</p>
  447. <ul>
  448. <li>Several simple C benchmarks for performance testing individual BLAS functions
  449. are available in the <code>benchmark</code> folder. They can be run locally through the
  450. <code>Makefile</code> in that directory. And the <code>benchmark/scripts</code> subdirectory
  451. contains similar benchmarks that use OpenBLAS via NumPy, SciPy, Octave and R.</li>
  452. <li>On pull requests, a representative set of functions is tested for performance
  453. regressions with Codspeed; results can be viewed at
  454. <a href="https://codspeed.io/OpenMathLib/OpenBLAS">https://codspeed.io/OpenMathLib/OpenBLAS</a>.</li>
  455. <li>The <a href="https://github.com/OpenMathLib/BLAS-Benchmarks">OpenMathLib/BLAS-Benchmarks</a> repository
  456. contains an <a href="https://github.com/airspeed-velocity/asv/">Airspeed Velocity</a>-based benchmark
  457. suite which is run on several CPU architectures in cron jobs. Results are published
  458. to a dashboard: <a href="http://www.openmathlib.org/BLAS-Benchmarks/">http://www.openmathlib.org/BLAS-Benchmarks/</a>.</li>
  459. </ul>
  460. <p>Benchmarking code for BLAS libraries, and specific performance analysis results, can be found
  461. in a number of places. For example:</p>
  462. <ul>
  463. <li><a href="https://github.com/RoyiAvital/MatlabJuliaMatrixOperationsBenchmark">MatlabJuliaMatrixOperationsBenchmark</a>
  464. (various matrix operations in Julia and Matlab)</li>
  465. <li><a href="https://github.com/mmperf/mmperf/">mmperf/mmperf</a> (single-core matrix multiplication)</li>
  466. </ul>
  467. <h2 id="adding-autodetection-support-for-a-new-revision-or-variant-of-a-supported-cpu">Adding autodetection support for a new revision or variant of a supported CPU</h2>
  468. <p>Especially relevant for x86-64, a new CPU model may be a "refresh" (die shrink and/or different number of cores) within an existing
  469. model family without significant changes to its instruction set (e.g., Intel Skylake and Kaby Lake still are fundamentally the same architecture as Haswell,
  470. low end Goldmont etc. are Nehalem). In this case, compilation with the appropriate older <code>TARGET</code> will already lead to a satisfactory build.</p>
  471. <p>To achieve autodetection of the new model, its CPUID (or an equivalent identifier) needs to be added in the <code>cpuid_&lt;architecture&gt;.c</code>
  472. relevant for its general architecture, with the returned name for the new type set appropriately. For x86, which has the most complex
  473. <code>cpuid</code> file, there are two functions that need to be edited: <code>get_cpuname()</code> to return, e.g., <code>CPUTYPE_HASWELL</code> and <code>get_corename()</code> for the (broader)
  474. core family returning, e.g., <code>CORE_HASWELL</code>.<sup id="fnref:1"><a class="footnote-ref" href="#fn:1">1</a></sup></p>
  475. <p>For architectures where <code>DYNAMIC_ARCH</code> builds are supported, a similar but simpler code section for the corresponding
  476. runtime detection of the CPU exists in <code>driver/others/dynamic.c</code> (for x86), and <code>driver/others/dynamic_&lt;arch&gt;.c</code> for other architectures.
  477. Note that for x86 the CPUID is compared after splitting it into its family, extended family, model and extended model parts, so the single decimal
  478. number returned by Linux in <code>/proc/cpuinfo</code> for the model has to be converted back to hexadecimal before splitting into its constituent
  479. digits. For example, <code>142 == 8E</code> translates to extended model 8, model 14.</p>
  480. <h2 id="adding-dedicated-support-for-a-new-cpu-model">Adding dedicated support for a new CPU model</h2>
  481. <p>Usually it will be possible to start from an existing model, clone its <code>KERNEL</code> configuration file to the new name to use for this
  482. <code>TARGET</code> and eventually replace individual kernels with versions better suited for peculiarities of the new CPU model.
  483. In addition, it is necessary to add (or clone at first) the corresponding section of <code>GEMM_UNROLL</code> parameters in the top-level <code>param.h</code>,
  484. and possibly to add definitions such as <code>USE_TRMM</code> (governing whether <code>TRMM</code> functions use the respective <code>GEMM</code> kernel or a separate source file)
  485. to the <code>Makefile</code>s (and <code>CMakeLists.txt</code>) in the kernel directory. The new CPU name needs to be added to <code>TargetList.txt</code>,
  486. and the CPU auto-detection code used by the <code>getarch</code> helper program - contained in
  487. the <code>cpuid_&lt;architecture&gt;.c</code> file amended to include the CPUID (or equivalent) information processing required (see preceding section).</p>
  488. <h2 id="adding-support-for-an-entirely-new-architecture">Adding support for an entirely new architecture</h2>
  489. <p>This endeavour is best started by cloning the entire support structure for 32-bit ARM, and within that the ARMv5 CPU in particular,
  490. as this is implemented through plain C kernels only. An example providing a convenient "shopping list" can be seen in pull request
  491. <a href="https://github.com/OpenMathLib/OpenBLAS/pull/1526">#1526</a>.</p>
  492. <div class="footnote">
  493. <hr />
  494. <ol>
  495. <li id="fn:1">
  496. <p>This information ends up in the <code>Makefile.conf</code> and <code>config.h</code> files generated by <code>getarch</code>. Failure to
  497. set either will typically lead to a missing definition of the <code>GEMM_UNROLL</code> parameters later in the build,
  498. as <code>getarch_2nd</code> will be unable to find a matching parameter section in <code>param.h</code>.&#160;<a class="footnote-backref" href="#fnref:1" title="Jump back to footnote 1 in the text">&#8617;</a></p>
  499. </li>
  500. </ol>
  501. </div>
  502. <aside class="md-source-file">
  503. <span class="md-source-file__fact">
  504. <span class="md-icon" title="Last update">
  505. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1zM12.5 7v5.2l4 2.4-1 1L11 13V7zM11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2z"/></svg>
  506. </span>
  507. <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date" title="June 30, 2024 15:50:01 UTC">June 30, 2024</span>
  508. </span>
  509. <span class="md-source-file__fact">
  510. <span class="md-icon" title="Created">
  511. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3z"/></svg>
  512. </span>
  513. <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date" title="August 4, 2023 07:21:01 UTC">August 4, 2023</span>
  514. </span>
  515. </aside>
  516. </article>
  517. </div>
  518. <script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
  519. </div>
  520. </main>
  521. <footer class="md-footer">
  522. <div class="md-footer-meta md-typeset">
  523. <div class="md-footer-meta__inner md-grid">
  524. <div class="md-copyright">
  525. <div class="md-copyright__highlight">
  526. Copyright &copy; 2012- OpenBLAS contributors
  527. </div>
  528. Made with
  529. <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
  530. Material for MkDocs
  531. </a>
  532. </div>
  533. <div class="md-social">
  534. <a href="https://github.com/OpenMathLib/OpenBLAS" target="_blank" rel="noopener" title="github.com" class="md-social__link">
  535. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 496 512"><!--! Font Awesome Free 6.7.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M165.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6m-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3m44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9M244.8 8C106.1 8 0 113.3 0 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C428.2 457.8 496 362.9 496 252 496 113.3 383.5 8 244.8 8M97.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1m-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7m32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1m-11.4-14.7c-1.6 1-1.6 3.6 0 5.9s4.3 3.3 5.6 2.3c1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2"/></svg>
  536. </a>
  537. <a href="https://github.com/OpenMathLib/OpenBLAS/LICENSE" target="_blank" rel="noopener" title="github.com" class="md-social__link">
  538. <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9 10a3.04 3.04 0 0 1 3-3 3.04 3.04 0 0 1 3 3 3.04 3.04 0 0 1-3 3 3.04 3.04 0 0 1-3-3m3 9 4 1v-3.08A7.54 7.54 0 0 1 12 18a7.54 7.54 0 0 1-4-1.08V20m4-16a5.78 5.78 0 0 0-4.24 1.74A5.78 5.78 0 0 0 6 10a5.78 5.78 0 0 0 1.76 4.23A5.78 5.78 0 0 0 12 16a5.78 5.78 0 0 0 4.24-1.77A5.78 5.78 0 0 0 18 10a5.78 5.78 0 0 0-1.76-4.26A5.78 5.78 0 0 0 12 4m8 6a8 8 0 0 1-.57 2.8A7.8 7.8 0 0 1 18 15.28V23l-6-2-6 2v-7.72A7.9 7.9 0 0 1 4 10a7.68 7.68 0 0 1 2.33-5.64A7.73 7.73 0 0 1 12 2a7.73 7.73 0 0 1 5.67 2.36A7.68 7.68 0 0 1 20 10"/></svg>
  539. </a>
  540. </div>
  541. </div>
  542. </div>
  543. </footer>
  544. </div>
  545. <div class="md-dialog" data-md-component="dialog">
  546. <div class="md-dialog__inner md-typeset"></div>
  547. </div>
  548. <script id="__config" type="application/json">{"base": "..", "features": ["header.autohide"], "search": "../assets/javascripts/workers/search.d50fe291.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
  549. <script src="../assets/javascripts/bundle.13a4f30d.min.js"></script>
  550. </body>
  551. </html>

OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.