You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

kernel_exec.py 41 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102
  1. #!/usr/bin/env python3
  2. # coding: utf-8
  3. # Copyright 2019-2021 Huawei Technologies Co., Ltd
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. """util"""
  17. import sys
  18. import gc
  19. import inspect
  20. import datetime
  21. import os
  22. import uuid
  23. import logging
  24. import time
  25. import random
  26. import subprocess
  27. import re
  28. import logging
  29. import tvm
  30. from timeit import default_timer as timer
  31. from threading import Thread
  32. from functools import reduce
  33. import numpy as np
  34. import akg
  35. from akg.build_module import help_tiling_level
  36. from akg import backend as cce
  37. import akg.tvm
  38. from akg.tvm import autotvm
  39. from akg.tvm import rpc
  40. from akg.utils import result_analysis as ra_util
  41. from akg.utils import format_transform as ft_util
  42. from akg.utils import custom_tiling as ct_util
  43. from akg.utils import validation_check as vc_util
  44. from akg.utils.dsl_create import TensorUtils
  45. sh = logging.StreamHandler(sys.stdout)
  46. logging.getLogger().addHandler(sh)
  47. logging.getLogger().setLevel(logging.INFO)
  48. rpc_machine = {}
  49. rpc_lb = {}
  50. kc_air_mode = "CCE"
  51. PERFORMANCE_TEST_FILE = "PERFORMANCE_TEST_FILE"
  52. BINDS = "binds"
  53. CUDA = "cuda"
  54. CCE = "cce"
  55. RANDOM_SEED_NUM = 20
  56. PROF_ERROR_CODE = 9999999999
  57. WGT_WIDTH = 16
  58. INP_WIDTH = 16
  59. OUT_WIDTH = 16
  60. BLOCK_IN = 16
  61. BLOCK_OUT = 16
  62. BLOCK_REDUCE = 16
  63. INP_ELEM_BYTES = (BLOCK_IN * BLOCK_REDUCE * INP_WIDTH // 8)
  64. WGT_ELEM_BYTES = (BLOCK_OUT * BLOCK_REDUCE * WGT_WIDTH // 8)
  65. OUT_ELEM_BYTES = (BLOCK_IN * BLOCK_OUT * OUT_WIDTH // 8)
  66. GLB_ELEM_BYTES = (16 * OUT_WIDTH // 8)
  67. def debug_mode(debug_flag):
  68. """
  69. Pass to enable tpu debug mode.
  70. Args:
  71. debug_flag (int): The dbeug flag to be passed.
  72. Returns:
  73. list of function, the pass to set to build_config(add_lower_pass=tpu.debug_mode(mode)).
  74. """
  75. # the number in pass_list such as 0,1,2,3 represents the order of the pass called
  76. pass_list = []
  77. if debug_flag == 1:
  78. pass_list.append((0, ir_pass.inject_dma_intrin))
  79. return pass_list
  80. def func_time_required(func_name):
  81. """Checking the Time Required for Function Running."""
  82. def wrapper(*args, **kwargs):
  83. t0 = time.time()
  84. result = func_name(*args, **kwargs)
  85. t1 = time.time()
  86. logging.info("func_time_required func:%s, running:%lf seconds", func_name.__name__, (t1 - t0))
  87. return result
  88. return wrapper
  89. def create_code(kernel_name, code_path=None, code=None, code_type=CCE):
  90. """
  91. Create cce or cuda file.
  92. Args:
  93. kernel_name: file name.
  94. code_path: file path.
  95. code: code.
  96. code_type: code type.
  97. """
  98. if code_type == CCE:
  99. postfix = ".cce"
  100. elif code_type == CUDA:
  101. postfix = ".cu"
  102. else:
  103. logging.info("the target code type %s is not supported.", code_type)
  104. if not code_path:
  105. code_path = "./"
  106. if code_type == CCE and len(code_path) > 4 and code_path[-4:].lower() == postfix:
  107. real_path = code_path
  108. elif code_type == CUDA and len(code_path) > 3 and code_path[-3:].lower() == postfix:
  109. real_path = code_path
  110. else:
  111. if code_path[-1] == r"/":
  112. real_path = code_path + kernel_name + postfix
  113. else:
  114. real_path = code_path + r"/" + kernel_name + postfix
  115. dir_path = r"/".join(real_path.split(r"/")[:-1])
  116. if not os.path.isdir(dir_path):
  117. os.makedirs(dir_path)
  118. with open(real_path, 'wt') as ss:
  119. ss.write(code)
  120. def gen_name_kernel(kernel, dtype, shapes):
  121. """generate kernel name."""
  122. def _flat_array(srclist, dstlist):
  123. for i in srclist:
  124. if isinstance(i, (list, tuple)):
  125. _flat_array(i, dstlist)
  126. else:
  127. dstlist.append(i)
  128. res = ''
  129. flat = []
  130. _flat_array(shapes, flat)
  131. for s in flat:
  132. res = "%s%s'_'" % (res, s)
  133. res = "%s_%s%s" % (kernel, res, dtype)
  134. return res
  135. def load_rpc_server_info(mode):
  136. """
  137. load rpc server host and port info.
  138. Args:
  139. mode (str): string of runtime choose, can set ca aic and rpc.
  140. """
  141. env_dic = os.environ
  142. if env_dic.get('RPC_HOST') and env_dic.get('RPC_PORT'):
  143. return None
  144. if mode == 'rpc_cloud':
  145. logging.error("runtime_mode=rpc_cloud must set 1980 host ip and port!")
  146. raise Exception("ERROR:runtime_mode=rpc_cloud must set 1980 host ip and port!")
  147. rpc_server_info_config = env_dic.get('RPC_SERVER_INFO_FILE')
  148. if not rpc_server_info_config:
  149. logging.error("runtime_mode=rpc must set RPC_SERVER_INFO_FILE for rpc server info config")
  150. raise Exception("ERROR:runtime_mode=rpc must set RPC_SERVER_INFO_FILE for rpc server info config")
  151. # load rpc server host and port info from local file.
  152. import json
  153. with open(rpc_server_info_config, 'r') as f:
  154. info = json.load(f)
  155. for i in info:
  156. rpc_machine[i] = info[i]
  157. rpc_lb[i] = 0.0
  158. return None
  159. def dispatch(rank=0):
  160. """Function for lock waiting dispatch handle version 1."""
  161. def _sort_by_value(d):
  162. items = list(d.items())
  163. random.shuffle(items)
  164. items.sort(key=lambda x: x[1])
  165. return [item[0] for item in items]
  166. for k, v in rpc_lb.items():
  167. logging.info("######rpc_lb[%s]=%f", rpc_machine[k][0], v)
  168. lb_list = _sort_by_value(rpc_lb)
  169. if len(lb_list) > rank:
  170. return lb_list[rank]
  171. return lb_list[len(lb_list) - 1]
  172. def commit(remote, weight):
  173. rpc_lb[remote] = weight
  174. @func_time_required
  175. def mod_launch_rpc_worker(mod, args, outputs, host, port, tuning=False):
  176. """internal RPC worker, should be called by mod_launch_rpc_thread."""
  177. logging.info("%s:====start connect to rpc ip: %s, rpc port: %d ",
  178. datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), host, port)
  179. remote = rpc.connect(host, port, session_timeout=300)
  180. logging.info("%s:====connect to rpc ip: %s, rpc port: %d finished ",
  181. datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), host, port)
  182. uuid_str = uuid.uuid4().hex
  183. temp_file_name = "stackvm_%s.o" % uuid_str
  184. mod.save(temp_file_name)
  185. remote.upload(temp_file_name)
  186. remote_mod = remote.load_module(temp_file_name)
  187. ctx = remote.cce()
  188. arg_list = []
  189. for a in args:
  190. arg_list.append(akg.tvm.nd.array(a, ctx))
  191. start_time = timer()
  192. remote_mod(*arg_list)
  193. ctx.sync()
  194. if os.path.exists(temp_file_name):
  195. os.remove(temp_file_name)
  196. out_list = []
  197. for i in outputs:
  198. out = arg_list[len(arg_list) + i if i < 0 else i].asnumpy()
  199. out_list.append(out)
  200. # this time measure is no accurate now, to be improved soon
  201. t = timer() - start_time
  202. if not tuning:
  203. return out_list[0] if len(out_list) == 1 else tuple(out_list)
  204. stat_info = {"run_time": t}
  205. return out_list[0] if len(out_list) == 1 else tuple(out_list), stat_info
  206. def mod_launch_rpc_thread(mode, mod, args, outputs, results, need_retry, retry, tuning=False):
  207. """internal RPC thread, should be called by mod_launch_rpc_multithread."""
  208. remoteevb = '0'
  209. host = None
  210. port = None
  211. env_dic = os.environ
  212. if env_dic.get('RPC_HOST') and env_dic.get('RPC_PORT'):
  213. host = env_dic.get('RPC_HOST')
  214. port = int(env_dic.get('RPC_PORT'))
  215. else:
  216. if mode == 'rpc_cloud':
  217. logging.error("runtime_mode=rpc_cloud must set 1980 host ip and port!")
  218. raise Exception("ERROR:runtime_mode=rpc_cloud must set 1980 host ip and port!")
  219. remoteevb = dispatch(retry)
  220. host = rpc_machine[remoteevb][0]
  221. port = rpc_machine[remoteevb][1]
  222. start_time = timer()
  223. end_time = 0.0
  224. logging.debug("rpc ip: %s, rpc port: %d", host, port)
  225. try:
  226. out_list = mod_launch_rpc_worker(mod, args, outputs, host, port, tuning=tuning)
  227. end_time = timer()
  228. t = end_time - start_time
  229. if not env_dic.get('RPC_HOST'):
  230. commit(remoteevb, 20 if t > 20 else t)
  231. logging.info("===this round host is %s time is %f", host, (end_time - start_time))
  232. results[retry] = out_list
  233. except RuntimeError:
  234. need_retry[retry] = True
  235. end_time = timer()
  236. logging.error("===Failed! this round host is %s time is %f", host, (end_time - start_time))
  237. if not env_dic.get('RPC_HOST'):
  238. commit(remoteevb, end_time - start_time + 20 * (retry + 1))
  239. logging.error("rpc retry error: %d %s", retry, sys.exc_info())
  240. def mod_launch_rpc(mode, mod, args, outputs, tuning=False):
  241. """
  242. launch rpc or rpc_cloud module with retry.
  243. Note:
  244. To minimize waiting time of struggler RPC servers, we wait for a short timeout and spawn
  245. a new thread after the timeout.
  246. In normal case, RPC would complete before the short timeout, so, only one thread will be created.
  247. When the RPC server is slow, we create multiple threads that run concurrently.
  248. We wait for the first thread that successfully completes its work and return the result.
  249. If a thread fails (an exception is raised), we spawn a new thread to retry.
  250. Newly spawned threads will use different RPC servers.
  251. We bound the maximum number of threads, i.e. maximum number of retries.
  252. """
  253. max_num_threads = 5
  254. import operator
  255. arg_filter = filter(lambda x: isinstance(x, np.ndarray), args)
  256. arg_tensor = list(arg_filter)
  257. tensor_size = reduce(operator.add, [reduce(operator.mul, arg.shape) for arg in arg_tensor])
  258. expected_upload_speed = 5e6
  259. expected_upload_time = int(tensor_size / expected_upload_speed)
  260. timeout_before_spawning_new_thread = 200 + expected_upload_time
  261. poll_interval = 1
  262. thread_timeout = 400 + expected_upload_time * 3
  263. load_rpc_server_info(mode)
  264. threads = [None] * max_num_threads
  265. results = [None] * max_num_threads
  266. need_retry = [None] * max_num_threads
  267. retried = [False] * max_num_threads
  268. for thread_index in range(max_num_threads):
  269. if thread_index > 0:
  270. logging.error("Thread %d run for %d seconds, spawn a new thread to retry",
  271. (thread_index - 1), timeout_before_spawning_new_thread)
  272. threads[thread_index] = Thread(target=mod_launch_rpc_thread,
  273. args=(mode, mod, args, outputs, results, need_retry, thread_index, tuning))
  274. # daemonize the thread to prevent long running threads from hanging the whole process
  275. threads[thread_index].daemon = True
  276. threads[thread_index].start()
  277. poll_count = timeout_before_spawning_new_thread // poll_interval
  278. while poll_count > 0:
  279. poll_count -= 1
  280. # wait for the newly created thread, because it is most likely to complete first
  281. threads[thread_index].join(poll_interval)
  282. for poll_index in range(thread_index + 1):
  283. if not threads[poll_index].is_alive() and not need_retry[poll_index]:
  284. return results[poll_index]
  285. if need_retry[poll_index] and not retried[poll_index]:
  286. logging.error("Thread %d exit with error, spawn a new thread immediately", poll_index)
  287. poll_count = 0
  288. retried[poll_index] = True
  289. logging.error("All %d threads are created, poll the threads until the first one exits normally, \
  290. or all threads exit abnormally or timeout", max_num_threads)
  291. poll_count = thread_timeout // poll_interval
  292. for _ in range(poll_count):
  293. threads[max_num_threads - 1].join(poll_interval)
  294. exit_thread_count = 0
  295. for poll_index in range(max_num_threads):
  296. if not threads[poll_index].is_alive() and not need_retry[poll_index]:
  297. return results[poll_index]
  298. if not threads[poll_index].is_alive():
  299. exit_thread_count += 1
  300. if exit_thread_count == max_num_threads:
  301. logging.error("All %d threads exit abnormally", max_num_threads)
  302. return None
  303. logging.error("All %d threads timeout", max_num_threads)
  304. return None
  305. def profiling_mode_run(mod, args, outputs, tuning, device_id):
  306. """
  307. Function for collecting cycle data from device.
  308. Args:
  309. mod: CCE Module.
  310. args: list or tuple of numpy array.
  311. outputs: list or tuple of output argment index.
  312. tuning: tuning model.
  313. device_id: device_id on device.
  314. """
  315. ctx = akg.tvm.ndarray.cce(device_id)
  316. arg_list = []
  317. for a in args:
  318. arg_list.append(akg.tvm.nd.array(a, ctx))
  319. mod(*arg_list)
  320. ctx.sync()
  321. out_list = []
  322. cycle = profiling_analyse(device_id)
  323. for i in outputs:
  324. out = arg_list[len(arg_list) + i if i < 0 else i].asnumpy()
  325. out_list.append(out)
  326. logging.info('=====parsing cycles==============================')
  327. if cycle != PROF_ERROR_CODE:
  328. logging.info(cycle)
  329. else:
  330. logging.error("OOPS, can't correctly parsing cycles!")
  331. TestUtils.record_cycle(cycle)
  332. logging.info('=====parsing cycles==============================')
  333. if tuning:
  334. return out_list[0] if len(out_list) == 1 else tuple(out_list), {'run_time': cycle}
  335. return out_list[0] if len(out_list) == 1 else tuple(out_list)
  336. def profiling_analyse(device_id):
  337. """analyse profiling."""
  338. def exec_cmds_with_pipe(cmd_list):
  339. cmd_num = len(cmd_list)
  340. if cmd_num <= 1:
  341. raise RuntimeError("length of cmd_list should be greater than 1.")
  342. ps = []
  343. for i, cmd in enumerate(cmd_list):
  344. if i == 0:
  345. p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
  346. else:
  347. p = subprocess.Popen(cmd, stdin=ps[-1].stdout, stdout=subprocess.PIPE)
  348. ps.append(p)
  349. for p in ps:
  350. p.wait()
  351. return ps[-1].communicate()
  352. if not isinstance(device_id, int):
  353. raise TypeError("device_id must be an integer.")
  354. try:
  355. public_path = "/var/log/npu/profiling"
  356. cmd_list = [
  357. ["find", public_path, "-iname", "*.log.%d" % device_id, "-printf", "'%T+\t%p\n'"],
  358. ["grep", "JOB"],
  359. ["sort", "-r"],
  360. ["head", "-n10"],
  361. ["awk", "{print $2}"],
  362. ["head", "-n1"],
  363. ]
  364. p = exec_cmds_with_pipe(cmd_list)
  365. for _ in range(5):
  366. if p[0].decode('utf8').strip() == '':
  367. time.sleep(1)
  368. try:
  369. job_file = p[0].decode('utf8').strip().split('/')[-2]
  370. except BaseException:
  371. logging.warning("failed to decode profiling result")
  372. return None
  373. logging.debug("job file is: %s", job_file)
  374. from akg.backend import parsing_profiling_data
  375. return parsing_profiling_data.parsing(public_path + '/' + job_file)
  376. except SyntaxError as e:
  377. logging.error(e)
  378. return PROF_ERROR_CODE
  379. def mod_launch_air(mod, args, outputs):
  380. """launch mod on kc_air."""
  381. if kc_air_mode == "CUDA":
  382. ctx = akg.tvm.ndarray.gpu(0)
  383. else:
  384. ctx = akg.tvm.ndarray.cce(0)
  385. arg_list = []
  386. for a in args:
  387. if isinstance(a, np.ndarray):
  388. arg_list.append(akg.tvm.nd.array(a, ctx))
  389. elif isinstance(a, (list, tuple)):
  390. for aa in a:
  391. if isinstance(aa, np.ndarray):
  392. arg_list.append(akg.tvm.nd.array(aa, ctx))
  393. else:
  394. arg_list.append(aa)
  395. else:
  396. arg_list.append(a)
  397. for retry in range(3):
  398. need_retry = False
  399. try:
  400. mod(*arg_list)
  401. ctx.sync()
  402. out_list = []
  403. if not need_retry:
  404. for i in outputs:
  405. out = arg_list[len(arg_list) + i if i < 0 else i].asnumpy()
  406. out_list.append(out)
  407. return out_list[0] if len(out_list) == 1 else tuple(out_list)
  408. except RuntimeError:
  409. need_retry = True
  410. logging.error("kc_air retry error: %d %s", retry, sys.exc_info())
  411. logging.error("kc_air runtime error, please check!")
  412. return None
  413. @func_time_required
  414. def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None, repeat_time=400):
  415. """
  416. unified run CCE kernel api.
  417. Args:
  418. mod (str): CCE Module, string of runtime choose, can set ca aic and rpc.
  419. args (Union[list, tuple]): list or tuple of numpy array.
  420. outputs (Union[list, tuple]): list or tuple of output argment index.
  421. tuning (bool): tuning model.
  422. device_id: device_id on device.
  423. expect: when mode in ["compile_cloud", "compile_mini"], return it.
  424. Returns:
  425. output numpy array, or tuple of numpy array if multi-output.
  426. """
  427. gc.collect()
  428. if mod.imported_modules[0].type_key == CUDA:
  429. ctx = akg.tvm.context(CUDA, device_id)
  430. mod_args = [akg.tvm.nd.array(a, ctx) for a in args]
  431. mod(*mod_args)
  432. out_list = [mod_args[len(args) + i if i < 0 else i].asnumpy() for i in outputs]
  433. if not tuning:
  434. return out_list[0] if len(out_list) == 1 else tuple(out_list)
  435. else:
  436. cycles = get_gpu_cycles(mod, *mod_args, device_id=device_id, save_log=True, repeat_time=repeat_time)
  437. return out_list[0] if len(out_list) == 1 else tuple(out_list), {'run_time': cycles}
  438. stat_info = {}
  439. profiling_mode = get_profiling_mode()
  440. if profiling_mode:
  441. return profiling_mode_run(mod, args, outputs, tuning, device_id)
  442. mode = get_runtime_mode()
  443. if mode == 'aic':
  444. output = aic_model.launch(mod, args, outputs)
  445. if not tuning:
  446. return output
  447. ra_util.get_ticks(stat_info)
  448. return output, stat_info
  449. if mode == 'aic_cloud':
  450. output = aic_model.launch(mod, args, outputs, spec=aic_model.Spec.CLOUD)
  451. if not tuning:
  452. return output
  453. ra_util.get_ticks(stat_info)
  454. return output, stat_info
  455. if mode in ('rpc', 'rpc_cloud'):
  456. return mod_launch_rpc(mode, mod, args, outputs, tuning)
  457. if mode in ('ca', 'air', 'air_cloud'):
  458. return mod_launch_air(mod, args, outputs)
  459. if mode in ("compile_cloud", "compile_mini"):
  460. return expect
  461. if mode in ("csim", "ccesim", "cdiff"):
  462. from akg.backend.csim import csim_launch
  463. return csim_launch(args, outputs)
  464. if mode == "cpu":
  465. tvm_array = []
  466. ctx = akg.tvm.context("llvm", 0)
  467. for _, args_val in enumerate(args):
  468. tvm_temp = akg.tvm.nd.array(args_val, ctx)
  469. tvm_array.append(tvm_temp)
  470. mod(*tvm_array)
  471. return tvm_array[-1].asnumpy()
  472. raise ValueError("mode must be aic, rpc, aic_cloud, ca, compile_cloud, compile_mini, cpu, csim, ccesim or cdiff")
  473. def gen_kernel_name(input_shapes, input_types, op_attrs=None, kernel_name=""):
  474. """generate kernel name."""
  475. dir_max_length = 250
  476. shape_info = ''
  477. for _, (shape, dtype) in enumerate(zip(input_shapes, input_types)):
  478. if isinstance(shape, (list, tuple)) and shape and isinstance(shape[0], (list, tuple)):
  479. for _, tmp_shape in enumerate(shape):
  480. vc_util.check_shape(tmp_shape)
  481. tmp_shape = list(tmp_shape)
  482. str_tmp_shape = [str(tmp) for tmp in tmp_shape]
  483. shape_info = "%s_%s_%s" % (shape_info, dtype, '_'.join(str_tmp_shape))
  484. elif isinstance(shape, akg.tvm.tensor.Tensor):
  485. for tmp_shape in shape.shape:
  486. if isinstance(tmp_shape, akg.tvm.expr.Var):
  487. str_shape = tmp_shape.name
  488. else:
  489. str_shape = str(tmp_shape)
  490. shape_info = "%s_%s_%s" % (shape_info, dtype, '_'.join(str_shape))
  491. else:
  492. vc_util.check_shape(shape)
  493. if isinstance(shape, akg.tvm.expr.Var):
  494. shape = [shape]
  495. shape = list(shape)
  496. str_shape = [str(i) for i in shape]
  497. shape_info = "%s_%s_%s" % (shape_info, dtype, '_'.join(str_shape))
  498. if op_attrs is not None:
  499. for tmp in op_attrs:
  500. if isinstance(tmp, (list, tuple)):
  501. for ele in tmp:
  502. if isinstance(ele, (list, tuple)):
  503. str_tmp = [str(i) for i in ele]
  504. shape_info = shape_info + '_' + '_'.join(str_tmp)
  505. else:
  506. shape_info = shape_info + '_' + str(ele)
  507. elif isinstance(tmp, (int, float)):
  508. shape_info = shape_info + '_' + str(tmp)
  509. elif isinstance(tmp, (str)):
  510. shape_info = shape_info + '_' + tmp
  511. elif isinstance(tmp, (np.ndarray)):
  512. shape = list(tmp.shape)
  513. str_shape = [str(i) for i in shape]
  514. shape_info = shape_info + '_' + '_'.join(str_shape)
  515. kernel_name = kernel_name + shape_info
  516. kernel_name = re.sub(r'[^0-9a-zA-Z]+', '_', kernel_name)
  517. if len(kernel_name) > dir_max_length:
  518. logging.info("Dir name %s exceed maximal length, use first %d char as dir name.", kernel_name, dir_max_length)
  519. kernel_name = kernel_name[:dir_max_length]
  520. return kernel_name
  521. @func_time_required
  522. def op_build_test(op_func, input_shapes, input_types, op_attrs=None, kernel_name="",
  523. attrs=None, log_cce=False, dump_ir=True, dump_code=True,
  524. polyhedral=True, tuning=False):
  525. """
  526. Return module from op_build with given inputs, distinguish tuning mode.
  527. Args:
  528. op_func (function returning an op or (op, [op_vars])): The op build function
  529. input_shapes(iterable of iterable of int): the dim sizes for input for op
  530. input_types (iterable of iterable of str): the dtypes for each input
  531. op_attrs (list or tuple): extra attributes for the op.
  532. kernel_name (str): name of op.
  533. attrs (dict): tiling parameter.
  534. log_cce (bool): False by default.
  535. dump_ir (bool): True by default.
  536. dump_code (bool): False by default.
  537. polyhedral (bool): True by default.
  538. tuning (bool): False by default.
  539. Return:
  540. module.
  541. """
  542. if isinstance(attrs, dict) and 'tuning' in attrs.keys():
  543. kernel_name = kernel_name
  544. else:
  545. kernel_name = gen_kernel_name(input_shapes, input_types, op_attrs, kernel_name)
  546. logging.debug('kernel_name---------- %s', str(kernel_name))
  547. mod = op_build(op_func, input_shapes, input_types, op_attrs, kernel_name,
  548. attrs, log_cce, dump_ir, dump_code,
  549. polyhedral, tuning)
  550. return mod
  551. def recursive_copy(obj):
  552. """
  553. Copy a container object recursively
  554. Args:
  555. obj (list, tuple, dict or object): input container object.
  556. Return:
  557. copied object.
  558. """
  559. if isinstance(obj, list):
  560. return [recursive_copy(it) for it in obj]
  561. if isinstance(obj, tuple):
  562. return tuple([recursive_copy(it) for it in obj])
  563. if isinstance(obj, dict):
  564. copy_obj = dict()
  565. for key in obj:
  566. copy_obj[key] = recursive_copy(obj[key])
  567. return copy_obj
  568. return obj
  569. def gen_inputs_and_shape_params(input_shapes, input_types, inputs, shape_params):
  570. """
  571. Generate akg.tvm.placeholder as inputs for op with given input_shapes and input_types
  572. Args:
  573. input_shapes(iterable of iterable of int): the dim sizes for input for op.
  574. input_types (iterable of iterable of str): the dtypes for each input.
  575. inputs (list): None by default.
  576. shape_params (list): None by default.
  577. """
  578. for i, (shape, dtype) in enumerate(zip(input_shapes, input_types)):
  579. if isinstance(shape, (list, tuple)) and shape and isinstance(shape[0], (list, tuple)):
  580. tmp_input = []
  581. for j, tmp_shape in enumerate(shape):
  582. tmp_input.append(akg.tvm.placeholder(tmp_shape, dtype, "input_%d_%d" % (i + 1, j + 1)))
  583. for tmp in tmp_shape:
  584. if isinstance(tmp, akg.tvm.expr.Var):
  585. shape_params.append(tmp)
  586. inputs.append(tmp_input)
  587. elif isinstance(shape, (list, tuple)) and shape and isinstance(shape[0], akg.tvm.expr.Var):
  588. inputs.append(akg.tvm.placeholder(shape, dtype, "input_%d" % (i + 1)))
  589. for tmp_shape in shape:
  590. if isinstance(tmp_shape, akg.tvm.expr.Var):
  591. shape_params.append(tmp_shape)
  592. elif isinstance(shape, akg.tvm.tensor.Tensor):
  593. inputs.append(shape)
  594. for tmp_shape in shape.shape:
  595. shape_params.append(tmp_shape)
  596. else:
  597. inputs.append(akg.tvm.placeholder(shape, dtype, "input_%d" % (i + 1)))
  598. def gen_attrs_params(op_attrs, attrs_params):
  599. """
  600. Parsing attrs given by op_attrs.
  601. Args:
  602. op_attrs (list or tuple): extra attributes for the op.
  603. attrs_params (list): None by default.
  604. """
  605. for tmp_attr in op_attrs:
  606. if isinstance(tmp_attr, (list, tuple)) and tmp_attr and isinstance(tmp_attr[0], akg.tvm.expr.Var):
  607. for attr_param in tmp_attr:
  608. if isinstance(attr_param, akg.tvm.expr.Var):
  609. attrs_params.append(attr_param)
  610. elif isinstance(tmp_attr, akg.tvm.expr.Var):
  611. attrs_params.append(tmp_attr)
  612. def get_dim_from_func_map(attrs, op_func, args):
  613. """
  614. Get tiling parameter from map defined in op_func.
  615. Args:
  616. attrs (dict): tiling parameter.
  617. op_func (function returning an op or (op, [op_vars])): The op build function.
  618. """
  619. if attrs is None or 'dim' not in attrs or not attrs['dim']:
  620. dim_info = ""
  621. if attrs is None:
  622. attrs = dict()
  623. if op_func.__name__ in ct_util.set_dim_func_map.keys():
  624. value = ct_util.set_dim_func_map[op_func.__name__]
  625. if inspect.isfunction(value):
  626. dim_info = value(*args)
  627. elif isinstance(value, dict):
  628. key = []
  629. key.append(ft_util.convert_to_list(input_shapes))
  630. key.append(ft_util.convert_to_list(input_types))
  631. if op_attrs is not None:
  632. key.append(op_attrs)
  633. key = str(tuple(key))
  634. if key in value.keys():
  635. dim_info = ct_util.set_dims(value[key])
  636. else:
  637. raise RuntimeError("Registered set_dim_map is invalid. Must be a function or a dict!")
  638. if isinstance(dim_info, (list, tuple)):
  639. dim_info = dim_info[0]
  640. attrs['dim'] = dim_info
  641. return attrs
  642. def parsing_output(output, attrs, compute_func, sch_tmpl, gpu_binds):
  643. """
  644. Parsing the outputs of op.
  645. Args:
  646. output (iterable of iterable of akg.tvm.tensor): the outputs of op.
  647. attrs (dict): tiling parameter.
  648. compute_func (function): None by default, func for doing compute_inline or other.
  649. sch_tmpl (dict): None by default.
  650. gpu_binds (dict): None by default.
  651. """
  652. if isinstance(output, (list, tuple)):
  653. from inspect import isfunction
  654. new_outputs = []
  655. for elem in output:
  656. if isfunction(elem):
  657. compute_func = elem
  658. elif isinstance(elem, dict):
  659. for key, value in elem.items():
  660. if key not in attrs or not attrs[key]:
  661. attrs[key] = value
  662. elif isinstance(elem, (list, tuple)):
  663. new_outputs += elem
  664. else:
  665. new_outputs.append(elem)
  666. output = new_outputs
  667. elif isinstance(output, dict):
  668. sch_tmpl = output
  669. output = sch_tmpl['output']
  670. gpu_binds = sch_tmpl['binds']
  671. return output, compute_func, sch_tmpl, gpu_binds
  672. def gen_op_var(inputs, output, op_var):
  673. """
  674. Combine inputs and outputs about the op.
  675. Args:
  676. inputs(list): the inputs of op.
  677. output(list): the outputs of op.
  678. op_var (list): inputs and outputs for the op.
  679. """
  680. for xx in inputs:
  681. if isinstance(xx, list):
  682. for x in xx:
  683. op_var.append(x)
  684. else:
  685. op_var.append(xx)
  686. if isinstance(output, (list, tuple)):
  687. op_var = op_var + [i for i in output if TensorUtils.is_output_value(i)]
  688. else:
  689. if TensorUtils.is_output_value(output):
  690. op_var = op_var + [output]
  691. return op_var
  692. def gen_shape_var(attrs_params, shape_params, shape_var):
  693. """
  694. Combine shape of inputs and extra attributes about the op.
  695. Args:
  696. attrs_params(list): shape of inputs for the op
  697. shape_params(list): extra attributes for the op
  698. shape_var (list): shape of inputs and extra attributes for the op.
  699. """
  700. if attrs_params:
  701. [shape_var.append(i) for i in attrs_params if i not in shape_var]
  702. [shape_var.append(i) for i in shape_params if i not in shape_var]
  703. def gen_spaces_dim_key(op_func, args, s, op_var, kernel_name, attrs, polyhedral, tuning, target):
  704. """
  705. Generate tiling parameter.
  706. Args:
  707. op_func (function returning an op or (op, [op_vars])): The op build function.
  708. args (Union[list, tuple]): list or tuple of numpy array.
  709. s (dict): schedule of op.
  710. op_var (list): the akg.tvm.tensor of inputs and outputs for op.
  711. kernel_name (str): name of op.
  712. attrs (dict): tiling parameter.
  713. polyhedral (bool): True by default.
  714. tuning (bool): False by default.
  715. Return:
  716. tiling parameter.
  717. """
  718. set_dim_key = ""
  719. if op_func.__name__ in ct_util.set_dim_func_map.keys():
  720. func_ = ct_util.set_dim_func_map[op_func.__name__]
  721. if inspect.isfunction(func_):
  722. set_dim_key = func_(*args)[1]
  723. elif op_func.__name__ in ct_util.gen_key_func_map.keys():
  724. func_ = ct_util.gen_key_func_map[op_func.__name__]
  725. if inspect.isfunction(func_):
  726. set_dim_key = func_(*args)
  727. with akg.build_config(dump_pass_ir=True):
  728. spaces = akg.lower(s, op_var, name=kernel_name, attrs=attrs, polyhedral=polyhedral, tuning=tuning,
  729. target=target)
  730. if set_dim_key == "":
  731. set_dim_key = str(args)
  732. return spaces, set_dim_key
  733. def create_gpu_mod(sch_tmpl, s, op_func, op_var, shape_var, kernel_name, attrs, polyhedral, binds, dump_ir, dump_code,
  734. tuning):
  735. """
  736. Return module for op of gpu.
  737. Args:
  738. sch_tmpl (dict): schedule of op and the others.
  739. s (dict): schedule of op.
  740. op_func (function returning an op or (op, [op_vars])): The op build function.
  741. op_var (list): the akg.tvm.tensor of inputs and outputs for op.
  742. shape_var (list): shape of inputs and extra attributes for the op.
  743. kernel_name (str): name of op.
  744. attrs (dict): tiling parameter.
  745. polyhedral (bool): True by default.
  746. binds (dict): BINDS
  747. dump_ir (bool): True by default.
  748. dump_code (bool): False by default.
  749. tuning (bool): False by default.
  750. Return:
  751. module.
  752. """
  753. if sch_tmpl is not None or (attrs and attrs.get("target", "cce") == "cuda"):
  754. if kernel_name == "":
  755. kernel_name = op_func.__name__ if sch_tmpl is None else sch_tmpl['op_name']
  756. target = CUDA
  757. if sch_tmpl is not None:
  758. if sch_tmpl['target'] != CUDA:
  759. raise ValueError("Only support cuda as target when using schedule template.")
  760. global kc_air_mode
  761. kc_air_mode = "CUDA"
  762. with akg.tvm.target.cuda() as target:
  763. if not tuning:
  764. s = sch_tmpl['schedule'](sch_tmpl['output'])
  765. with akg.tvm.build_config(dump_pass_ir=dump_ir):
  766. mod = akg.build(s, op_var, "cuda", shape_var, name=kernel_name, attrs=attrs,
  767. polyhedral=False, binds=binds)
  768. else:
  769. @autotvm.template
  770. def _autotune_template():
  771. s = sch_tmpl['schedule'](sch_tmpl['output'])
  772. return (s, op_var)
  773. # create autotune task
  774. task = autotvm.task.create(_autotune_template,
  775. args=list(),
  776. target='cuda')
  777. print("task config: ", task.config_space)
  778. # set measure_option
  779. measure_option = autotvm.measure_option(
  780. builder=autotvm.LocalBuilder(),
  781. runner=autotvm.LocalRunner(repeat=5, min_repeat_ms=150, timeout=4)
  782. )
  783. # Begin tuning, log records to file `kernel_name.log`
  784. tuner = autotvm.tuner.RandomTuner(task)
  785. if not os.path.exists(kernel_name + '.log'):
  786. tuner.tune(n_trial=len(task.config_space),
  787. measure_option=measure_option,
  788. callbacks=[autotvm.callback.log_to_file(kernel_name + '.log')])
  789. # query best config
  790. dispatch_context = autotvm.apply_history_best(kernel_name + '.log')
  791. best_config = dispatch_context.query(task.target, task.workload)
  792. print("\nBest config is:")
  793. print(best_config)
  794. # apply best config
  795. with autotvm.apply_history_best(kernel_name + '.log'):
  796. s, op_var = _autotune_template()
  797. mod = akg.build(s, op_var, "cuda", shape_var, name=kernel_name, attrs=attrs,
  798. polyhedral=False, binds=gpu_binds)
  799. else :
  800. with akg.build_config(dump_pass_ir=dump_ir):
  801. mod = akg.build(s, op_var, target, shape_var, name=kernel_name, attrs=attrs, polyhedral=polyhedral,
  802. binds=binds)
  803. if dump_code:
  804. source_code = mod.imported_modules[0].get_source()
  805. create_code(kernel_name, "./", source_code, CUDA)
  806. return mod
  807. def op_build(op_func, input_shapes, input_types, op_attrs=None, kernel_name="",
  808. attrs=None, log_cce=False, dump_ir=True, dump_code=True,
  809. polyhedral=True, tuning=False):
  810. """
  811. Return module built from op_func with given inputs.
  812. Args:
  813. op_func (function returning an op or (op, [op_vars])): The op build function.
  814. input_shapes(iterable of iterable of int): the dim sizes for input for op.
  815. input_types (iterable of iterable of str): the dtypes for each input.
  816. op_attrs (list or tuple): extra attributes for the op.
  817. kernel_name (str): name of op.
  818. attrs (dict): tiling parameter.
  819. log_cce (bool): False by default.
  820. dump_ir (bool): True by default.
  821. dump_code (bool): False by default.
  822. polyhedral (bool): True by default.
  823. tuning (bool): False by default.
  824. Return:
  825. module.
  826. """
  827. inputs = []
  828. shape_params = [] # save all the shape params for dynamic_shape cases
  829. gen_inputs_and_shape_params(input_shapes, input_types, inputs, shape_params)
  830. attrs_params = []
  831. if op_attrs is not None:
  832. args = inputs + op_attrs
  833. gen_attrs_params(op_attrs, attrs_params)
  834. else:
  835. args = inputs
  836. # backup inputs because the tensor names may be updated inside op_func
  837. inputs_backup = recursive_copy(inputs)
  838. output = op_func(*args)
  839. # restore inputs to make sure that tensor names are not changed by op_func
  840. inputs = inputs_backup
  841. # set dim
  842. attrs = get_dim_from_func_map(attrs, op_func, args)
  843. compute_func = None # func which is defined in dsl for doing compute_inline or other
  844. sch_tmpl = None
  845. gpu_binds = None
  846. output, compute_func, sch_tmpl, gpu_binds = parsing_output(output, attrs, compute_func, sch_tmpl, gpu_binds)
  847. op_var = []
  848. op_var = gen_op_var(inputs, output, op_var)
  849. shape_var = []
  850. gen_shape_var(attrs_params, shape_params, shape_var)
  851. if sch_tmpl is not None:
  852. return create_gpu_mod(sch_tmpl, None, op_func, op_var, shape_var, kernel_name, attrs, polyhedral, gpu_binds,
  853. dump_ir, dump_code, tuning)
  854. if isinstance(output, (list, tuple)):
  855. tmp = []
  856. for x in list(output):
  857. if isinstance(x, tuple):
  858. tmp.append(x[0].op)
  859. else:
  860. tmp.append(x.op)
  861. s = akg.tvm.create_schedule(tmp)
  862. else:
  863. s = akg.tvm.create_schedule(output.op)
  864. if compute_func is not None:
  865. compute_func(s)
  866. polyhedral = False
  867. target = CCE
  868. if attrs and attrs.get("target", "cce") == CUDA:
  869. target = CUDA
  870. level = attrs.get("help_tiling") if attrs and "help_tiling" in attrs else None
  871. if tuning or (level is not None and level > help_tiling_level['None']):
  872. return gen_spaces_dim_key(op_func, args, s, op_var, kernel_name, attrs, polyhedral, tuning, target)
  873. mode = get_runtime_mode()
  874. if mode == "cpu":
  875. mod = akg.tvm.build(s, op_var, "llvm")
  876. if not os.path.isdir("./cpu/ir/"):
  877. os.makedirs("./cpu/ir/")
  878. with os.fdopen(os.open("./cpu/ir/" + kernel_name + ".cc", os.O_WRONLY | os.O_CREAT, 0o400), 'w') as irf:
  879. irf.write(akg.tvm.lower(s, op_var, shape_var, simple_mode=True))
  880. return mod
  881. binds = None if not attrs else attrs.pop(BINDS, None)
  882. if target == CUDA:
  883. return create_gpu_mod(None, s, op_func, op_var, shape_var, kernel_name, attrs, polyhedral, binds, dump_ir,
  884. dump_code, tuning)
  885. target = CCE
  886. with akg.build_config(dump_pass_ir=dump_ir):
  887. mod = akg.build(s, op_var, target, shape_var, name=kernel_name, attrs=attrs, polyhedral=polyhedral, binds=binds)
  888. source_code = mod.imported_modules[0].get_source()
  889. if log_cce:
  890. logging.debug("#################cce code####################")
  891. logging.debug(source_code)
  892. if dump_code:
  893. create_code(kernel_name, "./", source_code, target)
  894. return mod
  895. def get_runtime_mode():
  896. """get runtime mode."""
  897. env_dic = os.environ
  898. if not env_dic.get('RUNTIME_MODE'):
  899. mode = 'rpc_cloud'
  900. else:
  901. mode = env_dic.get('RUNTIME_MODE')
  902. return mode
  903. def get_profiling_mode():
  904. """get profiling mode."""
  905. env_dic = os.environ
  906. if env_dic.get('PROFILING_MODE') and env_dic.get('PROFILING_MODE').lower() == "true":
  907. return True
  908. return False
  909. def product_is_mini():
  910. """check whether in mini environment."""
  911. mode = get_runtime_mode()
  912. if mode in ('rpc', 'air', 'aic', 'compile_mini'):
  913. return True
  914. return False
  915. def get_available_devices_num():
  916. """get available devives num."""
  917. env_dic = os.environ
  918. try:
  919. return int(env_dic.get('DEVICE_TOTAL_NUM').lower()) if env_dic.get('DEVICE_TOTAL_NUM') else 1
  920. except NameError as e:
  921. logging.error(e)
  922. return 1
  923. def get_device_id():
  924. """get device id."""
  925. env_dic = os.environ
  926. try:
  927. return int(env_dic.get('DEVICE_ID').lower()) if env_dic.get('DEVICE_ID') else 0
  928. except NameError as e:
  929. logging.error(e)
  930. return 0
  931. def get_gpu_cycles(mod, *mod_args, device_id=0, save_log=False, repeat_time=400):
  932. "get gpu profiling cycles."
  933. func = tvm.get_global_func('GPUProfilerInit')
  934. func("")
  935. from akg.utils.result_analysis import gpu_profiling
  936. gpu_profiling(mod, *mod_args, repeat_time=repeat_time, device_id=device_id)
  937. func = tvm.get_global_func('GPUProfilerStop')
  938. a = func()
  939. return int(a)
  940. class TestUtils:
  941. """Class for getting cycle and core num."""
  942. @staticmethod
  943. def record_cycle(cycle):
  944. if os.environ.get(PERFORMANCE_TEST_FILE):
  945. result_file = os.environ.get(PERFORMANCE_TEST_FILE)
  946. with open(result_file, "a+") as f:
  947. f.write("{0}\n".format(cycle))
  948. @staticmethod
  949. def record_core(stmt):
  950. """Function for getting performance data from cores."""
  951. def get_core_num():
  952. core_num = 1
  953. if hasattr(stmt, 'attr_key') and stmt.attr_key == 'thread_extent':
  954. core_num = stmt.value
  955. return core_num
  956. if os.environ.get(PERFORMANCE_TEST_FILE):
  957. result_file = os.environ.get(PERFORMANCE_TEST_FILE)
  958. with open(result_file, "a+") as f:
  959. f.write("{0}; ".format(get_core_num()))

AKG(Auto Kernel Generator)对深度神经网络中的算子进行优化,并提供特定模式下的算子自动融合功能。AKG与MindSpore的图算融合功能协同工作,可提升在不同硬件后端上运行网络的性能。