You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

kernel_exec.py 42 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126
  1. #!/usr/bin/env python3
  2. # coding: utf-8
  3. # Copyright 2019-2021 Huawei Technologies Co., Ltd
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. """util"""
  17. import sys
  18. import gc
  19. import inspect
  20. import datetime
  21. import os
  22. import uuid
  23. import logging
  24. import time
  25. import random
  26. import subprocess
  27. import re
  28. import logging
  29. import tvm
  30. from timeit import default_timer as timer
  31. from threading import Thread
  32. from functools import reduce
  33. import numpy as np
  34. import akg
  35. from akg.build_module import help_tiling_level
  36. from akg import backend as cce
  37. import akg.tvm
  38. from akg.tvm import autotvm
  39. from akg.tvm import rpc
  40. from akg.utils import result_analysis as ra_util
  41. from akg.utils import format_transform as ft_util
  42. from akg.utils import custom_tiling as ct_util
  43. from akg.utils import validation_check as vc_util
  44. from akg.utils.dsl_create import TensorUtils
  45. from akg.backend.parsing_profiling_data import HWTSLogParser
  46. from akg.backend.parsing_profiling_data import validate_and_normalize_path
  47. sh = logging.StreamHandler(sys.stdout)
  48. logging.getLogger().addHandler(sh)
  49. logging.getLogger().setLevel(logging.INFO)
  50. rpc_machine = {}
  51. rpc_lb = {}
  52. kc_air_mode = "CCE"
  53. PERFORMANCE_TEST_FILE = "PERFORMANCE_TEST_FILE"
  54. BINDS = "binds"
  55. CUDA = "cuda"
  56. CCE = "cce"
  57. RANDOM_SEED_NUM = 20
  58. PROF_ERROR_CODE = 9999999999
  59. WGT_WIDTH = 16
  60. INP_WIDTH = 16
  61. OUT_WIDTH = 16
  62. BLOCK_IN = 16
  63. BLOCK_OUT = 16
  64. BLOCK_REDUCE = 16
  65. INP_ELEM_BYTES = (BLOCK_IN * BLOCK_REDUCE * INP_WIDTH // 8)
  66. WGT_ELEM_BYTES = (BLOCK_OUT * BLOCK_REDUCE * WGT_WIDTH // 8)
  67. OUT_ELEM_BYTES = (BLOCK_IN * BLOCK_OUT * OUT_WIDTH // 8)
  68. GLB_ELEM_BYTES = (16 * OUT_WIDTH // 8)
  69. def debug_mode(debug_flag):
  70. """
  71. Pass to enable tpu debug mode.
  72. Args:
  73. debug_flag (int): The dbeug flag to be passed.
  74. Returns:
  75. list of function, the pass to set to build_config(add_lower_pass=tpu.debug_mode(mode)).
  76. """
  77. # the number in pass_list such as 0,1,2,3 represents the order of the pass called
  78. pass_list = []
  79. if debug_flag == 1:
  80. pass_list.append((0, ir_pass.inject_dma_intrin))
  81. return pass_list
  82. def func_time_required(func_name):
  83. """Checking the Time Required for Function Running."""
  84. def wrapper(*args, **kwargs):
  85. t0 = time.time()
  86. result = func_name(*args, **kwargs)
  87. t1 = time.time()
  88. logging.info("func_time_required func:%s, running:%lf seconds", func_name.__name__, (t1 - t0))
  89. return result
  90. return wrapper
  91. def create_code(kernel_name, code_path=None, code=None, code_type=CCE):
  92. """
  93. Create cce or cuda file.
  94. Args:
  95. kernel_name: file name.
  96. code_path: file path.
  97. code: code.
  98. code_type: code type.
  99. """
  100. if code_type == CCE:
  101. postfix = ".cce"
  102. elif code_type == CUDA:
  103. postfix = ".cu"
  104. else:
  105. logging.info("the target code type %s is not supported.", code_type)
  106. if not code_path:
  107. code_path = "./"
  108. if code_type == CCE and len(code_path) > 4 and code_path[-4:].lower() == postfix:
  109. real_path = code_path
  110. elif code_type == CUDA and len(code_path) > 3 and code_path[-3:].lower() == postfix:
  111. real_path = code_path
  112. else:
  113. if code_path[-1] == r"/":
  114. real_path = code_path + kernel_name + postfix
  115. else:
  116. real_path = code_path + r"/" + kernel_name + postfix
  117. dir_path = r"/".join(real_path.split(r"/")[:-1])
  118. if not os.path.isdir(dir_path):
  119. os.makedirs(dir_path)
  120. with open(real_path, 'wt') as ss:
  121. ss.write(code)
  122. def gen_name_kernel(kernel, dtype, shapes):
  123. """generate kernel name."""
  124. def _flat_array(srclist, dstlist):
  125. for i in srclist:
  126. if isinstance(i, (list, tuple)):
  127. _flat_array(i, dstlist)
  128. else:
  129. dstlist.append(i)
  130. res = ''
  131. flat = []
  132. _flat_array(shapes, flat)
  133. for s in flat:
  134. res = "%s%s'_'" % (res, s)
  135. res = "%s_%s%s" % (kernel, res, dtype)
  136. return res
  137. def load_rpc_server_info(mode):
  138. """
  139. load rpc server host and port info.
  140. Args:
  141. mode (str): string of runtime choose, can set ca aic and rpc.
  142. """
  143. env_dic = os.environ
  144. if env_dic.get('RPC_HOST') and env_dic.get('RPC_PORT'):
  145. return None
  146. if mode == 'rpc_cloud':
  147. logging.error("runtime_mode=rpc_cloud must set 1980 host ip and port!")
  148. raise Exception("ERROR:runtime_mode=rpc_cloud must set 1980 host ip and port!")
  149. rpc_server_info_config = env_dic.get('RPC_SERVER_INFO_FILE')
  150. if not rpc_server_info_config:
  151. logging.error("runtime_mode=rpc must set RPC_SERVER_INFO_FILE for rpc server info config")
  152. raise Exception("ERROR:runtime_mode=rpc must set RPC_SERVER_INFO_FILE for rpc server info config")
  153. # load rpc server host and port info from local file.
  154. import json
  155. with open(rpc_server_info_config, 'r') as f:
  156. info = json.load(f)
  157. for i in info:
  158. rpc_machine[i] = info[i]
  159. rpc_lb[i] = 0.0
  160. return None
  161. def dispatch(rank=0):
  162. """Function for lock waiting dispatch handle version 1."""
  163. def _sort_by_value(d):
  164. items = list(d.items())
  165. random.shuffle(items)
  166. items.sort(key=lambda x: x[1])
  167. return [item[0] for item in items]
  168. for k, v in rpc_lb.items():
  169. logging.info("######rpc_lb[%s]=%f", rpc_machine[k][0], v)
  170. lb_list = _sort_by_value(rpc_lb)
  171. if len(lb_list) > rank:
  172. return lb_list[rank]
  173. return lb_list[len(lb_list) - 1]
  174. def commit(remote, weight):
  175. rpc_lb[remote] = weight
  176. @func_time_required
  177. def mod_launch_rpc_worker(mod, args, outputs, host, port, tuning=False):
  178. """internal RPC worker, should be called by mod_launch_rpc_thread."""
  179. logging.info("%s:====start connect to rpc ip: %s, rpc port: %d ",
  180. datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), host, port)
  181. remote = rpc.connect(host, port, session_timeout=300)
  182. logging.info("%s:====connect to rpc ip: %s, rpc port: %d finished ",
  183. datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), host, port)
  184. uuid_str = uuid.uuid4().hex
  185. temp_file_name = "stackvm_%s.o" % uuid_str
  186. mod.save(temp_file_name)
  187. remote.upload(temp_file_name)
  188. remote_mod = remote.load_module(temp_file_name)
  189. ctx = remote.cce()
  190. arg_list = []
  191. for a in args:
  192. arg_list.append(akg.tvm.nd.array(a, ctx))
  193. start_time = timer()
  194. remote_mod(*arg_list)
  195. ctx.sync()
  196. if os.path.exists(temp_file_name):
  197. os.remove(temp_file_name)
  198. out_list = []
  199. for i in outputs:
  200. out = arg_list[len(arg_list) + i if i < 0 else i].asnumpy()
  201. out_list.append(out)
  202. # this time measure is no accurate now, to be improved soon
  203. t = timer() - start_time
  204. if not tuning:
  205. return out_list[0] if len(out_list) == 1 else tuple(out_list)
  206. stat_info = {"run_time": t}
  207. return out_list[0] if len(out_list) == 1 else tuple(out_list), stat_info
  208. def mod_launch_rpc_thread(mode, mod, args, outputs, results, need_retry, retry, tuning=False):
  209. """internal RPC thread, should be called by mod_launch_rpc_multithread."""
  210. remoteevb = '0'
  211. host = None
  212. port = None
  213. env_dic = os.environ
  214. if env_dic.get('RPC_HOST') and env_dic.get('RPC_PORT'):
  215. host = env_dic.get('RPC_HOST')
  216. port = int(env_dic.get('RPC_PORT'))
  217. else:
  218. if mode == 'rpc_cloud':
  219. logging.error("runtime_mode=rpc_cloud must set 1980 host ip and port!")
  220. raise Exception("ERROR:runtime_mode=rpc_cloud must set 1980 host ip and port!")
  221. remoteevb = dispatch(retry)
  222. host = rpc_machine[remoteevb][0]
  223. port = rpc_machine[remoteevb][1]
  224. start_time = timer()
  225. end_time = 0.0
  226. logging.debug("rpc ip: %s, rpc port: %d", host, port)
  227. try:
  228. out_list = mod_launch_rpc_worker(mod, args, outputs, host, port, tuning=tuning)
  229. end_time = timer()
  230. t = end_time - start_time
  231. if not env_dic.get('RPC_HOST'):
  232. commit(remoteevb, 20 if t > 20 else t)
  233. logging.info("===this round host is %s time is %f", host, (end_time - start_time))
  234. results[retry] = out_list
  235. except RuntimeError:
  236. need_retry[retry] = True
  237. end_time = timer()
  238. logging.error("===Failed! this round host is %s time is %f", host, (end_time - start_time))
  239. if not env_dic.get('RPC_HOST'):
  240. commit(remoteevb, end_time - start_time + 20 * (retry + 1))
  241. logging.error("rpc retry error: %d %s", retry, sys.exc_info())
  242. def mod_launch_rpc(mode, mod, args, outputs, tuning=False):
  243. """
  244. launch rpc or rpc_cloud module with retry.
  245. Note:
  246. To minimize waiting time of struggler RPC servers, we wait for a short timeout and spawn
  247. a new thread after the timeout.
  248. In normal case, RPC would complete before the short timeout, so, only one thread will be created.
  249. When the RPC server is slow, we create multiple threads that run concurrently.
  250. We wait for the first thread that successfully completes its work and return the result.
  251. If a thread fails (an exception is raised), we spawn a new thread to retry.
  252. Newly spawned threads will use different RPC servers.
  253. We bound the maximum number of threads, i.e. maximum number of retries.
  254. """
  255. max_num_threads = 5
  256. import operator
  257. arg_filter = filter(lambda x: isinstance(x, np.ndarray), args)
  258. arg_tensor = list(arg_filter)
  259. tensor_size = reduce(operator.add, [reduce(operator.mul, arg.shape) for arg in arg_tensor])
  260. expected_upload_speed = 5e6
  261. expected_upload_time = int(tensor_size / expected_upload_speed)
  262. timeout_before_spawning_new_thread = 200 + expected_upload_time
  263. poll_interval = 1
  264. thread_timeout = 400 + expected_upload_time * 3
  265. load_rpc_server_info(mode)
  266. threads = [None] * max_num_threads
  267. results = [None] * max_num_threads
  268. need_retry = [None] * max_num_threads
  269. retried = [False] * max_num_threads
  270. for thread_index in range(max_num_threads):
  271. if thread_index > 0:
  272. logging.error("Thread %d run for %d seconds, spawn a new thread to retry",
  273. (thread_index - 1), timeout_before_spawning_new_thread)
  274. threads[thread_index] = Thread(target=mod_launch_rpc_thread,
  275. args=(mode, mod, args, outputs, results, need_retry, thread_index, tuning))
  276. # daemonize the thread to prevent long running threads from hanging the whole process
  277. threads[thread_index].daemon = True
  278. threads[thread_index].start()
  279. poll_count = timeout_before_spawning_new_thread // poll_interval
  280. while poll_count > 0:
  281. poll_count -= 1
  282. # wait for the newly created thread, because it is most likely to complete first
  283. threads[thread_index].join(poll_interval)
  284. for poll_index in range(thread_index + 1):
  285. if not threads[poll_index].is_alive() and not need_retry[poll_index]:
  286. return results[poll_index]
  287. if need_retry[poll_index] and not retried[poll_index]:
  288. logging.error("Thread %d exit with error, spawn a new thread immediately", poll_index)
  289. poll_count = 0
  290. retried[poll_index] = True
  291. logging.error("All %d threads are created, poll the threads until the first one exits normally, \
  292. or all threads exit abnormally or timeout", max_num_threads)
  293. poll_count = thread_timeout // poll_interval
  294. for _ in range(poll_count):
  295. threads[max_num_threads - 1].join(poll_interval)
  296. exit_thread_count = 0
  297. for poll_index in range(max_num_threads):
  298. if not threads[poll_index].is_alive() and not need_retry[poll_index]:
  299. return results[poll_index]
  300. if not threads[poll_index].is_alive():
  301. exit_thread_count += 1
  302. if exit_thread_count == max_num_threads:
  303. logging.error("All %d threads exit abnormally", max_num_threads)
  304. return None
  305. logging.error("All %d threads timeout", max_num_threads)
  306. return None
  307. def profiling_mode_run(mod, args, outputs, tuning, device_id):
  308. """
  309. Function for collecting cycle data from device.
  310. Args:
  311. mod: CCE Module.
  312. args: list or tuple of numpy array.
  313. outputs: list or tuple of output argment index.
  314. tuning: tuning model.
  315. device_id: device_id on device.
  316. """
  317. ctx = akg.tvm.ndarray.cce(device_id)
  318. tvm.get_global_func("ascend_start_profiling")(device_id)
  319. arg_list = []
  320. for a in args:
  321. arg_list.append(akg.tvm.nd.array(a, ctx))
  322. time_before_launch = time.time()
  323. mod(*arg_list)
  324. ctx.sync()
  325. tvm.get_global_func("ascend_stop_profiling")()
  326. out_list = []
  327. cycle = profiling_analyse(device_id, time_before_launch)
  328. for i in outputs:
  329. out = arg_list[len(arg_list) + i if i < 0 else i].asnumpy()
  330. out_list.append(out)
  331. logging.info('=====parsing cycles==============================')
  332. if cycle != PROF_ERROR_CODE:
  333. logging.info(cycle)
  334. else:
  335. logging.error("OOPS, can't correctly parsing cycles!")
  336. TestUtils.record_cycle(cycle)
  337. logging.info('=====parsing cycles==============================')
  338. if tuning:
  339. return out_list[0] if len(out_list) == 1 else tuple(out_list), {'run_time': cycle}
  340. return out_list[0] if len(out_list) == 1 else tuple(out_list)
  341. def profiling_analyse(device_id, time_before_launch):
  342. """analyse profiling."""
  343. def exec_cmds_with_pipe(cmd_list):
  344. cmd_num = len(cmd_list)
  345. if cmd_num <= 1:
  346. raise RuntimeError("length of cmd_list should be greater than 1.")
  347. ps = []
  348. for i, cmd in enumerate(cmd_list):
  349. if i == 0:
  350. p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
  351. else:
  352. p = subprocess.Popen(cmd, stdin=ps[-1].stdout, stdout=subprocess.PIPE)
  353. ps.append(p)
  354. for p in ps:
  355. p.wait()
  356. return ps[-1].communicate()
  357. if not isinstance(device_id, int):
  358. raise TypeError("device_id must be an integer.")
  359. try:
  360. public_path = os.getenv('PROFILING_DIR')
  361. if public_path is None:
  362. raise RuntimeError("Environment PROFILING_DIR not set!")
  363. return None
  364. public_path = validate_and_normalize_path(public_path)
  365. cmd_list = [
  366. ["find", public_path, "-iname", "*.log.%d" % device_id, "-printf", "'%T+\t%p\n'"],
  367. ["grep", "JOB"],
  368. ["sort", "-r"],
  369. ["head", "-n10"],
  370. ["awk", "{print $2}"],
  371. ["head", "-n1"],
  372. ]
  373. p = exec_cmds_with_pipe(cmd_list)
  374. for _ in range(5):
  375. if p[0].decode('utf8').strip() == '':
  376. time.sleep(1)
  377. try:
  378. job_file = p[0].decode('utf8').strip().split('/')[-2]
  379. except BaseException:
  380. logging.warning("failed to decode profiling result")
  381. return None
  382. logging.debug("job file is: %s", job_file)
  383. file_abs_path = public_path + "/" + job_file
  384. file_create_time = os.path.getctime(file_abs_path)
  385. if file_create_time < time_before_launch:
  386. raise RuntimeError("The JOB file is too old")
  387. return None
  388. hwtslog_parser = HWTSLogParser(file_abs_path)
  389. return hwtslog_parser.execute()
  390. except SyntaxError as e:
  391. logging.error(e)
  392. return PROF_ERROR_CODE
  393. def mod_launch_air(mod, args, outputs):
  394. """launch mod on kc_air."""
  395. if kc_air_mode == "CUDA":
  396. ctx = akg.tvm.ndarray.gpu(0)
  397. else:
  398. ctx = akg.tvm.ndarray.cce(0)
  399. arg_list = []
  400. for a in args:
  401. if isinstance(a, np.ndarray):
  402. arg_list.append(akg.tvm.nd.array(a, ctx))
  403. elif isinstance(a, (list, tuple)):
  404. for aa in a:
  405. if isinstance(aa, np.ndarray):
  406. arg_list.append(akg.tvm.nd.array(aa, ctx))
  407. else:
  408. arg_list.append(aa)
  409. else:
  410. arg_list.append(a)
  411. for retry in range(3):
  412. need_retry = False
  413. try:
  414. mod(*arg_list)
  415. ctx.sync()
  416. out_list = []
  417. if not need_retry:
  418. for i in outputs:
  419. out = arg_list[len(arg_list) + i if i < 0 else i].asnumpy()
  420. out_list.append(out)
  421. return out_list[0] if len(out_list) == 1 else tuple(out_list)
  422. except RuntimeError:
  423. need_retry = True
  424. logging.error("kc_air retry error: %d %s", retry, sys.exc_info())
  425. logging.error("kc_air runtime error, please check!")
  426. return None
  427. @func_time_required
  428. def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None, repeat_time=400):
  429. """
  430. unified run CCE kernel api.
  431. Args:
  432. mod (str): CCE Module, string of runtime choose, can set ca aic and rpc.
  433. args (Union[list, tuple]): list or tuple of numpy array.
  434. outputs (Union[list, tuple]): list or tuple of output argment index.
  435. tuning (bool): tuning model.
  436. device_id: device_id on device.
  437. expect: when mode in ["compile_cloud", "compile_mini"], return it.
  438. Returns:
  439. output numpy array, or tuple of numpy array if multi-output.
  440. """
  441. gc.collect()
  442. if mod.imported_modules[0].type_key == CUDA:
  443. ctx = akg.tvm.context(CUDA, device_id)
  444. mod_args = [akg.tvm.nd.array(a, ctx) for a in args]
  445. mod(*mod_args)
  446. out_list = [mod_args[len(args) + i if i < 0 else i].asnumpy() for i in outputs]
  447. if not tuning:
  448. return out_list[0] if len(out_list) == 1 else tuple(out_list)
  449. else:
  450. cycles = get_gpu_cycles(mod, *mod_args, device_id=device_id, save_log=True, repeat_time=repeat_time)
  451. return out_list[0] if len(out_list) == 1 else tuple(out_list), {'run_time': cycles}
  452. stat_info = {}
  453. profiling_mode = get_profiling_mode()
  454. if profiling_mode:
  455. return profiling_mode_run(mod, args, outputs, tuning, device_id)
  456. mode = get_runtime_mode()
  457. if mode == 'aic':
  458. output = aic_model.launch(mod, args, outputs)
  459. if not tuning:
  460. return output
  461. ra_util.get_ticks(stat_info)
  462. return output, stat_info
  463. if mode == 'aic_cloud':
  464. output = aic_model.launch(mod, args, outputs, spec=aic_model.Spec.CLOUD)
  465. if not tuning:
  466. return output
  467. ra_util.get_ticks(stat_info)
  468. return output, stat_info
  469. if mode in ('rpc', 'rpc_cloud'):
  470. return mod_launch_rpc(mode, mod, args, outputs, tuning)
  471. if mode in ('ca', 'air', 'air_cloud'):
  472. return mod_launch_air(mod, args, outputs)
  473. if mode in ("compile_cloud", "compile_mini"):
  474. return expect
  475. if mode in ("csim", "ccesim", "cdiff"):
  476. from akg.backend.csim import csim_launch
  477. return csim_launch(args, outputs)
  478. if mode == "cpu":
  479. tvm_array = []
  480. ctx = akg.tvm.context("llvm", 0)
  481. for _, args_val in enumerate(args):
  482. tvm_temp = akg.tvm.nd.array(args_val, ctx)
  483. tvm_array.append(tvm_temp)
  484. mod(*tvm_array)
  485. return tvm_array[-1].asnumpy()
  486. raise ValueError("mode must be aic, rpc, aic_cloud, ca, compile_cloud, compile_mini, cpu, csim, ccesim or cdiff")
  487. def gen_kernel_name(input_shapes, input_types, op_attrs=None, kernel_name=""):
  488. """generate kernel name."""
  489. dir_max_length = 250
  490. shape_info = ''
  491. for _, (shape, dtype) in enumerate(zip(input_shapes, input_types)):
  492. if isinstance(shape, (list, tuple)) and shape and isinstance(shape[0], (list, tuple)):
  493. for _, tmp_shape in enumerate(shape):
  494. vc_util.check_shape(tmp_shape)
  495. tmp_shape = list(tmp_shape)
  496. str_tmp_shape = [str(tmp) for tmp in tmp_shape]
  497. shape_info = "%s_%s_%s" % (shape_info, dtype, '_'.join(str_tmp_shape))
  498. elif isinstance(shape, akg.tvm.tensor.Tensor):
  499. for tmp_shape in shape.shape:
  500. if isinstance(tmp_shape, akg.tvm.expr.Var):
  501. str_shape = tmp_shape.name
  502. else:
  503. str_shape = str(tmp_shape)
  504. shape_info = "%s_%s_%s" % (shape_info, dtype, '_'.join(str_shape))
  505. else:
  506. vc_util.check_shape(shape)
  507. if isinstance(shape, akg.tvm.expr.Var):
  508. shape = [shape]
  509. shape = list(shape)
  510. str_shape = [str(i) for i in shape]
  511. shape_info = "%s_%s_%s" % (shape_info, dtype, '_'.join(str_shape))
  512. if op_attrs is not None:
  513. for tmp in op_attrs:
  514. if isinstance(tmp, (list, tuple)):
  515. for ele in tmp:
  516. if isinstance(ele, (list, tuple)):
  517. str_tmp = [str(i) for i in ele]
  518. shape_info = shape_info + '_' + '_'.join(str_tmp)
  519. else:
  520. shape_info = shape_info + '_' + str(ele)
  521. elif isinstance(tmp, (int, float)):
  522. shape_info = shape_info + '_' + str(tmp)
  523. elif isinstance(tmp, (str)):
  524. shape_info = shape_info + '_' + tmp
  525. elif isinstance(tmp, (np.ndarray)):
  526. shape = list(tmp.shape)
  527. str_shape = [str(i) for i in shape]
  528. shape_info = shape_info + '_' + '_'.join(str_shape)
  529. kernel_name = kernel_name + shape_info
  530. kernel_name = re.sub(r'[^0-9a-zA-Z]+', '_', kernel_name)
  531. if len(kernel_name) > dir_max_length:
  532. logging.info("Dir name %s exceed maximal length, use first %d char as dir name.", kernel_name, dir_max_length)
  533. kernel_name = kernel_name[:dir_max_length]
  534. return kernel_name
  535. @func_time_required
  536. def op_build_test(op_func, input_shapes, input_types, op_attrs=None, kernel_name="",
  537. attrs=None, log_cce=False, dump_ir=True, dump_code=True,
  538. polyhedral=True, tuning=False):
  539. """
  540. Return module from op_build with given inputs, distinguish tuning mode.
  541. Args:
  542. op_func (function returning an op or (op, [op_vars])): The op build function
  543. input_shapes(iterable of iterable of int): the dim sizes for input for op
  544. input_types (iterable of iterable of str): the dtypes for each input
  545. op_attrs (list or tuple): extra attributes for the op.
  546. kernel_name (str): name of op.
  547. attrs (dict): tiling parameter.
  548. log_cce (bool): False by default.
  549. dump_ir (bool): True by default.
  550. dump_code (bool): False by default.
  551. polyhedral (bool): True by default.
  552. tuning (bool): False by default.
  553. Return:
  554. module.
  555. """
  556. if isinstance(attrs, dict) and 'tuning' in attrs.keys():
  557. kernel_name = kernel_name
  558. else:
  559. kernel_name = gen_kernel_name(input_shapes, input_types, op_attrs, kernel_name)
  560. logging.debug('kernel_name---------- %s', str(kernel_name))
  561. mod = op_build(op_func, input_shapes, input_types, op_attrs, kernel_name,
  562. attrs, log_cce, dump_ir, dump_code,
  563. polyhedral, tuning)
  564. return mod
  565. def recursive_copy(obj):
  566. """
  567. Copy a container object recursively
  568. Args:
  569. obj (list, tuple, dict or object): input container object.
  570. Return:
  571. copied object.
  572. """
  573. if isinstance(obj, list):
  574. return [recursive_copy(it) for it in obj]
  575. if isinstance(obj, tuple):
  576. return tuple([recursive_copy(it) for it in obj])
  577. if isinstance(obj, dict):
  578. copy_obj = dict()
  579. for key in obj:
  580. copy_obj[key] = recursive_copy(obj[key])
  581. return copy_obj
  582. return obj
  583. def gen_inputs_and_shape_params(input_shapes, input_types, inputs, shape_params):
  584. """
  585. Generate akg.tvm.placeholder as inputs for op with given input_shapes and input_types
  586. Args:
  587. input_shapes(iterable of iterable of int): the dim sizes for input for op.
  588. input_types (iterable of iterable of str): the dtypes for each input.
  589. inputs (list): None by default.
  590. shape_params (list): None by default.
  591. """
  592. for i, (shape, dtype) in enumerate(zip(input_shapes, input_types)):
  593. if isinstance(shape, (list, tuple)) and shape and isinstance(shape[0], (list, tuple)):
  594. tmp_input = []
  595. for j, tmp_shape in enumerate(shape):
  596. tmp_input.append(akg.tvm.placeholder(tmp_shape, dtype, "input_%d_%d" % (i + 1, j + 1)))
  597. for tmp in tmp_shape:
  598. if isinstance(tmp, akg.tvm.expr.Var):
  599. shape_params.append(tmp)
  600. inputs.append(tmp_input)
  601. elif isinstance(shape, (list, tuple)) and shape and isinstance(shape[0], akg.tvm.expr.Var):
  602. inputs.append(akg.tvm.placeholder(shape, dtype, "input_%d" % (i + 1)))
  603. for tmp_shape in shape:
  604. if isinstance(tmp_shape, akg.tvm.expr.Var):
  605. shape_params.append(tmp_shape)
  606. elif isinstance(shape, akg.tvm.tensor.Tensor):
  607. inputs.append(shape)
  608. for tmp_shape in shape.shape:
  609. shape_params.append(tmp_shape)
  610. else:
  611. inputs.append(akg.tvm.placeholder(shape, dtype, "input_%d" % (i + 1)))
  612. def gen_attrs_params(op_attrs, attrs_params):
  613. """
  614. Parsing attrs given by op_attrs.
  615. Args:
  616. op_attrs (list or tuple): extra attributes for the op.
  617. attrs_params (list): None by default.
  618. """
  619. for tmp_attr in op_attrs:
  620. if isinstance(tmp_attr, (list, tuple)) and tmp_attr and isinstance(tmp_attr[0], akg.tvm.expr.Var):
  621. for attr_param in tmp_attr:
  622. if isinstance(attr_param, akg.tvm.expr.Var):
  623. attrs_params.append(attr_param)
  624. elif isinstance(tmp_attr, akg.tvm.expr.Var):
  625. attrs_params.append(tmp_attr)
  626. def get_dim_from_func_map(attrs, op_func, args):
  627. """
  628. Get tiling parameter from map defined in op_func.
  629. Args:
  630. attrs (dict): tiling parameter.
  631. op_func (function returning an op or (op, [op_vars])): The op build function.
  632. """
  633. if attrs is None or 'dim' not in attrs or not attrs['dim']:
  634. dim_info = ""
  635. if attrs is None:
  636. attrs = dict()
  637. if op_func.__name__ in ct_util.set_dim_func_map.keys():
  638. value = ct_util.set_dim_func_map[op_func.__name__]
  639. if inspect.isfunction(value):
  640. dim_info = value(*args)
  641. elif isinstance(value, dict):
  642. key = []
  643. key.append(ft_util.convert_to_list(input_shapes))
  644. key.append(ft_util.convert_to_list(input_types))
  645. if op_attrs is not None:
  646. key.append(op_attrs)
  647. key = str(tuple(key))
  648. if key in value.keys():
  649. dim_info = ct_util.set_dims(value[key])
  650. else:
  651. raise RuntimeError("Registered set_dim_map is invalid. Must be a function or a dict!")
  652. if isinstance(dim_info, (list, tuple)):
  653. dim_info = dim_info[0]
  654. attrs['dim'] = dim_info
  655. return attrs
  656. def parsing_output(output, attrs, compute_func, sch_tmpl, gpu_binds):
  657. """
  658. Parsing the outputs of op.
  659. Args:
  660. output (iterable of iterable of akg.tvm.tensor): the outputs of op.
  661. attrs (dict): tiling parameter.
  662. compute_func (function): None by default, func for doing compute_inline or other.
  663. sch_tmpl (dict): None by default.
  664. gpu_binds (dict): None by default.
  665. """
  666. if isinstance(output, (list, tuple)):
  667. from inspect import isfunction
  668. new_outputs = []
  669. for elem in output:
  670. if isfunction(elem):
  671. compute_func = elem
  672. elif isinstance(elem, dict):
  673. for key, value in elem.items():
  674. if key not in attrs or not attrs[key]:
  675. attrs[key] = value
  676. elif isinstance(elem, (list, tuple)):
  677. new_outputs += elem
  678. else:
  679. new_outputs.append(elem)
  680. output = new_outputs
  681. elif isinstance(output, dict):
  682. sch_tmpl = output
  683. output = sch_tmpl['output']
  684. gpu_binds = sch_tmpl['binds']
  685. return output, compute_func, sch_tmpl, gpu_binds
  686. def gen_op_var(inputs, output, op_var):
  687. """
  688. Combine inputs and outputs about the op.
  689. Args:
  690. inputs(list): the inputs of op.
  691. output(list): the outputs of op.
  692. op_var (list): inputs and outputs for the op.
  693. """
  694. for xx in inputs:
  695. if isinstance(xx, list):
  696. for x in xx:
  697. op_var.append(x)
  698. else:
  699. op_var.append(xx)
  700. if isinstance(output, (list, tuple)):
  701. op_var = op_var + [i for i in output if TensorUtils.is_output_value(i)]
  702. else:
  703. if TensorUtils.is_output_value(output):
  704. op_var = op_var + [output]
  705. return op_var
  706. def gen_shape_var(attrs_params, shape_params, shape_var):
  707. """
  708. Combine shape of inputs and extra attributes about the op.
  709. Args:
  710. attrs_params(list): shape of inputs for the op
  711. shape_params(list): extra attributes for the op
  712. shape_var (list): shape of inputs and extra attributes for the op.
  713. """
  714. if attrs_params:
  715. [shape_var.append(i) for i in attrs_params if i not in shape_var]
  716. [shape_var.append(i) for i in shape_params if i not in shape_var]
  717. def gen_spaces_dim_key(op_func, args, s, op_var, kernel_name, attrs, polyhedral, tuning, target):
  718. """
  719. Generate tiling parameter.
  720. Args:
  721. op_func (function returning an op or (op, [op_vars])): The op build function.
  722. args (Union[list, tuple]): list or tuple of numpy array.
  723. s (dict): schedule of op.
  724. op_var (list): the akg.tvm.tensor of inputs and outputs for op.
  725. kernel_name (str): name of op.
  726. attrs (dict): tiling parameter.
  727. polyhedral (bool): True by default.
  728. tuning (bool): False by default.
  729. Return:
  730. tiling parameter.
  731. """
  732. set_dim_key = ""
  733. if op_func.__name__ in ct_util.set_dim_func_map.keys():
  734. func_ = ct_util.set_dim_func_map[op_func.__name__]
  735. if inspect.isfunction(func_):
  736. set_dim_key = func_(*args)[1]
  737. elif op_func.__name__ in ct_util.gen_key_func_map.keys():
  738. func_ = ct_util.gen_key_func_map[op_func.__name__]
  739. if inspect.isfunction(func_):
  740. set_dim_key = func_(*args)
  741. with akg.build_config(dump_pass_ir=True):
  742. spaces = akg.lower(s, op_var, name=kernel_name, attrs=attrs, polyhedral=polyhedral, tuning=tuning,
  743. target=target)
  744. if set_dim_key == "":
  745. set_dim_key = str(args)
  746. return spaces, set_dim_key
  747. def create_gpu_mod(sch_tmpl, s, op_func, op_var, shape_var, kernel_name, attrs, polyhedral, binds, dump_ir, dump_code,
  748. tuning):
  749. """
  750. Return module for op of gpu.
  751. Args:
  752. sch_tmpl (dict): schedule of op and the others.
  753. s (dict): schedule of op.
  754. op_func (function returning an op or (op, [op_vars])): The op build function.
  755. op_var (list): the akg.tvm.tensor of inputs and outputs for op.
  756. shape_var (list): shape of inputs and extra attributes for the op.
  757. kernel_name (str): name of op.
  758. attrs (dict): tiling parameter.
  759. polyhedral (bool): True by default.
  760. binds (dict): BINDS
  761. dump_ir (bool): True by default.
  762. dump_code (bool): False by default.
  763. tuning (bool): False by default.
  764. Return:
  765. module.
  766. """
  767. if sch_tmpl is not None or (attrs and attrs.get("target", "cce") == "cuda"):
  768. if kernel_name == "":
  769. kernel_name = op_func.__name__ if sch_tmpl is None else sch_tmpl['op_name']
  770. target = CUDA
  771. if sch_tmpl is not None:
  772. if sch_tmpl['target'] != CUDA:
  773. raise ValueError("Only support cuda as target when using schedule template.")
  774. global kc_air_mode
  775. kc_air_mode = "CUDA"
  776. with akg.tvm.target.cuda() as target:
  777. if not tuning:
  778. s = sch_tmpl['schedule'](sch_tmpl['output'])
  779. with akg.tvm.build_config(dump_pass_ir=dump_ir):
  780. mod = akg.build(s, op_var, "cuda", shape_var, name=kernel_name, attrs=attrs,
  781. polyhedral=False, binds=binds)
  782. else:
  783. @autotvm.template
  784. def _autotune_template():
  785. s = sch_tmpl['schedule'](sch_tmpl['output'])
  786. return (s, op_var)
  787. # create autotune task
  788. task = autotvm.task.create(_autotune_template,
  789. args=list(),
  790. target='cuda')
  791. print("task config: ", task.config_space)
  792. # set measure_option
  793. measure_option = autotvm.measure_option(
  794. builder=autotvm.LocalBuilder(),
  795. runner=autotvm.LocalRunner(repeat=5, min_repeat_ms=150, timeout=4)
  796. )
  797. # Begin tuning, log records to file `kernel_name.log`
  798. tuner = autotvm.tuner.RandomTuner(task)
  799. if not os.path.exists(kernel_name + '.log'):
  800. tuner.tune(n_trial=len(task.config_space),
  801. measure_option=measure_option,
  802. callbacks=[autotvm.callback.log_to_file(kernel_name + '.log')])
  803. # query best config
  804. dispatch_context = autotvm.apply_history_best(kernel_name + '.log')
  805. best_config = dispatch_context.query(task.target, task.workload)
  806. print("\nBest config is:")
  807. print(best_config)
  808. # apply best config
  809. with autotvm.apply_history_best(kernel_name + '.log'):
  810. s, op_var = _autotune_template()
  811. mod = akg.build(s, op_var, "cuda", shape_var, name=kernel_name, attrs=attrs,
  812. polyhedral=False, binds=gpu_binds)
  813. else :
  814. with akg.build_config(dump_pass_ir=dump_ir):
  815. mod = akg.build(s, op_var, target, shape_var, name=kernel_name, attrs=attrs, polyhedral=polyhedral,
  816. binds=binds)
  817. if dump_code:
  818. source_code = mod.imported_modules[0].get_source()
  819. create_code(kernel_name, "./", source_code, CUDA)
  820. return mod
  821. def op_build(op_func, input_shapes, input_types, op_attrs=None, kernel_name="",
  822. attrs=None, log_cce=False, dump_ir=True, dump_code=True,
  823. polyhedral=True, tuning=False):
  824. """
  825. Return module built from op_func with given inputs.
  826. Args:
  827. op_func (function returning an op or (op, [op_vars])): The op build function.
  828. input_shapes(iterable of iterable of int): the dim sizes for input for op.
  829. input_types (iterable of iterable of str): the dtypes for each input.
  830. op_attrs (list or tuple): extra attributes for the op.
  831. kernel_name (str): name of op.
  832. attrs (dict): tiling parameter.
  833. log_cce (bool): False by default.
  834. dump_ir (bool): True by default.
  835. dump_code (bool): False by default.
  836. polyhedral (bool): True by default.
  837. tuning (bool): False by default.
  838. Return:
  839. module.
  840. """
  841. inputs = []
  842. shape_params = [] # save all the shape params for dynamic_shape cases
  843. gen_inputs_and_shape_params(input_shapes, input_types, inputs, shape_params)
  844. attrs_params = []
  845. if op_attrs is not None:
  846. args = inputs + op_attrs
  847. gen_attrs_params(op_attrs, attrs_params)
  848. else:
  849. args = inputs
  850. # backup inputs because the tensor names may be updated inside op_func
  851. inputs_backup = recursive_copy(inputs)
  852. output = op_func(*args)
  853. # restore inputs to make sure that tensor names are not changed by op_func
  854. inputs = inputs_backup
  855. # set dim
  856. attrs = get_dim_from_func_map(attrs, op_func, args)
  857. compute_func = None # func which is defined in dsl for doing compute_inline or other
  858. sch_tmpl = None
  859. gpu_binds = None
  860. output, compute_func, sch_tmpl, gpu_binds = parsing_output(output, attrs, compute_func, sch_tmpl, gpu_binds)
  861. op_var = []
  862. op_var = gen_op_var(inputs, output, op_var)
  863. shape_var = []
  864. gen_shape_var(attrs_params, shape_params, shape_var)
  865. if sch_tmpl is not None:
  866. return create_gpu_mod(sch_tmpl, None, op_func, op_var, shape_var, kernel_name, attrs, polyhedral, gpu_binds,
  867. dump_ir, dump_code, tuning)
  868. if isinstance(output, (list, tuple)):
  869. tmp = []
  870. for x in list(output):
  871. if isinstance(x, tuple):
  872. tmp.append(x[0].op)
  873. else:
  874. tmp.append(x.op)
  875. s = akg.tvm.create_schedule(tmp)
  876. else:
  877. s = akg.tvm.create_schedule(output.op)
  878. if compute_func is not None:
  879. compute_func(s)
  880. polyhedral = False
  881. target = CCE
  882. if attrs and attrs.get("target", "cce") == CUDA:
  883. target = CUDA
  884. level = attrs.get("help_tiling") if attrs and "help_tiling" in attrs else None
  885. if tuning or (level is not None and level > help_tiling_level['None']):
  886. return gen_spaces_dim_key(op_func, args, s, op_var, kernel_name, attrs, polyhedral, tuning, target)
  887. mode = get_runtime_mode()
  888. if mode == "cpu":
  889. mod = akg.tvm.build(s, op_var, "llvm")
  890. if not os.path.isdir("./cpu/ir/"):
  891. os.makedirs("./cpu/ir/")
  892. with os.fdopen(os.open("./cpu/ir/" + kernel_name + ".cc", os.O_WRONLY | os.O_CREAT, 0o400), 'w') as irf:
  893. irf.write(akg.tvm.lower(s, op_var, shape_var, simple_mode=True))
  894. return mod
  895. binds = None if not attrs else attrs.pop(BINDS, None)
  896. if target == CUDA:
  897. return create_gpu_mod(None, s, op_func, op_var, shape_var, kernel_name, attrs, polyhedral, binds, dump_ir,
  898. dump_code, tuning)
  899. target = CCE
  900. with akg.build_config(dump_pass_ir=dump_ir):
  901. mod = akg.build(s, op_var, target, shape_var, name=kernel_name, attrs=attrs, polyhedral=polyhedral, binds=binds)
  902. source_code = mod.imported_modules[0].get_source()
  903. if log_cce:
  904. logging.debug("#################cce code####################")
  905. logging.debug(source_code)
  906. if dump_code:
  907. create_code(kernel_name, "./", source_code, target)
  908. return mod
  909. def get_runtime_mode():
  910. """get runtime mode."""
  911. env_dic = os.environ
  912. if not env_dic.get('RUNTIME_MODE'):
  913. mode = 'rpc_cloud'
  914. else:
  915. mode = env_dic.get('RUNTIME_MODE')
  916. return mode
  917. def get_profiling_mode():
  918. """get profiling mode."""
  919. env_dic = os.environ
  920. if env_dic.get('PROFILING_MODE') and env_dic.get('PROFILING_MODE').lower() == "true":
  921. return True
  922. return False
  923. def product_is_mini():
  924. """check whether in mini environment."""
  925. mode = get_runtime_mode()
  926. if mode in ('rpc', 'air', 'aic', 'compile_mini'):
  927. return True
  928. return False
  929. def get_available_devices_num():
  930. """get available devives num."""
  931. env_dic = os.environ
  932. try:
  933. return int(env_dic.get('DEVICE_TOTAL_NUM').lower()) if env_dic.get('DEVICE_TOTAL_NUM') else 1
  934. except NameError as e:
  935. logging.error(e)
  936. return 1
  937. def get_device_id():
  938. """get device id."""
  939. env_dic = os.environ
  940. try:
  941. return int(env_dic.get('DEVICE_ID').lower()) if env_dic.get('DEVICE_ID') else 0
  942. except NameError as e:
  943. logging.error(e)
  944. return 0
  945. def get_gpu_cycles(mod, *mod_args, device_id=0, save_log=False, repeat_time=400):
  946. "get gpu profiling cycles."
  947. func = tvm.get_global_func('GPUProfilerInit')
  948. func("")
  949. from akg.utils.result_analysis import gpu_profiling
  950. gpu_profiling(mod, *mod_args, repeat_time=repeat_time, device_id=device_id)
  951. func = tvm.get_global_func('GPUProfilerStop')
  952. a = func()
  953. return int(a)
  954. class TestUtils:
  955. """Class for getting cycle and core num."""
  956. @staticmethod
  957. def record_cycle(cycle):
  958. if os.environ.get(PERFORMANCE_TEST_FILE):
  959. result_file = os.environ.get(PERFORMANCE_TEST_FILE)
  960. with open(result_file, "a+") as f:
  961. f.write("{0}\n".format(cycle))
  962. @staticmethod
  963. def record_core(stmt):
  964. """Function for getting performance data from cores."""
  965. def get_core_num():
  966. core_num = 1
  967. if hasattr(stmt, 'attr_key') and stmt.attr_key == 'thread_extent':
  968. core_num = stmt.value
  969. return core_num
  970. if os.environ.get(PERFORMANCE_TEST_FILE):
  971. result_file = os.environ.get(PERFORMANCE_TEST_FILE)
  972. with open(result_file, "a+") as f:
  973. f.write("{0}; ".format(get_core_num()))