refactor(sdk): refactor load and run with new framework

GitOrigin-RevId: b092699dee
4 years ago · 37c1726fc1
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,7 +74,6 @@ option(MGE_ENABLE_EXCEPTIONS "Build with exceptions" ON)
 option(MGE_WITH_TEST "Enable test for MegEngine." OFF)
 option(MGE_WITH_DISTRIBUTED "Build with distributed support" ON)
 option(MGE_BUILD_IMPERATIVE_RT "Build _imperative_rt Python Module " ON)
 option(MGE_BUILD_SDK "Build load_and_run" ON)
 option(MGE_INFERENCE_ONLY "Build inference only library." OFF)
 option(MGE_WITH_MKLDNN "Enable Intel MKL_DNN support," ON)
 option(MGE_WITH_ROCM "Enable ROCM support" OFF)
@@ -542,6 +541,8 @@ if(MGE_WITH_TEST)
    include(cmake/gtest.cmake)
 endif()

 include(cmake/gflags.cmake)

 if(MGE_BUILD_IMPERATIVE_RT)
    set(CMAKE_CXX_STANDARD 17)
 endif()
@@ -1147,10 +1148,6 @@ endif()

 add_subdirectory(src)

 if(MGE_BUILD_SDK)
    add_subdirectory(sdk/load-and-run)
 endif()

 if(MGE_BUILD_IMPERATIVE_RT)
    add_subdirectory(imperative)
    message(STATUS "Enable imperative python wrapper runtime")
--- a/cmake/gflags.cmake
+++ b/cmake/gflags.cmake
@@ -0,0 +1 @@
 add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/gflags ${CMAKE_CURRENT_BINARY_DIR}/gflags)
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -150,6 +150,9 @@ if(MGE_WITH_TEST)
    add_subdirectory(test)
 endif()

 #load_and_run
 add_subdirectory(load_and_run)

 # tools and example
 add_executable(rc4_encryptor tools/rc4_encrypt.cpp)

--- a/lite/load_and_run/BUILD
+++ b/lite/load_and_run/BUILD
@@ -0,0 +1,38 @@
 load("//brain/megbrain/lite:flags.bzl","pthread_select")

 cc_library(
    name = "mgblar",
    copts = ["-std=c++14"],

    srcs = glob(["src/**/*.cpp"], exclude = ["src/main.cpp"]),
    hdrs = glob(["src/**/*.h"]),
    includes = ["src"],
    features = if_opt([
        "no_exceptions",
        "no_rtti",
    ]),
    defines = [
        "LITE_BUILD_WITH_MGE=1", 
    ],

    deps = ["//brain/megbrain/lite:lite_static_test"]+
        pthread_select(
            ["@com_github_gflags_gflags//:gflags_nothreads"],
            ["//external:gflags"]
        ),
    alwayslink = 1,
    visibility = ["//visibility:public"],
 )

 cc_megvii_binary(
    name = "load_and_run",
    copts = ["-std=c++14"],
    srcs = ["src/main.cpp"],
    features = if_opt([
        "no_exceptions",
        "no_rtti",
    ]),
    internal_deps = [":mgblar"],
    visibility = ["//visibility:public"],
 )

--- a/lite/load_and_run/CMakeLists.txt
+++ b/lite/load_and_run/CMakeLists.txt
@@ -0,0 +1,29 @@
 # BUILD the load and run for lite
 include_directories(PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/load_and_run/src>)
 file (GLOB_RECURSE SOURCES ./*.cpp)

 add_executable (load_and_run  ${SOURCES})

 target_link_libraries(load_and_run lite_static)
 target_link_libraries(load_and_run megbrain)
 target_link_libraries(load_and_run gflags)

 if(LITE_BUILD_WITH_RKNPU)
    #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
    target_link_options(load_and_run  PRIVATE "-fuse-ld=gold")
 endif()

 if(MGE_WITH_ROCM)
    # FIXME: hip obj can not find cpp obj only through lite_static
    target_link_libraries(load_and_run megdnn)
 endif()

 if(UNIX)
    if(APPLE OR ANDROID)
        target_link_libraries(load_and_run dl)
    else()
        target_link_libraries(load_and_run dl rt)
    endif()
 endif()

 install (TARGETS load_and_run EXPORT ${LITE_EXPORT_TARGETS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
--- a/lite/load_and_run/dump_with_testcase.py
+++ b/lite/load_and_run/dump_with_testcase.py
@@ -0,0 +1,404 @@
 #!/usr/bin/env mdl
 # -*- coding: utf-8 -*-
 # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 #
 # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 from megskull.graph import NodeFilter, FpropEnv
 from megskull.opr.all import AssertEqual, DataProvider, BatchNormalization
 from megskull.utils.logconf import get_logger
 from meghair.utils import io
 import megbrain as mgb

 import argparse
 import struct
 import re
 import os

 import numpy as np
 import cv2

 logger = get_logger(__name__)

 def auto_reformat_image(args, path, data, dst_shape):
    """reformat image to target shape

    :param data: image data as numpy array
    :param dst_shape: target shape
    """
    dim3_format = False     # required input format does not contain batch
    hwc_format = False      # required input format is NHWC

    if len(dst_shape) == 3:
        dst_shape = (1, ) + dst_shape
        dim3_format = True

    assert len(dst_shape) == 4, 'bad dst_shape: {}'.format(dst_shape)
    chl = dst_shape[1]
    if chl in [1, 3]:
        n, c, h, w = dst_shape
        dst_shape = (n, h, w, c)
    else:
        chl = dst_shape[3]
        assert chl in [1, 3], (
            'can not infer input format from shape: {}'.format(dst_shape))
        hwc_format = True

    # dst_shape has now been normalized to NHWC format

    if args.resize_input:
        h, w = dst_shape[1:3]
        data = cv2.resize(data, (w, h))
        logger.info('input {} resized to {}'.format(path, data.shape))

    if chl == 1:
        data = cv2.cvtColor(data, cv2.COLOR_BGR2GRAY)
        data = data[:, :, np.newaxis]

    assert data.ndim == 3
    data = data[np.newaxis]
    # data normalized to NHWC format

    if not hwc_format:
        data = np.transpose(data, (0, 3, 1, 2))

    if dim3_format:
        data = np.squeeze(data, 0)

    return data

 def read_input_data(args, dst_shape, dtype, path, repeat):
    def check_shape_equal(dst_shape, data_shape):
        assert len(data_shape) == len(dst_shape) , (
            'input/data shapes mismatch: {} vs {}'.format(
                dst_shape, data_shape))

        if data_shape[1:] != dst_shape[1:]:
            logger.warning('dst_shape is {}; data_shape is {}'.format(
                dst_shape, data_shape))

    if path.startswith('#'):
        assert not args.resize_input
        assert not args.input_transform
        spec = path
        m = re.match(
            r'^#rand\(([-0-9.]*)\s*,\s*([-0-9.]*)\s*(,[^\)]+)?\)$', spec)
        assert m, 'bad spec {}'.format(spec)

        rng_min = float(m.group(1))
        rng_max = float(m.group(2))
        if m.group(3):
            shape_str = m.group(3)
            try:
                shape = shape_str[1:].split(',')
                if shape[-1].strip() == '...':
                    shape = shape[:-1]
                    shape.extend(list(dst_shape[len(shape):]))
                data_shape = tuple(map(int, shape))
            except ValueError as e:
                raise ValueError('bad spec {}: {}'.format(spec, e.args))
        else:
            data_shape = dst_shape

        check_shape_equal(dst_shape, data_shape)
        return np.random.uniform(rng_min, rng_max, data_shape).astype(dtype)

    # try to load image
    data = cv2.imread(path, cv2.IMREAD_COLOR)
    if data is None:
        assert not args.resize_input
        data = io.load(path)
        assert isinstance(data, np.ndarray)
    else:
        # load image succeeds, so we expect input format is image format
        data = auto_reformat_image(args, path, data, dst_shape)

    data = np.repeat(data, repeat, axis=0)
    if repeat > 1:
        logger.info('repeat input for {} times, data shape is {}'.format(
            repeat, data.shape))

    check_shape_equal(dst_shape, data.shape)

    if args.input_transform:
        data = eval(args.input_transform, {'data': data, 'np': np})

    return data


 def gen_one_testcase(args, inputs, spec):
    paths = spec.split(';')
    if len(paths) != len(inputs):
        if len(paths) == 1 and paths[0].startswith('#'):
            paths = ['{}:{}'.format(name, paths[0]) for name in inputs.keys()]
    assert len(paths) == len(inputs), (
        'required inputs: {}; data paths: {}'.format(inputs.keys(), paths))
    if len(paths) == 1 and ':' not in paths[0]:
        paths[0] = next(iter(inputs.keys())) + ':' + paths[0]

    ret = {}
    for path in paths:
        var, path = path.split(':')
        if args.repeat:
            repeat = args.repeat
        else:
            repeat = 1
        ret[var] = read_input_data(args, inputs[var].imm_shape,
                                   inputs[var].dtype, path, repeat)
    return ret


 def make_feeds(args):
    outputs = io.load_network(args.input).outputs
    if not args.no_assert:
        env = FpropEnv(verbose_fprop=False)
        # set flag so ExternCOprPlaceholder produce expected output
        env.flags.user['extern_c_opr_eval'] = True
        func = env.comp_graph.compile(None, [mgb.copy_output(env.get_mgbvar(i))
                                             for i in outputs])

        def expect_name(var): return 'expect:{}'.format(var.name)

    nf = NodeFilter.make_all_deps(*outputs)
    inputs = {i.name: i for i in nf.data_provider()}
    if args.init_bn:
        for i in nf:
            if isinstance(i, BatchNormalization):
                if i._iter.get_value() == 0:
                    i._iter.set_value(1)
                    i._variance.set_value(np.ones(i._variance.shape))

    testcases = []

    np.set_printoptions(precision=2, threshold=4, suppress=True)

    data_list = []
    for item in args.data:
        if item.startswith('@'):
            with open(item[1:], 'r') as f:
                data_list.extend([ line.rstrip() for line in f if line.rstrip() != ''])
        else:
            data_list.append(item)

    for inp_spec in data_list:
        cur_testcase = gen_one_testcase(args, inputs, inp_spec)
        assert len(cur_testcase) == len(inputs), (
            'required inputs: {}; given data: {}'.format(
                inputs.keys(), cur_testcase.keys()))

        if not args.no_assert:
            outputs_get = func(**cur_testcase)
            for var, val in zip(outputs, outputs_get):
                cur_testcase[expect_name(var)] = val
                logger.info(
                    'generate test groundtruth: var={} shape={} range=({}, {})'
                    ' mean={} var={}'.format(
                        var, val.shape, val.min(), val.max(),
                        np.mean(val), np.var(val)))
        testcases.append(cur_testcase)
        logger.info('add testcase: \n {}'.format(
            '\n '.join('{}: shape={} dtype={} range=({:.2f},{:.2f}) '
                       'mean={:.2f} sd={:.2f}'.format(
                           k, v.shape, v.dtype, v.min(), v.max(), np.mean(v),
                           np.std(v))
                       for k, v in sorted(cur_testcase.items()))))

    if not args.no_assert:
        def expect_shp(var):
            ret = var.partial_shape.determined_shape
            if ret:
                return ret
            return testcases[0][expect_name(var)].shape

        verbose = not args.silent
        outputs = [AssertEqual(DataProvider(expect_name(i), expect_shp(i),
                                            dtype=i.dtype,
                                            comp_node=i.comp_node),
                               i, verbose=verbose, maxerr=args.maxerr)
                   for i in outputs]
    return {'outputs': outputs, 'testcases': testcases}

 def optimize_for_inference(args, outputs):
    args_map = {
        'enable_io16xc32': 'f16_io_f32_comp',
        'enable_ioc16': 'f16_io_comp',
        'enable_hwcd4': 'use_nhwcd4',
        'enable_nchw4': 'use_nchw4',
        'enable_nchw88': 'use_nchw88',
        'enable_nchw44': 'use_nchw44',
        'enable_nchw44_dot': 'use_nchw44_dot',
        'enable_nchw32': 'use_nchw32',
        'enable_chwn4': 'use_chwn4',
        'enable_fuse_conv_bias_nonlinearity': 'fuse_conv_bias_nonlinearity',
        'enable_fuse_conv_bias_with_z': 'fuse_conv_bias_with_z',
        'enable_nchw64': 'use_nchw64', 
        'enable_fuse_preprocess': 'fuse_preprocess', 
    }

    kwargs = {}
    for k, v in args_map.items():
        if getattr(args, k):
            assert args.optimize_for_inference, (
                'optimize_for_inference should be set when {} is given'.format(
                    k))
            kwargs[v] = True

    if args.optimize_for_inference:
        return mgb.optimize_for_inference(outputs, **kwargs)

    return outputs

 def main():
    parser = argparse.ArgumentParser(
        description='Pack computing graph, input values and expected output '
        'values into one file for checking correctness. README.md gives more '
        'details on the usage',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help='input file; see README for details')
    parser.add_argument('-o', '--output', help='output file', required=True)
    parser.add_argument('--init-bn', action='store_true',
                        help='initialize untrained batch-normalization, to '
                        'avoid NaN or Inf results')
    parser.add_argument(
        '-d', '--data', default=[], action='append',
        help='Given input test data when input file is a network, '
        'and current network output would be used as groundtruth. '
        'The format is var0:file0;var1:file1... to specify data files for '
        'input vars. It can also be #rand(min,max,shape...) for generating '
        'random input data, for example, #rand(0,255), '
        '#rand(0,255,1,3,224,224) or #rand(0, 255, 1, ...) where `...` means '
        'the remaining part of the original shape. '
        'If the shape is not specified, the shape of '
        'corresponding DataProvider in the network will be used. '
        'If there is only one input var, its name can be omitted. '
        'Each data file can either be an image which can be loaded by opencv, '
        'or a pickled numpy.ndarray. '
        'This option can be given multiple times to add multiple testcases. '
        ' *NOTE* '
        'If you start the data with the letter @, the rest should be a '
        'filename, and each line in the file should be a single datum in '
        'the format described above. '
    )
    parser.add_argument(
        '--repeat', type=int, default=1,
        help='Specify how many times the input image is repeated. '
        'Useful when running benchmark for batch size other than one. '
        'Have no effect on randomly generated input data.')
    parser.add_argument('--silent', action='store_true',
                        help='set verbose to False in AssertEqual opr')
    parser.add_argument('--optimize-for-inference', action='store_true',
                        help='enbale optimization for inference')
    parser.add_argument('--no-assert', action='store_true',
                        help='do not insert AssertEqual opr to check result; '
                        'this option is useful for benchmarking')
    parser.add_argument('--maxerr', type=float, default=AssertEqual.maxerr,
                        help='max error for AssertEqual check during runtime')
    parser.add_argument('--resize-input', action='store_true',
                        help='resize input image to fit input var shape')
    parser.add_argument('--input-transform',
                        help='a python expression to transform the input data. '
                        'Example: data / np.std(data)')
    parser.add_argument('--discard-var-name', action='store_true',
                        help='discard variable and param names in the '
                        'generated output')
    parser.add_argument('--output-strip-info', action='store_true',
                        help='output code strip information')
    parser.add_argument('--enable-io16xc32', action='store_true',
                        help='transform the mode to float16 io float32 compute')
    parser.add_argument('--enable-ioc16', action='store_true',
                        help='transform the dtype of the model to float16 io '
                        'and compute')
    parser.add_argument('--enable-fuse-conv-bias-nonlinearity',
                        action='store_true',
                        help='fuse convolution bias and nonlinearity opr to a '
                        'conv_bias opr and compute')
    parser.add_argument('--enable-hwcd4', action='store_true',
                        help='transform the model format from NCHW to NHWCD4 '
                        'for inference; you may need to disable CUDA and set '
                        'MGB_USE_MEGDNN_DBG=2')
    parser.add_argument('--enable-nchw4', action='store_true',
                        help='transform the model format from NCHW to NCHW4 '
                        'for inference')
    parser.add_argument('--enable-nchw88', action='store_true',
                        help='transform the model format from NCHW to NCHW88 '
                        'for inference')
    parser.add_argument('--enable-nchw44', action='store_true',
                        help='transform the model format from NCHW to NCHW44 '
                        'for inference')
    parser.add_argument('--enable-nchw44-dot', action='store_true',
                        help='transform the model format from NCHW to NCHW44_DOT '
                        'for optimizing armv8.2 dot in inference')
    parser.add_argument('--enable-chwn4', action='store_true',
                        help='transform the model format to CHWN4 '
                        'for inference, mainly used for nvidia tensorcore')
    parser.add_argument('--enable-nchw32', action='store_true',
                        help='transform the model format from NCHW4 to NCHW32 '
                        'for inference on nvidia TensoCore')
    parser.add_argument('--enable-nchw64', action='store_true', 
                        help='transform the model format from NCHW to NCHW64 '
                        'for inference on Nvidia GPU')
    parser.add_argument('--enable-fuse-conv-bias-with-z', action='store_true',
                        help='fuse conv_bias with z input for inference on '
                        'nvidia GPU (this optimization pass will result in mismatch '
                        'of the precision of output of training and inference)')
    parser.add_argument('--enable-fuse-preprocess', action='store_true', 
                        help='fuse astype\pad_channel\dimshuffle and etc opr '
                        'from h2d op')
    args = parser.parse_args()
    if args.data:
        feeds = make_feeds(args)
    else:
        feeds = io.load(args.input)

    assert isinstance(feeds, dict) and feeds['testcases'], (
        'testcases can not be empty')

    env = FpropEnv(verbose_fprop=False)

    outputs = feeds['outputs']
    output_mgbvars = list(map(env.get_mgbvar, outputs))

    output_mgbvars = optimize_for_inference(args, output_mgbvars)

    inputs = sorted(((i.name, i.dtype) for i in
                     NodeFilter.make_all_deps(*outputs).data_provider()))
    if args.discard_var_name:
        sereg_kwargs = dict(keep_var_name=0, keep_param_name=False)
    else:
        sereg_kwargs = dict(keep_var_name=2, keep_param_name=True)

    with open(args.output, 'wb') as fout:
        fout.write(b'mgbtest0')
        fout.write(struct.pack('I', len(feeds['testcases'])))
    stat = mgb.serialize_comp_graph_to_file(
        args.output, output_mgbvars, append=True,
        output_strip_info=args.output_strip_info,
        **sereg_kwargs)
    logger.info('graph dump sizes: tot_size={:.3f}KiB overhead={:.3f}KiB'.
                format(stat.tot_bytes / 1024,
                       (stat.tot_bytes - stat.tensor_value_bytes) / 1024))

    for testcase in feeds['testcases']:
        assert isinstance(testcase, dict)
        cg = mgb.comp_graph()
        cn = mgb.comp_node('cpux')
        output_mgbvars = []
        for name, dtype in inputs:
            output_mgbvars.append(cg.make_shared(cn, value=testcase.pop(name),
                                                 dtype=dtype))
        assert not testcase, 'extra inputs provided in testcase: {}'.format(
            testcase.keys())

        mgb.serialize_comp_graph_to_file(
            args.output,
            output_mgbvars,
            append=True,
            output_strip_info=args.output_strip_info,
            append_json=True)

 if __name__ == '__main__':
    main()
--- a/lite/load_and_run/dump_with_testcase_mge.py
+++ b/lite/load_and_run/dump_with_testcase_mge.py
@@ -0,0 +1,535 @@
 # -*- coding: utf-8 -*-
 # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 #
 # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import argparse
 import os
 import re
 import struct

 import cv2
 import numpy as np

 import megengine as mge
 import megengine.core._imperative_rt as rt
 import megengine.core.tensor.megbrain_graph as G
 from megengine import tensor
 from megengine.core._imperative_rt.core2 import apply
 from megengine.core.ops import builtin
 from megengine.utils import comp_graph_tools as cgtools

 logger = mge.get_logger(__name__)


 def auto_reformat_image(args, path, data, dst_shape):
    """reformat image to target shape

    :param data: image data as numpy array
    :param dst_shape: target shape
    """
    dim3_format = False  # required input format does not contain batch
    hwc_format = False  # required input format is NHWC

    if not dst_shape:  # input tensor shape is not predefined
        if len(data.shape) == 2:
            chl = 1
            h = data.shape[0]
            w = data.shape[1]
        else:
            assert len(data.shape) == 3, "Input image must be of dimension 2 or 3"
            h, w, chl = data.shape
        dst_shape = (1, chl, h, w)

    if len(dst_shape) == 3:
        dst_shape = (1,) + dst_shape
        dim3_format = True

    assert len(dst_shape) == 4, "bad dst_shape: {}".format(dst_shape)
    chl = dst_shape[1]
    if chl in [1, 3]:
        n, c, h, w = dst_shape
        dst_shape = (n, h, w, c)
    else:
        chl = dst_shape[3]
        assert chl in [1, 3], "can not infer input format from shape: {}".format(
            dst_shape
        )
        hwc_format = True

    # dst_shape has now been normalized to NHWC format

    if args.resize_input:
        h, w = dst_shape[1:3]
        data = cv2.resize(data, (w, h))
        logger.info("input {} resized to {}".format(path, data.shape))

    if chl == 1:
        data = cv2.cvtColor(data, cv2.COLOR_BGR2GRAY)
        data = data[:, :, np.newaxis]

    assert data.ndim == 3
    data = data[np.newaxis]
    # data normalized to NHWC format

    if not hwc_format:
        data = np.transpose(data, (0, 3, 1, 2))

    if dim3_format:
        data = np.squeeze(data, 0)

    return data


 def read_input_data(args, dst_shape, dtype, path, repeat):
    def check_shape_equal(dst_shape, data_shape):
        if len(dst_shape):
            assert len(data_shape) == len(
                dst_shape
            ), "input/data shapes mismatch: {} vs {}".format(dst_shape, data_shape)

            if data_shape[1:] != dst_shape[1:]:
                logger.warning(
                    "dst_shape is {}; data_shape is {}".format(dst_shape, data_shape)
                )

    if path.startswith("#"):
        assert not args.resize_input
        assert not args.input_transform
        spec = path
        m = re.match(r"^#rand\(([-0-9.]*)\s*,\s*([-0-9.]*)\s*(,[^\)]+)?\)$", spec)
        assert m, "bad spec {}".format(spec)

        rng_min = float(m.group(1))
        rng_max = float(m.group(2))
        if m.group(3):
            shape_str = m.group(3)
            try:
                shape = shape_str[1:].split(",")
                if shape[-1].strip() == "...":
                    shape = shape[:-1]
                    shape.extend(list(dst_shape[len(shape) :]))
                data_shape = tuple(map(int, shape))
            except ValueError as e:
                raise ValueError("bad spec {}: {}".format(spec, e.args))
        else:
            data_shape = dst_shape

        check_shape_equal(dst_shape, data_shape)
        return np.random.uniform(rng_min, rng_max, data_shape).astype(dtype)

    # try to load image
    data = cv2.imread(path, cv2.IMREAD_COLOR)
    if data is None:
        assert not args.resize_input
        data = np.load(path)
        assert isinstance(data, np.ndarray)
    else:
        # load image succeeds, so we expect input format is image format
        data = auto_reformat_image(args, path, data, dst_shape)

    data = np.repeat(data, repeat, axis=0)
    if repeat > 1:
        logger.info(
            "repeat input for {} times, data shape is {}".format(repeat, data.shape)
        )

    check_shape_equal(dst_shape, data.shape)

    if args.input_transform:
        data = eval(args.input_transform, {"data": data, "np": np})

    return data


 def gen_one_testcase(args, inputs, spec):
    paths = spec.split(";")
    if len(paths) != len(inputs):
        if len(paths) == 1 and paths[0].startswith("#"):
            paths = ["{}:{}".format(name, paths[0]) for name in inputs.keys()]
    assert len(paths) == len(inputs), "required inputs: {}; data paths: {}".format(
        inputs.keys(), paths
    )
    if len(paths) == 1 and ":" not in paths[0]:
        paths[0] = next(iter(inputs.keys())) + ":" + paths[0]

    ret = {}
    for path in paths:
        var, path = path.split(":")
        if args.repeat:
            repeat = args.repeat
        else:
            repeat = 1
        ret[var] = read_input_data(
            args, inputs[var].shape, inputs[var].dtype, path, repeat
        )
    return ret


 def make_feeds(args):
    ret = G.load_graph(args.input)
    cg_rt, outputs = ret.graph, ret.output_vars_list
    inputs = cgtools.get_dep_vars(outputs, "Host2DeviceCopy")

    inputs = {i.name: i for i in inputs}
    if not args.no_assert:

        replace_varmap = {}
        inp_map = {}
        # replace var use InputNode
        for name, var in inputs.items():
            inp = G.InputNode(
                device="xpux", dtype=var.dtype, shape=var.shape, graph=cg_rt
            )
            replace_varmap[var] = inp.outputs[0]
            inp_map[name] = inp

        new = cgtools.replace_vars(outputs, replace_varmap)
        if isinstance(new, rt.VarNode):
            new = list(new)

        output_nodes = [G.OutputNode(var) for var in new]
        func = cg_rt.compile([node.outputs[0] for node in output_nodes])

        def make_dev_tensor(value, dtype=None, device=None):
            return tensor(value, dtype=dtype, device=device)._dev_tensor()

        def calculate(*args, **kwargs):
            output_val = []
            # set inputs value
            for name, var in inputs.items():
                val = kwargs.pop(name, None)
                assert val is not None, "miss input name{}".format(name)
                dev_tensor = make_dev_tensor(val, dtype=var.dtype, device="xpux")
                inp_map[name].set_value(dev_tensor)

            func.execute()

            for res in output_nodes:
                output_val.append(res.get_value().numpy())
            return output_val

        def expect_name(var):
            return "{}:expect".format(var.name)

    testcases = []

    np.set_printoptions(precision=2, threshold=4, suppress=True)

    data_list = []
    for item in args.data:
        if item.startswith("@"):
            with open(item[1:], "r") as f:
                data_list.extend([line.rstrip() for line in f if line.rstrip() != ""])
        else:
            data_list.append(item)

    for inp_spec in data_list:
        cur_testcase = gen_one_testcase(args, inputs, inp_spec)
        assert len(cur_testcase) == len(
            inputs
        ), "required inputs: {}; given data: {}".format(
            inputs.keys(), cur_testcase.keys()
        )

        if not args.no_assert:
            outputs_get = calculate(**cur_testcase)
            for var, val in zip(outputs, outputs_get):
                cur_testcase[expect_name(var)] = val
                logger.info(
                    "generate test groundtruth: var={} shape={} range=({}, {})"
                    " mean={} var={}".format(
                        var, val.shape, val.min(), val.max(), np.mean(val), np.var(val)
                    )
                )
        testcases.append(cur_testcase)
        logger.info(
            "add testcase: \n {}".format(
                "\n ".join(
                    "{}: shape={} dtype={} range=({:.2f},{:.2f}) "
                    "mean={:.2f} sd={:.2f}".format(
                        k, v.shape, v.dtype, v.min(), v.max(), np.mean(v), np.std(v)
                    )
                    for k, v in sorted(cur_testcase.items())
                )
            )
        )

    if not args.no_assert:

        def expect_shp(var):
            ret = var.shape
            if ret:
                return ret
            return testcases[0][expect_name(var)].shape

        def assert_equal(expect, real, **kwargs):
            op = builtin.AssertEqual(**kwargs)
            (res,) =  G.apply_normal_varnode(op, expect, real)
            return res

        verbose = not args.silent

        outputs_new = []
        for i in outputs:
            device = rt.CompNode("xpux")
            dtype = i.dtype
            name = expect_name(i)
            shape = expect_shp(i)
            # make expect output as one input of model.
            expect_get = rt.make_h2d(cg_rt, device, dtype, shape, name)
            # insert assert opr to check expect and real.
            outputs_new.append(
                assert_equal(
                    expect_get,
                    i,
                    verbose=verbose,
                    maxerr=args.maxerr,
                )
            )
            inputs[expect_name(i)] = expect_get
        outputs = outputs_new

    return {"outputs": outputs, "testcases": testcases}


 def optimize_for_inference(args, outputs):
    args_list = [
        "enable_io16xc32",
        "enable_ioc16",
        "enable_hwcd4",
        "enable_nchw4",
        "enable_nchw88",
        "enable_nchw44",
        "enable_nchw44_dot",
        "enable_nchw32",
        "enable_chwn4",
        "enable_fuse_conv_bias_nonlinearity",
        "enable_fuse_conv_bias_with_z",
        "enable_fuse_preprocess",
    ]
    kwargs = {}
    for k in args_list:
        if getattr(args, k):
            kwargs[k] = True

    if args.optimize_for_inference:
        outputs = G.optimize_for_inference(outputs, **kwargs)

    return outputs


 def main():
    parser = argparse.ArgumentParser(
        description="Pack computing graph, input values and expected output "
        "values into one file for checking correctness. README.md gives more "
        "details on the usage",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("input", help="MegEngine dumped model file")
    parser.add_argument("-o", "--output", help="output file", required=True)
    parser.add_argument(
        "-d",
        "--data",
        default=[],
        action="append",
        required=True,
        help="Given input test data when input file is a network, "
        "and current network output would be used as groundtruth. "
        "The format is var0:file0;var1:file1... to specify data files for "
        "input vars. It can also be #rand(min,max,shape...) for generating "
        "random input data, for example, #rand(0,255), "
        "#rand(0,255,1,3,224,224) or #rand(0, 255, 1, ...) where `...` means "
        "the remaining part of the original shape. "
        "If the shape is not specified, the shape of "
        "corresponding input tensors in the network will be used. "
        "If there is only one input var, its name can be omitted. "
        "Each data file can either be an image which can be loaded by opencv, "
        "or a pickled numpy.ndarray. "
        "This option can be given multiple times to add multiple testcases. "
        " *NOTE* "
        "If you start the data with the letter @, the rest should be a "
        "filename, and each line in the file should be a single datum in "
        "the format described above. ",
    )
    parser.add_argument(
        "--repeat",
        type=int,
        default=1,
        help="Specify how many times the input image is repeated. "
        "Useful when running benchmark for batch size other than one. "
        "Have no effect on randomly generated input data.",
    )
    parser.add_argument(
        "--silent",
        action="store_true",
        help="set verbose to False in asserti_equal opr",
    )
    parser.add_argument(
        "--optimize-for-inference",
        action="store_true",
        help="enable optimization for inference",
    )
    parser.add_argument(
        "--no-assert",
        action="store_true",
        help="do not insert assert_equal opr to check result; "
        "this option is useful for benchmarking",
    )
    parser.add_argument(
        "--maxerr",
        type=float,
        default=1e-4,
        help="max error for assert_equal check during runtime",
    )
    parser.add_argument(
        "--resize-input",
        action="store_true",
        help="resize input image to fit input var shape",
    )
    parser.add_argument(
        "--input-transform",
        help="a python expression to transform the input data. "
        "Example: data / np.std(data)",
    )
    parser.add_argument(
        "--discard-var-name",
        action="store_true",
        help="discard variable and param names in the " "generated output",
    )
    parser.add_argument(
        "--output-strip-info", action="store_true", help="output code strip information"
    )
    parser.add_argument(
        "--enable-io16xc32",
        action="store_true",
        help="transform the mode to float16 io float32 compute",
    )
    parser.add_argument(
        "--enable-ioc16",
        action="store_true",
        help="transform the dtype of the model to float16 io " "and compute",
    )
    parser.add_argument(
        "--enable-fuse-conv-bias-nonlinearity",
        action="store_true",
        help="fuse convolution bias and nonlinearity opr to a "
        "conv_bias opr and compute",
    )
    parser.add_argument(
        "--enable-hwcd4",
        action="store_true",
        help="transform the model format from NCHW to NHWCD4 "
        "for inference; you may need to disable CUDA and set "
        "MGB_USE_MEGDNN_DBG=2",
    )
    parser.add_argument(
        "--enable-nchw4",
        action="store_true",
        help="transform the model format from NCHW to NCHW4 " "for inference",
    )
    parser.add_argument(
        "--enable-nchw88",
        action="store_true",
        help="transform the model format from NCHW to NCHW88 " "for inference",
    )
    parser.add_argument(
        "--enable-nchw44",
        action="store_true",
        help="transform the model format from NCHW to NCHW44 " "for inference",
    )
    parser.add_argument(
        "--enable-nchw44-dot",
        action="store_true",
        help="transform the model format from NCHW to NCHW44_DOT "
        "for optimizing armv8.2 dot in inference",
    )
    parser.add_argument(
        "--enable-nchw32",
        action="store_true",
        help="transform the model format from NCHW4 to NCHW32 "
        "for inference on nvidia TensoCore",
    )
    parser.add_argument(
        "--enable-chwn4",
        action="store_true",
        help="transform the model format to CHWN4 "
        "for inference, mainly used for nvidia tensorcore",
    )
    parser.add_argument(
        "--enable-fuse-conv-bias-with-z",
        action="store_true",
        help="fuse conv_bias with z input for inference on "
        "nvidia GPU (this optimization pass will result in mismatch "
        "of the precision of output of training and inference)",
    )
    parser.add_argument(
        "--enable-fuse-preprocess",
        action="store_true",
        help="fuse astype\pad_channel\dimshuffle and etc opr "
        "from h2d opr",
    )
    args = parser.parse_args()

    feeds = make_feeds(args)

    assert isinstance(feeds, dict) and feeds["testcases"], "testcases can not be empty"

    output_mgbvars = feeds["outputs"]
    output_mgbvars = optimize_for_inference(args, output_mgbvars)

    inputs = cgtools.get_dep_vars(output_mgbvars, "Host2DeviceCopy")
    inputs = sorted((i.name, i.dtype) for i in inputs)

    if args.discard_var_name:
        sereg_kwargs = dict(keep_var_name=0, keep_param_name=False)
    else:
        sereg_kwargs = dict(keep_var_name=2, keep_param_name=True)

    strip_info_file = args.output + ".json" if args.output_strip_info else None

    with open(args.output, "wb") as fout:
        fout.write(b"mgbtest0")
        fout.write(struct.pack("I", len(feeds["testcases"])))
        dump_content, stat = G.dump_graph(
            output_mgbvars,
            append_json=True,
            strip_info_file=strip_info_file,
            **sereg_kwargs,
        )
        fout.write(dump_content)

        logger.info(
            "graph dump sizes: tot_size={:.3f}KiB overhead={:.3f}KiB".format(
                stat.tot_bytes / 1024, (stat.tot_bytes - stat.tensor_value_bytes) / 1024
            )
        )

    def make_dev_tensor(value, dtype=None, device=None):
        return tensor(value, dtype=dtype, device=device)._dev_tensor()

    for testcase in feeds["testcases"]:
        assert isinstance(testcase, dict)
        cg = G.Graph()
        output_mgbvars = []
        for name, dtype in inputs:
            output_mgbvars.append(
                cg.make_const(
                    make_dev_tensor(testcase.pop(name), dtype=dtype, device="cpux")
                )
            )
        assert not testcase, "extra inputs provided in testcase: {}".format(
            testcase.keys()
        )
        with open(args.output, "ab") as fout:
            dump_content, _ = G.dump_graph(
                output_mgbvars, strip_info_file=strip_info_file, append_json=True
            )
            fout.write(dump_content)


 if __name__ == "__main__":
    main()
--- a/lite/load_and_run/src/helpers/common.h
+++ b/lite/load_and_run/src/helpers/common.h
@@ -0,0 +1,74 @@
 /**
 * \file lite/load_and_run/src/helpers/common.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once
 #include <gflags/gflags.h>
 #include <memory>
 DECLARE_int32(thread);
 namespace lar {
 /*!
 * \brief: state of model running
 */
 enum class RunStage {

    BEFORE_MODEL_LOAD = 0,

    AFTER_MODEL_LOAD = 1,

    BEFORE_OUTSPEC_SET = 2,

    //! using for dump static memory information svg file
    AFTER_OUTSPEC_SET = 3,

    //! using for external c opr library
    MODEL_RUNNING = 4,

    //! using for output dumper
    AFTER_RUNNING_WAIT = 5,

    //! using for external c opr library
    AFTER_RUNNING_ITER = 6,

    AFTER_MODEL_RUNNING = 7,
 };
 /*!
 * \brief: type of different model
 */
 enum class ModelType {
    LITE_MODEL = 0,
    MEGDL_MODEL,
    UNKNOWN,
 };
 /*!
 * \brief: param for running model
 */
 struct RuntimeParam {
    RunStage stage = RunStage::AFTER_MODEL_LOAD;
    size_t warmup_iter;             //! warm up number before running model
    size_t run_iter;                //! iteration number for running model
    size_t threads = FLAGS_thread;  //! thread number for running model (NOTE:it's
                                    //! different from multithread device )
    size_t testcase_num = 1;        //! testcase number for model with testcase
 };
 /*!
 * \brief:layout type  for running model optimization
 */
 enum class OptLayoutType {
    NCHW4 = 1 << 0,
    CHWN4 = 1 << 1,
    NCHW44 = 1 << 2,
    NCHW88 = 1 << 3,
    NCHW32 = 1 << 4,
    NCHW64 = 1 << 5,
    NHWCD4 = 1 << 6,
    NCHW44_DOT = 1 << 7
 };

 }  // namespace lar
 // vim: syntax=cpp.doxygen
--- a/lite/load_and_run/src/helpers/data_parser.cpp
+++ b/lite/load_and_run/src/helpers/data_parser.cpp
@@ -0,0 +1,266 @@
 /**
 * \file lite/load_and_run/src/helpers/data_parser.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "data_parser.h"
 #include <sstream>
 #include "json_loader.h"
 #include "npy.h"

 using namespace lar;

 /*!
 * \brief feed different data to diffferent parser
 * \param path data file path or data string
 */
 void DataParser::feed(const std::string& path) {
    std::string blob_name = "data", blob_string = path;
    size_t sep = path.find(":");
    if (sep != std::string::npos) {
        blob_name = path.substr(0, sep);
        blob_string = path.substr(sep + 1);
    }

    auto endWith = [blob_string](std::string suffix) -> bool {
        return blob_string.rfind(suffix) == (blob_string.length() - suffix.length());
    };

    if (endWith(".ppm") || endWith(".pgm")) {
        parse_image(blob_name, blob_string);
    } else if (endWith(".json")) {
        parse_json(blob_string);
    } else if (endWith(".npy")) {
        parse_npy(blob_name, blob_string);
    } else {
        parse_string(blob_name, blob_string);
    }
 }

 void DataParser::parse_json(const std::string& path) {
    mgb::JsonLoader json;
    std::shared_ptr<mgb::JsonLoader::Value> root = json.load(path.c_str());

    mgb_assert(root != nullptr, "parse json %s fail", path.c_str());
    // parse json to data map
    const std::string SHAPE = "shape", TYPE = "type", RAW = "raw";
    for (auto& item : root->objects()) {
        auto&& value = *item.second;
        auto&& shape = value[SHAPE];
        mgb_assert(shape->is_array());

        auto&& type = value[TYPE];
        mgb_assert(type->is_str());

        auto&& raw = value[RAW];
        mgb_assert(raw->is_array());

        megdnn::SmallVector<size_t> data_shape;
        for (auto&& shape_ptr : shape->array()) {
            data_shape.append({static_cast<size_t>(std::round(shape_ptr->number()))});
        }

        // get type
        const std::map<std::string, megdnn::DType> type_map = {
                {"float32", mgb::dtype::Float32()}, {"float", mgb::dtype::Float32()},
                {"int32", mgb::dtype::Int32()},     {"int", mgb::dtype::Int32()},
                {"int8", mgb::dtype::Int8()},       {"uint8", mgb::dtype::Uint8()}};

        const std::string& type_str = type->str();
        mgb_assert(
                type_map.find(type_str) != type_map.end(),
                "unknown json data type for --input");

        mgb::DType datatype = type_map.at(type_str);
        mgb::HostTensorND hv;
        hv.comp_node(mgb::CompNode::default_cpu(), true)
                .dtype(datatype)
                .resize(data_shape);
        mgb::dt_byte* raw_ptr = hv.raw_ptr();
        size_t elem_size = datatype.size();

        // get raw
        const size_t array_size = raw->len();
        for (size_t idx = 0; idx < array_size; ++idx) {
            double tmp = (*raw)[idx]->number();

            switch (datatype.enumv()) {
                case megdnn::DTypeEnum::Int32: {
                    int32_t ival = std::round(tmp);
                    memcpy(((char*)raw_ptr) + idx * elem_size, &ival, elem_size);
                } break;
                case megdnn::DTypeEnum::Uint8:
                case megdnn::DTypeEnum::Int8: {
                    int8_t cval = std::round(tmp);
                    memcpy(((char*)raw_ptr) + idx, &cval, sizeof(int8_t));
                } break;
                case megdnn::DTypeEnum::Float32: {
                    float fval = tmp;
                    memcpy(((char*)raw_ptr) + idx * elem_size, &fval, elem_size);
                } break;
                default:
                    break;
            }
        }

        inputs.insert(std::make_pair(item.first, std::move(hv)));
    }
 }

 void DataParser::parse_image(const std::string& name, const std::string& path) {
    // load binary ppm/pgm
    std::ifstream fin;
    fin.open(path, std::ifstream::binary | std::ifstream::in);
    mgb_assert(fin.is_open(), "open file %s failed for --input", path.c_str());

    size_t w = 0, h = 0, channel = 0;
    char buf[128] = {0};

    fin.getline(buf, 128);
    if ('5' == buf[1]) {
        channel = 1;
    } else if ('6' == buf[1]) {
        channel = 3;
    } else {
        mgb_assert(0, "not a formal ppm/pgm");
    }

    while (fin.getline(buf, 128)) {
        if (buf[0] == '#') {
            continue;
        }
        break;
    }
    std::stringstream ss;
    ss << std::string(buf);
    ss >> w;
    ss >> h;

    mgb_assert(w > 0 and h > 0);

    mgb::HostTensorND hv;
    hv.comp_node(mgb::CompNode::default_cpu(), true)
            .dtype(mgb::dtype::Uint8())
            .resize({1, h, w, channel});

    fin.read((char*)(hv.raw_ptr()), hv.layout().total_nr_elems());
    fin.close();
    inputs.insert(std::make_pair(name, std::move(hv)));
 }

 void DataParser::parse_npy(const std::string& name, const std::string& path) {
    std::string type_str;
    std::vector<npy::ndarray_len_t> stl_shape;
    std::vector<int8_t> raw;
    npy::LoadArrayFromNumpy(path, type_str, stl_shape, raw);

    megdnn::SmallVector<size_t> shape;
    for (auto val : stl_shape) {
        shape.append({static_cast<size_t>(val)});
    }

    const std::map<std::string, megdnn::DType> type_map = {
            {"f4", mgb::dtype::Float32()}, {"i4", mgb::dtype::Int32()},
            {"i2", mgb::dtype::Int16()},   {"u2", mgb::dtype::Uint16()},
            {"i1", mgb::dtype::Int8()},    {"u1", mgb::dtype::Uint8()}};

    megdnn::DType hv_type;
    for (auto& item : type_map) {
        if (type_str.find(item.first) != std::string::npos) {
            hv_type = item.second;
            break;
        }
    }

    mgb::HostTensorND hv;
    hv.comp_node(mgb::CompNode::default_cpu(), true).dtype(hv_type).resize(shape);
    mgb::dt_byte* raw_ptr = hv.raw_ptr();
    memcpy(raw_ptr, raw.data(), raw.size());

    inputs.insert(std::make_pair(name, std::move(hv)));
 }

 void DataParser::parse_string(const std::string name, const std::string& str) {
    // data type
    megdnn::DType data_type = mgb::dtype::Int32();
    if (str.find(".") != std::string::npos or str.find(".") != std::string::npos) {
        data_type = mgb::dtype::Float32();
    }
    // shape
    size_t number_cnt = 0;

    std::shared_ptr<Brace> brace_root = std::make_shared<Brace>();
    std::shared_ptr<Brace> cur = brace_root;
    for (size_t i = 0; i < str.size(); ++i) {
        char c = str[i];
        if (c == '[') {
            std::shared_ptr<Brace> child = std::make_shared<Brace>();
            child->parent = cur;
            cur->chidren.emplace_back(child);
            cur = child;
        } else if (c == ']') {
            cur = cur->parent.lock();
        } else if (c == ',') {
            number_cnt++;
        }
        continue;
    }
    ++number_cnt;

    mgb_assert(cur == brace_root, "braces not closed for --input");
    megdnn::SmallVector<size_t> shape;
    cur = brace_root;
    while (not cur->chidren.empty()) {
        shape.append({cur->chidren.size()});
        number_cnt /= cur->chidren.size();
        cur = cur->chidren[0];
    }
    mgb_assert(number_cnt > 0);
    shape.append({number_cnt});

    // data
    std::string json_arr;
    for (size_t i = 0; i < str.size(); ++i) {
        char c = str[i];
        if (c != '[' and c != ']') {
            json_arr += c;
        }
    }
    json_arr = "[" + json_arr + "]";

    // reuse json parser to resolve raw data
    mgb::JsonLoader json;
    std::shared_ptr<mgb::JsonLoader::Value> json_root =
            json.load(json_arr.data(), json_arr.size());
    mgb_assert(json_root != nullptr, "parse json fail in parse_string");

    mgb::HostTensorND hv;
    hv.comp_node(mgb::CompNode::default_cpu(), true).dtype(data_type).resize(shape);
    mgb::dt_byte* raw_ptr = hv.raw_ptr();

    const size_t array_len = json_root->len();
    const size_t elem_size = data_type.size();
    for (size_t idx = 0; idx < array_len; ++idx) {
        double tmp = json_root->array()[idx]->number();
        switch (data_type.enumv()) {
            case megdnn::DTypeEnum::Int32: {
                int32_t ival = std::round(tmp);
                memcpy(((char*)raw_ptr) + idx * elem_size, &ival, elem_size);
            } break;
            case megdnn::DTypeEnum::Float32: {
                float fval = tmp;
                memcpy(((char*)raw_ptr) + idx * elem_size, &fval, elem_size);
            } break;
            default:
                break;
        }
    }
    inputs.insert(std::make_pair(name, std::move(hv)));
 }
--- a/lite/load_and_run/src/helpers/data_parser.h
+++ b/lite/load_and_run/src/helpers/data_parser.h
@@ -0,0 +1,48 @@
 /**
 * \file lite/load_and_run/src/helpers/data_parser.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #pragma once

 #include <memory>
 #include <unordered_map>
 #include <vector>
 #include "megbrain/opr/io.h"

 namespace lar {
 /*!
 * \brief data parser for --input
 * support .json|.ppm|.pgm|.npy data and user define data string
 * data string format: [0,0,227,227]
 */
 struct DataParser {
    struct Brace {
        std::weak_ptr<Brace> parent;
        std::vector<std::shared_ptr<Brace>> chidren;
    };
    void feed(const std::string& path);

    std::unordered_map<std::string, mgb::HostTensorND> inputs;

 private:
    //! parser for json data
    void parse_json(const std::string& path);

    //! parser for .ppm .pgm image
    void parse_image(const std::string& name, const std::string& path);

    //! parser for .npy data
    void parse_npy(const std::string& name, const std::string& path);

    //! parser for user define string
    void parse_string(const std::string name, const std::string& str);
 };
 }  // namespace lar
--- a/lite/load_and_run/src/helpers/json_loader.cpp
+++ b/lite/load_and_run/src/helpers/json_loader.cpp
@@ -0,0 +1,297 @@
 /**
 * \file lite/load_and_run/src/helpers/json_loader.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "json_loader.h"

 using namespace mgb;

 template <typename T>
 T* JsonLoader::Value::safe_cast() {
    T* ptr = (T*)(this);
    if (nullptr == ptr) {
        fprintf(stderr, "cast ptr is null\n");
    }
    return ptr;
 }

 std::unique_ptr<JsonLoader::Value>& JsonLoader::Value::operator[](
        const std::string& key) {
    mgb_assert(Type::OBJECT == m_type);
    auto t = safe_cast<JsonLoader::ObjectValue>();
    return t->m_obj.at(key);
 }

 std::unique_ptr<JsonLoader::Value>& JsonLoader::Value::operator[](const size_t index) {
    mgb_assert(Type::ARRAY == m_type);
    auto t = safe_cast<JsonLoader::ArrayValue>();
    return t->m_obj[index];
 }

 std::map<std::string, std::unique_ptr<JsonLoader::Value>>& JsonLoader::Value::
        objects() {
    mgb_assert(Type::OBJECT == m_type);
    auto t = safe_cast<JsonLoader::ObjectValue>();
    return t->m_obj;
 }

 size_t JsonLoader::Value::len() {
    if (Type::ARRAY == m_type) {
        auto t = safe_cast<JsonLoader::ArrayValue>();
        return t->m_obj.size();
    } else if (Type::OBJECT == m_type) {
        auto t = safe_cast<JsonLoader::ObjectValue>();
        return t->m_obj.size();
    }
    return 0;
 }

 megdnn::SmallVector<std::unique_ptr<JsonLoader::Value>>& JsonLoader::Value::array() {
    mgb_assert(Type::ARRAY == m_type);
    auto t = safe_cast<JsonLoader::ArrayValue>();
    return t->m_obj;
 }

 double JsonLoader::Value::number() {
    mgb_assert(Type::NUMBER == m_type);
    auto t = safe_cast<JsonLoader::NumberValue>();
    return t->value();
 }

 std::string JsonLoader::Value::str() {
    if (Type::STRING == m_type) {
        auto t = safe_cast<StringValue>();
        return t->value();
    }
    return std::string();
 }

 void JsonLoader::expect(char c) {
    mgb_assert(c == (*m_buf));
    m_buf++;
 }

 void JsonLoader::skip_whitespace() {
    const char* p = m_buf;
    while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') {
        ++p;
    }
    m_buf = p;
 }

 std::unique_ptr<JsonLoader::Value> JsonLoader::parse_object() {
    expect('{');
    skip_whitespace();

    std::unique_ptr<JsonLoader::Value> ret;
    JsonLoader::ObjectValue* pObject = new JsonLoader::ObjectValue();

    if ('}' == *m_buf) {
        m_buf = m_buf + 1;
        ret.reset((JsonLoader::Value*)(pObject));
        return ret;
    }

    while (true) {
        std::unique_ptr<JsonLoader::Value> key = parse_string();
        if (m_state != State::OK) {
            return ret;
        }

        skip_whitespace();
        if (':' != (*m_buf)) {
            m_state = State::MISS_COLON;
            return ret;
        }
        m_buf++;
        skip_whitespace();

        std::unique_ptr<JsonLoader::Value> pVal = parse_value();
        if (m_state != State::OK) {
            return ret;
        }

        if (pObject->m_obj.find(pVal->str()) != pObject->m_obj.end()) {
            m_state = State::KEY_NOT_UNIQUE;
            return ret;
        }

        pObject->m_obj.insert(std::make_pair(key->str(), std::move(pVal)));

        skip_whitespace();
        if (',' == (*m_buf)) {
            m_buf++;
            skip_whitespace();
        } else if ('}' == (*m_buf)) {
            m_buf++;
            break;
        } else {
            m_state = State::MISS_BRACE;
            break;
        }
    }

    ret.reset((JsonLoader::Value*)(pObject));
    return ret;
 }

 std::unique_ptr<JsonLoader::Value> JsonLoader::parse_array() {
    expect('[');
    skip_whitespace();

    std::unique_ptr<JsonLoader::Value> ret;
    JsonLoader::ArrayValue* pArray = new JsonLoader::ArrayValue();

    if (']' == *m_buf) {
        m_buf = m_buf + 1;

        ret.reset((JsonLoader::Value*)(pArray));
        return ret;
    }

    while (true) {
        std::unique_ptr<JsonLoader::Value> pVal = parse_value();
        if (m_state != State::OK) {
            mgb_assert(0, "parse value failed during pase array");
            return ret;
        }

        pArray->m_obj.emplace_back(pVal.get());
        pVal.release();

        skip_whitespace();
        if (',' == *m_buf) {
            m_buf++;
            skip_whitespace();
        } else if (']' == *m_buf) {
            m_buf++;
            break;
        } else {
            m_state = State::BAD_ARRAY;
            return ret;
        }
    }

    ret.reset((JsonLoader::Value*)(pArray));
    return ret;
 }

 std::unique_ptr<JsonLoader::Value> JsonLoader::parse_string() {
    expect('\"');

    std::unique_ptr<JsonLoader::Value> ret;
    JsonLoader::StringValue* pStr = new JsonLoader::StringValue();

    const char* p = m_buf;
    while (true) {
        if (*p == '\"') {
            p++;
            break;
        } else {
            pStr->m_value += (*p);
            p++;
        }
    }
    m_buf = p;
    ret.reset((JsonLoader::Value*)(pStr));
    return ret;
 }

 std::unique_ptr<JsonLoader::Value> JsonLoader::parse_number() {
    const char* p = m_buf;

    auto loop_digit = [this](const char*& p) {
        if (not std::isdigit(*p)) {
            m_state = State::BAD_DIGIT;
            return;
        }
        while (std::isdigit(*p)) {
            p++;
        }
        return;
    };

    if (*p == '-')
        p++;
    if (*p == '0')
        p++;
    else {
        loop_digit(std::ref(p));
    }
    if (*p == '.') {
        p++;
        loop_digit(std::ref(p));
    }

    if (*p == 'e' || *p == 'E') {
        p++;
        if (*p == '+' || *p == '-')
            p++;
        loop_digit(std::ref(p));
    }
    JsonLoader::NumberValue* pNum = new JsonLoader::NumberValue();
    pNum->m_value = strtod(m_buf, nullptr);

    m_buf = p;

    std::unique_ptr<JsonLoader::Value> ret;
    ret.reset((JsonLoader::Value*)(pNum));
    return ret;
 }

 std::unique_ptr<JsonLoader::Value> JsonLoader::parse_value() {
    switch (*m_buf) {
        case '[':
            return parse_array();
        case '{':
            return parse_object();
        case '\"':
            return parse_string();
        case '\0':
            m_state = State::BAD_TYPE;
            break;
        default:
            return parse_number();
    }
    return nullptr;
 }

 std::unique_ptr<JsonLoader::Value> JsonLoader::load(
        const char* content, const size_t size) {
    m_buf = content;
    skip_whitespace();
    std::unique_ptr<JsonLoader::Value> value = parse_value();
    skip_whitespace();

    if (m_state != State::OK) {
        return nullptr;
    }
    mgb_assert(size == static_cast<size_t>(m_buf - content));

    return value;
 }

 std::unique_ptr<JsonLoader::Value> JsonLoader::load(const char* path) {
    std::unique_ptr<std::FILE, void (*)(std::FILE*)> fin(
            std::fopen(path, "rb"), [](std::FILE* fp) { std::fclose(fp); });

    mgb_assert(fin.get(), "failed to open %s: %s", path, strerror(errno));
    std::fseek(fin.get(), 0, SEEK_END);
    const size_t size = ftell(fin.get());
    std::fseek(fin.get(), 0, SEEK_SET);

    std::unique_ptr<char> buf(static_cast<char*>(malloc(size)));

    auto nr = std::fread(buf.get(), 1, size, fin.get());
    mgb_assert(nr == size);

    return load(buf.get(), size);
 }
--- a/lite/load_and_run/src/helpers/json_loader.h
+++ b/lite/load_and_run/src/helpers/json_loader.h
@@ -0,0 +1,183 @@
 /**
 * \file lite/load_and_run/src/helpers/json_loader.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #pragma once

 #include <cctype>
 #include <fstream>
 #include <functional>
 #include <iostream>
 #include <map>
 #include <memory>
 #include "megbrain/common.h"
 #include "megdnn/thin/small_vector.h"

 namespace mgb {
 /*!
 * \brief JSON format data loader for --input
 */
 class JsonLoader {
 public:
    // base class for different value format
    class Value {
    protected:
        enum struct Type : uint8_t { UNKNOWN, NUMBER, STRING, OBJECT, ARRAY };
        Type m_type;

    public:
        template <typename T>
        T* safe_cast();

        Value() { m_type = Type::UNKNOWN; }

        Value(Type type) : m_type(type) {}

        virtual ~Value() {}

        bool is_array() { return Type::ARRAY == m_type; }

        bool is_object() { return Type::OBJECT == m_type; }

        bool is_number() { return Type::NUMBER == m_type; }

        bool is_str() { return Type::STRING == m_type; }

        std::unique_ptr<Value>& operator[](const std::string& key);

        std::unique_ptr<Value>& operator[](const size_t index);

        std::map<std::string, std::unique_ptr<Value>>& objects();

        size_t len();

        megdnn::SmallVector<std::unique_ptr<Value>>& array();

        double number();

        std::string str();
    };

    void expect(char c);

    void skip_whitespace();

    std::unique_ptr<Value> parse_object();

    std::unique_ptr<Value> parse_array();

    std::unique_ptr<Value> parse_string();

    std::unique_ptr<Value> parse_number();

    std::unique_ptr<Value> parse_value();

    enum struct State : uint8_t {
        OK = 0,
        BAD_TYPE,
        BAD_DIGIT,
        BAD_ARRAY,
        MISS_COLON,
        MISS_BRACE,
        KEY_NOT_UNIQUE
    };

    JsonLoader() { m_state = State::OK; }

    std::unique_ptr<Value> load(const char* content, const size_t size);

    std::unique_ptr<Value> load(const char* path);

    class NumberValue final : public Value {
        friend std::unique_ptr<Value> JsonLoader::parse_number();
        double m_value;

    public:
        NumberValue() : Value(Type::NUMBER) {}

        double value() { return m_value; }
    };

    class StringValue final : public Value {
        std::string m_value;

    public:
        StringValue() : Value(Type::STRING) {}

        std::string value() { return m_value; }

        friend std::unique_ptr<Value> JsonLoader::parse_string();
    };

    class ArrayValue final : public Value {
        megdnn::SmallVector<std::unique_ptr<Value>> m_obj;

    public:
        ArrayValue() : Value(Type::ARRAY) {}

        ArrayValue(ArrayValue& arr) : Value(arr) {
            m_obj.clear();
            for (auto& item : arr.m_obj) {
                m_obj.emplace_back(item.get());
                item.release();
            }
        }

        ArrayValue(ArrayValue&& arr) : Value(arr) {
            m_obj.clear();
            for (auto& item : arr.m_obj) {
                m_obj.emplace_back(item.get());
                item.release();
            }
        }

        friend std::unique_ptr<Value> JsonLoader::parse_array();
        friend std::unique_ptr<JsonLoader::Value>& JsonLoader::Value::operator[](
                const size_t index);
        friend megdnn::SmallVector<std::unique_ptr<JsonLoader::Value>>& JsonLoader::
                Value::array();
        friend size_t JsonLoader::Value::len();
    };

    class ObjectValue final : public Value {
        std::map<std::string, std::unique_ptr<Value>> m_obj;

    public:
        ObjectValue() : Value(Type::OBJECT) {}

        ObjectValue(ObjectValue& arr) : Value(arr) {
            m_obj.clear();
            for (auto itra = arr.m_obj.begin(); itra != arr.m_obj.end(); ++itra) {
                m_obj.emplace(std::make_pair(itra->first, std::move(itra->second)));
            }
        }

        ObjectValue(ObjectValue&& arr) : Value(arr) {
            m_obj.clear();
            for (auto itra = arr.m_obj.begin(); itra != arr.m_obj.end(); ++itra) {
                m_obj.emplace(std::make_pair(itra->first, std::move(itra->second)));
            }
        }

        friend std::unique_ptr<Value> JsonLoader::parse_object();
        friend std::unique_ptr<JsonLoader::Value>& JsonLoader::Value::operator[](
                const std::string&);
        friend std::map<std::string, std::unique_ptr<JsonLoader::Value>>& JsonLoader::
                Value::objects();
        friend size_t JsonLoader::Value::len();
    };

 private:
    const char* m_buf;
    State m_state;
 };

 }  // namespace mgb
--- a/lite/load_and_run/src/helpers/npy.h
+++ b/lite/load_and_run/src/helpers/npy.h
@@ -0,0 +1,615 @@
 /*
   Copyright 2017 Leon Merten Lohse

   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
   in the Software without restriction, including without limitation the rights
   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   copies of the Software, and to permit persons to whom the Software is
   furnished to do so, subject to the following conditions:

   The above copyright notice and this permission notice shall be included in
   all copies or substantial portions of the Software.

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   SOFTWARE.
 */

 #ifndef NPY_H
 #define NPY_H

 #include <algorithm>
 #include <complex>
 #include <cstdint>
 #include <cstring>
 #include <fstream>
 #include <iostream>
 #include <regex>
 #include <sstream>
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
 #include <vector>

 namespace npy {

 /* Compile-time test for byte order.
   If your compiler does not define these per default, you may want to define
   one of these constants manually.
   Defaults to little endian order. */
 #if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN ||                     \
        defined(__BIG_ENDIAN__) || defined(__ARMEB__) || defined(__THUMBEB__) || \
        defined(__AARCH64EB__) || defined(_MIBSEB) || defined(__MIBSEB) ||       \
        defined(__MIBSEB__)
 const bool big_endian = true;
 #else
 const bool big_endian = false;
 #endif

 const char magic_string[] = "\x93NUMPY";
 const size_t magic_string_length = 6;

 const char little_endian_char = '<';
 const char big_endian_char = '>';
 const char no_endian_char = '|';

 constexpr char host_endian_char = (big_endian ? big_endian_char : little_endian_char);

 /* npy array length */
 typedef unsigned long int ndarray_len_t;

 inline void write_magic(
        std::ostream& ostream, unsigned char v_major = 1, unsigned char v_minor = 0) {
    ostream.write(magic_string, magic_string_length);
    ostream.put(v_major);
    ostream.put(v_minor);
 }

 inline void read_magic(
        std::istream& istream, unsigned char& v_major, unsigned char& v_minor) {
    char buf[magic_string_length + 2];
    istream.read(buf, magic_string_length + 2);

    if (!istream) {
        fprintf(stderr, "io error: failed reading file");
    }

    if (0 != std::memcmp(buf, magic_string, magic_string_length)) {
        fprintf(stderr, "this file does not have a valid npy format.");
    }

    v_major = buf[magic_string_length];
    v_minor = buf[magic_string_length + 1];
 }

 // typestring magic
 struct Typestring {
 private:
    char c_endian;
    char c_type;
    int len;

 public:
    inline std::string str() {
        const size_t max_buflen = 16;
        char buf[max_buflen];
        std::sprintf(buf, "%c%c%u", c_endian, c_type, len);
        return std::string(buf);
    }

    Typestring(const std::vector<float>&)
            : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(float)} {}
    Typestring(const std::vector<double>&)
            : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(double)} {}
    Typestring(const std::vector<long double>&)
            : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(long double)} {}

    Typestring(const std::vector<char>&)
            : c_endian{no_endian_char}, c_type{'i'}, len{sizeof(char)} {}
    Typestring(const std::vector<short>&)
            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(short)} {}
    Typestring(const std::vector<int>&)
            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(int)} {}
    Typestring(const std::vector<long>&)
            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long)} {}
    Typestring(const std::vector<long long>&)
            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long long)} {}

    Typestring(const std::vector<unsigned char>&)
            : c_endian{no_endian_char}, c_type{'u'}, len{sizeof(unsigned char)} {}
    Typestring(const std::vector<unsigned short>&)
            : c_endian{host_endian_char}, c_type{'u'}, len{sizeof(unsigned short)} {}
    Typestring(const std::vector<unsigned int>&)
            : c_endian{host_endian_char}, c_type{'u'}, len{sizeof(unsigned int)} {}
    Typestring(const std::vector<unsigned long>&)
            : c_endian{host_endian_char}, c_type{'u'}, len{sizeof(unsigned long)} {}
    Typestring(const std::vector<unsigned long long>&)
            : c_endian{host_endian_char},
              c_type{'u'},
              len{sizeof(unsigned long long)} {}

    Typestring(const std::vector<std::complex<float>>&)
            : c_endian{host_endian_char},
              c_type{'c'},
              len{sizeof(std::complex<float>)} {}
    Typestring(const std::vector<std::complex<double>>&)
            : c_endian{host_endian_char},
              c_type{'c'},
              len{sizeof(std::complex<double>)} {}
    Typestring(const std::vector<std::complex<long double>>&)
            : c_endian{host_endian_char},
              c_type{'c'},
              len{sizeof(std::complex<long double>)} {}
 };

 inline void parse_typestring(std::string typestring) {
    std::regex re("'([<>|])([ifuc])(\\d+)'");
    std::smatch sm;

    std::regex_match(typestring, sm, re);

    if (sm.size() != 4) {
        fprintf(stderr, "invalid typestring");
    }
 }

 namespace pyparse {

 /**
  Removes leading and trailing whitespaces
  */
 inline std::string trim(const std::string& str) {
    const std::string whitespace = " \t";
    auto begin = str.find_first_not_of(whitespace);

    if (begin == std::string::npos)
        return "";

    auto end = str.find_last_not_of(whitespace);

    return str.substr(begin, end - begin + 1);
 }

 inline std::string get_value_from_map(const std::string& mapstr) {
    size_t sep_pos = mapstr.find_first_of(":");
    if (sep_pos == std::string::npos)
        return "";

    std::string tmp = mapstr.substr(sep_pos + 1);
    return trim(tmp);
 }

 /**
   Parses the string representation of a Python dict

   The keys need to be known and may not appear anywhere else in the data.
 */
 inline std::unordered_map<std::string, std::string> parse_dict(
        std::string in, std::vector<std::string>& keys) {
    std::unordered_map<std::string, std::string> map;

    if (keys.size() == 0)
        return map;

    in = trim(in);

    // unwrap dictionary
    if ((in.front() == '{') && (in.back() == '}'))
        in = in.substr(1, in.length() - 2);
    else {
        fprintf(stderr, "Not a Python dictionary.");
    }

    std::vector<std::pair<size_t, std::string>> positions;

    for (auto const& value : keys) {
        size_t pos = in.find("'" + value + "'");

        if (pos == std::string::npos) {
            fprintf(stderr, "Missing %s key.", value.c_str());
        }

        std::pair<size_t, std::string> position_pair{pos, value};
        positions.push_back(position_pair);
    }

    // sort by position in dict
    std::sort(positions.begin(), positions.end());

    for (size_t i = 0; i < positions.size(); ++i) {
        std::string raw_value;
        size_t begin{positions[i].first};
        size_t end{std::string::npos};

        std::string key = positions[i].second;

        if (i + 1 < positions.size())
            end = positions[i + 1].first;

        raw_value = in.substr(begin, end - begin);

        raw_value = trim(raw_value);

        if (raw_value.back() == ',')
            raw_value.pop_back();

        map[key] = get_value_from_map(raw_value);
    }

    return map;
 }

 /**
  Parses the string representation of a Python boolean
  */
 inline bool parse_bool(const std::string& in) {
    if (in == "True")
        return true;
    if (in == "False")
        return false;

    fprintf(stderr, "Invalid python boolan.");
    return false;
 }

 /**
  Parses the string representation of a Python str
  */
 inline std::string parse_str(const std::string& in) {
    if ((in.front() == '\'') && (in.back() == '\''))
        return in.substr(1, in.length() - 2);

    fprintf(stderr, "Invalid python string.");
    return "";
 }

 /**
  Parses the string represenatation of a Python tuple into a vector of its items
 */
 inline std::vector<std::string> parse_tuple(std::string in) {
    std::vector<std::string> v;
    const char seperator = ',';

    in = trim(in);

    if ((in.front() == '(') && (in.back() == ')'))
        in = in.substr(1, in.length() - 2);
    else {
        fprintf(stderr, "Invalid Python tuple.");
    }

    std::istringstream iss(in);

    for (std::string token; std::getline(iss, token, seperator);) {
        v.push_back(token);
    }

    return v;
 }

 template <typename T>
 inline std::string write_tuple(const std::vector<T>& v) {
    if (v.size() == 0)
        return "";

    std::ostringstream ss;

    if (v.size() == 1) {
        ss << "(" << v.front() << ",)";
    } else {
        const std::string delimiter = ", ";
        // v.size() > 1
        ss << "(";
        std::copy(
                v.begin(), v.end() - 1,
                std::ostream_iterator<T>(ss, delimiter.c_str()));
        ss << v.back();
        ss << ")";
    }

    return ss.str();
 }

 inline std::string write_boolean(bool b) {
    if (b)
        return "True";
    else
        return "False";
 }

 }  // namespace pyparse

 inline void parse_header(std::string header, std::string& descr) {
    /*
       The first 6 bytes are a magic string: exactly "x93NUMPY".
       The next 1 byte is an unsigned byte: the major version number of the file
       format, e.g. x01. The next 1 byte is an unsigned byte: the minor version
       number of the file format, e.g. x00. Note: the version of the file format
       is not tied to the version of the numpy package. The next 2 bytes form a
       little-endian unsigned short int: the length of the header data
       HEADER_LEN. The next HEADER_LEN bytes form the header data describing the
       array's format. It is an ASCII string which contains a Python literal
       expression of a dictionary. It is terminated by a newline ('n') and
       padded with spaces
       ('x20') to make the total length of the magic string + 4 + HEADER_LEN be
       evenly divisible by 16 for alignment purposes. The dictionary contains
       three keys:

       "descr" : dtype.descr
       An object that can be passed as an argument to the numpy.dtype()
       constructor to create the array's dtype. For repeatability and
       readability, this dictionary is formatted using pprint.pformat() so the
       keys are in alphabetic order.
     */

    // remove trailing newline
    if (header.back() != '\n')
        fprintf(stderr, "invalid header");
    header.pop_back();

    // parse the dictionary
    std::vector<std::string> keys{"descr"};
    auto dict_map = npy::pyparse::parse_dict(header, keys);

    if (dict_map.size() == 0)
        fprintf(stderr, "invalid dictionary in header");

    std::string descr_s = dict_map["descr"];
    parse_typestring(descr_s);
    // remove
    descr = npy::pyparse::parse_str(descr_s);
    return;
 }

 inline void parse_header(
        std::string header, std::string& descr, bool& fortran_order,
        std::vector<ndarray_len_t>& shape) {
    /*
       The first 6 bytes are a magic string: exactly "x93NUMPY".
       The next 1 byte is an unsigned byte: the major version number of the file
       format, e.g. x01. The next 1 byte is an unsigned byte: the minor version
       number of the file format, e.g. x00. Note: the version of the file format
       is not tied to the version of the numpy package. The next 2 bytes form a
       little-endian unsigned short int: the length of the header data
       HEADER_LEN. The next HEADER_LEN bytes form the header data describing the
       array's format. It is an ASCII string which contains a Python literal
       expression of a dictionary. It is terminated by a newline ('n') and
       padded with spaces
       ('x20') to make the total length of the magic string + 4 + HEADER_LEN be
       evenly divisible by 16 for alignment purposes. The dictionary contains
       three keys:

       "descr" : dtype.descr
       An object that can be passed as an argument to the numpy.dtype()
       constructor to create the array's dtype. "fortran_order" : bool Whether
       the array data is Fortran-contiguous or not. Since Fortran-contiguous
       arrays are a common form of non-C-contiguity, we allow them to be written
       directly to disk for efficiency. "shape" : tuple of int The shape of the
       array. For repeatability and readability, this dictionary is formatted
       using pprint.pformat() so the keys are in alphabetic order.
     */

    // remove trailing newline
    if (header.back() != '\n')
        fprintf(stderr, "invalid header");
    header.pop_back();

    // parse the dictionary
    std::vector<std::string> keys{"descr", "fortran_order", "shape"};
    auto dict_map = npy::pyparse::parse_dict(header, keys);

    if (dict_map.size() == 0)
        fprintf(stderr, "invalid dictionary in header");

    std::string descr_s = dict_map["descr"];
    std::string fortran_s = dict_map["fortran_order"];
    std::string shape_s = dict_map["shape"];

    // TODO: extract info from typestring
    parse_typestring(descr_s);
    // remove
    descr = npy::pyparse::parse_str(descr_s);

    // convert literal Python bool to C++ bool
    fortran_order = npy::pyparse::parse_bool(fortran_s);

    // parse the shape tuple
    auto shape_v = npy::pyparse::parse_tuple(shape_s);
    if (shape_v.size() == 0)
        fprintf(stderr, "invalid shape tuple in header");

    for (auto item : shape_v) {
        ndarray_len_t dim = static_cast<ndarray_len_t>(std::stoul(item));
        shape.push_back(dim);
    }
 }

 inline std::string write_header_dict(
        const std::string& descr, bool fortran_order,
        const std::vector<ndarray_len_t>& shape) {
    std::string s_fortran_order = npy::pyparse::write_boolean(fortran_order);
    std::string shape_s = npy::pyparse::write_tuple(shape);

    return "{'descr': '" + descr + "', 'fortran_order': " + s_fortran_order +
           ", 'shape': " + shape_s + ", }";
 }

 inline void write_header(
        std::ostream& out, const std::string& descr, bool fortran_order,
        const std::vector<ndarray_len_t>& shape_v) {
    std::string header_dict = write_header_dict(descr, fortran_order, shape_v);

    size_t length = magic_string_length + 2 + 2 + header_dict.length() + 1;

    unsigned char version[2] = {1, 0};
    if (length >= 255 * 255) {
        length = magic_string_length + 2 + 4 + header_dict.length() + 1;
        version[0] = 2;
        version[1] = 0;
    }
    size_t padding_len = 16 - length % 16;
    std::string padding(padding_len, ' ');

    // write magic
    write_magic(out, version[0], version[1]);

    // write header length
    if (version[0] == 1 && version[1] == 0) {
        char header_len_le16[2];
        uint16_t header_len =
                static_cast<uint16_t>(header_dict.length() + padding.length() + 1);

        header_len_le16[0] = (header_len >> 0) & 0xff;
        header_len_le16[1] = (header_len >> 8) & 0xff;
        out.write(reinterpret_cast<char*>(header_len_le16), 2);
    } else {
        char header_len_le32[4];
        uint32_t header_len =
                static_cast<uint32_t>(header_dict.length() + padding.length() + 1);

        header_len_le32[0] = (header_len >> 0) & 0xff;
        header_len_le32[1] = (header_len >> 8) & 0xff;
        header_len_le32[2] = (header_len >> 16) & 0xff;
        header_len_le32[3] = (header_len >> 24) & 0xff;
        out.write(reinterpret_cast<char*>(header_len_le32), 4);
    }

    out << header_dict << padding << '\n';
 }

 inline std::string read_header(std::istream& istream) {
    // check magic bytes an version number
    unsigned char v_major, v_minor;
    read_magic(istream, v_major, v_minor);

    uint32_t header_length = 0;
    if (v_major == 1 && v_minor == 0) {
        char header_len_le16[2];
        istream.read(header_len_le16, 2);
        header_length = (header_len_le16[0] << 0) | (header_len_le16[1] << 8);

        if ((magic_string_length + 2 + 2 + header_length) % 16 != 0) {
            // TODO: display warning
        }
    } else if (v_major == 2 && v_minor == 0) {
        char header_len_le32[4];
        istream.read(header_len_le32, 4);

        header_length = (header_len_le32[0] << 0) | (header_len_le32[1] << 8) |
                        (header_len_le32[2] << 16) | (header_len_le32[3] << 24);

        if ((magic_string_length + 2 + 4 + header_length) % 16 != 0) {
            // TODO: display warning
        }
    } else {
        fprintf(stderr, "unsupported file format version");
    }

    auto buf_v = std::vector<char>();
    buf_v.reserve(header_length);
    istream.read(buf_v.data(), header_length);
    std::string header(buf_v.data(), header_length);

    return header;
 }

 inline ndarray_len_t comp_size(const std::vector<ndarray_len_t>& shape) {
    ndarray_len_t size = 1;
    for (ndarray_len_t i : shape)
        size *= i;

    return size;
 }

 template <typename Scalar>
 inline void SaveArrayAsNumpy(
        const std::string& filename, bool fortran_order, unsigned int n_dims,
        const unsigned long shape[], const std::vector<Scalar>& data) {
    Typestring typestring_o(data);
    std::string typestring = typestring_o.str();

    std::ofstream stream(filename, std::ofstream::binary);
    if (!stream) {
        fprintf(stderr, "io error: failed to open a file.");
    }

    std::vector<ndarray_len_t> shape_v(shape, shape + n_dims);
    write_header(stream, typestring, fortran_order, shape_v);

    auto size = static_cast<size_t>(comp_size(shape_v));

    stream.write(reinterpret_cast<const char*>(data.data()), sizeof(Scalar) * size);
 }

 template <typename Scalar>
 inline void LoadArrayFromNumpy(
        const std::string& filename, std::vector<unsigned long>& shape,
        std::vector<Scalar>& data) {
    bool fortran_order;
    LoadArrayFromNumpy<Scalar>(filename, shape, fortran_order, data);
 }

 template <typename Scalar>
 inline void LoadArrayFromNumpy(
        const std::string& filename, std::vector<unsigned long>& shape,
        bool& fortran_order, std::vector<Scalar>& data) {
    std::ifstream stream(filename, std::ifstream::binary);
    if (!stream) {
        fprintf(stderr, "io error: failed to open a file.");
    }

    std::string header = read_header(stream);

    // parse header
    std::string typestr;

    parse_header(header, typestr, fortran_order, shape);

    // check if the typestring matches the given one
    Typestring typestring_o{data};
    std::string expect_typestr = typestring_o.str();
    if (typestr != expect_typestr) {
        fprintf(stderr, "formatting error: typestrings not matching");
    }

    // compute the data size based on the shape
    auto size = static_cast<size_t>(comp_size(shape));
    data.resize(size);

    // read the data
    stream.read(reinterpret_cast<char*>(data.data()), sizeof(Scalar) * size);
 }

 inline void LoadArrayFromNumpy(
        const std::string& filename, std::string& type_str,
        std::vector<ndarray_len_t>& shape, std::vector<int8_t>& data) {
    std::ifstream stream(filename, std::ifstream::binary);
    if (!stream) {
        fprintf(stderr, "io error: failed to open a file.");
    }

    std::string header = read_header(stream);
    bool fortran_order;
    // parse header
    parse_header(header, type_str, fortran_order, shape);

    // check if the typestring matches the given one
    std::string size_str = type_str.substr(type_str.size() - 1);
    size_t elem_size = atoi(size_str.c_str());

    // compute the data size based on the shape
    auto byte_size = elem_size * static_cast<size_t>(comp_size(shape));
    data.resize(byte_size);

    // read the data
    stream.read(reinterpret_cast<char*>(data.data()), byte_size);
 }

 }  // namespace npy

 #endif  // NPY_H
--- a/lite/load_and_run/src/helpers/outdumper.cpp
+++ b/lite/load_and_run/src/helpers/outdumper.cpp
@@ -0,0 +1,48 @@
 /**
 * \file lite/load_and_run/src/helpers/outdumper.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 */

 #include "outdumper.h"
 #include "megbrain/utils/debug.h"

 using namespace lar;

 void OutputDumper::set(mgb::SymbolVarArray& symb_var) {
    for (auto&& i : symb_var) {
        auto&& var = i.node();
        DumpInfo info;
        info.var_info = mgb::cg::dump_var_info({var});
        info.owner_inputs_info = mgb::cg::dump_var_info(var->owner_opr()->input());
        info.id = var->id();
        m_infos.push_back(info);
    }
 }

 mgb::ComputingGraph::Callback OutputDumper::bind() {
    auto& info = m_infos.at(m_bind_id++);
    mgb::ComputingGraph::Callback cb = [&info](const mgb::DeviceTensorND& dv) {
        info.hv.copy_from(dv);
    };
    return cb;
 }

 void OutputDumper::write_to_file() {
    if (!dump_file.empty()) {
        for (auto&& info : m_infos) {
            auto value = mgb::debug::dump_tensor(
                    info.hv,
                    mgb::ssprintf(
                            "var=%s owner_opr_inputs= %s", info.var_info.c_str(),
                            info.owner_inputs_info.c_str()));
            mgb::debug::write_to_file(
                    mgb::ssprintf(
                            "%s/run%zu-var %zd", dump_file.c_str(), m_run_id, info.id)
                            .c_str(),
                    value);
        }
    }
    m_run_id++;
 }
--- a/lite/load_and_run/src/helpers/outdumper.h
+++ b/lite/load_and_run/src/helpers/outdumper.h
@@ -0,0 +1,42 @@
 /**
 * \file lite/load_and_run/src/helpers/outdumper.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 */

 #pragma once
 #include "megbrain/serialization/serializer.h"

 namespace lar {

 /*!
 * \brief dumper for only output used for --bin-out-dump
 */
 class OutputDumper {
 public:
    struct DumpInfo {
        mgb::HostTensorND hv = {};
        std::string var_info;
        std::string owner_inputs_info;
        size_t id;
    };
    //! init the dump_file path
    OutputDumper(const char* file) { dump_file = file; }

    //! set the dump informations
    void set(mgb::SymbolVarArray& symb_var);

    //! callback function for specify output when compile computing graph
    mgb::ComputingGraph::Callback bind();

    //! write dumped output into dump_file
    void write_to_file();

 private:
    mgb::SmallVector<DumpInfo> m_infos;
    size_t m_run_id = 0;
    size_t m_bind_id = 0;
    std::string dump_file;
 };
 }  // namespace lar
--- a/lite/load_and_run/src/helpers/text_table.cpp
+++ b/lite/load_and_run/src/helpers/text_table.cpp
@@ -0,0 +1,119 @@
 /**
 * \file lite/load_and_run/src/helpers/text_table.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "text_table.h"

 using namespace mgb;

 namespace {
 inline void mid(std::ostream& os, const std::string& str, size_t max_w) {
    size_t l = (max_w - str.length()) / 2 + str.length();
    size_t r = max_w - l;
    os << std::setw(l) << std::right << str;
    if (r > 0)
        os << std::setw(r) << ' ';
 }
 inline size_t char_length(char c) {
    return c ? 1 : 0;
 }
 }  // namespace

 void TextTable::adjuster_last_row() {
    if (m_rows.empty())
        return;
    auto& row = m_rows.back();
    if (row.params.horizontal == 0 or row.params.vertical == 0) {
        row.params.corner = 0;
    }
    if (row.params.horizontal != 0 && row.params.vertical != 0 &&
        row.params.corner == 0) {
        row.params.corner = row.params.horizontal;
    }
 }

 void TextTable::show(std::ostream& os) {
    if (m_rows.empty())
        return;
    auto& last_row = m_rows.front();
    bool first = true;
    for (auto& row : m_rows) {
        auto& lrow =
                (last_row.values.size() * char_length(last_row.params.horizontal)) >
                                (row.values.size() * char_length(row.params.horizontal))
                        ? last_row
                        : row;
        // line before row
        if (lrow.params.horizontal) {
            if (not first)
                os << std::endl;
            os << m_prefix;
            if (lrow.params.corner)
                os << lrow.params.corner;
            size_t skip_size = 0;
            // table name
            if (first) {
                os << m_name;
                skip_size = m_name.length();
            }
            for (size_t i = 0; i < lrow.values.size(); ++i) {
                auto max_w = m_cols_max_w.at(i) + m_padding * 2;
                if (max_w + char_length(lrow.params.corner) <= skip_size) {
                    skip_size = skip_size - max_w - char_length(lrow.params.corner);
                    continue;
                }
                size_t rest = max_w + char_length(lrow.params.corner) - skip_size;
                skip_size = 0;
                if (rest > char_length(lrow.params.corner)) {
                    os << std::string(
                            rest - char_length(lrow.params.corner),
                            lrow.params.horizontal);
                    rest = char_length(lrow.params.corner);
                }
                if (rest > 0 && lrow.params.corner)
                    os << lrow.params.corner;
            }
        } else if (first) {
            os << m_prefix << ' ' << m_name;
        }
        first = false;
        os << std::endl << m_prefix;
        if (row.params.vertical)
            os << row.params.vertical;
        // row
        for (size_t i = 0; i < row.values.size(); ++i) {
            auto& str = row.values.at(i);
            auto max_w = m_cols_max_w.at(i) + 2 * m_padding;
            if (row.params.align == Align::Mid) {
                mid(os, str, max_w);
            } else if (row.params.align == Align::Left) {
                os << std::setw(max_w) << std::left << str;
            } else {
                os << std::setw(max_w) << std::right << str;
            }
            if (row.params.vertical)
                os << row.params.vertical;
        }
        last_row = row;
    }
    if (last_row.params.horizontal) {
        os << std::endl << m_prefix;
        if (last_row.params.corner)
            os << last_row.params.corner;
        for (size_t i = 0; i < last_row.values.size(); ++i) {
            auto max_w = m_cols_max_w.at(i);
            std::string tmp(max_w + m_padding * 2, last_row.params.horizontal);
            os << tmp;
            if (last_row.params.corner)
                os << last_row.params.corner;
        }
    }
 }
--- a/lite/load_and_run/src/helpers/text_table.h
+++ b/lite/load_and_run/src/helpers/text_table.h
@@ -0,0 +1,133 @@
 /**
 * \file lite/load_and_run/src/helpers/text_table.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #pragma once

 #include <array>
 #include <iomanip>
 #include <ostream>
 #include <sstream>
 #include <string>
 #include <tuple>
 #include <type_traits>
 #include <vector>
 #include "megbrain/common.h"

 namespace mgb {

 class TextTable {
 public:
    enum Level { Summary, Detail };
    enum class Align : int { Left, Right, Mid };
    explicit TextTable(const std::string& table_name) : m_name(table_name) {}
    TextTable& horizontal(char c) {
        m_row.params.horizontal = c;
        return *this;
    }
    TextTable& vertical(char c) {
        m_row.params.vertical = c;
        return *this;
    }
    TextTable& corner(char c) {
        m_row.params.corner = c;
        return *this;
    }
    TextTable& align(Align v) {
        m_row.params.align = v;
        return *this;
    }
    TextTable& padding(size_t w) {
        m_padding = w;
        return *this;
    }
    TextTable& prefix(const std::string& str) {
        m_prefix = str;
        return *this;
    }

    template <typename T>
    TextTable& add(const T& value) {
        m_row.values.emplace_back(value);
        if (m_cols_max_w.size() < m_row.values.size()) {
            m_cols_max_w.emplace_back(m_row.values.back().length());
        } else {
            mgb_assert(m_row.values.size() >= 1);
            size_t i = m_row.values.size() - 1;
            m_cols_max_w[i] = std::max(m_cols_max_w[i], m_row.values.back().length());
        }
        return *this;
    }

    template <
            typename T,
            typename std::enable_if<std::is_floating_point<T>::value, bool>::type = 0>
    TextTable& add(const T& value) {
        std::stringstream ss;
        ss << std::setiosflags(std::ios::fixed) << std::setprecision(2);
        ss << value;
        m_row.values.emplace_back(ss.str());
        if (m_cols_max_w.size() < m_row.values.size()) {
            m_cols_max_w.emplace_back(m_row.values.back().length());
        } else {
            mgb_assert(m_row.values.size() >= 1);
            size_t i = m_row.values.size() - 1;
            m_cols_max_w[i] = std::max(m_cols_max_w[i], m_row.values.back().length());
        }
        return *this;
    }

    template <
            typename T,
            typename std::enable_if<std::is_integral<T>::value, bool>::type = 0>
    TextTable& add(const T& value) {
        m_row.values.emplace_back(std::to_string(value));
        return *this;
    }

    void eor() {
        m_rows.emplace_back(m_row);
        adjuster_last_row();
        m_row.values.clear();
    }

    void reset() {
        m_row = {};
        m_cols_max_w.clear();
        m_padding = 0;
        m_rows.clear();
    }

    void show(std::ostream& os);

 private:
    void adjuster_last_row();
    std::string m_name;
    std::vector<size_t> m_cols_max_w;
    size_t m_padding = 0;
    std::string m_prefix = "";
    struct Row {
        std::vector<std::string> values;
        struct Params {
            Align align = Align::Left;
            char horizontal = '-', vertical = '|', corner = '+';
        } params;
    };
    std::vector<Row> m_rows;
    Row m_row;
 };

 inline std::ostream& operator<<(std::ostream& stream, TextTable& table) {
    table.show(stream);
    return stream;
 }

 }  // namespace mgb
--- a/lite/load_and_run/src/main.cpp
+++ b/lite/load_and_run/src/main.cpp
@@ -0,0 +1,31 @@
 /**
 * \file lite/load_and_run/src/main.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include <gflags/gflags.h>
 #include <string>
 #include "strategys/strategy.h"

 int main(int argc, char** argv) {
    std::string usage = "load_and_run <model_path> [options...]";
    if (argc < 2) {
        printf("usage: %s\n", usage.c_str());
        return -1;
    }
    gflags::SetUsageMessage(usage);
    gflags::SetVersionString("1.0");
    gflags::ParseCommandLineFlags(&argc, &argv, true);
    std::string model_path = argv[1];
    auto strategy = lar::StrategyBase::create_strategy(model_path);
    strategy->run();
    gflags::ShutDownCommandLineFlags();

    return 0;
 }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/load_and_run/src/models/model.cpp
+++ b/lite/load_and_run/src/models/model.cpp
@@ -0,0 +1,60 @@

 /**
 * \file lite/load_and_run/src/models/model.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */
 #include "model.h"
 #include <iostream>
 #include <memory>
 #include "model_lite.h"
 #include "model_mdl.h"

 using namespace lar;

 ModelType ModelBase::get_model_type(std::string model_path) {
    //! read magic number of dump file
    FILE* fin = fopen(model_path.c_str(), "rb");
    mgb_assert(fin, "failed to open %s: %s", model_path.c_str(), strerror(errno));
    char buf[16];
    mgb_assert(fread(buf, 1, 16, fin) == 16, "read model failed");
    fclose(fin);

    // get model type
    // uint32_t MGB_MAGIC = 0x5342474D
    std::string tag(buf);
    ModelType type;
    if (tag.substr(0, 7) == std::string("mgb0001") ||
        tag.substr(0, 8) == std::string("mgb0000a") ||
        tag.substr(0, 4) == std::string("MGBS") ||
        tag.substr(0, 8) == std::string("mgbtest0")) {
        type = ModelType::MEGDL_MODEL;

    } else {
        type = ModelType::LITE_MODEL;
    }

    return type;
 }

 std::shared_ptr<ModelBase> ModelBase::create_model(std::string model_path) {
    mgb_log_debug("model path %s\n", model_path.c_str());

    auto model_type = get_model_type(model_path);

    if (ModelType::LITE_MODEL == model_type) {
        return std::make_shared<ModelLite>(model_path);
    } else if (ModelType::MEGDL_MODEL == model_type) {
        if (FLAGS_lite)
            return std::make_shared<ModelLite>(model_path);
        else
            return std::make_shared<ModelMdl>(model_path);
    } else {
        return nullptr;
    }
 }
 DEFINE_bool(lite, false, "using lite model to run mdl model");
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/load_and_run/src/models/model.h
+++ b/lite/load_and_run/src/models/model.h
@@ -0,0 +1,49 @@
 /**
 * \file lite/load_and_run/src/models/model.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once
 #include <gflags/gflags.h>
 #include <string>
 #include "helpers/common.h"

 DECLARE_bool(lite);

 namespace lar {
 /*!
 * \brief: base class of model
 */
 class ModelBase {
 public:
    //! get model type by the magic number in dump file
    static ModelType get_model_type(std::string model_path);

    //! create model by different model type
    static std::shared_ptr<ModelBase> create_model(std::string model_path);

    //! type of the model
    virtual ModelType type() = 0;

    //! set model load state

    virtual void set_shared_mem(bool state) = 0;

    //! load model interface for load and run strategy
    virtual void load_model() = 0;

    //! run model interface for load and run strategy
    virtual void run_model() = 0;

    //! wait asynchronous function interface for load and run strategy
    virtual void wait() = 0;

    virtual ~ModelBase() = default;
 };
 }  // namespace lar

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/load_and_run/src/models/model_lite.cpp
+++ b/lite/load_and_run/src/models/model_lite.cpp
@@ -0,0 +1,50 @@
 /**
 * \file lite/load_and_run/src/models/model_lite.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */
 #include "model_lite.h"
 #include <gflags/gflags.h>
 #include <cstring>
 #include "misc.h"

 DECLARE_bool(share_param_mem);

 using namespace lar;
 ModelLite::ModelLite(const std::string& path) : model_path(path) {
    LITE_WARN("creat lite model use CPU as default comp node");
 };
 void ModelLite::load_model() {
    m_network = std::make_shared<lite::Network>(config, IO);
    if (share_model_mem) {
        //! WARNNING:maybe not right to share param memmory for this
        LITE_WARN("enable share model memory");

        FILE* fin = fopen(model_path.c_str(), "rb");
        LITE_ASSERT(fin, "failed to open %s: %s", model_path.c_str(), strerror(errno));
        fseek(fin, 0, SEEK_END);
        size_t size = ftell(fin);
        fseek(fin, 0, SEEK_SET);

        void* ptr = malloc(size);
        std::shared_ptr<void> buf{ptr, free};
        auto nr = fread(buf.get(), 1, size, fin);
        LITE_ASSERT(nr == size, "read model file failed");
        fclose(fin);

        m_network->load_model(buf.get(), size);
    } else {
        m_network->load_model(model_path);
    }
 }

 void ModelLite::run_model() {
    m_network->forward();
 }

 void ModelLite::wait() {
    m_network->wait();
 }
--- a/lite/load_and_run/src/models/model_lite.h
+++ b/lite/load_and_run/src/models/model_lite.h
@@ -0,0 +1,73 @@
 /**
 * \file lite/load_and_run/src/models/model_lite.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once

 #include <string>
 #include "helpers/common.h"
 #include "helpers/data_parser.h"
 #include "lite/network.h"
 #include "model.h"

 namespace lar {
 /*!
 * \brief: megengine lite model
 */
 class ModelLite : public ModelBase {
 public:
    using Strategy = LiteAlgoSelectStrategy;

    ModelLite(const std::string& path);
    //!  model type
    ModelType type() override { return ModelType::LITE_MODEL; }

    //! set to load from shared memory
    void set_shared_mem(bool state) override { share_model_mem = state; }

    //! load model from dump file
    void load_model() override;

    //! run model with given runtime parameter
    void run_model() override;

    //! wait the end of asynchronous function execution
    void wait() override;

    //! get the network of lite model
    std::shared_ptr<lite::Network> get_lite_network() { return m_network; }

    //! get the config of lite model
    lite::Config& get_config() { return config; }

    //! get the networkIO of lite model
    lite::NetworkIO& get_networkIO() { return IO; }

    //! get the data parser
    DataParser& get_input_parser() { return parser; }

    //! set the strategy before load model
    void set_lite_strategy(Strategy& u_strategy) { m_strategy = u_strategy; }

    //! get algo strategy
    Strategy& get_lite_strategy() { return m_strategy; }

 private:
    bool share_model_mem;
    std::string model_path;

    DataParser parser;
    lite::Config config;
    lite::NetworkIO IO;

    std::shared_ptr<lite::Network> m_network;

    Strategy m_strategy;
 };
 }  // namespace lar
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/load_and_run/src/models/model_mdl.cpp
+++ b/lite/load_and_run/src/models/model_mdl.cpp
@@ -0,0 +1,105 @@
 /**
 * \file lite/load_and_run/src/models/model_mdl.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "model_mdl.h"
 #include <gflags/gflags.h>
 #include <iostream>

 DECLARE_bool(share_param_mem);

 using namespace lar;

 ModelMdl::ModelMdl(const std::string& path) : model_path(path) {
    mgb_log_warn("creat mdl model use XPU as default comp node");
    m_load_config.comp_graph = mgb::ComputingGraph::make();
    m_load_config.comp_graph->options().graph_opt_level = 0;
    testcase_num = 0;
 }

 void ModelMdl::load_model() {
    //! read dump file
    if (share_model_mem) {
        mgb_log_warn("enable share model memory");
        FILE* fin = fopen(model_path.c_str(), "rb");
        mgb_assert(fin, "failed to open %s: %s", model_path.c_str(), strerror(errno));
        fseek(fin, 0, SEEK_END);
        size_t size = ftell(fin);
        fseek(fin, 0, SEEK_SET);

        void* ptr = malloc(size);
        std::shared_ptr<void> buf{ptr, free};
        auto nr = fread(buf.get(), 1, size, fin);
        mgb_assert(nr == size, "read model file failed");
        fclose(fin);

        m_model_file = mgb::serialization::InputFile::make_mem_proxy(buf, size);
    } else {
        m_model_file = mgb::serialization::InputFile::make_fs(model_path.c_str());
    }

    //! get dump_with_testcase model testcase number
    char magic[8];
    m_model_file->read(magic, sizeof(magic));
    if (strncmp(magic, "mgbtest0", 8)) {
        m_model_file->rewind();
    } else {
        m_model_file->read(&testcase_num, sizeof(testcase_num));
    }

    auto format =
            mgb::serialization::GraphLoader::identify_graph_dump_format(*m_model_file);
    mgb_assert(
            format.valid(),
            "invalid format, please make sure model is dumped by GraphDumper");

    //! load computing graph of model
    m_loader = mgb::serialization::GraphLoader::make(
            std::move(m_model_file), format.val());
    m_load_result = m_loader->load(m_load_config, false);
    m_load_config.comp_graph.reset();

    // get testcase input generated by dump_with_testcase.py
    if (testcase_num) {
        for (auto&& i : m_load_result.tensor_map) {
            test_input_tensors.emplace_back(i.first, i.second.get());
        }
        std::sort(test_input_tensors.begin(), test_input_tensors.end());
    }
    // initialize output callback
    for (size_t i = 0; i < m_load_result.output_var_list.size(); i++) {
        mgb::ComputingGraph::Callback cb;
        m_callbacks.push_back(cb);
    }
 }

 void ModelMdl::make_output_spec() {
    for (size_t i = 0; i < m_load_result.output_var_list.size(); i++) {
        auto item = m_load_result.output_var_list[i];
        m_output_spec.emplace_back(item, std::move(m_callbacks[i]));
    }

    m_asyc_exec = m_load_result.graph_compile(m_output_spec);
 }

 std::shared_ptr<mgb::serialization::GraphLoader>& ModelMdl::reset_loader() {
    m_loader = mgb::serialization::GraphLoader::make(
            m_loader->reset_file(), m_loader->format());
    return m_loader;
 }

 void ModelMdl::run_model() {
    mgb_assert(
            m_asyc_exec != nullptr,
            "empty asychronous function to execute after graph compiled");
    m_asyc_exec->execute();
 }

 void ModelMdl::wait() {
    m_asyc_exec->wait();
 }
--- a/lite/load_and_run/src/models/model_mdl.h
+++ b/lite/load_and_run/src/models/model_mdl.h
@@ -0,0 +1,117 @@
 /**
 * \file lite/load_and_run/src/models/model_mdl.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once
 #include <string>
 #include "megbrain/opr/search_policy/algo_chooser_helper.h"
 #include "megbrain/plugin/opr_io_dump.h"
 #include "megbrain/serialization/extern_c_opr.h"
 #include "megbrain/serialization/serializer.h"
 #include "megbrain/utils/debug.h"

 #include "megbrain/plugin/num_range_checker.h"
 #include "megbrain/plugin/profiler.h"

 #include "helpers/common.h"
 #include "helpers/data_parser.h"
 #include "model.h"

 namespace lar {

 class ModelMdl : public ModelBase {
 public:
    using Strategy = mgb::opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    //! interface implement of ModelBase
    ModelMdl(const std::string& path);

    ModelType type() override { return ModelType::MEGDL_MODEL; }

    void set_shared_mem(bool state) override { share_model_mem = state; }

    void load_model() override;

    void make_output_spec();

    void run_model() override;

    void wait() override;

    //! get load result for megDL model
    mgb::serialization::GraphLoader::LoadResult& get_mdl_load_result() {
        return m_load_result;
    }

    //! get load config for megDL model
    mgb::serialization::GraphLoadConfig& get_mdl_config() { return m_load_config; }

    //! reset the graph loader for dump_with_testcase model
    std::shared_ptr<mgb::serialization::GraphLoader>& reset_loader();

    //!  algo strategy for runing model
    void set_mdl_strategy(Strategy& u_strategy) { m_strategy = u_strategy; }
    Strategy& get_mdl_strategy() { return m_strategy; }

    //! get data parser
    DataParser& get_input_parser() { return parser; }
    uint32_t get_testcase_num() { return testcase_num; }
    std::vector<std::pair<std::string, mgb::HostTensorND*>>& get_test_input() {
        return test_input_tensors;
    }

    //! get output specified configuration
    mgb::ComputingGraph::OutputSpec& get_output_spec() { return m_output_spec; }
    std::unique_ptr<mgb::cg::AsyncExecutable>& get_async_func() { return m_asyc_exec; }

    void set_output_callback(std::vector<mgb::ComputingGraph::Callback>& cb) {
        mgb_assert(
                m_callbacks.size() == cb.size(),
                "invalid output callback list to set!!");
        for (size_t i = 0; i < cb.size(); i++) {
            m_callbacks[i] = cb[i];
        }
    }
 #if MGB_ENABLE_JSON
    std::unique_ptr<mgb::GraphProfiler>& get_profiler() { return m_profiler; }
    void set_profiler() {
        m_profiler =
                std::make_unique<mgb::GraphProfiler>(m_load_config.comp_graph.get());
    }
 #endif
    void set_num_range_checker(float range) {
        m_num_range_checker = std::make_unique<mgb::NumRangeChecker>(
                m_load_config.comp_graph.get(), range);
    }

 private:
    bool share_model_mem;
    std::string model_path;
    std::unique_ptr<mgb::serialization::InputFile> m_model_file;
    mgb::serialization::GraphLoadConfig m_load_config;

    mgb::serialization::GraphLoader::LoadResult m_load_result;
    std::shared_ptr<mgb::serialization::GraphLoader> m_loader;
    std::unique_ptr<mgb::cg::AsyncExecutable> m_asyc_exec;

    uint32_t testcase_num;
    std::vector<std::pair<std::string, mgb::HostTensorND*>> test_input_tensors;

    DataParser parser;
    Strategy m_strategy = Strategy::HEURISTIC;
    std::vector<mgb::ComputingGraph::Callback> m_callbacks;
    mgb::ComputingGraph::OutputSpec m_output_spec;

    std::unique_ptr<mgb::NumRangeChecker> m_num_range_checker;
 #if MGB_ENABLE_JSON
    std::unique_ptr<mgb::GraphProfiler> m_profiler;
 #endif
 };

 }  // namespace lar

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/load_and_run/src/options/device_options.cpp
+++ b/lite/load_and_run/src/options/device_options.cpp
@@ -0,0 +1,200 @@
 /**
 * \file lite/load_and_run/src/options/device_options.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include <iostream>
 #include <sstream>
 #include "lite/global.h"
 #include "megbrain/comp_node_env.h"
 #include "misc.h"
 #include "device_options.h"
 #include "models/model_lite.h"
 #include "models/model_mdl.h"

 DECLARE_bool(weight_preprocess);

 using namespace lar;

 /////////////////// XPUDeviceOption //////////////////////
 namespace lar {
 template <>
 void XPUDeviceOption::config_model_internel<ModelLite>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        if ((enable_cpu) || (enable_cpu_default) || (enable_multithread) ||
            (enable_multithread_default)) {
            LITE_WARN("using cpu device\n");
            model->get_config().device_type = LiteDeviceType::LITE_CPU;
        }
 #if MGE_WITH_CUDA
        if (enable_cuda) {
            model->get_config().device_type = LiteDeviceType::LITE_CUDA;
        }
 #endif
    } else if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
        auto network = model->get_lite_network();
        if (enable_cpu_default) {
            LITE_WARN("using cpu default device\n");
            lite::Runtime::set_cpu_inplace_mode(network);
        }
        if (enable_multithread) {
            LITE_WARN("using multithread device\n");
            lite::Runtime::set_cpu_threads_number(network, thread_num);
        }
        if (enable_multithread_default) {
            LITE_WARN("using multithread  default device\n");
            lite::Runtime::set_cpu_inplace_mode(network);
            lite::Runtime::set_cpu_threads_number(network, thread_num);
        }
        if (enable_set_core_ids) {
            std::string core_str;
            for (auto id : core_ids) {
                core_str += std::to_string(id) + ",";
            }
            LITE_WARN("multi thread core ids: %s\n", core_str.c_str());
            lite::ThreadAffinityCallback affinity_callback = [&](size_t thread_id) {
                mgb::sys::set_cpu_affinity({core_ids[thread_id]});
            };
            lite::Runtime::set_runtime_thread_affinity(network, affinity_callback);
        }
    }
 }

 template <>
 void XPUDeviceOption::config_model_internel<ModelMdl>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        if (enable_cpu) {
            mgb_log_warn("using cpu device\n");
            model->get_mdl_config().comp_node_mapper = [](mgb::CompNode::Locator& loc) {
                loc.type = mgb::CompNode::DeviceType::CPU;
            };
        }
 #if MGE_WITH_CUDA
        if (enable_cuda) {
            mgb_log_warn("using cuda device\n");
            model->get_mdl_config().comp_node_mapper = [](mgb::CompNode::Locator& loc) {
                loc.type = mgb::CompNode::DeviceType::CUDA;
            };
        }
 #endif
        if (enable_cpu_default) {
            mgb_log_warn("using cpu default device\n");
            model->get_mdl_config().comp_node_mapper = [](mgb::CompNode::Locator& loc) {
                loc.type = mgb::CompNode::DeviceType::CPU;
                loc.device = mgb::CompNode::Locator::DEVICE_CPU_DEFAULT;
            };
        }
        if (enable_multithread) {
            mgb_log_warn("using multithread device\n");
            model->get_mdl_config().comp_node_mapper =
                    [&](mgb::CompNode::Locator& loc) {
                        loc.type = mgb::CompNode::DeviceType::MULTITHREAD;
                        loc.device = 0;
                        loc.stream = thread_num;
                    };
        }
        if (enable_multithread_default) {
            mgb_log_warn("using multithread default device\n");
            model->get_mdl_config().comp_node_mapper =
                    [&](mgb::CompNode::Locator& loc) {
                        loc.type = mgb::CompNode::DeviceType::MULTITHREAD;
                        loc.device = mgb::CompNode::Locator::DEVICE_MULTITHREAD_DEFAULT;
                        loc.stream = thread_num;
                    };
        }
        if (enable_set_core_ids) {
            std::string core_str;
            for (auto id : core_ids) {
                core_str += std::to_string(id) + ",";
            }
            mgb_log_warn("set multi thread core ids:%s\n", core_str.c_str());
            auto affinity_callback = [&](size_t thread_id) {
                mgb::sys::set_cpu_affinity({core_ids[thread_id]});
            };
            mgb::CompNode::Locator loc;
            model->get_mdl_config().comp_node_mapper(loc);
            auto comp_node = mgb::CompNode::load(loc);
            mgb::CompNodeEnv::from_comp_node(comp_node).cpu_env().set_affinity(
                    affinity_callback);
        }
    }
 }
 }  // namespace lar

 XPUDeviceOption::XPUDeviceOption() {
    m_option_name = "xpu_device";
    enable_cpu = FLAGS_cpu;
 #if MGE_WITH_CUDA
    enable_cuda = FLAGS_cuda;
 #endif
    enable_cpu_default = FLAGS_cpu_default;

    if (FLAGS_multithread >= 0) {
        thread_num = FLAGS_multithread;
        enable_multithread = true;
    }

    if (FLAGS_multithread_default >= 0) {
        thread_num = FLAGS_multithread_default;
        enable_multithread_default = true;
    }

    if (!FLAGS_multi_thread_core_ids.empty()) {
        mgb_assert(enable_multithread, "core ids should be set after --multithread");
        std::stringstream id_stream(FLAGS_multi_thread_core_ids);
        std::string id;
        size_t thread_cnt = 0;
        while (getline(id_stream, id, ',')) {
            thread_cnt++;
            core_ids.push_back(atoi(id.c_str()));
        }
        mgb_assert(
                thread_cnt == thread_num,
                "core ids number should be same with thread number set before");
        enable_set_core_ids = true;
    }
 }

 bool XPUDeviceOption::is_valid() {
    bool ret = FLAGS_cpu || FLAGS_cpu_default;
 #if MGE_WITH_CUDA
    ret = ret || FLAGS_cuda;
 #endif
    ret = ret || FLAGS_multithread >= 0;
    ret = ret || FLAGS_multithread_default >= 0;
    ret = ret || !FLAGS_multi_thread_core_ids.empty();

    return ret;
 }

 std::shared_ptr<OptionBase> XPUDeviceOption::create_option() {
    static std::shared_ptr<lar::XPUDeviceOption> option(new XPUDeviceOption);
    if (XPUDeviceOption::is_valid()) {
        return std::static_pointer_cast<lar::OptionBase>(option);
    } else {
        return nullptr;
    }
 }

 void XPUDeviceOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    CONFIG_MODEL_FUN;
 }
 ///////////////////////// xpu gflags ////////////////////////////
 DEFINE_bool(cpu, false, "set CPU device as running device");
 #if MGE_WITH_CUDA
 DEFINE_bool(cuda, false, "set CUDA device as running device ");
 #endif
 DEFINE_bool(cpu_default, false, "set running device as CPU device with inplace mode");
 DEFINE_int32(multithread, -1, "set multithread device as running device");
 DEFINE_int32(
        multithread_default, -1,
        "set multithread device as running device with inplace mode");
 DEFINE_string(multi_thread_core_ids, "", "set multithread core id");
 REGIST_OPTION_CREATOR(xpu_device, lar::XPUDeviceOption::create_option);
--- a/lite/load_and_run/src/options/device_options.h
+++ b/lite/load_and_run/src/options/device_options.h
@@ -0,0 +1,49 @@
 /**
 * \file lite/load_and_run/src/options/device_options.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once
 #include <gflags/gflags.h>
 #include "models/model.h"
 #include "option_base.h"

 DECLARE_bool(cpu);
 #if MGE_WITH_CUDA
 DECLARE_bool(cuda);
 #endif
 DECLARE_bool(cpu_default);
 DECLARE_int32(multithread);
 DECLARE_int32(multithread_default);
 DECLARE_string(multi_thread_core_ids);
 namespace lar {

 class XPUDeviceOption final : public OptionBase {
 public:
    static bool is_valid();
    static std::shared_ptr<OptionBase> create_option();
    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;
    std::string option_name() const override { return m_option_name; };

 private:
    XPUDeviceOption();
    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};
    bool enable_cpu;
 #if MGE_WITH_CUDA
    bool enable_cuda;
 #endif
    bool enable_cpu_default;
    bool enable_multithread;
    bool enable_multithread_default;
    bool enable_set_core_ids;
    size_t thread_num;
    std::vector<int> core_ids;
    std::string m_option_name;
 };
 }  // namespace lar
--- a/lite/load_and_run/src/options/extern_c_opr_options.cpp
+++ b/lite/load_and_run/src/options/extern_c_opr_options.cpp
@@ -0,0 +1,216 @@
 /**
 * \file lite/load_and_run/src/options/extern_c_opr_options.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "extern_c_opr_options.h"
 #include "megbrain/utils/debug.h"
 #include "misc.h"
 #include "models/model_lite.h"
 #include "models/model_mdl.h"

 namespace lar {
 template <>
 void COprLibOption::config_model_internel(
        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
    MGB_MARK_USED_VAR(model);
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        if (!lib_path.empty()) {
            lite::set_loader_lib_path(lib_path);
        }
        if (c_opr_args.is_run_c_opr_with_param) {
            LITE_THROW(
                    "lite model dont't support run with external c opr "
                    "parmeter");
        }
    }
 }
 template <>
 void COprLibOption::config_model_internel(
        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        if (!lib_path.empty()) {
            load_lib();
        }
        if (c_opr_args.is_run_c_opr_with_param) {
            mgb_assert(
                    c_opr_args.is_run_c_opr &&
                            c_opr_args.copr_param_device_ptr_malloc &&
                            c_opr_args.copr_param_device_ptr_free &&
                            c_opr_args.copr_param_device_ptr_h2d,
                    "--c-opr-lib-with-param need config with --c-opr-lib, also "
                    "extern c opr loader need implemente "
                    "copr_param_device_ptr_malloc, copr_param_device_ptr_free "
                    "and copr_param_device_ptr_h2d symbols");
        }
    } else if (runtime_param.stage == RunStage::MODEL_RUNNING) {
        if (model->get_testcase_num() && c_opr_args.is_run_c_opr_with_param) {
            init_extern_param(model);
            set_Copr_IO(model);
        }
    } else if (runtime_param.stage == RunStage::AFTER_RUNNING_ITER) {
        if (model->get_testcase_num() && c_opr_args.is_run_c_opr_with_param) {
            c_opr_args.copr_param_device_ptr_free(c_opr_param.get());
            free(c_opr_param->input);
        }
    }
 }
 }  // namespace lar

 using namespace lar;

 MGBDType COprLibOption::dtype_cpp2c(megdnn::DType dtype) {
    switch (dtype.enumv()) {
        case megdnn::DTypeEnum::Float32:
            return MGB_DTYPE_FLOAT32;
        case megdnn::DTypeEnum::Int32:
            return MGB_DTYPE_INT32;
        case megdnn::DTypeEnum::Int16:
            return MGB_DTYPE_INT16;
        case megdnn::DTypeEnum::Uint8:
            return MGB_DTYPE_UINT8;
 #if !MEGDNN_DISABLE_FLOAT16
        case megdnn::DTypeEnum::Float16:
            return MGB_DTYPE_FLOAT16;
 #endif
        default:
            mgb_throw(
                    mgb::InternalError, "unsupported dtype for extern C API: %s",
                    dtype.name());
    }
 }

 void COprLibOption::tensor_shape_to_c(
        const megdnn::TensorShape& shape, MGBTensorShape& mgb_shape) {
    mgb_assert(
            shape.ndim <= MGB_TENSOR_MAX_NDIM, "shape ndim too large: %zu", shape.ndim);
    mgb_shape.ndim = shape.ndim;
    for (size_t i = 0; i < shape.ndim; ++i) {
        mgb_shape.shape[i] = shape[i];
    }
 }

 void COprLibOption::init_extern_param(std::shared_ptr<ModelBase> model_ptr) {
    auto model = std::static_pointer_cast<ModelMdl>(model_ptr);
    auto inp_tensors = model->get_test_input();

    c_opr_param = std::make_shared<ExternCOprParam>();
    memset(c_opr_param.get(), 0, sizeof(ExternCOprParam));

    //! we just test input on npu case, do not test output on
    //! npu case, so we just init input shape and type

    c_opr_param->nr_input = inp_tensors.size();
    c_opr_param->input = (ExternDeviceTensor*)malloc(
            sizeof(ExternDeviceTensor) * inp_tensors.size());
    memset(c_opr_param->input, 0, sizeof(ExternDeviceTensor) * inp_tensors.size());

    //! init input ExternDeviceTensor shape and dtype
    for (size_t input_idx = 0; input_idx < inp_tensors.size(); input_idx++) {
        auto& mgb_tensor_layout = c_opr_param->input[input_idx].layout;
        auto host_tensor_nd_p = inp_tensors[input_idx].second;
        mgb_tensor_layout.dtype = dtype_cpp2c(host_tensor_nd_p->dtype());
        tensor_shape_to_c(
                inp_tensors[input_idx].second->shape(), mgb_tensor_layout.shape);
    }
    c_opr_param->nr_output = 0;

    //! now call copr_param_device_ptr_malloc to malloc
    //! device_ptr
    c_opr_args.copr_param_device_ptr_malloc(c_opr_param.get());
 }

 void COprLibOption::load_lib() {
    auto handle = dlopen(lib_path.c_str(), RTLD_LAZY);
    mgb_assert(handle, "failed to open c opr lib %s: %s", lib_path.c_str(), dlerror());

    const char* entry = MGB_C_OPR_INIT_FUNC_STR;
    auto func = dlsym(handle, entry);
    mgb_assert(func, "can not resolve %s: %s", entry, dlerror());
    typedef void (*entry_f_t)(void*);
    reinterpret_cast<entry_f_t>(func)(
            reinterpret_cast<void*>(&mgb_get_extern_c_opr_api_versioned));
    printf("loaded C opr library: %s\n", lib_path.c_str());
    entry = "copr_param_device_ptr_malloc";
    func = dlsym(handle, entry);
    if (func) {
        printf("get %s from: %s\n", entry, lib_path.c_str());
        c_opr_args.copr_param_device_ptr_malloc =
                reinterpret_cast<COprArgs::COPR_PARAM_DEVICE_PTR_MEM_T>(func);
    }

    entry = "copr_param_device_ptr_free";
    func = dlsym(handle, entry);
    if (func) {
        printf("get %s from: %s\n", entry, lib_path.c_str());
        c_opr_args.copr_param_device_ptr_free =
                reinterpret_cast<COprArgs::COPR_PARAM_DEVICE_PTR_MEM_T>(func);
    }

    entry = "copr_param_device_ptr_h2d";
    func = dlsym(handle, entry);
    if (func) {
        printf("get %s from: %s\n", entry, lib_path.c_str());
        c_opr_args.copr_param_device_ptr_h2d =
                reinterpret_cast<COprArgs::COPR_PARAM_DEVICE_PTR_H2D_T>(func);
    }
 }

 void COprLibOption::set_Copr_IO(std::shared_ptr<ModelBase> model_ptr) {
    auto model = std::static_pointer_cast<ModelMdl>(model_ptr);
    auto inp_tensors = model->get_test_input();
    auto loader = model->reset_loader();
    auto testcase = loader->load(model->get_mdl_config(), false);
    mgb_assert(testcase.output_var_list.size() == inp_tensors.size());
    for (size_t i = 0; i < inp_tensors.size(); ++i) {
        auto&& opr = testcase.output_var_list[i]
                             .node()
                             ->owner_opr()
                             ->cast_final_safe<mgb::opr::SharedDeviceTensor>();
        c_opr_args.copr_param_device_ptr_h2d(
                c_opr_param.get(), opr.dev_data()->raw_ptr(), i);
    }

    //! now config c opr dynamic param
    config_extern_c_opr_dynamic_param(model->get_async_func(), c_opr_param);
 }

 COprLibOption::COprLibOption() {
    m_option_name = "c_opr_lib";
    lib_path = FLAGS_c_opr_lib;
    c_opr_args.is_run_c_opr = !lib_path.empty();
    c_opr_args.is_run_c_opr_with_param = FLAGS_c_opr_lib_with_param;
 }

 bool COprLibOption::is_valid() {
    return !FLAGS_c_opr_lib.empty() || FLAGS_c_opr_lib_with_param;
 }

 std::shared_ptr<OptionBase> COprLibOption::create_option() {
    static std::shared_ptr<COprLibOption> option(new COprLibOption);
    if (COprLibOption::is_valid()) {
        return std::static_pointer_cast<OptionBase>(option);
    } else {
        return nullptr;
    }
 }

 void COprLibOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    CONFIG_MODEL_FUN;
 }
 DEFINE_string(
        c_opr_lib, "",
        "Load external operator library. It must implement "
        "MGB_C_OPR_INIT_FUNC_STR as the entry point");
 DEFINE_bool(
        c_opr_lib_with_param, false,
        "Run c opr lib with param, use to benchmark speed and check result, "
        "need c opr loader implemente `copr_param_device_ptr_malloc, "
        "copr_param_device_ptr_free and copr_param_device_ptr_h2d' symbols");

 REGIST_OPTION_CREATOR(c_opr_lib, lar::COprLibOption::create_option);
--- a/lite/load_and_run/src/options/extern_c_opr_options.h
+++ b/lite/load_and_run/src/options/extern_c_opr_options.h
@@ -0,0 +1,64 @@
 /**
 * \file lite/load_and_run/src/options/extern_c_opr_options.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once
 #include <gflags/gflags.h>
 #include "megbrain/graph/extern_copr_api.h"
 #include "models/model.h"
 #include "option_base.h"

 DECLARE_bool(c_opr_lib_with_param);
 DECLARE_string(c_opr_lib);

 namespace lar {

 struct COprArgs {
    //! for run c opr
    bool is_run_c_opr = false;
    bool is_run_c_opr_with_param = false;
    typedef void (*COPR_PARAM_DEVICE_PTR_MEM_T)(ExternCOprParam* param);
    typedef void (*COPR_PARAM_DEVICE_PTR_H2D_T)(
            ExternCOprParam* param, void* host_ptr, size_t extern_device_tensor_id);
    COPR_PARAM_DEVICE_PTR_MEM_T copr_param_device_ptr_malloc = nullptr;
    COPR_PARAM_DEVICE_PTR_MEM_T copr_param_device_ptr_free = nullptr;
    COPR_PARAM_DEVICE_PTR_H2D_T copr_param_device_ptr_h2d = nullptr;
 };

 class COprLibOption final : public OptionBase {
 public:
    static bool is_valid();

    static std::shared_ptr<OptionBase> create_option();

    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;

    std::string option_name() const override { return m_option_name; };

 private:
    COprLibOption();
    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};

    void load_lib();

    MGBDType dtype_cpp2c(megdnn::DType dtype);

    void tensor_shape_to_c(const megdnn::TensorShape& shape, MGBTensorShape& mgb_shape);

    void init_extern_param(std::shared_ptr<ModelBase> model);

    void set_Copr_IO(std::shared_ptr<ModelBase> model);

    std::string m_option_name;
    COprArgs c_opr_args;
    std::string lib_path;
    std::shared_ptr<ExternCOprParam> c_opr_param;
 };
 }  // namespace lar
--- a/lite/load_and_run/src/options/fastrun_options.cpp
+++ b/lite/load_and_run/src/options/fastrun_options.cpp
@@ -0,0 +1,231 @@
 /**
 * \file lite/load_and_run/src/options/fastrun_options.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include <gflags/gflags.h>

 #if defined(_WIN32)
 #include <io.h>
 #define F_OK         0
 #define access(a, b) _access(a, b)
 #elif __linux__ || __unix__ || __APPLE__
 #include <unistd.h>
 #endif
 #include "fastrun_options.h"
 #include "megbrain/gopt/inference.h"
 #include "megbrain/utils/infile_persistent_cache.h"
 #include "misc.h"
 #include "models/model_lite.h"
 #include "models/model_mdl.h"

 namespace lar {

 template <>
 void FastRunOption::config_model_internel<ModelLite>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        //! set the algo policy before model load
        using Strategy = ModelLite::Strategy;
        uint32_t strategy = 0;
 #if MGB_ENABLE_FASTRUN
        if (enable_full_run) {
            LITE_WARN("enable full-run strategy for algo profile");
            strategy = static_cast<uint32_t>(Strategy::LITE_ALGO_PROFILE) | strategy;
        } else if (enable_fast_run) {
            LITE_WARN("enable fast-run strategy for algo profile");
            strategy = static_cast<uint32_t>(Strategy::LITE_ALGO_PROFILE) |
                       static_cast<uint32_t>(Strategy::LITE_ALGO_OPTIMIZED) | strategy;
        } else {
            strategy = static_cast<uint32_t>(Strategy::LITE_ALGO_HEURISTIC) | strategy;
        }
 #else
        strategy = static_cast<uint32_t>(Strategy::LITE_ALGO_HEURISTIC) | strategy;
 #endif
        if (batch_binary_equal || enable_reproducible) {
            LITE_WARN("enable reproducible strategy for algo profile");
            if (batch_binary_equal)
                strategy = static_cast<uint32_t>(Strategy::LITE_ALGO_REPRODUCIBLE) |
                           strategy;
        }
        auto lite_strategy = static_cast<Strategy>(strategy);
        model->set_lite_strategy(lite_strategy);
    } else if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
        auto lite_network = model->get_lite_network();
        auto lite_strategy = model->get_lite_strategy();
        //! set algo policy for model
        lite::Runtime::set_network_algo_policy(
                lite_network, lite_strategy, share_batch_size, batch_binary_equal);
        if (!m_fast_run_cache.empty()) {
            if (!access(m_fast_run_cache.c_str(), F_OK)) {
                lite::set_persistent_cache(m_fast_run_cache);
            } else {
                lite::set_persistent_cache(m_fast_run_cache, true);
            }
            //! TODO:this is from mdl model settings but not matched settings in
            //! lite model
            // if (!enable_full_run && !enable_fast_run)
            //     mgb::gopt::enable_opr_use_profiling_cache_inplace(vars);
        }
    } else if (runtime_param.stage == RunStage::AFTER_MODEL_RUNNING) {
 #if MGB_ENABLE_FASTRUN
        //! dump algo cache
        if (!m_fast_run_cache.empty()) {
            lite::dump_persistent_cache(m_fast_run_cache);
        }
 #endif
    }
 }

 template <>
 void FastRunOption::config_model_internel<ModelMdl>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        //! set the algo policy before model load
        using Strategy = ModelMdl::Strategy;
        auto strategy = static_cast<Strategy>(0);
 #if MGB_ENABLE_FASTRUN
        if (enable_full_run) {
            mgb_log_warn("enable full-run strategy for algo profile");
            strategy = Strategy::PROFILE | strategy;
        } else if (enable_fast_run) {
            mgb_log_warn("enable fast-run strategy for algo profile");
            strategy = Strategy::PROFILE | Strategy::OPTIMIZED | strategy;
        } else {
            strategy = Strategy::HEURISTIC | strategy;
        }
 #else
        strategy = Strategy::HEURISTIC | strategy;
 #endif
        if (batch_binary_equal || enable_reproducible) {
            mgb_log_warn("enable reproducible strategy for algo profile");
            strategy = Strategy::REPRODUCIBLE | strategy;
        }
        model->set_mdl_strategy(strategy);

        //! set binary_equal_between_batch and shared_batch_size
        if (batch_binary_equal) {
            mgb_log_warn("enable batch binary equal");
            model->get_mdl_config()
                    .comp_graph->options()
                    .fast_run_config.binary_equal_between_batch = true;
        }
        if (share_batch_size > 0) {
            mgb_log_warn("set shared shared batch");
            model->get_mdl_config()
                    .comp_graph->options()
                    .fast_run_config.shared_batch_size = share_batch_size;
        }
    } else if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
        auto vars = model->get_mdl_load_result().output_var_list;
        auto strategy = model->get_mdl_strategy();
        mgb::gopt::modify_opr_algo_strategy_inplace(vars, strategy);
        // set algo cache path
        if (!m_fast_run_cache.empty()) {
            if (!access(m_fast_run_cache.c_str(), F_OK)) {
                mgb::PersistentCache::set_impl(
                        std::make_shared<mgb::InFilePersistentCache>(
                                m_fast_run_cache.c_str()));
            } else {
                mgb::PersistentCache::set_impl(
                        std::make_shared<mgb::InFilePersistentCache>());
            }
 #if MGB_ENABLE_FASTRUN
            if (!enable_full_run && !enable_fast_run)
 #endif
                mgb::gopt::enable_opr_use_profiling_cache_inplace(vars);
        }
    } else if (runtime_param.stage == RunStage::AFTER_MODEL_RUNNING) {
 #if MGB_ENABLE_FASTRUN
        //! dump algo cache
        if (!m_fast_run_cache.empty()) {
            static_cast<mgb::InFilePersistentCache&>(mgb::PersistentCache::inst())
                    .dump_cache(m_fast_run_cache.c_str());
        }
 #endif
    }
 }

 }  // namespace lar

 using namespace lar;

 FastRunOption::FastRunOption() {
    m_option_name = "fastrun";
 #if MGB_ENABLE_FASTRUN
    enable_fast_run = FLAGS_fast_run;
    enable_full_run = FLAGS_full_run;
 #endif
    batch_binary_equal = FLAGS_binary_equal_between_batch;
    enable_reproducible = FLAGS_reproducible;
    m_fast_run_cache = FLAGS_fast_run_algo_policy;
    share_batch_size = FLAGS_fast_run_shared_batch_size;
 #if MGB_ENABLE_FASTRUN
    //! while fastrun cache file path is not empty and can't be accessed
    if (!m_fast_run_cache.empty() && access(m_fast_run_cache.c_str(), F_OK)) {
        mgb_assert(
                enable_full_run || enable_fast_run,
                "--fast-run or --full-run should be enabled");
    }
    if (share_batch_size) {
        mgb_assert(
                enable_full_run || enable_fast_run || !m_fast_run_cache.empty(),
                "--fast-run-shared-batch-size should be used with "
                "--fast-run|--full-run|--fast-run-algo-policy");
    }
 #endif
 }

 bool FastRunOption::is_valid() {
    bool ret = false;
 #if MGB_ENABLE_FASTRUN
    ret = ret || FLAGS_fast_run;
    ret = ret || FLAGS_full_run;
 #endif
    ret = ret || FLAGS_binary_equal_between_batch;
    ret = ret || FLAGS_fast_run_shared_batch_size > 0;
    ret = ret || FLAGS_reproducible;
    ret = ret || FLAGS_fast_run_algo_policy.size() > 0;

    return ret;
 }

 std::shared_ptr<OptionBase> FastRunOption::create_option() {
    static std::shared_ptr<FastRunOption> option(new FastRunOption);
    if (FastRunOption::is_valid()) {
        return std::static_pointer_cast<OptionBase>(option);
    } else {
        return nullptr;
    }
 }

 void FastRunOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    CONFIG_MODEL_FUN;
 }

 #if MGB_ENABLE_FASTRUN
 DEFINE_bool(fast_run, false, "whether to use fast-run in model run");
 DEFINE_bool(full_run, false, "whether to use full-run in model run");
 #endif

 DEFINE_bool(
        binary_equal_between_batch, false,
        "Each batch of output is promised binary equal if each batch of "
        "input is binary equal\n Note that if this option is turned on, "
        "`--reproducible` will also be turned on.");
 DEFINE_bool(
        reproducible, false,
        "Enable choose algo which is reproducible. It mainly used for "
        "cudnn algos.See "
        "https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/"
        "index.html#reproducibility"
        "for more details.");
 DEFINE_uint32(fast_run_shared_batch_size, 0, "Set the batch size used during fastrun");
 DEFINE_string(fast_run_algo_policy, "", "fast-run cache path.");

 REGIST_OPTION_CREATOR(fastrun, lar::FastRunOption::create_option);
--- a/lite/load_and_run/src/options/fastrun_options.h
+++ b/lite/load_and_run/src/options/fastrun_options.h
@@ -0,0 +1,57 @@
 /**
 * \file lite/load_and_run/src/options/fastrun_options.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once

 #include <gflags/gflags.h>
 #include "models/model.h"
 #include "option_base.h"

 #if MGB_ENABLE_FASTRUN
 DECLARE_bool(fast_run);
 DECLARE_bool(full_run);
 #endif
 DECLARE_bool(reproducible);
 DECLARE_bool(binary_equal_between_batch);
 DECLARE_uint32(fast_run_shared_batch_size);
 DECLARE_string(fast_run_algo_policy);

 namespace lar {
 class FastRunOption final : public OptionBase {
 public:
    //! get condition for construct FastRunOption
    static bool is_valid();

    //! creat option using condition from cmdline args
    static std::shared_ptr<OptionBase> create_option();

    //! configure model for different runtime_param
    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;

    //! get options name for quickly search
    std::string option_name() const override { return m_option_name; }

 private:
    FastRunOption();
    //! config template for different model
    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>) {}

 #if MGB_ENABLE_FASTRUN
    bool enable_fast_run;  //! fast run strategy flag
    bool enable_full_run;  //! full run strategy flag
 #endif
    bool batch_binary_equal;       //! fast run stratgey setting
    bool enable_reproducible;      //! enable reproducible strategy
    size_t share_batch_size;       //! fast run strategy share batch size setting
    std::string m_fast_run_cache;  //! fast run cache file path
    std::string m_option_name;     //! option name
 };
 }  // namespace lar
--- a/lite/load_and_run/src/options/io_options.cpp
+++ b/lite/load_and_run/src/options/io_options.cpp
@@ -0,0 +1,295 @@
 /**
 * \file lite/load_and_run/src/options/io_options.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include <map>

 #include "helpers/data_parser.h"
 #include "misc.h"
 #include "models/model_lite.h"
 #include "models/model_mdl.h"

 #include "io_options.h"
 namespace lar {
 template <>
 void InputOption::config_model_internel<ModelLite>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        auto parser = model->get_input_parser();
        auto io = model->get_networkIO();
        for (size_t idx = 0; idx < data_path.size(); ++idx) {
            parser.feed(data_path[idx].c_str());
        }

        auto inputs = parser.inputs;
        bool is_host = true;
        for (auto& i : inputs) {
            io.inputs.push_back({i.first, is_host});
        }
    } else if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
        auto config = model->get_config();
        auto parser = model->get_input_parser();
        auto network = model->get_lite_network();

        //! datd type map from mgb data type to lite data type
        std::map<megdnn::DTypeEnum, LiteDataType> type_map = {
                {megdnn::DTypeEnum::Float32, LiteDataType::LITE_FLOAT},
                {megdnn::DTypeEnum::Int32, LiteDataType::LITE_INT},
                {megdnn::DTypeEnum::Int8, LiteDataType::LITE_INT8},
                {megdnn::DTypeEnum::Uint8, LiteDataType::LITE_UINT8}};

        for (auto& i : parser.inputs) {
            //! get tensor information from data parser
            auto tensor = i.second;
            auto data_type = tensor.dtype();
            auto tensor_shape = tensor.shape();
            mgb::dt_byte* src = tensor.raw_ptr();

            //! set lite layout
            lite::Layout layout;
            layout.ndim = tensor_shape.ndim;
            for (size_t idx = 0; idx < tensor_shape.ndim; idx++) {
                layout.shapes[idx] = tensor_shape[idx];
            }
            layout.data_type = type_map[data_type.enumv()];

            //! set network input tensor
            std::shared_ptr<lite::Tensor> input_tensor =
                    network->get_io_tensor(i.first);
            input_tensor->reset(src, layout);
        }
    }
 }

 template <>
 void InputOption::config_model_internel<ModelMdl>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        auto parser = model->get_input_parser();
        for (size_t idx = 0; idx < data_path.size(); ++idx) {
            parser.feed(data_path[idx].c_str());
        }
    } else if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
        auto parser = model->get_input_parser();
        auto network = model->get_mdl_load_result();
        auto tensormap = network.tensor_map;
        for (auto& i : parser.inputs) {
            mgb_assert(
                    tensormap.find(i.first) != tensormap.end(),
                    "can't find tesnor named %s", i.first.c_str());
            auto& in = tensormap.find(i.first)->second;
            in->copy_from(i.second);
        }
    }
 }

 template <>
 void IOdumpOption::config_model_internel<ModelLite>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
    if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
        if (enable_io_dump) {
            LITE_WARN("enable text io dump");
            lite::Runtime::enable_io_txt_dump(model->get_lite_network(), dump_path);
        }
        if (enable_bin_io_dump) {
            LITE_WARN("enable binary io dump");
            lite::Runtime::enable_io_bin_dump(model->get_lite_network(), dump_path);
        }
        //! FIX:when add API in lite complate this
        if (enable_io_dump_stdout || enable_io_dump_stderr) {
            LITE_THROW("lite model don't support the stdout or stderr io dump");
        }
        if (enable_bin_out_dump) {
            LITE_THROW("lite model don't support the binary output dump");
        }
        if (enable_copy_to_host) {
            LITE_WARN("lite model set copy to host defaultly");
        }
    }
 }

 template <>
 void IOdumpOption::config_model_internel<ModelMdl>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        if (enable_io_dump) {
            mgb_log_warn("enable text io dump");
            auto iodump = std::make_unique<mgb::TextOprIODump>(
                    model->get_mdl_config().comp_graph.get(), dump_path.c_str());
            iodump->print_addr(false);
            io_dumper = std::move(iodump);
        }

        if (enable_io_dump_stdout) {
            mgb_log_warn("enable text io dump to stdout");
            std::shared_ptr<FILE> std_out(stdout, [](FILE*) {});
            auto iodump = std::make_unique<mgb::TextOprIODump>(
                    model->get_mdl_config().comp_graph.get(), std_out);
            iodump->print_addr(false);
            io_dumper = std::move(iodump);
        }

        if (enable_io_dump_stderr) {
            mgb_log_warn("enable text io dump to stderr");
            std::shared_ptr<FILE> std_err(stderr, [](FILE*) {});
            auto iodump = std::make_unique<mgb::TextOprIODump>(
                    model->get_mdl_config().comp_graph.get(), std_err);
            iodump->print_addr(false);
            io_dumper = std::move(iodump);
        }

        if (enable_bin_io_dump) {
            mgb_log_warn("enable binary io dump");
            auto iodump = std::make_unique<mgb::BinaryOprIODump>(
                    model->get_mdl_config().comp_graph.get(), dump_path);
            io_dumper = std::move(iodump);
        }

        if (enable_bin_out_dump) {
            mgb_log_warn("enable binary output dump");
            out_dumper = std::make_unique<OutputDumper>(dump_path.c_str());
        }
    } else if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
        if (enable_bin_out_dump) {
            auto load_result = model->get_mdl_load_result();
            out_dumper->set(load_result.output_var_list);

            std::vector<mgb::ComputingGraph::Callback> cb;
            for (size_t i = 0; i < load_result.output_var_list.size(); i++) {
                cb.push_back(out_dumper->bind());
            }
            model->set_output_callback(cb);
        }
        if (enable_copy_to_host) {
            auto load_result = model->get_mdl_load_result();

            std::vector<mgb::ComputingGraph::Callback> cb;
            for (size_t i = 0; i < load_result.output_var_list.size(); i++) {
                mgb::HostTensorND val;
                auto callback = [val](const mgb::DeviceTensorND& dv) mutable {
                    val.copy_from(dv);
                };
                cb.push_back(callback);
            }
            model->set_output_callback(cb);
        }
    } else if (runtime_param.stage == RunStage::AFTER_RUNNING_WAIT) {
        if (enable_bin_out_dump) {
            out_dumper->write_to_file();
        }
    }
 }

 }  // namespace lar

 ////////////////////// Input options ////////////////////////
 using namespace lar;

 InputOption::InputOption() {
    m_option_name = "input";
    size_t start = 0;
    auto end = FLAGS_input.find(";", start);
    while (end != std::string::npos) {
        std::string path = FLAGS_input.substr(start, end - start);
        data_path.emplace_back(path);
        start = end + 1;
        end = FLAGS_input.find(";", start);
    }
    data_path.emplace_back(FLAGS_input.substr(start));
 }

 std::shared_ptr<lar::OptionBase> lar::InputOption::create_option() {
    static std::shared_ptr<InputOption> m_option(new InputOption);
    if (InputOption::is_valid()) {
        return std::static_pointer_cast<OptionBase>(m_option);
    } else {
        return nullptr;
    }
 }

 void InputOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    CONFIG_MODEL_FUN;
 }

 ////////////////////// OprIOdump options ////////////////////////

 IOdumpOption::IOdumpOption() {
    m_option_name = "iodump";
    size_t valid_flag = 0;
    if (!FLAGS_io_dump.empty()) {
        dump_path = FLAGS_io_dump;
        enable_io_dump = true;
        valid_flag = valid_flag | (1 << 0);
    }
    if (!FLAGS_bin_io_dump.empty()) {
        dump_path = FLAGS_bin_io_dump;
        enable_bin_io_dump = true;
        valid_flag = valid_flag | (1 << 1);
    }
    if (!FLAGS_bin_out_dump.empty()) {
        dump_path = FLAGS_bin_out_dump;
        enable_bin_out_dump = true;
        valid_flag = valid_flag | (1 << 2);
    }
    if (FLAGS_io_dump_stdout) {
        enable_io_dump_stdout = FLAGS_io_dump_stdout;
        valid_flag = valid_flag | (1 << 3);
    }
    if (FLAGS_io_dump_stderr) {
        enable_io_dump_stderr = FLAGS_io_dump_stderr;
        valid_flag = valid_flag | (1 << 4);
    }
    // not only one dump set valid
    if (valid_flag && (valid_flag & (valid_flag - 1))) {
        mgb_log_warn(
                "ONLY the last io dump option is validate and others is "
                "skipped!!!");
    }

    enable_copy_to_host = FLAGS_copy_to_host;
 }

 bool IOdumpOption::is_valid() {
    bool ret = !FLAGS_io_dump.empty();
    ret = ret || FLAGS_io_dump_stdout;
    ret = ret || FLAGS_io_dump_stderr;
    ret = ret || !FLAGS_bin_io_dump.empty();
    ret = ret || !FLAGS_bin_out_dump.empty();
    ret = ret || FLAGS_copy_to_host;
    return ret;
 }

 std::shared_ptr<OptionBase> IOdumpOption::create_option() {
    static std::shared_ptr<IOdumpOption> option(new IOdumpOption);
    if (IOdumpOption::is_valid()) {
        return std::static_pointer_cast<OptionBase>(option);
    } else {
        return nullptr;
    }
 }

 void IOdumpOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    CONFIG_MODEL_FUN;
 }
 ////////////////////// Input gflags ////////////////////////
 DEFINE_string(
        input, "", "Set up inputs data for model --input [ file_path | data_string]");

 ////////////////////// OprIOdump gflags ////////////////////////

 DEFINE_string(io_dump, "", "set the io dump file path in text format");
 DEFINE_bool(io_dump_stdout, false, "dump io opr to stdout in text format");
 DEFINE_bool(io_dump_stderr, false, "dump io opr to stderr in text format");
 DEFINE_string(bin_io_dump, "", "set the io dump file path in binary format");
 DEFINE_string(bin_out_dump, "", "set the out dump file path in binary format");
 DEFINE_bool(copy_to_host, false, "copy device data to host");

 REGIST_OPTION_CREATOR(input, lar::InputOption::create_option);
 REGIST_OPTION_CREATOR(iodump, lar::IOdumpOption::create_option);
--- a/lite/load_and_run/src/options/io_options.h
+++ b/lite/load_and_run/src/options/io_options.h
@@ -0,0 +1,78 @@
 /**
 * \file lite/load_and_run/src/options/io_options.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once
 #include <gflags/gflags.h>
 #include "helpers/outdumper.h"
 #include "megbrain/plugin/opr_io_dump.h"
 #include "models/model.h"
 #include "option_base.h"

 DECLARE_string(input);

 DECLARE_string(io_dump);
 DECLARE_bool(io_dump_stdout);
 DECLARE_bool(io_dump_stderr);
 DECLARE_string(bin_io_dump);
 DECLARE_string(bin_out_dump);
 DECLARE_bool(copy_to_host);

 namespace lar {

 /*!
 * \brief: input option for --input set
 */
 class InputOption final : public OptionBase {
 public:
    //! static function for registe options
    static bool is_valid() { return !FLAGS_input.empty(); };
    static std::shared_ptr<OptionBase> create_option();

    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;
    //! interface implement from OptionBase
    std::string option_name() const override { return m_option_name; };

 private:
    InputOption();

    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};

    std::string m_option_name;
    std::vector<std::string> data_path;  // data string or data file path
 };

 class IOdumpOption : public OptionBase {
 public:
    static bool is_valid();
    static std::shared_ptr<OptionBase> create_option();
    //! config the model, if different has different configure code, then
    //! dispatch
    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;
    std::string option_name() const override { return m_option_name; };

 private:
    IOdumpOption();
    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};

    bool enable_io_dump;
    bool enable_io_dump_stdout;
    bool enable_io_dump_stderr;
    bool enable_bin_io_dump;
    bool enable_bin_out_dump;
    bool enable_copy_to_host;
    std::string m_option_name;
    std::string dump_path;
    std::unique_ptr<mgb::OprIODumpBase> io_dumper;
    std::unique_ptr<OutputDumper> out_dumper;
 };
 }  // namespace lar
--- a/lite/load_and_run/src/options/layout_options.cpp
+++ b/lite/load_and_run/src/options/layout_options.cpp
@@ -0,0 +1,171 @@
 /**
 * \file lite/load_and_run/src/options/layout_options.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include <gflags/gflags.h>

 #include "misc.h"
 #include "models/model_lite.h"
 #include "models/model_mdl.h"

 #include "layout_options.h"
 namespace lar {
 template <>
 void LayoutOption::config_model_internel<ModelLite>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
 #define ENABLE_LAYOUT(layout)                           \
    LITE_WARN("enable " #layout " optimization");       \
    model->get_config().options.enable_##layout = true; \
    break;

        switch (option_flag) {
            case OptLayoutType::NCHW4:
                ENABLE_LAYOUT(nchw4)

            case OptLayoutType::CHWN4:
                LITE_THROW("lite model unsupport chwn4 layout");
                break;
            case OptLayoutType::NCHW44:
                ENABLE_LAYOUT(nchw44)

            case OptLayoutType::NCHW88:
                ENABLE_LAYOUT(nchw88)

            case OptLayoutType::NCHW32:
                ENABLE_LAYOUT(nchw32)

            case OptLayoutType::NCHW64:
                ENABLE_LAYOUT(nchw64)

            case OptLayoutType::NHWCD4:
                ENABLE_LAYOUT(nhwcd4)

            case OptLayoutType::NCHW44_DOT:
                ENABLE_LAYOUT(nchw44_dot)
            default:
                break;
        }
 #undef ENABLE_LAYOUT
    }
 }

 template <>
 void lar::LayoutOption::config_model_internel<ModelMdl>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        mgb_log_debug("mdl  layout config start");
 #define ENABLE_LAYOUT(layout)                                                  \
    mgb_log_warn("enable " #layout " optimization");                           \
    model->get_mdl_config().comp_graph->options().graph_opt.enable_##layout(); \
    break;

        switch (option_flag) {
            case OptLayoutType::NCHW4:
                ENABLE_LAYOUT(nchw4)

            case OptLayoutType::CHWN4:
                ENABLE_LAYOUT(chwn4)

            case OptLayoutType::NCHW44:
                ENABLE_LAYOUT(nchw44)

            case OptLayoutType::NCHW88:
                ENABLE_LAYOUT(nchw88)

            case OptLayoutType::NCHW32:
                ENABLE_LAYOUT(nchw32)

            case OptLayoutType::NCHW64:
                ENABLE_LAYOUT(nchw64)

            case OptLayoutType::NHWCD4:
                ENABLE_LAYOUT(nhwcd4)

            case OptLayoutType::NCHW44_DOT:
                ENABLE_LAYOUT(nchw44_dot)

            default:
                break;
        }
        mgb_log_debug("mdl layout config end");

 #undef ENABLE_LAYOUT
    }
 }
 }  // namespace lar

 using namespace lar;

 OptLayoutType LayoutOption::option_flag;

 LayoutOption::LayoutOption() {
    m_option_name = "layout";
 }

 bool LayoutOption::is_valid() {
    size_t valid_flag = 0;
    if (FLAGS_enable_nchw4) {
        valid_flag = valid_flag | (1 << 0);
    }
    if (FLAGS_enable_chwn4) {
        valid_flag = valid_flag | (1 << 1);
    }
    if (FLAGS_enable_nchw44) {
        valid_flag = valid_flag | (1 << 2);
    }
    if (FLAGS_enable_nchw88) {
        valid_flag = valid_flag | (1 << 3);
    }
    if (FLAGS_enable_nchw32) {
        valid_flag = valid_flag | (1 << 4);
    }
    if (FLAGS_enable_nchw64) {
        valid_flag = valid_flag | (1 << 5);
    }
    if (FLAGS_enable_nhwcd4) {
        valid_flag = valid_flag | (1 << 6);
    }
    if (FLAGS_enable_nchw44_dot) {
        valid_flag = valid_flag | (1 << 7);
    }

    bool ret = valid_flag && !(valid_flag & (valid_flag - 1));
    if (ret) {
        option_flag = static_cast<OptLayoutType>(valid_flag);
    } else {
        option_flag = static_cast<OptLayoutType>(0);
    }

    return ret;
 };

 std::shared_ptr<OptionBase> LayoutOption::create_option() {
    static std::shared_ptr<LayoutOption> option(new LayoutOption);
    if (LayoutOption::is_valid()) {
        return std::static_pointer_cast<OptionBase>(option);
    } else {
        return nullptr;
    }
 }

 void LayoutOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    CONFIG_MODEL_FUN;
 }

 DEFINE_bool(enable_nchw4, false, "enable nchw4 layout optimization!!");
 DEFINE_bool(enable_chwn4, false, "enable chwn4 layout optimization!!");
 DEFINE_bool(enable_nchw44, false, "enable nchw44 layout optimization!!");
 DEFINE_bool(enable_nchw88, false, "enable nchw88 layout optimization!!");
 DEFINE_bool(enable_nchw32, false, "enable nchw32 layout optimization!!");
 DEFINE_bool(enable_nchw64, false, "enable nchw64 layout optimization!!");
 DEFINE_bool(enable_nhwcd4, false, "enable nhwcd4 layout optimization!!");
 DEFINE_bool(enable_nchw44_dot, false, "enable nchw444-dot layout optimization!!");

 REGIST_OPTION_CREATOR(layout, lar::LayoutOption::create_option);
--- a/lite/load_and_run/src/options/layout_options.h
+++ b/lite/load_and_run/src/options/layout_options.h
@@ -0,0 +1,56 @@
 /**
 * \file lite/load_and_run/src/options/layout_options.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once

 #include <gflags/gflags.h>
 #include "helpers/common.h"
 #include "models/model.h"
 #include "option_base.h"

 DECLARE_bool(enable_nchw4);
 DECLARE_bool(enable_chwn4);
 DECLARE_bool(enable_nchw44);
 DECLARE_bool(enable_nchw88);
 DECLARE_bool(enable_nchw32);
 DECLARE_bool(enable_nchw64);
 DECLARE_bool(enable_nhwcd4);
 DECLARE_bool(enable_nchw44_dot);

 namespace lar {
 /*!
 * \brief: layout option for optimization
 */
 class LayoutOption final : public OptionBase {
 public:
    //! check the validation  of option flag
    static bool is_valid();

    //! creat options when option is used
    static std::shared_ptr<OptionBase> create_option();

    //! config the model, dispatch configuration for different model implement
    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;

    //! get option name
    std::string option_name() const override { return m_option_name; };

 private:
    //! Constructor
    LayoutOption();

    //! configuration for different model implement
    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};

    static OptLayoutType option_flag;
    std::string m_option_name;
 };
 }  // namespace lar
--- a/lite/load_and_run/src/options/optimize_options.cpp
+++ b/lite/load_and_run/src/options/optimize_options.cpp
@@ -0,0 +1,600 @@
 /**
 * \file lite/load_and_run/src/options/optimize_options.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "megbrain/gopt/inference.h"
 #if MGB_ENABLE_TENSOR_RT
 #include "megbrain/tensorrt/tensorrt_engine_cache.h"
 #endif
 #include "lite/global.h"
 #include "misc.h"
 #include "models/model_lite.h"
 #include "models/model_mdl.h"
 #include "optimize_options.h"

 ///////////////////////// fuse and preprocess optimize options ///////////////
 namespace lar {
 template <>
 void FusePreprocessOption::config_model_internel<ModelLite>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        if (enable_fuse_preprocess) {
            LITE_WARN("enable fuse-preprocess optimization");
            model->get_config().options.fuse_preprocess = true;
        }
    }
 }

 template <>
 void FusePreprocessOption::config_model_internel<ModelMdl>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        auto&& graph_option = model->get_mdl_config().comp_graph->options();
        if (enable_fuse_preprocess) {
            mgb_log_warn("enable fuse-preprocess optimization");
            graph_option.graph_opt.enable_fuse_preprocess();
        }
    }
 }
 }  // namespace lar
 using namespace lar;

 FusePreprocessOption::FusePreprocessOption() {
    m_option_name = "fuse_preprocess";
    enable_fuse_preprocess = FLAGS_enable_fuse_preprocess;
 }

 bool FusePreprocessOption::is_valid() {
    bool ret = FLAGS_enable_fuse_preprocess;
    return ret;
 }

 std::shared_ptr<OptionBase> FusePreprocessOption::create_option() {
    static std::shared_ptr<FusePreprocessOption> option(new FusePreprocessOption);
    if (FusePreprocessOption::is_valid()) {
        return std::static_pointer_cast<OptionBase>(option);
    } else {
        return nullptr;
    }
 }

 void FusePreprocessOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    CONFIG_MODEL_FUN;
 }

 ///////////////////////// weight preprocess optimize options ///////////////
 namespace lar {
 template <>
 void WeightPreprocessOption::config_model_internel<ModelLite>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        if (weight_preprocess) {
            LITE_WARN("enable weight-preprocess optimization");
            model->get_config().options.weight_preprocess = true;
            //! FIXME: algo searcher enable weight preprocess for opencl(
            //! implement below has some problem);
            // #if MGB_OPENCL
            //             megdnn::opencl::algo_searcher::AlgoSearcherBase::
            //                     enable_weight_preprocess();
            // #endif
        }
    }
 }

 template <>
 void WeightPreprocessOption::config_model_internel<ModelMdl>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        auto&& graph_option = model->get_mdl_config().comp_graph->options();
        if (weight_preprocess) {
            mgb_log_warn("enable weight-preprocess optimization");
            graph_option.graph_opt.enable_weight_preprocess();
            //! FIXME: this implemment is not right
            // #if MGB_OPENCL
            //             megdnn::opencl::algo_searcher::AlgoSearcherBase::
            //                     enable_weight_preprocess();
            // #endif
        }
    }
 }
 }  // namespace lar

 WeightPreprocessOption::WeightPreprocessOption() {
    m_option_name = "weight_preprocess";
    weight_preprocess = FLAGS_weight_preprocess;
 }

 bool WeightPreprocessOption::is_valid() {
    bool ret = FLAGS_weight_preprocess;
    return ret;
 }

 std::shared_ptr<OptionBase> WeightPreprocessOption::create_option() {
    static std::shared_ptr<WeightPreprocessOption> option(new WeightPreprocessOption);
    if (WeightPreprocessOption::is_valid()) {
        return std::static_pointer_cast<OptionBase>(option);
    } else {
        return nullptr;
    }
 }

 void WeightPreprocessOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    CONFIG_MODEL_FUN;
 }

 ///// fuse conv bias and nonlinear activation opr optimize options ////////
 namespace lar {
 template <>
 void FuseConvBiasNonlinearOption::config_model_internel<ModelLite>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
    LITE_MARK_USED_VAR(model);
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        if (enable_fuse_conv_bias_nonlinearity) {
            LITE_THROW("fuse conv+bias+nonlinearity not supported in lite model");
        }
    }
 }

 template <>
 void FuseConvBiasNonlinearOption::config_model_internel<ModelMdl>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        auto&& graph_option = model->get_mdl_config().comp_graph->options();
        if (enable_fuse_conv_bias_nonlinearity) {
            mgb_log_warn("enable fuse conv+bias+nonlinearity optimization");
            graph_option.graph_opt.enable_fuse_conv_bias_nonlinearity();
        }
    }
 }
 }  // namespace lar

 FuseConvBiasNonlinearOption::FuseConvBiasNonlinearOption() {
    m_option_name = "fuse_conv_bias_nonlinear";
    enable_fuse_conv_bias_nonlinearity = FLAGS_enable_fuse_conv_bias_nonlinearity;
 }

 bool FuseConvBiasNonlinearOption::is_valid() {
    bool ret = FLAGS_enable_fuse_conv_bias_nonlinearity;
    return ret;
 }

 std::shared_ptr<OptionBase> FuseConvBiasNonlinearOption::create_option() {
    static std::shared_ptr<FuseConvBiasNonlinearOption> option(
            new FuseConvBiasNonlinearOption);
    if (FuseConvBiasNonlinearOption::is_valid()) {
        return std::static_pointer_cast<OptionBase>(option);
    } else {
        return nullptr;
    }
 }

 void FuseConvBiasNonlinearOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    CONFIG_MODEL_FUN;
 }

 ///////////////////////// fuse and preprocess optimize options ///////////////
 namespace lar {
 template <>
 void FuseConvBiasElemwiseAddOption::config_model_internel<ModelLite>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
    LITE_MARK_USED_VAR(model);
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        if (enable_fuse_conv_bias_with_z) {
            LITE_THROW(
                    "fuse conv+bias+z optimization not supported in lite "
                    "model");
        }
    }
 }

 template <>
 void FuseConvBiasElemwiseAddOption::config_model_internel<ModelMdl>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        auto&& graph_option = model->get_mdl_config().comp_graph->options();
        if (enable_fuse_conv_bias_with_z) {
            mgb_log_warn("enable fuse conv+bias+z optimization");
            graph_option.graph_opt.enable_fuse_conv_bias_with_z();
        }
    }
 }
 }  // namespace lar

 FuseConvBiasElemwiseAddOption::FuseConvBiasElemwiseAddOption() {
    m_option_name = "fuse_conv_bias_z";
    enable_fuse_conv_bias_with_z = FLAGS_enable_fuse_conv_bias_with_z;
 }

 bool FuseConvBiasElemwiseAddOption::is_valid() {
    bool ret = FLAGS_enable_fuse_conv_bias_with_z;
    return ret;
 }

 std::shared_ptr<OptionBase> FuseConvBiasElemwiseAddOption::create_option() {
    static std::shared_ptr<FuseConvBiasElemwiseAddOption> option(
            new FuseConvBiasElemwiseAddOption);
    if (FuseConvBiasElemwiseAddOption::is_valid()) {
        return std::static_pointer_cast<OptionBase>(option);
    } else {
        return nullptr;
    }
 }

 void FuseConvBiasElemwiseAddOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    CONFIG_MODEL_FUN;
 }

 ///////////////////////// graph retrict options /////////////////////////
 namespace lar {
 template <>
 void GraphRecordOption::config_model_internel<ModelLite>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        auto&& config_option = model->get_config().options;
        if (const_shape) {
            LITE_WARN("enable const var shape");
            config_option.const_shape = true;
        }
        if (fake_first) {
            LITE_WARN("enable fake-first optimization");
            config_option.fake_next_exec = true;
        }
        if (no_sanity_check) {
            LITE_WARN("disable var sanity check optimization");
            config_option.var_sanity_check_first_run = false;
        }
        if (m_record_comp_seq == 1) {
            LITE_WARN("set record_comp_seq_level to 1");
        }
        if (m_record_comp_seq == 2) {
            mgb_assert(
                    no_sanity_check,
                    "--no-sanity-check should be set before "
                    "--record-comp-seq2");
            LITE_WARN("set record_comp_seq_level to 2");
        }
        config_option.comp_node_seq_record_level = m_record_comp_seq;
    }
 }

 template <>
 void GraphRecordOption::config_model_internel<ModelMdl>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        auto&& graph_option = model->get_mdl_config().comp_graph->options();
        if (const_shape) {
            mgb_log_warn("enable const var shape");
            model->get_mdl_config().const_var_shape = true;
        }
        if (fake_first) {
            mgb_log_warn("enable fake-first optimization");
            graph_option.fake_next_exec = true;
        }
        if (no_sanity_check) {
            mgb_log_warn("disable var sanity check optimization");
            graph_option.var_sanity_check_first_run = false;
        }
        if (m_record_comp_seq == 1) {
            mgb_log_warn("set record_comp_seq_level to 1");
        }
        if (m_record_comp_seq == 2) {
            mgb_assert(
                    no_sanity_check && !fake_first,
                    "--no-sanity-check should be set before "
                    "--record-comp-seq2 and --fake-first should not be set");
            mgb_log_warn("set record_comp_seq_level to 2");
        }
        graph_option.comp_node_seq_record_level = m_record_comp_seq;
    }
 }
 }  // namespace lar

 GraphRecordOption::GraphRecordOption() {
    m_option_name = "graph_record";
    m_record_comp_seq = 0;
    const_shape = FLAGS_const_shape;
    fake_first = FLAGS_fake_first;
    no_sanity_check = FLAGS_no_sanity_check;
    if (FLAGS_record_comp_seq) {
        m_record_comp_seq = 1;
    }
    if (FLAGS_record_comp_seq2) {
        m_record_comp_seq = 2;
    }
 }

 bool GraphRecordOption::is_valid() {
    bool ret = FLAGS_const_shape;
    ret = ret || FLAGS_fake_first;
    ret = ret || FLAGS_no_sanity_check;
    ret = ret || FLAGS_record_comp_seq;
    ret = ret || FLAGS_record_comp_seq2;
    return ret;
 }

 std::shared_ptr<OptionBase> GraphRecordOption::create_option() {
    static std::shared_ptr<GraphRecordOption> option(new GraphRecordOption);
    if (GraphRecordOption::is_valid()) {
        return std::static_pointer_cast<OptionBase>(option);
    } else {
        return nullptr;
    }
 }

 void GraphRecordOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    CONFIG_MODEL_FUN;
 }
 ///////////////////////// graph retrict options /////////////////////////
 namespace lar {
 template <>
 void MemoryOptimizeOption::config_model_internel<ModelLite>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
    LITE_MARK_USED_VAR(model);
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        if (disable_mem_opt) {
            LITE_THROW("lite model don't support disable memory optimization");
        }
    } else if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
        if (workspace_limit != SIZE_MAX) {
            LITE_WARN("set workspace limit to %ld", workspace_limit);
            lite::Runtime::set_network_algo_workspace_limit(
                    model->get_lite_network(), workspace_limit);
        }
    }
 }

 template <>
 void MemoryOptimizeOption::config_model_internel<ModelMdl>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        auto&& graph_option = model->get_mdl_config().comp_graph->options();
        if (disable_mem_opt) {
            mgb_log_warn("disable memory optimization");
            graph_option.seq_opt.enable_mem_plan_opt = false;
            graph_option.seq_opt.enable_mem_reuse_alloc = false;
        }
        if (workspace_limit < SIZE_MAX) {
            mgb_log_warn("set workspace limit to %ld", workspace_limit);
            auto output_spec = model->get_output_spec();
            mgb::SymbolVarArray vars;
            for (auto i : output_spec) {
                vars.push_back(i.first);
            }
            mgb::gopt::set_opr_algo_workspace_limit_inplace(vars, workspace_limit);
        }
    }
 }
 }  // namespace lar

 MemoryOptimizeOption::MemoryOptimizeOption() {
    m_option_name = "memory_optimize";
    disable_mem_opt = FLAGS_disable_mem_opt;
    workspace_limit = FLAGS_workspace_limit;
 }

 bool MemoryOptimizeOption::is_valid() {
    bool ret = FLAGS_disable_mem_opt;
    ret = ret || FLAGS_workspace_limit < SIZE_MAX;
    return ret;
 }

 std::shared_ptr<OptionBase> MemoryOptimizeOption::create_option() {
    static std::shared_ptr<MemoryOptimizeOption> option(new MemoryOptimizeOption);
    if (MemoryOptimizeOption::is_valid()) {
        return std::static_pointer_cast<OptionBase>(option);
    } else {
        return nullptr;
    }
 }

 void MemoryOptimizeOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    CONFIG_MODEL_FUN;
 }

 ///////////////////////// other options for optimization /////////////////
 namespace lar {
 template <>
 void JITOption::config_model_internel<ModelLite>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        auto&& config_option = model->get_config().options;
        if (enable_jit) {
            LITE_WARN("enable JIT (level 1)");
            config_option.jit_level = 1;
        }
    }
 }

 template <>
 void JITOption::config_model_internel<ModelMdl>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        auto&& graph_option = model->get_mdl_config().comp_graph->options();
        if (enable_jit) {
            mgb_log_warn("enable JIT (level 1)");
            graph_option.graph_opt.jit = 1;
        }
    }
 }
 }  // namespace lar
 JITOption::JITOption() {
    m_option_name = "JIT";
    enable_jit = FLAGS_enable_jit;
 }

 bool JITOption::is_valid() {
    bool ret = FLAGS_enable_jit;
    return ret;
 }

 std::shared_ptr<OptionBase> JITOption::create_option() {
    static std::shared_ptr<JITOption> option(new JITOption);
    if (JITOption::is_valid()) {
        return std::static_pointer_cast<OptionBase>(option);
    } else {
        return nullptr;
    }
 }

 void JITOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    CONFIG_MODEL_FUN;
 }
 ///////////////////////// other options for optimization /////////////////
 #if MGB_ENABLE_TENSOR_RT
 namespace lar {
 template <>
 void TensorRTOption::config_model_internel<ModelLite>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        if (!tensorrt_cache.empty()) {
            LITE_WARN("set tensorrt cache as %s", tensorrt_cache.c_str());
            lite::set_tensor_rt_cache(tensorrt_cache);
        }
    } else if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
        if (enable_tensorrt) {
            LITE_WARN("enable TensorRT");
            lite::Runtime::use_tensorrt(model->get_lite_network());
        }
    } else if (runtime_param.stage == RunStage::AFTER_MODEL_RUNNING) {
        if (!tensorrt_cache.empty()) {
            lite::dump_tensor_rt_cache();
        }
    }
 }

 template <>
 void TensorRTOption::config_model_internel<ModelMdl>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        auto&& graph_option = model->get_mdl_config().comp_graph->options();
        if (enable_tensorrt) {
            mgb_log_warn("using tensorRT");
            graph_option.graph_opt.tensorrt = true;
        }
        if (!tensorrt_cache.empty()) {
            mgb_log_warn("use tensorrt cache: %s", tensorrt_cache.c_str());
            mgb::TensorRTEngineCache::enable_engine_cache(true);
            mgb::TensorRTEngineCache::set_impl(
                    std::make_shared<mgb::TensorRTEngineCacheIO>(
                            tensorrt_cache.c_str()));
        }
    } else if (runtime_param.stage == RunStage::AFTER_MODEL_RUNNING) {
        if (!tensorrt_cache.empty()) {
            if (mgb::TensorRTEngineCache::enable_engine_cache()) {
                mgb::TensorRTEngineCache::inst().dump_cache();
            }
        }
    }
 }
 }  // namespace lar

 TensorRTOption::TensorRTOption() {
    m_option_name = "tensorRT";
    enable_tensorrt = FLAGS_tensorrt;
    tensorrt_cache = FLAGS_tensorrt_cache;
 }

 bool TensorRTOption::is_valid() {
    bool ret = FLAGS_tensorrt;
    ret = ret || !FLAGS_tensorrt_cache.empty();
    return ret;
 }

 std::shared_ptr<OptionBase> TensorRTOption::create_option() {
    static std::shared_ptr<TensorRTOption> option(new TensorRTOption);
    if (TensorRTOption::is_valid()) {
        return std::static_pointer_cast<OptionBase>(option);
    } else {
        return nullptr;
    }
 }

 void TensorRTOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    CONFIG_MODEL_FUN;
 }
 #endif
 ///////////////////////// fuse and preprocess optimize options ///////////////
 DEFINE_bool(
        enable_fuse_preprocess, false,
        "Fusion astype | pad_channel | dimshuffle and etc opr from h2d opr");
 DEFINE_bool(
        weight_preprocess, false,
        "Execute operators with weight preprocess, which can optimize the "
        "operator execution time with algo of winograd, im2col ,etc., but "
        "it may consume more memory.");
 DEFINE_bool(
        enable_fuse_conv_bias_nonlinearity, false,
        "whether to fuse conv+bias+nonlinearity");
 DEFINE_bool(
        enable_fuse_conv_bias_with_z, false,
        "fuse conv，bias (elemwise add)，z(elemwise add) into one opr "
        "(only support on GPU)");

 ///////////////////////// graph retrict options /////////////////////////
 DEFINE_bool(
        const_shape, false,
        "set const_var_shape to reduce memory usage, since some static "
        "inference data structures can be omitted");
 DEFINE_bool(
        fake_first, false,
        "Enable fake exec for the first run. In fake exec mode, some "
        "initialization job would be done, but no actual computing is "
        "performed.");
 DEFINE_bool(no_sanity_check, false, "Disable var sanity check on the first run");
 DEFINE_bool(
        record_comp_seq, false,
        "Record the computing sequence, in level 1 . It reduces overhead of API"
        "calls of some asynchronous computing devices");
 DEFINE_bool(
        record_comp_seq2, false,
        "Record the computing sequence, in level 2, the computing graph can be"
        "destructed to reduce memory usage");
 DEFINE_bool(disable_mem_opt, false, "disable memory optimization!!");
 DEFINE_uint64(workspace_limit, SIZE_MAX, "set workspace upbound limit");

 ///////////////////////// other options for optimization /////////////////
 DEFINE_bool(
        enable_jit, false,
        " Execute supported operators with JIT(now only support NVRTC). "
        "Can only be used on Nvidia GPUs");
 #if MGB_ENABLE_ANDROID_NN
 DEFINE_bool(
        android_nn, false,
        "Execute supported operators with Android NN. Can only be used "
        "with --cpu.");
 #endif
 #if MGB_ENABLE_TENSOR_RT
 DEFINE_bool(
        tensorrt, false,
        " Execute supported operators with TensorRT. Can only be used on "
        "Nvidia GPUs,i.e. comp node is xpu or gpu.");
 DEFINE_string(
        tensorrt_cache, "",
        "Set the TensorRT engine cache path for serialized prebuilt "
        "ICudaEngine");
 #endif
 REGIST_OPTION_CREATOR(fuse_preprocess, lar::FusePreprocessOption::create_option);
 REGIST_OPTION_CREATOR(weight_preprocess, lar::WeightPreprocessOption::create_option);
 REGIST_OPTION_CREATOR(
        fuse_conv_bias_nonlinear, lar::FuseConvBiasNonlinearOption::create_option);
 REGIST_OPTION_CREATOR(
        fuse_conv_bias_z, lar::FuseConvBiasElemwiseAddOption::create_option);
 REGIST_OPTION_CREATOR(graph_record, lar::GraphRecordOption::create_option);
 REGIST_OPTION_CREATOR(memory_optimize, lar::MemoryOptimizeOption::create_option);
 REGIST_OPTION_CREATOR(JIT, lar::JITOption::create_option);
 #if MGB_ENABLE_TENSOR_RT
 REGIST_OPTION_CREATOR(tensorRT, lar::TensorRTOption::create_option);
 #endif
--- a/lite/load_and_run/src/options/optimize_options.h
+++ b/lite/load_and_run/src/options/optimize_options.h
@@ -0,0 +1,207 @@
 /**
 * \file lite/load_and_run/src/options/optimize_options.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once
 #include <gflags/gflags.h>
 #include "helpers/common.h"
 #include "models/model.h"
 #include "option_base.h"

 DECLARE_bool(enable_fuse_preprocess);
 DECLARE_bool(weight_preprocess);
 DECLARE_bool(enable_fuse_conv_bias_nonlinearity);
 DECLARE_bool(enable_fuse_conv_bias_with_z);

 DECLARE_bool(const_shape);
 DECLARE_bool(fake_first);
 DECLARE_bool(no_sanity_check);
 DECLARE_bool(record_comp_seq);
 DECLARE_bool(record_comp_seq2);
 DECLARE_bool(disable_mem_opt);
 DECLARE_uint64(workspace_limit);

 DECLARE_bool(enable_jit);
 #if MGB_ENABLE_TENSOR_RT
 DECLARE_bool(tensorrt);
 DECLARE_string(tensorrt_cache);
 #endif
 namespace lar {
 ///////////////////////// fuse_preprocess optimize options //////////////
 class FusePreprocessOption final : public OptionBase {
 public:
    static bool is_valid();

    static std::shared_ptr<OptionBase> create_option();

    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;

    std::string option_name() const override { return m_option_name; };

 private:
    FusePreprocessOption();
    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};

    std::string m_option_name;
    bool enable_fuse_preprocess;
 };

 ///////////////////////// weight preprocess optimize options //////////////
 class WeightPreprocessOption final : public OptionBase {
 public:
    static bool is_valid();

    static std::shared_ptr<OptionBase> create_option();

    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;

    std::string option_name() const override { return m_option_name; };

 private:
    WeightPreprocessOption();
    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};

    std::string m_option_name;
    bool weight_preprocess;
 };

 /////////////// fuse_conv_bias_nonlinearity optimize options ///////////////
 class FuseConvBiasNonlinearOption final : public OptionBase {
 public:
    static bool is_valid();

    static std::shared_ptr<OptionBase> create_option();

    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;

    std::string option_name() const override { return m_option_name; };

 private:
    FuseConvBiasNonlinearOption();
    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};

    std::string m_option_name;
    bool enable_fuse_conv_bias_nonlinearity;
 };

 ///////////////////////// fuse_conv_bias_with_z optimize options //////////////
 class FuseConvBiasElemwiseAddOption final : public OptionBase {
 public:
    static bool is_valid();

    static std::shared_ptr<OptionBase> create_option();

    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;

    std::string option_name() const override { return m_option_name; };

 private:
    FuseConvBiasElemwiseAddOption();
    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};
    std::string m_option_name;
    bool enable_fuse_conv_bias_with_z;
 };

 ///////////////////////// graph record options ///////////////////////////
 class GraphRecordOption final : public OptionBase {
 public:
    static bool is_valid();

    static std::shared_ptr<OptionBase> create_option();

    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;

    std::string option_name() const override { return m_option_name; };

 private:
    GraphRecordOption();
    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};

    std::string m_option_name;
    size_t m_record_comp_seq;
    bool const_shape;
    bool fake_first;
    bool no_sanity_check;
 };

 ///////////////////////// memory optimize options /////////////////////////
 class MemoryOptimizeOption final : public OptionBase {
 public:
    static bool is_valid();

    static std::shared_ptr<OptionBase> create_option();

    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;

    std::string option_name() const override { return m_option_name; };

 private:
    MemoryOptimizeOption();
    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};

    std::string m_option_name;
    bool disable_mem_opt;
    uint64_t workspace_limit;
 };

 ///////////////////////// other options for optimization /////////////////
 class JITOption final : public OptionBase {
 public:
    static bool is_valid();

    static std::shared_ptr<OptionBase> create_option();

    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;

    std::string option_name() const override { return m_option_name; };

 private:
    JITOption();
    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};

    std::string m_option_name;
    bool enable_jit;
 };
 ///////////////////////// TensorRT options for optimization /////////////////
 #if MGB_ENABLE_TENSOR_RT
 class TensorRTOption final : public OptionBase {
 public:
    static bool is_valid();

    static std::shared_ptr<OptionBase> create_option();

    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;

    std::string option_name() const override { return m_option_name; };

 private:
    TensorRTOption();
    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};

    std::string m_option_name;
    bool enable_tensorrt;
    std::string tensorrt_cache;
 };
 #endif
 }  // namespace lar
--- a/lite/load_and_run/src/options/option_base.h
+++ b/lite/load_and_run/src/options/option_base.h
@@ -0,0 +1,87 @@
 /**
 * \file lite/load_and_run/src/options/option_base.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once
 #include <functional>
 #include <iostream>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include "megbrain/common.h"

 #include "helpers/common.h"
 #include "models/model.h"

 namespace lar {
 /*!
 * \brief: base class of options
 */
 class OptionBase {
 public:
    //! configure  model in different runtime state
    virtual void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) = 0;
    //! get depend options
    virtual std::vector<std::string> depend_option() const { return {}; };

    //! get option name
    virtual std::string option_name() const = 0;

    virtual ~OptionBase() = default;
 };

 /*!
 * \brief: Singleton option factory for register options before main function
 */
 class OptionFactory {
 public:
    using OptionCreator = std::function<std::shared_ptr<OptionBase>()>;
    using OptionMap = std::unordered_map<std::string, OptionCreator>;

    //! get Singleton option factory
    static OptionFactory& get_Instance() {
        static OptionFactory instance;
        return instance;
    }

    //! registe option creator into option map
    void registe_options(std::string name, OptionCreator creator) {
        if (option_creator_map.count(name) == 0) {
            option_creator_map[name] = creator;
        }
    }

    //! get creator map
    OptionMap* get_option_creator_map() { return &option_creator_map; }

 private:
    OptionFactory(){};
    OptionMap option_creator_map;
 };

 }  // namespace lar

 #define REGIST_OPTION_CREATOR(name_, creator_)                                    \
    struct OptionRegister_##name_ {                                               \
        OptionRegister_##name_() {                                                \
            lar::OptionFactory::get_Instance().registe_options(#name_, creator_); \
        }                                                                         \
    };                                                                            \
    OptionRegister_##name_ name_;

 #define CONFIG_MODEL_FUN                                                    \
    if (model->type() == ModelType::LITE_MODEL) {                           \
        config_model_internel<ModelLite>(                                   \
                runtime_param, std::static_pointer_cast<ModelLite>(model)); \
    } else if (model->type() == ModelType::MEGDL_MODEL) {                   \
        config_model_internel<ModelMdl>(                                    \
                runtime_param, std::static_pointer_cast<ModelMdl>(model));  \
    }
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/load_and_run/src/options/plugin_options.cpp
+++ b/lite/load_and_run/src/options/plugin_options.cpp
@@ -0,0 +1,401 @@
 /**
 * \file lite/load_and_run/src/options/plugin_options.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "plugin_options.h"
 #include "misc.h"
 #include "models/model_lite.h"
 #include "models/model_mdl.h"

 ///////////////////// Plugin options///////////////////////////
 namespace lar {

 template <>
 void PluginOption::config_model_internel<ModelLite>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        LITE_ASSERT(range == 0, "lite model don't support NumRangeChecker plugin");
        LITE_ASSERT(
                !enable_check_dispatch,
                "lite model don't support CPUDispatchChecker plugin");
        LITE_ASSERT(
                var_value_check_str.empty(),
                "lite model don't support VarValueChecker plugin");
    }
 #if MGB_ENABLE_JSON
    else if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
        if (!profile_path.empty()) {
            if (!enable_profile_host) {
                LITE_WARN("enable profiling");
                model->get_lite_network()->enable_profile_performance(profile_path);
            } else {
                LITE_WARN("enable profiling for host");
                model->get_lite_network()->enable_profile_performance(profile_path);
            }
        }
    }
 #endif
 }

 template <>
 void PluginOption::config_model_internel<ModelMdl>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        auto config = model->get_mdl_config();
        if (range > 0) {
            mgb_log_warn("enable number range check");
            model->set_num_range_checker(float(range));
        }

        if (enable_check_dispatch) {
            mgb_log_warn("enable cpu dispatch check");
            cpu_dispatch_checker =
                    std::make_unique<mgb::CPUDispatchChecker>(config.comp_graph.get());
        }

        if (!var_value_check_str.empty()) {
            mgb_log_warn("enable variable value check");
            size_t init_idx = 0, switch_interval;
            auto sep = var_value_check_str.find(':');
            if (sep != std::string::npos) {
                switch_interval = std::stoul(var_value_check_str.substr(0, sep));
                init_idx = std::stoul(var_value_check_str.substr(sep + 1));
            } else {
                switch_interval = std::stoul(var_value_check_str);
            }
            var_value_checker = std::make_unique<mgb::VarValueChecker>(
                    config.comp_graph.get(), switch_interval, init_idx);
        }

 #if MGB_ENABLE_JSON

        if (!profile_path.empty()) {
            if (!enable_profile_host) {
                mgb_log_warn("enable profiling");
            } else {
                mgb_log_warn("enable profiling for host");
            }
            model->set_profiler();
        }
 #endif
    }

    else if (runtime_param.stage == RunStage::AFTER_MODEL_RUNNING) {
 #if MGB_ENABLE_JSON
        if (!profile_path.empty()) {
            mgb_log_warn("filename %s", profile_path.c_str());
            if (model->get_profiler()) {
                model->get_profiler()
                        ->to_json_full(model->get_async_func().get())
                        ->writeto_fpath(profile_path);
                mgb_log_warn("profiling result written to %s", profile_path.c_str());
            }
        }
 #endif
    }
 }

 }  // namespace lar

 using namespace lar;
 PluginOption::PluginOption() {
    m_option_name = "plugin";
    range = FLAGS_range;
    enable_check_dispatch = FLAGS_check_dispatch;
    var_value_check_str = FLAGS_check_var_value;
 #if MGB_ENABLE_JSON
    enable_profile_host = false;
    if (!FLAGS_profile.empty()) {
        profile_path = FLAGS_profile;
    }
    if (!FLAGS_profile_host.empty()) {
        enable_profile_host = !FLAGS_profile_host.empty();
        profile_path = FLAGS_profile_host;
    }
 #endif
 }

 bool PluginOption::is_valid() {
    bool ret = FLAGS_check_dispatch;
    ret = ret || FLAGS_range > 0;
    ret = ret || !FLAGS_check_var_value.empty();
 #if MGB_ENABLE_JSON
    ret = ret || !FLAGS_profile.empty();
    ret = ret || !FLAGS_profile_host.empty();
 #endif
    return ret;
 }

 std::shared_ptr<OptionBase> PluginOption::create_option() {
    static std::shared_ptr<PluginOption> option(new PluginOption);
    if (PluginOption::is_valid()) {
        return std::static_pointer_cast<OptionBase>(option);
    } else {
        return nullptr;
    }
 }

 void PluginOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    CONFIG_MODEL_FUN;
 }

 ///////////////////// Debug options///////////////////////////
 namespace lar {
 template <>
 void DebugOption::format_and_print(
        const std::string& tablename, std::shared_ptr<ModelLite> model) {
    auto table = mgb::TextTable(tablename);
    auto network = model->get_lite_network();
    table.padding(1);
    table.align(mgb::TextTable::Align::Mid).add("type").add("name").add("shape").eor();

    auto to_string = [&](lite::Layout& layout) {
        std::string shape("{");
        for (size_t i = 0; i < layout.ndim; i++) {
            if (i)
                shape.append(",");
            shape.append(std::to_string(layout.shapes[i]));
        }
        shape.append("}");
        return shape;
    };

    auto input_name = network->get_all_input_name();
    for (auto& i : input_name) {
        auto layout = network->get_io_tensor(i)->get_layout();
        table.align(mgb::TextTable::Align::Mid)
                .add("INPUT")
                .add(i)
                .add(to_string(layout))
                .eor();
    }

    auto output_name = network->get_all_output_name();
    for (auto& i : output_name) {
        auto layout = network->get_io_tensor(i)->get_layout();
        table.align(mgb::TextTable::Align::Mid)
                .add("OUTPUT")
                .add(i)
                .add(to_string(layout))
                .eor();
    }

    std::stringstream ss;
    ss << table;
    printf("%s\n\n", ss.str().c_str());
 }

 template <>
 void DebugOption::format_and_print(
        const std::string& tablename, std::shared_ptr<ModelMdl> model) {
    auto table = mgb::TextTable(tablename);
    table.padding(1);
    table.align(mgb::TextTable::Align::Mid).add("type").add("name").add("shape").eor();

    for (auto&& i : model->get_mdl_load_result().tensor_map) {
        table.align(mgb::TextTable::Align::Mid)
                .add("INPUT")
                .add(i.first)
                .add(i.second->shape().to_string())
                .eor();
    }

    for (auto&& i : model->get_mdl_load_result().output_var_list) {
        table.align(mgb::TextTable::Align::Mid)
                .add("OUTPUT")
                .add(i.node()->name())
                .add(i.shape().to_string())
                .eor();
    }

    std::stringstream ss;
    ss << table;
    printf("%s\n\n", ss.str().c_str());
 }

 template <>
 void DebugOption::config_model_internel<ModelLite>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        LITE_ASSERT(
                !disable_assert_throw, "lite model don't support disable assert throw");
 #ifndef __IN_TEE_ENV__
 #if MGB_ENABLE_JSON
        LITE_ASSERT(
                static_mem_log_dir_path.empty(),
                "lite model don't support static memory information export");
 #endif
 #endif
        if (enable_verbose) {
            LITE_WARN("enable verbose");
            lite::set_log_level(LiteLogLevel::DEBUG);
        }

 #if __linux__ || __unix__
        if (enable_wait_gdb) {
            printf("wait for gdb attach (pid=%d): ", getpid());
            getchar();
        }
 #endif
    } else if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
        if (enable_display_model_info) {
            LITE_WARN("enable display model information");
            format_and_print<ModelLite>("Runtime Model Info", model);
        }
    } else if (runtime_param.stage == RunStage::AFTER_MODEL_RUNNING) {
        if (enable_display_model_info) {
            format_and_print<ModelLite>("Runtime Model Info", model);
        }
    }
 }

 template <>
 void DebugOption::config_model_internel<ModelMdl>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        auto config = model->get_mdl_config();
        if (enable_verbose) {
            mgb_log_warn("enable verbose");
            mgb::set_log_level(mgb::LogLevel::DEBUG);
        }

 #if __linux__ || __unix__
        if (enable_wait_gdb) {
            printf("wait for gdb attach (pid=%d): ", getpid());
            getchar();
        }
 #endif
    } else if (runtime_param.stage == RunStage::AFTER_OUTSPEC_SET) {
        if (enable_display_model_info) {
            mgb_log_warn("enable display model information");
            format_and_print<ModelMdl>("Runtime Model Info", model);
        }

        if (disable_assert_throw) {
            mgb_log_warn("disable assert throw");
            auto on_opr = [](mgb::cg::OperatorNodeBase* opr) {
                if (opr->same_type<mgb::opr::AssertEqual>()) {
                    opr->cast_final<mgb::opr::AssertEqual>().disable_throw_on_error();
                }
            };
            mgb::cg::DepOprIter iter{on_opr};
            for (auto&& i : model->get_output_spec()) {
                iter.add(i.first.node()->owner_opr());
            }
        }
    } else if (runtime_param.stage == RunStage::AFTER_OUTSPEC_SET) {
        //! FIX:it don't work for cpu build (nothing dumped)
        //! megbrain/sdk origin code will assert(m_recorded) in
        //! EventImplHelper::finished();

 #ifndef __IN_TEE_ENV__
 #if MGB_ENABLE_JSON
        if (!static_mem_log_dir_path.empty()) {
            mgb_log_warn("enable get static memeory information");
            model->get_async_func()->get_static_memory_alloc_info(
                    static_mem_log_dir_path);
        }
 #endif
 #endif
    } else if (runtime_param.stage == RunStage::AFTER_MODEL_RUNNING) {
        if (enable_display_model_info) {
            format_and_print<ModelMdl>("Runtime Model Info", model);
        }
    }
 }

 }  // namespace lar

 DebugOption::DebugOption() {
    m_option_name = "debug";
    enable_display_model_info = FLAGS_model_info;
    enable_verbose = FLAGS_verbose;
    disable_assert_throw = FLAGS_disable_assert_throw;
 #if __linux__ || __unix__
    enable_wait_gdb = FLAGS_wait_gdb;
 #endif
 #ifndef __IN_TEE_ENV__
 #if MGB_ENABLE_JSON
    static_mem_log_dir_path = FLAGS_get_static_mem_info;
 #endif
 #endif
 }

 bool DebugOption::is_valid() {
    bool ret = FLAGS_model_info;
    ret = ret || FLAGS_verbose;
    ret = ret || FLAGS_disable_assert_throw;

 #if __linux__ || __unix__
    ret = ret || FLAGS_wait_gdb;
 #endif
 #ifndef __IN_TEE_ENV__
 #if MGB_ENABLE_JSON
    ret = ret || !FLAGS_get_static_mem_info.empty();
 #endif
 #endif
    return ret;
 }

 std::shared_ptr<OptionBase> DebugOption::create_option() {
    static std::shared_ptr<DebugOption> option(new DebugOption);
    if (DebugOption::is_valid()) {
        return std::static_pointer_cast<OptionBase>(option);
    } else {
        return nullptr;
    }
 }

 void DebugOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    CONFIG_MODEL_FUN;
 }
 ///////////////////// Plugin gflags///////////////////////////
 DEFINE_double(
        range, 0,
        "check whether absolute value of all numbers in computing graph "
        "is in the given range");

 DEFINE_bool(
        check_dispatch, false,
        "check whether an operator call dispatch on cpu comp nodes");

 DEFINE_string(
        check_var_value, "",
        "--check-var-value [interval]|[interval:init_idx], Enable "
        "VarValueChecker plugin. Refer to its doc for more details");
 #if MGB_ENABLE_JSON
 DEFINE_string(
        profile, "",
        "Write profiling result to given file. The output file is in "
        "JSON format");
 DEFINE_string(profile_host, "", "focus on host time profiling For some backends");
 #endif

 ///////////////////// Debug gflags///////////////////////////
 DEFINE_bool(
        model_info, false,
        " Format and display model input/output  tensor inforamtion");

 DEFINE_bool(verbose, false, "get more inforamtion for debug");

 DEFINE_bool(disable_assert_throw, false, "disable assert throw on error check");
 #if __linux__ || __unix__
 DEFINE_bool(wait_gdb, false, "print current process PID and wait for gdb attach");
 #endif
 #ifndef __IN_TEE_ENV__
 #if MGB_ENABLE_JSON
 DEFINE_string(
        get_static_mem_info, "",
        "Record the static computing graph's static memory information");
 #endif
 #endif
 REGIST_OPTION_CREATOR(plugin, lar::PluginOption::create_option);

 REGIST_OPTION_CREATOR(debug, lar::DebugOption::create_option);
--- a/lite/load_and_run/src/options/plugin_options.h
+++ b/lite/load_and_run/src/options/plugin_options.h
@@ -0,0 +1,105 @@
 /**
 * \file lite/load_and_run/src/options/plugin_options.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once
 #include <gflags/gflags.h>
 #if __linux__ || __unix__
 #include <unistd.h>
 #endif
 #include "megbrain/plugin/cpu_dispatch_checker.h"
 #include "megbrain/plugin/var_value_checker.h"

 #include "helpers/common.h"
 #include "helpers/text_table.h"
 #include "models/model.h"

 #include "option_base.h"

 DECLARE_bool(check_dispatch);
 DECLARE_double(range);
 DECLARE_string(check_var_value);
 #if MGB_ENABLE_JSON
 DECLARE_string(profile);
 DECLARE_string(profile_host);
 #endif

 DECLARE_bool(model_info);
 DECLARE_bool(verbose);
 DECLARE_bool(disable_assert_throw);
 #if __linux__ || __unix__
 DECLARE_bool(wait_gdb);
 #endif
 #ifndef __IN_TEE_ENV__
 #if MGB_ENABLE_JSON
 DECLARE_string(get_static_mem_info);
 #endif
 #endif

 namespace lar {
 class PluginOption final : public OptionBase {
 public:
    static bool is_valid();

    static std::shared_ptr<OptionBase> create_option();

    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;

    std::string option_name() const override { return m_option_name; };

 private:
    PluginOption();
    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};
    double range;
    bool enable_check_dispatch;
 #if MGB_ENABLE_JSON
    bool enable_profile_host;
    std::string profile_path;
 #endif

    std::string var_value_check_str;

    std::string m_option_name;

    std::unique_ptr<mgb::VarValueChecker> var_value_checker;
    std::unique_ptr<mgb::CPUDispatchChecker> cpu_dispatch_checker;
 };

 class DebugOption final : public OptionBase {
 public:
    static bool is_valid();

    static std::shared_ptr<OptionBase> create_option();

    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;

    std::string option_name() const override { return m_option_name; };

 private:
    DebugOption();
    template <typename ModelImpl>
    void format_and_print(const std::string&, std::shared_ptr<ModelImpl>){};
    template <typename ModelImpl>
    void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){};
    bool enable_display_model_info;
    bool enable_verbose;
    bool disable_assert_throw;
 #if __linux__ || __unix__
    bool enable_wait_gdb;
 #endif
 #ifndef __IN_TEE_ENV__
 #if MGB_ENABLE_JSON
    std::string static_mem_log_dir_path;
 #endif
 #endif
    std::string m_option_name;
 };
 }  // namespace lar
--- a/lite/load_and_run/src/options/strategy_options.cpp
+++ b/lite/load_and_run/src/options/strategy_options.cpp
@@ -0,0 +1,96 @@
 /**
 * \file lite/load_and_run/src/options/strategy_options.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "strategy_options.h"
 #include "models/model_mdl.h"

 using namespace lar;

 DECLARE_bool(c_opr_lib_with_param);

 StrategyOption::StrategyOption() {
    m_option_name = "run_strategy";
    warmup_iter = FLAGS_warmup_iter;
    run_iter = FLAGS_iter;
    threads = FLAGS_thread;
 }

 std::shared_ptr<OptionBase> StrategyOption::create_option() {
    static std::shared_ptr<StrategyOption> option(new StrategyOption);
    return std::static_pointer_cast<OptionBase>(option);
 }

 void StrategyOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) {
        model->set_shared_mem(FLAGS_share_param_mem);
        runtime_param.warmup_iter = warmup_iter;
        runtime_param.run_iter = run_iter;
        runtime_param.threads = threads;
        runtime_param.testcase_num = 1;
    } else if (runtime_param.stage == RunStage::BEFORE_OUTSPEC_SET) {
        if (model->type() == ModelType::MEGDL_MODEL) {
            auto model_ptr = std::static_pointer_cast<ModelMdl>(model);
            auto num = model_ptr->get_testcase_num();
            if (num != 0)
                runtime_param.testcase_num = num;

            model_ptr->make_output_spec();
        }
    }
 }

 TestcaseOption::TestcaseOption() {
    m_option_name = "run_testcase";
 }

 std::shared_ptr<OptionBase> TestcaseOption::create_option() {
    static std::shared_ptr<TestcaseOption> option(new TestcaseOption);
    return std::static_pointer_cast<OptionBase>(option);
 }

 void TestcaseOption::config_model(
        RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) {
    if (model->type() == ModelType::MEGDL_MODEL) {
        auto model_ptr = std::static_pointer_cast<ModelMdl>(model);
        if (model_ptr->get_testcase_num() && !FLAGS_c_opr_lib_with_param) {
            if (runtime_param.stage == RunStage::MODEL_RUNNING) {
                auto load_result = model_ptr->get_mdl_load_result();
                auto input_tensor = model_ptr->get_test_input();
                auto loader = model_ptr->reset_loader();
                auto testcase = loader->load(model_ptr->get_mdl_config(), false);
                mgb_assert(testcase.output_var_list.size() == input_tensor.size());
                for (size_t i = 0; i < input_tensor.size(); ++i) {
                    auto&& opr =
                            testcase.output_var_list[i]
                                    .node()
                                    ->owner_opr()
                                    ->cast_final_safe<mgb::opr::SharedDeviceTensor>();
                    input_tensor[i].second->copy_from(
                            mgb::HostTensorND::make_proxy(*opr.dev_data()));
                }
            }
        }
    }
 }

 DEFINE_int32(iter, 10, "iteration number for run model");

 DEFINE_int32(warmup_iter, 1, "iteration number for warm up model before run");

 DEFINE_int32(
        thread, 1,
        "thread number for run model while <thread> is supported( NOTE: "
        "this is not a mapper device setting just for load and run)");

 DEFINE_bool(share_param_mem, false, "load model from shared memeory");

 REGIST_OPTION_CREATOR(run_strategy, lar::StrategyOption::create_option);

 REGIST_OPTION_CREATOR(run_testcase, lar::TestcaseOption::create_option);
--- a/lite/load_and_run/src/options/strategy_options.h
+++ b/lite/load_and_run/src/options/strategy_options.h
@@ -0,0 +1,68 @@
 /**
 * \file lite/load_and_run/src/options/strategy_options.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include <gflags/gflags.h>
 #include "models/model.h"
 #include "option_base.h"
 DECLARE_int32(iter);
 DECLARE_int32(warmup_iter);
 DECLARE_int32(thread);
 DECLARE_bool(share_param_mem);

 namespace lar {
 /*!
 * \brief: strategy option for running model
 */
 class StrategyOption final : public OptionBase {
 public:
    //! creat options when option is used
    static std::shared_ptr<OptionBase> create_option();

    //! config the model, dispatch configuration for different model implement

    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;

    //! get option name
    std::string option_name() const override { return m_option_name; };

 private:
    //! Constructor
    StrategyOption();

    //! configuration for different model implement
    std::string m_option_name;

    size_t warmup_iter;  //! warm up number before running model
    size_t run_iter;     //! iteration number for running model
    size_t threads;      //! thread number for running model (NOTE:it's different
                         //! from multithread device )
 };

 class TestcaseOption final : public OptionBase {
 public:
    //! creat options when option is used
    static std::shared_ptr<OptionBase> create_option();

    //! config the model, dispatch configuration for different model implement

    void config_model(
            RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override;

    //! get option name
    std::string option_name() const override { return m_option_name; };

 private:
    //! Constructor
    TestcaseOption();

    //! configuration for different model implement
    std::string m_option_name;
 };
 }  // namespace lar
--- a/lite/load_and_run/src/strategys/strategy.cpp
+++ b/lite/load_and_run/src/strategys/strategy.cpp
@@ -0,0 +1,24 @@

 /**
 * \file lite/load_and_run/src/strategys/strategy.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "strategy.h"
 #include <iostream>

 using namespace lar;

 std::shared_ptr<StrategyBase> StrategyBase::create_strategy(std::string model_path) {
    if (FLAGS_fitting) {
        return std::make_shared<FittingStrategy>(model_path);
    } else {
        return std::make_shared<NormalStrategy>(model_path);
    }
 }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/load_and_run/src/strategys/strategy.h
+++ b/lite/load_and_run/src/strategys/strategy.h
@@ -0,0 +1,63 @@
 /**
 * \file lite/load_and_run/src/strategys/strategy.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once
 #include <gflags/gflags.h>
 #include <string>
 #include <unordered_map>
 #include "helpers/common.h"
 #include "models/model.h"
 #include "options/option_base.h"

 DECLARE_bool(fitting);

 namespace lar {
 /*!
 * \brief: load and run strategy base class
 */
 class StrategyBase {
 public:
    static std::shared_ptr<StrategyBase> create_strategy(std::string model_path);

    virtual void run() = 0;

    virtual ~StrategyBase() = default;

    RuntimeParam m_runtime_param;
    std::unordered_map<std::string, std::shared_ptr<OptionBase>> m_options;
 };

 /*!
 * \brief: normal strategy for running
 */
 class NormalStrategy : public StrategyBase {
 public:
    NormalStrategy(std::string model_path);

    //! run model with runtime parameter
    void run() override;

 private:
    //! run model subline for multiple thread
    void run_subline();

    std::string m_model_path;
 };

 /*!
 * \brief: Fitting strategy for running
 */
 class FittingStrategy : public StrategyBase {
 public:
    FittingStrategy(std::string model_path);
    void run() override;
 };
 }  // namespace lar

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/load_and_run/src/strategys/strategy_fitting.cpp
+++ b/lite/load_and_run/src/strategys/strategy_fitting.cpp
@@ -0,0 +1,24 @@
 /**
 * \file lite/load_and_run/src/strategys/strategy_fitting.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "strategy.h"
 using namespace lar;

 FittingStrategy::FittingStrategy(std::string) {
    mgb_assert("this version don't support Fitting Strategy");
 };

 void FittingStrategy::run() {
    mgb_assert("this version don't support Fitting Strategy");
 };

 DEFINE_bool(
        fitting, false,
        "whether to use the fitting model, which will auto profile and get "
        "the best option set!");
--- a/lite/load_and_run/src/strategys/strategy_normal.cpp
+++ b/lite/load_and_run/src/strategys/strategy_normal.cpp
@@ -0,0 +1,167 @@
 /**
 * \file lite/load_and_run/src/strategys/strategy_normal.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */
 #include <iostream>
 #include <thread>
 #include "megbrain/common.h"
 #include "megbrain/utils/timer.h"
 #include "megbrain/version.h"
 #include "megdnn/version.h"
 #include "misc.h"
 #include "strategy.h"

 using namespace lar;

 NormalStrategy::NormalStrategy(std::string model_path) {
    mgb::set_log_level(mgb::LogLevel::WARN);
    lite::set_log_level(LiteLogLevel::WARN);
    m_model_path = model_path;
    auto option_creator_map = OptionFactory::get_Instance().get_option_creator_map();
    mgb_log_debug("option map size: %lu", option_creator_map->size());
    auto construct_option = [&](std::string name) -> void {
        auto& creator = (*option_creator_map)[name];
        auto option = creator();
        if (option) {
            m_options.insert({name, option});
        }
    };

    for (auto& creator : *option_creator_map) {
        auto name = creator.first;
        if (m_options.count(name) == 0) {
            construct_option(name);
        }
    }
 }

 void NormalStrategy::run_subline() {
    auto model = ModelBase::create_model(m_model_path);
    mgb_assert(model != nullptr, "create model failed!!");

    auto stage_config_model = [&]() {
        for (auto& option : m_options) {
            option.second->config_model(m_runtime_param, model);
        }
    };
    //! execute before load config
    m_runtime_param.stage = RunStage::BEFORE_MODEL_LOAD;
    stage_config_model();

    mgb::RealTimer timer;
    model->load_model();
    printf("load model: %.3fms\n", timer.get_msecs_reset());

    //! after load configure
    m_runtime_param.stage = RunStage::AFTER_MODEL_LOAD;
    stage_config_model();

    m_runtime_param.stage = RunStage::BEFORE_OUTSPEC_SET;
    stage_config_model();

    // for get static memmory information options
    m_runtime_param.stage = RunStage::AFTER_OUTSPEC_SET;
    stage_config_model();

    auto warm_up = [&]() {
        auto warmup_num = m_runtime_param.warmup_iter;
        for (size_t i = 0; i < warmup_num; i++) {
            printf("=== prepare: %.3fms; going to warmup\n\n", timer.get_msecs_reset());
            model->run_model();
            model->wait();
            printf("warm up %lu  %.3fms\n", i, timer.get_msecs_reset());
            m_runtime_param.stage = RunStage::AFTER_RUNNING_WAIT;
            stage_config_model();
        }
    };

    auto run_iter = [&](int idx) {
        double time_sqrsum = 0, time_sum = 0,
               min_time = std::numeric_limits<double>::max(), max_time = 0;
        auto run_num = m_runtime_param.run_iter;
        for (size_t i = 0; i < run_num; i++) {
            timer.reset();
            model->run_model();
            auto exec_time = timer.get_msecs();
            model->wait();
            m_runtime_param.stage = RunStage::AFTER_RUNNING_WAIT;
            stage_config_model();
            auto cur = timer.get_msecs();
            printf("iter %lu/%lu: %.3fms (exec=%.3fms)\n", i, run_num, cur, exec_time);
            time_sum += cur;
            time_sqrsum += cur * cur;
            fflush(stdout);
            min_time = std::min(min_time, cur);
            max_time = std::max(max_time, cur);
        }
        printf("\n=== finished test #%u: time=%.3fms avg_time=%.3fms "
               "sexec=%.3fms min=%.3fms max=%.3fms\n\n",
               idx, time_sum, time_sum / run_num,
               std::sqrt(
                       (time_sqrsum * run_num - time_sum * time_sum) /
                       (run_num * (run_num - 1))),
               min_time, max_time);
        return time_sum;
    };

    //! model with testcase
    size_t iter_num = m_runtime_param.testcase_num;

    double tot_time = 0;
    for (size_t idx = 0; idx < iter_num; idx++) {
        //! config when running model
        mgb_log_warn("run testcase: %zu ", idx);
        m_runtime_param.stage = RunStage::MODEL_RUNNING;
        stage_config_model();

        if (!idx) {
            warm_up();
        }
        tot_time += run_iter(idx);

        m_runtime_param.stage = RunStage::AFTER_RUNNING_ITER;
        stage_config_model();
    }

    printf("=== total time: %.3fms\n", tot_time);
    //! execute after run
    m_runtime_param.stage = RunStage::AFTER_MODEL_RUNNING;
    stage_config_model();
 };

 void NormalStrategy::run() {
    auto v0 = mgb::get_version();
    auto v1 = megdnn::get_version();
    printf("megbrain/lite/load_and_run:\nusing MegBrain "
           "%d.%d.%d(%d) and MegDNN %d.%d.%d\n",
           v0.major, v0.minor, v0.patch, v0.is_dev, v1.major, v1.minor, v1.patch);

    size_t thread_num = m_runtime_param.threads;
    auto run_sub = [&]() { run_subline(); };
    if (thread_num == 1) {
        run_sub();
    } else if (thread_num > 1) {
 #if MGB_HAVE_THREAD
        std::vector<std::thread> threads;

        for (size_t i = 0; i < thread_num; ++i) {
            threads.emplace_back(run_sub);
        }
        for (auto&& i : threads) {
            i.join();
        }
 #else
        mgb_log_error(
                "%d threads requested, but load_and_run was compiled "
                "without <thread> support.",
                thread_num);
 #endif
    } else {
        mgb_assert(false, "--thread must input a positive number!!");
    }
    //! execute before run
 }