diff --git a/ci/compatibility/fbs/V2-backup/dtype.fbs b/ci/compatibility/fbs/V2-backup/dtype.fbs
new file mode 100644
index 00000000..a387d05c
--- /dev/null
+++ b/ci/compatibility/fbs/V2-backup/dtype.fbs
@@ -0,0 +1,43 @@
+namespace mgb.serialization.fbs;
+
+// Keep in sync with dnn/include/megdnn/dtype.h
+// Please only add new dtypes at the end of this list
+enum DTypeEnum : byte {
+    Float32,
+    Uint8,
+    Int8,
+    Int16,
+    Int32,
+    IntB1,
+    IntB2,
+    IntB4,
+    Byte,
+    Float16,
+    UintB4,
+    Quantized8Asymm,
+    QuantizedS32,
+    QuantizedS8,
+    Quantized4Asymm,
+    QuantizedS4,
+    QuantizedS16,
+    BFloat16,
+    Bool,
+    Uint16,
+    QuantizedS1,
+}
+
+table LinearQuantizationParam {
+    scale:float;
+
+    // Won't be set for symmetric quantization types
+    zero_point:ubyte;
+}
+
+union DTypeParam {
+    LinearQuantizationParam,
+}
+
+table DType {
+    type:DTypeEnum;
+    param:DTypeParam;
+}
diff --git a/ci/compatibility/fbs/V2-backup/mgb_cpp_opr.fbs b/ci/compatibility/fbs/V2-backup/mgb_cpp_opr.fbs
new file mode 100644
index 00000000..48dfe44c
--- /dev/null
+++ b/ci/compatibility/fbs/V2-backup/mgb_cpp_opr.fbs
@@ -0,0 +1,62 @@
+include "dtype.fbs";
+
+namespace mgb.serialization.fbs.param;
+
+struct PersistentDTypeScalar {
+    dtype:DTypeEnum;
+    storage:[ubyte:4];
+}
+
+table MGBAddUpdate {
+    alpha:PersistentDTypeScalar;
+    beta:PersistentDTypeScalar;
+    bias:PersistentDTypeScalar;
+}
+
+table Host2DeviceCopy {
+    enable_value_infer:bool = true;
+    dump_default_value:bool = false;
+    allow_cpu_mem_fwd:bool = true;
+}
+
+table Dimshuffle {
+    pattern:[int];
+    ndim:uint;
+}
+
+enum AxisDescMethod : byte {
+    ADD_1,
+    REMOVE,
+}
+
+struct AxisDesc {
+    method:AxisDescMethod;
+    axis:int;
+}
+
+table AxisAddRemove {
+    desc:[AxisDesc];
+}
+
+table MGBSleep {
+    device:bool = true;
+    host:bool = false;
+    seconds:double;
+}
+
+struct IndexDescMaskItem {
+    axis:byte;
+    begin:bool;
+    end:bool;
+    step:bool;
+    idx:bool;
+}
+
+table IndexDescMaskDump {
+    items:[IndexDescMaskItem];
+}
+
+table NMSKeep {
+    iou_thresh:float;
+    max_output:uint;
+}
diff --git a/ci/compatibility/fbs/V2-backup/mgb_opr_param_defs.fbs b/ci/compatibility/fbs/V2-backup/mgb_opr_param_defs.fbs
new file mode 100644
index 00000000..f21634b2
--- /dev/null
+++ b/ci/compatibility/fbs/V2-backup/mgb_opr_param_defs.fbs
@@ -0,0 +1,237 @@
+// generated by gen_param_defs.py for c23d51f3c4f33119fd74f58f04d112ccea8f64f1249ab372300975ab7e710e9a
+include "dtype.fbs";
+namespace mgb.serialization.fbs.param;
+
+/// mode of collective communication
+enum CollectiveCommMode : uint  {
+    /// reduce by sum to output computing node
+    REDUCE_SUM = 0,
+    /// copy input value to each output computing node
+    BROADCAST = 1,
+    /// each output comp node gets the concatenated value of all inputs
+    ALL_GATHER = 2,
+    /// reduce inputs by sum and each output gets one part of it
+    REDUCE_SCATTER_SUM = 3,
+    /// every output gets the sum of all inputs
+    ALL_REDUCE_SUM = 4,
+    /// every output gets the max of all inputs
+    ALL_REDUCE_MAX = 5,
+    /// every output gets the min of all inputs
+    ALL_REDUCE_MIN = 6,
+    /// every output gets the prod of all inputs
+    ALL_REDUCE_PROD = 7,
+    /// concat inputs to one node
+    GATHER = 8,
+    /// scatter input to each output computing node
+    SCATTER = 9,
+    /// scatter inputs and gather them on each computing node
+    ALL_TO_ALL = 10,
+}
+
+/// mode for computing the gradient
+enum CondExecMarkGradMode : uint  {
+    /// normal gradient mode: sum all the activated components
+    SUM = 0,
+    /// use :attr:`CondExecMerge.SUM_COND_OUT` mode so oprs that depend on the
+    /// gradient opr would not be executed if the forward var is not used.
+    SUM_COND_OUT = 1,
+}
+
+/// static inference option. **Note:** This is a workaround: since
+/// currently static inference in MegBrain does not take conditional
+/// execution into account, this option can be used to bypass static
+/// inference errors. This is currently only used by automatically
+/// generated gradient oprs.
+enum CondExecMarkStaticInfer : uint  {
+    /// enable both shape and value inference
+    SHAPE_VALUE = 0,
+    /// only enable shape inference (disable value inference)
+    SHAPE_ONLY = 1,
+    /// disable both shape and value inference
+    NONE = 2,
+}
+
+enum CondExecMergeMode : uint  {
+    /// copy the var whose mask is activated to the output, requiring that
+    /// exactly one branch is active
+    EXACT_ONE = 0,
+    /// like :attr:`EXACT_ONE` with the requirement that all branches have the
+    /// same shape, so shape inference can be easier
+    EXACT_ONE_SAME_SHAPE = 1,
+    /// sum all the active branches into output var; require all branches to
+    /// have the same shape. Extra shape vars are needed in this mod, so the
+    /// outputs can be initialized to zero when no input is active (and their
+    /// shapes are probably unknown).
+    SUM = 2,
+    /// like :attr:`SUM` but also add an ExecutionMask to the readers of output
+    /// vars, so they would be skipped if  no branch is taken
+    SUM_COND_OUT = 3,
+}
+
+/// how to compare predicate var with branch keys
+enum CondExecPredMode : uint  {
+    /// The outputs correspond to branch keys, and the one which equals
+    /// predicate would be activated. This behaves like a case-statement in many
+    /// languages.
+    CASE = 0,
+    /// like :attr:`CASE`, but add an extra output that would be activated if no
+    /// branch is matched
+    CASE_FALLBACK = 1,
+    /// One more outputs would be produced than the number of branch keys,
+    /// representing the interval in which the predicate var fits in. The
+    /// intervals are defined as :math:`(-\\infty, k_0), [k_0, k_1), \\ldots,
+    /// [k_{n-2}, k_{n-1}), [k_{n-1}, \infty)`. The keys must be given in
+    /// ascending order.
+    PIECEWISE = 2,
+}
+
+enum CondExecPredLogicalMode : uint  {
+    /// logical or
+    OR = 0,
+    /// logical and
+    AND = 1,
+    /// exclusive-or
+    XOR = 2,
+    /// not or(inputs)
+    NOR = 3,
+    /// not and(inputs)
+    NAND = 4,
+    /// not xor(inputs)
+    XNOR = 5,
+}
+
+enum ExecutionPolicyStrategy : uint (bit_flags) {
+    /// use heuristic to choose the fastest algorithm
+    HEURISTIC =  0,
+    /// run possible algorithms on real device to find the best
+    PROFILE =  1,
+    /// when profile or heuristic algo selection it require the algosmust be
+    /// reproducible
+    REPRODUCIBLE =  2,
+    /// profile require algos are optmized to achieve fast-profile
+    OPTIMIZED =  3,
+}
+
+enum ExecutionPolicyV0Strategy : uint  {
+    /// use heuristic to choose the fastest algorithm
+    HEURISTIC = 0,
+    /// use heuristic to choose the fastest algorithm, and the chosen algorithm
+    /// is reproducible
+    HEURISTIC_REPRODUCIBLE = 1,
+    /// run possible algorithms on real device to find the best
+    PROFILE = 2,
+    /// the fastest of profile result that is also reproducible
+    PROFILE_REPRODUCIBLE = 3,
+    /// use profile result and heuristic to choose the fastest algorithm
+    PROFILE_HEURISTIC = 4,
+}
+
+table DType {
+    dtype:DTypeEnum = Byte;
+}
+
+table PersistentOutputStorage {
+    /// This is used for controlling memory sharing. Multiple
+    /// ``PersistentOutputStorage'' oprs with the same ``share_key'' would share
+    /// underlying tensor storage. Note that the value ``-1'' is treated
+    /// specially: storage of oprs with this key would be private and would not
+    /// be shared with any other opr.
+    share_key:int = -1;
+}
+
+/// optinal axis: axis == -1 means no axis
+table OptionalAxis {
+    axis:int = -1;
+}
+
+/// optinal axis: axis == MAX_NDIM means no axis
+table OptionalAxisV1 {
+    axis:int = 7;
+}
+
+table ExecutionPolicyV0 {
+    strategy:ExecutionPolicyV0Strategy = HEURISTIC;
+    /// workspace limit in bytes
+    workspace_limit:ulong = 18446744073709551615;
+}
+
+/// specify how to select an algorithm for an operator
+table ExecutionPolicy {
+    strategy:ExecutionPolicyStrategy = 1;
+    /// workspace limit in bytes
+    workspace_limit:ulong = 18446744073709551615;
+}
+
+table AssertEqual {
+    /// max allowed error; error is defined as the minimal of absolute and
+    /// relative error
+    maxerr:float = 0.0001;
+    /// whether to print maxerr to stdout during opr exec
+    verbose:bool = false;
+}
+
+table FpgaConv {
+    need_output_quantize:bool = false;
+    need_output_threshold:bool = false;
+    stride:int = 1;
+    input_bit_width:int = 2;
+    output_bit_width:int = 2;
+    weight_bit_width:int = 2;
+    thres0:int = 0;
+    thres1:int = 1;
+    unpool_size:uint = 4;
+    direct_size:uint = 4;
+}
+
+/// collective communication between multiple computing nodes on localhost
+table CollectiveComm {
+    /// mode of collective communication
+    mode:CollectiveCommMode = REDUCE_SUM;
+}
+
+/// HACK: The tag of this param def is actually used for another non-generated
+/// param def SerializedDType, the sole purpose of this param def is to provide
+/// a spare tag. Do not use.
+table FakeSerializedDType {
+}
+
+/// evaluate a predicate and branch keys to setup ExecutionMask objects with
+/// associated predicate proxy vars (PPVs)
+table CondExecPred {
+    /// how to compare predicate var with branch keys
+    mode:CondExecPredMode = CASE;
+    /// threshold for checking equality of float point values
+    eps:float = 0.0001;
+}
+
+/// compute a logical function over a set of PPVs
+table CondExecPredLogical {
+    mode:CondExecPredLogicalMode = OR;
+}
+
+/// add ExecutionMask of the input PPV to this opr and readers of the outputs of
+/// this opr
+table CondExecMark {
+    /// mode for computing the gradient
+    grad_mode:CondExecMarkGradMode = SUM;
+    /// static inference option. **Note:** This is a workaround: since
+    /// currently static inference in MegBrain does not take conditional
+    /// execution into account, this option can be used to bypass static
+    /// inference errors. This is currently only used by automatically
+    /// generated gradient oprs.
+    static_infer:CondExecMarkStaticInfer = SHAPE_VALUE;
+}
+
+/// merge multiple conditional execution branches
+table CondExecMerge {
+    /// number of output vars (i.e. vars per branch)
+    nr_output:uint = 1;
+    mode:CondExecMergeMode = EXACT_ONE;
+}
+
+/// opr Implements NVIDIA Optical Flow SDK.
+table NvOf {
+    precision:uint = 1;
+}
+
+
diff --git a/ci/compatibility/fbs/V2-backup/opr_param_defs.fbs b/ci/compatibility/fbs/V2-backup/opr_param_defs.fbs
new file mode 100644
index 00000000..1ef3ac1a
--- /dev/null
+++ b/ci/compatibility/fbs/V2-backup/opr_param_defs.fbs
@@ -0,0 +1,1912 @@
+// generated by gen_param_defs.py for 53ca6252b5b9568f67b9767fb4fd0d2ef6b717b28a861692e9105d5f796a9472
+include "dtype.fbs";
+namespace mgb.serialization.fbs.param;
+
+enum ArgsortOrder : uint  {
+    ASCENDING = 0,
+    DESCENDING = 1,
+}
+
+enum BNFwdMode : uint  {
+    /// Training phase.
+    TRAINING = 0,
+    /// Inference phase.
+    INFERENCE = 1,
+}
+
+enum BNParamDim : uint  {
+    /// Dim of params (Sigma, Mu) is 1 x 1 x H x W
+    DIM_11HW = 0,
+    /// Dim of params (Sigma, Mu) is 1 x C x H x W
+    DIM_1CHW = 1,
+    /// Dim of params (Sigma, Mu) is 1 x C x 1 x 1
+    DIM_1C11 = 2,
+    /// Dim of params (Sigma, Mu) is 1 x 1 x 1 x C
+    DIM_111C = 3,
+}
+
+enum CondTakeMode : uint  {
+    /// take if ``abs(data-val)<eps``
+    EQ = 0,
+    /// take if ``abs(data-val)>=eps``
+    NEQ = 1,
+    /// take if ``data<val``
+    LT = 2,
+    /// take if ``data<=val``
+    LEQ = 3,
+    /// take if ``data>val``
+    GT = 4,
+    /// take if ``data>=val``
+    GEQ = 5,
+}
+
+enum Conv3DBiasNonlineMode : uint  {
+    IDENTITY = 0,
+    RELU = 1,
+    SIGMOID = 2,
+}
+
+enum ConvBiasV0NonlineMode : uint  {
+    IDENTITY = 0,
+    RELU = 1,
+    SIGMOID = 2,
+    H_SWISH = 3,
+}
+
+enum ConvPoolingMethod : uint  {
+    WITH_TEXTURE_OBJ = 0,
+    WITH_SHARED_MEM = 1,
+}
+
+enum ConvPoolingNonlineMode : uint  {
+    IDENTITY = 0,
+    RELU = 1,
+    SIGMOID = 2,
+}
+
+enum ConvPoolingPoolMode : uint  {
+    AVERAGE = 0,
+    MAX_ = 1,
+}
+
+/// convolution data/filter/output format; see :class:`RelayoutFormat` for more
+/// details
+enum ConvolutionFormat : uint  {
+    NCHW = 0,
+    NHWC = 1,
+    NHWCD4 = 2,
+    NCHW4 = 3,
+    NCHW8 = 4,
+    NCHW32 = 5,
+    NCHW88 = 6,
+    NCHW44 = 7,
+    NCHW44_DOT = 8,
+    /// NCHW4_NCHW32 means input tensors are nchw4 layout, output tensor is
+    /// nchw32 layout
+    NCHW4_NCHW32 = 9,
+    /// NCHW32_NCHW4 means input tensors are nchw32 layout, output tensor is
+    /// nchw4 layout
+    NCHW32_NCHW4 = 10,
+    /// NCHW4_NCHW means input tensors are nchw4 layout, output tensor is nchw
+    /// layout
+    NCHW4_NCHW = 11,
+    /// NHWC_NCHW means input tensors are nhwc layout, output tensor is nchw
+    /// layout
+    NHWC_NCHW = 12,
+    /// NHWC_NCHW4_IC_SMALL means input tensors are nhwc(c < 4) layout, output
+    /// tensor is nchw4 layout, padding c=4
+    NHWC_NCHW4_IC_SMALL = 13,
+    /// NCHW_NCHW4_IC_SMALL means input tensors are nchw(c < 4) layout, output
+    /// tensor is nchw4 layout, padding c=4
+    NCHW_NCHW4_IC_SMALL = 14,
+    /// CHWN4 is currently only used on Nvidia platform for fast implementation
+    /// of convolution using CUDA/SASS. The channels are splitted to groups of 4
+    /// channels.
+    CHWN4 = 15,
+    /// NCHW64 is designed for convolution implementation to utilizing
+    /// TensorCore instructions for 4-bit integers on Nvidia platforms
+    NCHW64 = 16,
+    /// NCHW4_NHWC means input tensors are nchw4 layout, output tensor is nhwc
+    /// layout
+    NCHW4_NHWC = 17,
+}
+
+enum Convolution3DDataType : uint  {
+    /// input/output both float32/float16
+    FLOAT = 0,
+    /// input/output both float16, the internal compute is float32
+    FLOAT_IO16xC32 = 1,
+}
+
+enum Convolution3DFormat : uint  {
+    NCDHW = 0,
+    NDHWC = 1,
+}
+
+enum Convolution3DMode : uint  {
+    CROSS_CORRELATION = 0,
+    CONVOLUTION = 1,
+}
+
+enum Convolution3DSparse : uint  {
+    /// dense convolution: filter shape should be [oc, ic, spatial...] if format
+    /// is NCDHW, [oc, spatial..., ic] if format is NDHWC
+    DENSE = 0,
+    /// group convolution: filter shape should be [group, oc_per_group,
+    /// ic_per_group, spatial...] if format is NCDHW, [group, oc_per_group,
+    /// spatial..., ic_per_group] if format is NDHWC
+    GROUP = 1,
+}
+
+enum ConvolutionV0DataType : uint  {
+    /// input/output both float32/float16
+    FLOAT = 0,
+    INT8x8x16 = 1,
+    INT8x8x32 = 2,
+    /// input/output both float16, the internal compute is float32
+    FLOAT_IO16xC32 = 3,
+    /// input QuantizedAsymm8, output QuantizedS32
+    QUINT8x8x32 = 4,
+    /// input int8, output specified by tensor DType
+    INT8x8xX = 5,
+    /// input QuantizedAsymm4, output QuantizedS32
+    QUINT4x4x32 = 6,
+}
+
+/// convolution data/filter/output format; see :class:`RelayoutFormat` for more
+/// details
+enum ConvolutionV0Format : uint  {
+    NCHW = 0,
+    NHWC = 1,
+    NHWCD4 = 2,
+    NCHW4 = 3,
+    NCHW8 = 4,
+    NCHW32 = 5,
+    NCHW88 = 6,
+    NCHW44 = 7,
+    NCHW44_DOT = 8,
+    /// NCHW layout with weights tranformed by winograd
+    NCHW_WINOGRAD = 9,
+    /// NCHW88 layout with weights tranformed by winograd
+    NCHW88_WINOGRAD = 10,
+    /// NCHW44 layout with weights tranformed by winograd
+    NCHW44_WINOGRAD = 11,
+    /// NCHW4_NCHW32 means input tensors are nchw4 layout, output tensor is
+    /// nchw32 layout
+    NCHW4_NCHW32 = 12,
+    /// NCHW32_NCHW4 means input tensors are nchw32 layout, output tensor is
+    /// nchw4 layout
+    NCHW32_NCHW4 = 13,
+    /// NCHW4_NCHW means input tensors are nchw4 layout, output tensor is nchw
+    /// layout
+    NCHW4_NCHW = 14,
+    /// NHWC_NCHW means input tensors are nhwc layout, output tensor is nchw
+    /// layout
+    NHWC_NCHW = 15,
+    /// NHWC_NCHW4_IC_SMALL means input tensors are nhwc(c < 4) layout, output
+    /// tensor is nchw4 layout, padding c=4
+    NHWC_NCHW4_IC_SMALL = 16,
+    /// NCHW_NCHW4_IC_SMALL means input tensors are nchw(c < 4) layout, output
+    /// tensor is nchw4 layout, padding c=4
+    NCHW_NCHW4_IC_SMALL = 17,
+    /// CHWN4 is currently only used on Nvidia platform for fast implementation
+    /// of convolution using CUDA/SASS. The channels are splitted to groups of 4
+    /// channels.
+    CHWN4 = 18,
+    /// NCHW4_NHWC means input tensors are nchw4 layout, output tensor is nhwc
+    /// layout
+    NCHW4_NHWC = 19,
+}
+
+enum ConvolutionV0Mode : uint  {
+    CROSS_CORRELATION = 0,
+    CONVOLUTION = 1,
+}
+
+enum ConvolutionV0Sparse : uint  {
+    /// dense convolution: filter shape should be [oc, ic, spatial...] if format
+    /// is NCHW, [oc, spatial..., ic] if format is NHWC
+    DENSE = 0,
+    /// group convolution: filter shape should be [group, oc_per_group,
+    /// ic_per_group, spatial...] if format is NCHW, [group, oc_per_group,
+    /// spatial..., ic_per_group] if format is NHWC
+    GROUP = 1,
+}
+
+/// Specifies special computation modes, e.g. different combinations of
+/// intermediate result data types.
+enum ConvolutionV1ComputeMode : uint  {
+    /// No special requirements on the precision of intermediate results.
+    DEFAULT = 0,
+    /// Use Float32 accumulator and intermediate result. Only supported when
+    /// input and output is Float16.
+    FLOAT32 = 1,
+}
+
+enum CvtColorMode : uint  {
+    RGB2GRAY = 0,
+    RGB2YUV = 1,
+    YUV2RGB = 2,
+    GRAY2RGB = 3,
+    RGBA2RGB = 4,
+    RGBA2BGR = 5,
+    RGBA2GRAY = 6,
+    RGB2BGR = 7,
+    BGR2GRAY = 8,
+    BGR2RGB = 9,
+    /// For historical reasons, referred to as YCC by opencv
+    YUV2GRAY_NV21 = 10,
+    YUV2RGB_NV21 = 11,
+    YUV2BGR_NV21 = 12,
+    YUV2GRAY_NV12 = 13,
+    YUV2RGB_NV12 = 14,
+    YUV2BGR_NV12 = 15,
+    YUV2GRAY_YV12 = 16,
+    YUV2RGB_YV12 = 17,
+    YUV2BGR_YV12 = 18,
+    YUV2GRAY_YU12 = 19,
+    YUV2RGB_YU12 = 20,
+    YUV2BGR_YU12 = 21,
+    YCrCb2RGB = 22,
+    YCrCb2BGR = 23,
+    /// BT601 yuv format, referred to as YUV by opencv
+    BT601_YUV2RGB_NV21 = 24,
+    BT601_YUV2BGR_NV21 = 25,
+    BT601_YUV2RGB_NV12 = 26,
+    BT601_YUV2BGR_NV12 = 27,
+    BT601_YUV2RGB_YV12 = 28,
+    BT601_YUV2BGR_YV12 = 29,
+    BT601_YUV2RGB_YU12 = 30,
+    BT601_YUV2BGR_YU12 = 31,
+}
+
+enum DctChannelSelectV0FastImpl : uint  {
+    NONE = 0,
+    FIX_32_MASK = 1,
+}
+
+enum ElemwiseMode : uint  {
+    /// unary: max(x, 0)
+    RELU = 0,
+    /// unary: abs(x)
+    ABS = 1,
+    /// unary: acos(x)
+    ACOS = 2,
+    /// unary: asin(x)
+    ASIN = 3,
+    /// unary: ceil(x)
+    CEIL = 4,
+    /// unary: cos(x)
+    COS = 5,
+    /// unary: exp(x)
+    EXP = 6,
+    /// unary: numerically stable exp(x)-1
+    EXPM1 = 7,
+    /// unary: floor(x)
+    FLOOR = 8,
+    /// unary: natural logarithm, log(x)
+    LOG = 9,
+    /// unary: numerically stable log(x+1)
+    LOG1P = 10,
+    /// unary: -x
+    NEGATE = 11,
+    /// unary: 1/(1+exp(-x))
+    SIGMOID = 12,
+    /// unary: sin(x)
+    SIN = 13,
+    /// unary: tanh(x)
+    TANH = 14,
+    /// binary: x > 0 ? y : -y
+    ABS_GRAD = 15,
+    /// binary: x + y
+    ADD = 16,
+    /// binary: floor(x / y)
+    FLOOR_DIV = 17,
+    /// binary: max(x, y)
+    MAX_ = 18,
+    /// binary: min(x, y)
+    MIN_ = 19,
+    /// binary: x % y or fmodf(x, y)
+    MOD = 20,
+    /// binary: x * y
+    MUL = 21,
+    /// binary: pow(x, y)
+    POW = 22,
+    /// binary: x * (1 - x) * y
+    SIGMOID_GRAD = 23,
+    /// binary: x - y
+    SUB = 24,
+    /// binary: (x > 0) * y
+    SWITCH_GT0 = 25,
+    /// binary: (1 - x * x) * y
+    TANH_GRAD = 26,
+    /// binary: x / y
+    TRUE_DIV = 27,
+    /// binary: numerically stable log(exp(x) + exp(y))
+    LOG_SUM_EXP = 28,
+    /// binary: x < y
+    LT = 29,
+    /// binary: x <= y
+    LEQ = 30,
+    /// binary: x == y
+    EQ = 31,
+    /// bitwise binary: x << y. Note that result is undefined if y < 0 or y >=
+    /// bitwidth. Logical shift is performed for unsigned intergers, and
+    /// arithmetic shift for signed ones.
+    SHL = 32,
+    /// bitwise binary: x >> y; see SHL mode for more details
+    SHR = 33,
+    /// ternary: x <= y ? z : 0
+    COND_LEQ_MOV = 34,
+    /// compute ``a * b + c`` where c must either have same layout as a or b, or
+    /// be a scalar
+    FUSE_MUL_ADD3 = 35,
+    /// compute ``a * A + b * B`` where a and b must have equal layout, and A
+    /// and B must have equal layout. In the inputs ``b`` and ``B`` can be
+    /// swapped
+    FUSE_MUL_ADD4 = 36,
+    /// binary: max(x+y, 0)
+    FUSE_ADD_RELU = 37,
+    /// binary: 1/(1+exp(-(x+y)))
+    FUSE_ADD_SIGMOID = 38,
+    /// binary: tanh(x+y)
+    FUSE_ADD_TANH = 39,
+    /// unary: rational approximation of tanh(x)
+    FAST_TANH = 40,
+    /// binary: grad of the rational approximation of tanh(x)
+    FAST_TANH_GRAD = 41,
+    /// unary: round(x), the nearest integer value to x, rounding halfway cases
+    /// away from zero. Float only.
+    ROUND = 42,
+    /// binary: rounded higher l bits of x * y, where l is the bit length of x.
+    RMULH = 43,
+    /// binary: atan2(y,x)
+    ATAN2 = 44,
+    /// unary: erf(x)
+    ERF = 45,
+    /// unary: inverse function of erf(x)
+    ERFINV = 46,
+    /// unary: erfc(x)
+    ERFC = 47,
+    /// unary: inverse function of erfc(x)
+    ERFCINV = 48,
+    /// unary: x * clip(x + 3, 0, 6) / 6
+    H_SWISH = 49,
+    /// binary: x < -3 ? 0 : (x > 3 ? y : (2 * x + 3) / 6 * y)
+    H_SWISH_GRAD = 50,
+    /// binary: hswish(x+y)
+    FUSE_ADD_H_SWISH = 51,
+    /// unary: !x
+    NOT = 52,
+    /// binary: x && y
+    AND = 53,
+    /// binary: x || y
+    OR = 54,
+    /// binary: x ^ y
+    XOR = 55,
+    /// unary: x / (1 + exp(-x))
+    SILU = 56,
+    /// binary: grad(x / (1 + exp(-x))
+    SILU_GRAD = 57,
+    /// unary: x Phi(x)
+    GELU = 58,
+    /// binary: grad(x Phi(x))
+    GELU_GRAD = 59,
+}
+
+enum ElemwiseMultiTypeMode : uint  {
+    /// compute ``a * b + c`` requiring that ``a`` be int16 and ``b`` and ``c``
+    /// int32, and the result is int32. This mode is optimized for the channel-
+    /// broadacsted case, i.e. ``a`` has shape (A, B, C) and ``b`` and ``c``
+    /// have shape (1, C, 1)
+    FUSE_MUL_ADD3_INT16x32x32x32 = 0,
+    /// compuate ``a * b + c`` where the inputs ``a`` is an integer type ``b``
+    /// and ``c`` are both ``float32``, the result is ``int8``. This is
+    /// currently only optimized for ``(1, x)`` broadcast for ``b`` and ``c``.
+    /// Computation is carried in floating points and results are rounded
+    /// towards zero with saturated cast to int.
+    FUSE_MUL_ADD3_IXxF32xF32xI8 = 1,
+    /// Compute ``a >> b``, round the result according to lower ``b`` bits of
+    /// ``a``` and make a saturating conversion to int8. Where ``a`` should be
+    /// an integer tensor and ``b`` should be an int8 scalar.
+    ROUND_SHR_SATURATE_IXxI8xI8 = 2,
+    /// Fused operation of an int16 elemwise add, an int16 rounding multiply
+    /// high and an int16 to int8 rounding right shift with saturation.
+    FUSE_ADD_RMULH_ROUND_SHR_SATURATE_INT16x16x16x8 = 3,
+    /// Fused operation of an int32 elemwise add, an int32 rounding multiply
+    /// high and an int32 to int8 rounding right shift with saturation.
+    FUSE_ADD_RMULH_ROUND_SHR_SATURATE_INT32x32x32x8 = 4,
+    /// Compute ``a >> b``, round the result according to lower ``b`` bits of
+    /// ``a``` and make a saturating conversion to int16. Where ``a`` should be
+    /// an integer tensor and ``b`` should be an int8 scalar.
+    ROUND_SHR_SATURATE_IXxI8xI16 = 5,
+    /// Fused elemwise add two quantized int8 with specifiedoutput quantized
+    /// dtype
+    QADD = 6,
+    /// Fused elemwise add two quantized int8 followed by ReLU and typecvt to
+    /// specified dtype
+    QFUSE_ADD_RELU = 7,
+    /// Fused elemwise multiply two quantized int8 with specifiedoutput
+    /// quantized dtype
+    QMUL = 8,
+    /// Fused elemwise min two quantized int8 with specifiedoutput quantized
+    /// dtype
+    QMIN = 9,
+    /// quantized: max(x, y), with specified output quantized dtype
+    QMAX = 10,
+    /// quantized: x - y
+    QSUB = 11,
+    /// quantized: x / y
+    QTRUE_DIV = 12,
+    /// quantized: sigmoid(x + y)
+    QFUSE_ADD_SIGMOID = 13,
+    /// quantized: tanh(x + y)
+    QFUSE_ADD_TANH = 14,
+    /// quantized: x > 0 ? x : 0
+    QRELU = 15,
+    /// quantized: x > 0 ? x : -x
+    QABS = 16,
+    /// quantized: sigmoid(x)
+    QSIGMOID = 17,
+    /// quantized: exp(x)
+    QEXP = 18,
+    /// quantized: tanh(x)
+    QTANH = 19,
+    /// quantized: x * y + z
+    QFUSE_MUL_ADD3 = 20,
+    /// quantized: fast_tanh(x)
+    QFAST_TANH = 21,
+    /// quantized: -x
+    QNEGATE = 22,
+    /// quantized: acos(x)
+    QACOS = 23,
+    /// quantized: asin(x)
+    QASIN = 24,
+    /// quantized: ceil(x)
+    QCEIL = 25,
+    /// quantized: cos(x)
+    QCOS = 26,
+    /// quantized: expm1(x)
+    QEXPM1 = 27,
+    /// quantized: floor(x)
+    QFLOOR = 28,
+    /// quantized: log(x)
+    QLOG = 29,
+    /// quantized: log1p(x)
+    QLOG1P = 30,
+    /// quantized: sin(x)
+    QSIN = 31,
+    /// quantized: round(x)
+    QROUND = 32,
+    /// quantized: erf(x)
+    QERF = 33,
+    /// quantized: erfinv(x)
+    QERFINV = 34,
+    /// quantized: erfc(x)
+    QERFC = 35,
+    /// quantized: erfcinv(x)
+    QERFCINV = 36,
+    /// quantized: abs_grad
+    QABS_GRAD = 37,
+    /// quantized floor_div
+    QFLOOR_DIV = 38,
+    /// quantized mod
+    QMOD = 39,
+    /// quantized sigmoid_grad
+    QSIGMOID_GRAD = 40,
+    /// quantized switch_gt0
+    QSWITCH_GT0 = 41,
+    /// quantized tanh_grad
+    QTANH_GRAD = 42,
+    /// quantized lt
+    QLT = 43,
+    /// quantized leq
+    QLEQ = 44,
+    /// quantized eq
+    QEQ = 45,
+    /// quantized pow
+    QPOW = 46,
+    /// quantized log_sum_exp
+    QLOG_SUM_EXP = 47,
+    /// quantized fast_tanh_grad
+    QFAST_TANH_GRAD = 48,
+    /// quantized atan2
+    QATAN2 = 49,
+    /// quantized cond_leq_mov
+    QCOND_LEQ_MOV = 50,
+    /// quantized h_swish
+    QH_SWISH = 51,
+    /// quantized h_swish(x+y)
+    QFUSE_ADD_H_SWISH = 52,
+    /// quantized h_swish_grad
+    QH_SWISH_GRAD = 53,
+    /// compute ``a * b + c`` requiring that ``a`` be int16 and ``b`` and ``c``
+    /// float32, and the result is float32.
+    FUSE_MUL_ADD3_INT16xF32xF32xF32 = 54,
+    /// compute ``a * b `` requiring that ``a`` be int16 and ``b`` float32, and
+    /// the result is float32.
+    MUL_INT16xF32xF32 = 55,
+    /// compute ``a * b + c`` requiring that ``a`` be uint8 and ``b`` and ``c``
+    /// float32, and the result is float32.
+    FUSE_MUL_ADD3_UINT8xF32xF32xF32 = 56,
+}
+
+enum MatrixMulFormat : uint  {
+    /// Normal matrix mul: (M, K) x (K, N) = (M, N)
+    DEFAULT = 0,
+    /// Split 4 from M and K, better for neon compute:(M/4, K/4, 4(k), 4(m)) x
+    /// (K/4, N, 4(k)). if transposeA the layout is (K/4, M/4, 4(k), 4(m)) x
+    /// (K/4, N, 4(k))
+    MK4 = 1,
+    /// Split 8 from M and K, better for neon compute:(M/8, K/8, 8(k), 8(m)) x
+    /// (K/8, N, 8(k)). if transposeA the layout is (K/8, M/8, 8(k), 8(m)) x
+    /// (K/8, N, 8(k))
+    MK8 = 2,
+    /// Split 4 from M and K, better for neon dotprod:M/4, K/4, 4(m), 4(k)) x
+    /// (K/4, N, 4(k)). if transposeA the layout is (K/4, M/4, 4(m), 4(k)) x
+    /// (K/4, N, 4(k))
+    MK4_DOT = 3,
+}
+
+enum MatrixMulV0DataType : uint  {
+    /// input/output both float32/float16
+    FLOAT = 0,
+    INT8x8x16 = 1,
+    INT8x8x32 = 2,
+    /// input/output both float16, the internal compute is float32
+    FLOAT_IO16xC32 = 3,
+    /// input QuantizedAsymm8, output QuantizedS32
+    QUINT8x8x32 = 4,
+    /// input QuantizedAsymm4, output QuantizedS32
+    QUINT4x4x32 = 5,
+}
+
+/// Specifies special computation modes, e.g. different combinations of
+/// intermediate result data types.
+enum MatrixMulV1ComputeMode : uint  {
+    /// No special requirements on the precision of intermediate results.
+    DEFAULT = 0,
+    /// Use Float32 accumulator and intermediate result. Only supported when
+    /// input and output is Float16.
+    FLOAT32 = 1,
+}
+
+enum PaddingPaddingMode : uint  {
+    /// aaaaaa|abcdefgh|hhhhhhh
+    REPLICATE = 0,
+    /// fedcba|abcdefgh|hgfedcb
+    REFLECT = 1,
+    /// iiiiii|abcdefgh|iiiiiii
+    CONSTANT = 2,
+}
+
+enum PoolingV0Mode : uint  {
+    /// maximum value inside pooling window
+    MAX_ = 0,
+    /// arithmetic mean of all values inside pooling window. Padding values are
+    /// taken into account and are viewed as zero
+    AVERAGE = 1,
+    /// arithmetic mean of all values inside pooling window. No padding isused.
+    AVERAGE_COUNT_EXCLUDE_PADDING = 2,
+}
+
+enum RNNCellNonlineMode : uint  {
+    IDENTITY = 0,
+    RELU = 1,
+    TANH = 2,
+}
+
+enum ROIAlignV0Mode : uint  {
+    MAX_ = 0,
+    AVERAGE = 1,
+}
+
+enum ROIPoolingMode : uint  {
+    /// maximum value inside pooling window; pooling result would be 0 if
+    /// pooling window is empty
+    MAX_ = 0,
+    /// arithmetic mean of all values inside pooling window; pooling result
+    /// would be 0 if pooling window is empty
+    AVERAGE = 1,
+}
+
+enum ReduceDataType : uint  {
+    /// input/output are the same data type, and the internal computation type would be chosen by the input/output dtypes and the reduction mode.
+    /// Currently, ```DEFAULT``` mode means:
+    /// 
+    /// +--------------------+-----------------------------------+-------------------+
+    /// | Input/Output DType | Mode                              | Computation DType |
+    /// +====================+===================================+===================+
+    /// | FLOAT32            | MIN/MAX/MEAN/SUM/SUM_SQR/PRODUCT  | FLOAT32           |
+    /// +--------------------+-----------------------------------+-------------------+
+    /// | FLOAT16            | MIN/MAX/MEAN/SUM/SUM_SQR/PRODUCT  | FLOAT16           |
+    /// +--------------------+-----------------------------------+-------------------+
+    /// | INT32              | MIN/MAX/MEAN/SUM/SUM_SQR/PRODUCT  | INT32             |
+    /// +--------------------+-----------------------------------+-------------------+
+    /// | INT8               | MIN/MAX/MEAN/SUM/SUM_SQR/PRODUCT  | INT8              |
+    /// +--------------------+-----------------------------------+-------------------+
+    /// | QuantizedS8        | MIN/MAX                           | QuantizedS8       |
+    /// +--------------------+-----------------------------------+-------------------+
+    /// | QuantizedS8        | MEAN/SUM                          | QuantizedS32      |
+    /// +--------------------+-----------------------------------+-------------------+
+    /// | Quantized8Asymm    | MIN/MAX                           | Quantized8Asymm   |
+    /// +--------------------+-----------------------------------+-------------------+
+    /// | Quantized8Asymm    | MEAN/SUM                          | QuantizedS32      |
+    /// +--------------------+-----------------------------------+-------------------+
+    /// 
+    /// 
+    DEFAULT = 0,
+    /// Deprecated. This was replaced by FLOAT_O16xC32, and input's dtype
+    /// decided by actual input tensor.
+    FLOAT_IO16xC32 = 1,
+    /// compute/output both are float32
+    FLOAT_O32xC32 = 2,
+    /// compute are float32, output float16
+    FLOAT_O16xC32 = 3,
+    /// input quint8, compute and output are qint32
+    QUINT_I8xO32 = 4,
+    /// input qint8, compute and output are qint32
+    QINT_I8xO32 = 5,
+}
+
+enum ReduceMode : uint  {
+    SUM = 0,
+    /// sum of x * x for each element x
+    SUM_SQR = 1,
+    PRODUCT = 2,
+    MIN_ = 3,
+    MAX_ = 4,
+    MEAN = 5,
+}
+
+enum ReduceV0Mode : uint  {
+    SUM = 0,
+    /// sum of x * x for each element x
+    SUM_SQR = 1,
+    PRODUCT = 2,
+    MIN_ = 3,
+    MAX_ = 4,
+}
+
+enum ReduceV1DataType : uint  {
+    /// input/output are the same data type, and the internal computation type would be chosen by the input/output dtypes and the reduction mode.
+    /// Currently, ```DEFAULT``` mode means:
+    /// 
+    /// +--------------------+-----------------------------------+-------------------+
+    /// | Input/Output DType | Mode                              | Computation DType |
+    /// +====================+===================================+===================+
+    /// | FLOAT32            | MIN/MAX/MEAN/SUM/SUM_SQR/PRODUCT  | FLOAT32           |
+    /// +--------------------+-----------------------------------+-------------------+
+    /// | FLOAT16            | MIN/MAX/MEAN/SUM/SUM_SQR/PRODUCT  | FLOAT16           |
+    /// +--------------------+-----------------------------------+-------------------+
+    /// | INT32              | MIN/MAX/MEAN/SUM/SUM_SQR/PRODUCT  | INT32             |
+    /// +--------------------+-----------------------------------+-------------------+
+    /// | INT8               | MIN/MAX/MEAN/SUM/SUM_SQR/PRODUCT  | INT8              |
+    /// +--------------------+-----------------------------------+-------------------+
+    /// | QuantizedS8        | MIN/MAX                           | QuantizedS8       |
+    /// +--------------------+-----------------------------------+-------------------+
+    /// | QuantizedS8        | MEAN/SUM                          | QuantizedS32      |
+    /// +--------------------+-----------------------------------+-------------------+
+    /// | Quantized8Asymm    | MIN/MAX                           | Quantized8Asymm   |
+    /// +--------------------+-----------------------------------+-------------------+
+    /// | Quantized8Asymm    | MEAN/SUM                          | QuantizedS32      |
+    /// +--------------------+-----------------------------------+-------------------+
+    /// 
+    /// 
+    DEFAULT = 0,
+    /// Deprecated. This was replaced by FLOAT_O16xC32, and input's dtype
+    /// decided by actual input tensor.
+    FLOAT_IO16xC32 = 1,
+    /// compute/output both are float32
+    FLOAT_O32xC32 = 2,
+    /// compute are float32, output float16
+    FLOAT_O16xC32 = 3,
+    /// input quint8, compute and output are qint32
+    QUINT_I8xO32 = 4,
+    /// input qint8, compute and output are qint32
+    QINT_I8xO32 = 5,
+}
+
+enum ReduceV1Mode : uint  {
+    SUM = 0,
+    /// sum of x * x for each element x
+    SUM_SQR = 1,
+    PRODUCT = 2,
+    MIN_ = 3,
+    MAX_ = 4,
+    MEAN = 5,
+}
+
+/// Relayout mode.
+/// 
+/// **Naming conventions**
+/// 
+/// 1. ``A_B`` means change from layout format ``A`` to ``B``.
+/// 2. ``INTER_WEIGHT_xx`` means relayout the weight for faster processing by
+///    :attr:`Convolution.Format.NHWCD4` convolutions.
+/// 3. A suffix of ``I`` means ``Image2DPack4TensorFormat`` tensor format is used
+///    for faster processing on GPUs.
+/// 
+/// **Layout definitions**
+/// 
+/// * ``NCHW`` layout: ``{N, C, H, W}``
+/// * ``NHWC`` layout: ``{N, H, W, C}``
+/// * ``NHWCD4`` layout: ``{N, H, (C + 3) / 4, W, 4}``
+/// * ``NHWCD4I`` layout: with ``align_axis = 2``
+/// * ``NCHW4`` layout: ``{N, C/4, H, W, 4}``
+/// * ``NCHW88`` layout: ``{N, C/8, H, W, 8}``
+/// * ``CHWN4`` layout: ``{C/4, H, W, N, 4}``
+/// * ``NCHW64`` layout: ``{N, C/64, H, W, 64}``
+/// 
+/// **Float weight transformation definitions**
+/// 
+/// +---------------+---------------------------------+--------------------+--------------------------------------+------+
+/// | Sparsity Type | Input Layout                    | Input Req          | Output Layout                        | Axis |
+/// +===============+=================================+====================+======================================+======+
+/// | DENSE         | ``{OC, IC, FH, FW}``            | ``OC % 4 == 0``    | ``{OC/4, FH, FW, IC, 4}``            | 3    |
+/// +---------------+---------------------------------+--------------------+--------------------------------------+------+
+/// | GROUP         | ``{GROUP, OCPG, ICPG, FH, FW}`` | ``OCPG % 4 == 0``  | ``{GROUP, OCPG/4, FH, FW, ICPG, 4}`` | 4    |
+/// |               |                                 | ``ICPG % 4 == 0``  |                                      |      |
+/// +---------------+---------------------------------+--------------------+--------------------------------------+------+
+/// | CHAN          | ``{GROUP, 1, 1, FH, FW}``       | ``GROUP % 4 == 0`` | ``{GROUP / 4, 1, FH ,FW, 4}``        | 1    |
+/// +---------------+---------------------------------+--------------------+--------------------------------------+------+
+/// 
+/// **Float weight transformation nchw88 definitions**
+/// 
+/// +---------------+---------------------------------+--------------------+--------------------------------------+
+/// | Sparsity Type | Input Layout                    | Input Req          | Output Layout                        |
+/// +===============+=================================+====================+======================================+
+/// | DENSE         | ``{OC, IC, FH, FW}``            | ``OC % 8 == 0``    |``{OC/8, IC/8 ,FH, FW, 8(IC), 8(OC)}``|
+/// |               |                                 | ``IC % 8 == 0``    |                                      |
+/// +---------------+---------------------------------+--------------------+--------------------------------------+
+/// | GROUP         | ``{GROUP, OCPG, ICPG, FH, FW}`` | ``OCPG % 8 == 0``  | ``{GROUP, OCPG/8, ICPG/8 FH, FW,     |
+/// |               |                                 | ``ICPG % 8 == 0``  |  8(ICPG), 8(OCPG)} ``                |
+/// +---------------+---------------------------------+--------------------+--------------------------------------+
+/// | CHAN          | ``{GROUP, 1, 1, FH, FW}``       | ``GROUP % 8 == 0`` | ``{GROUP / 8, 1, FH ,FW, 8}``        |
+/// +---------------+---------------------------------+--------------------+--------------------------------------+
+/// 
+/// **Int8(DOT) weight transformation definitions**
+/// 
+/// +---------------+---------------------------------+--------------------+------------------------------------------+------+
+/// | Sparsity Type | Input Layout                    | Input Req          | Output Layout                            | Axis |
+/// +===============+=================================+====================+==========================================+======+
+/// | DENSE         | ``{OC, IC, FH, FW}``            | ``OC % 4 == 0``    | ``{OC/4, FH, FW, IC/4, 4, 4}`            | 3    |
+/// +---------------+---------------------------------+--------------------+------------------------------------------+------+
+/// | GROUP         | ``{GROUP, OCPG, ICPG, FH, FW}`` | ``OCPG % 4 == 0``  | ``{GROUP, OCPG/4, FH, FW, ICPG/4, 4, 4}``| 4    |
+/// |               |                                 | ``ICPG % 4 == 0``  |                                          |      |
+/// +---------------+---------------------------------+--------------------+------------------------------------------+------+
+/// 
+/// Note: the axis column means the corresponding ``align_axis`` for image format
+/// when the ``I`` suffix is present.
+/// 
+/// Note: NCHW_NCHW4_WEIGHT will auto pad oc and ic, you should remove oc in later opr by seting group and oc param with NCHW4_NCHW
+/// 
+enum RelayoutFormatV0Mode : uint  {
+    NHWC_NHWCD4 = 0,
+    NHWCD4_NHWC = 1,
+    NHWC_NHWCD4I = 2,
+    NCHW_NHWCD4 = 3,
+    NCHW_NHWCD4I = 4,
+    NHWCD4I_NCHW = 5,
+    NHWCD4_NCHW = 6,
+    INTER_WEIGHT_DENSE = 7,
+    INTER_WEIGHT_DENSEI = 8,
+    INTER_WEIGHT_GROUP = 9,
+    INTER_WEIGHT_GROUPI = 10,
+    INTER_WEIGHT_CHAN = 11,
+    INTER_WEIGHT_CHANI = 12,
+    INTER_WEIGHT_DENSEI_DOT = 13,
+    INTER_WEIGHT_GROUPI_DOT = 14,
+    NCHW4_CHWN4 = 15,
+    CHWN4_NCHW4 = 16,
+    NCHW_NCHW88_CONV_DENSE_WEIGHT = 17,
+    NCHW_NCHW88_CONV_CHAN_WEIGHT = 18,
+    NCHW_NCHW88_CONV_GROUP_WEIGHT = 19,
+    NCHW_NCHW88 = 20,
+    NCHW88_NCHW = 21,
+    NCHW_NCHW4_IC_SMALL = 22,
+    NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT = 23,
+    NCHW_NCHW4 = 24,
+    NCHW4_NCHW = 25,
+    NCHW_NCHW4_WEIGHT = 26,
+    NCHW_NCHW64 = 27,
+    NCHW64_NCHW = 28,
+    NCHW_NHWC = 29,
+    NHWC_NCHW = 30,
+    NHWCD4I_NHWC = 31,
+}
+
+enum SeparableConvBorderMode : uint  {
+    BORDER_REPLICATE = 0,
+    BORDER_REFLECT = 1,
+    BORDER_REFLECT_101 = 2,
+    BORDER_WRAP = 3,
+    BORDER_CONSTANT = 4,
+    BORDER_TRANSPARENT = 5,
+    BORDER_ISOLATED = 6,
+}
+
+enum SeparableConv3DBorderMode : uint  {
+    BORDER_REPLICATE = 0,
+    BORDER_REFLECT = 1,
+    BORDER_REFLECT_101 = 2,
+    BORDER_WRAP = 3,
+    BORDER_CONSTANT = 4,
+    BORDER_TRANSPARENT = 5,
+    BORDER_ISOLATED = 6,
+}
+
+enum SpatialTfGridGeneratorMode : uint  {
+    AFFINE = 0,
+}
+
+enum SpatialTfSamplerMode : uint  {
+    BILINEAR = 0,
+}
+
+enum TopKMode : uint  {
+    /// only the value of the k'th element would be computed
+    KTH_ONLY = 0,
+    /// all the top-k values and corresponding indices would be computed; no
+    /// order is guaranteed
+    VALUE_IDX_NOSORT = 1,
+    /// all the top-k values and corresponding indices sorted
+    VALUE_IDX_SORTED = 2,
+}
+
+enum WarpPerspectiveV1BorderMode : uint  {
+    /// aaaaaa|abcdefgh|hhhhhhh
+    REPLICATE = 0,
+    /// fedcba|abcdefgh|hgfedcb
+    REFLECT = 1,
+    /// gfedcb|abcdefgh|gfedcba
+    REFLECT_101 = 2,
+    /// cdefgh|abcdefgh|abcdefg
+    WRAP = 3,
+    /// iiiiii|abcdefgh|iiiiiii
+    CONSTANT = 4,
+    TRANSPARENT = 5,
+    ISOLATED = 6,
+}
+
+enum WarpPerspectiveV1InterpolationMode : uint  {
+    NEAREST = 0,
+    LINEAR = 1,
+    AREA = 2,
+    CUBIC = 3,
+    LANCZOS4 = 4,
+}
+
+table Empty {
+}
+
+table Axis {
+    axis:int = 0;
+}
+
+table ConvolutionV0 {
+    mode:ConvolutionV0Mode = CROSS_CORRELATION;
+    /// padding on one side on the first dimension
+    pad_h:uint = 0;
+    /// padding on one side on the second dimension
+    pad_w:uint = 0;
+    /// kernel stride on the first dimension
+    stride_h:uint = 1;
+    /// kernel stride on the second dimension
+    stride_w:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_h:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_w:uint = 1;
+    data_type:ConvolutionV0DataType = FLOAT;
+    sparse:ConvolutionV0Sparse = DENSE;
+    /// convolution data/filter/output format; see :class:`RelayoutFormat` for
+    /// more details
+    format:ConvolutionV0Format = NCHW;
+}
+
+table ConvolutionV1 {
+    mode:ConvolutionV0Mode = CROSS_CORRELATION;
+    /// padding on one side on the first dimension
+    pad_h:uint = 0;
+    /// padding on one side on the second dimension
+    pad_w:uint = 0;
+    /// kernel stride on the first dimension
+    stride_h:uint = 1;
+    /// kernel stride on the second dimension
+    stride_w:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_h:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_w:uint = 1;
+    sparse:ConvolutionV0Sparse = DENSE;
+    format:ConvolutionV0Format = NCHW;
+    /// Specifies special computation modes, e.g. different combinations of
+    /// intermediate result data types.
+    compute_mode:ConvolutionV1ComputeMode = DEFAULT;
+}
+
+table Convolution {
+    mode:ConvolutionV0Mode = CROSS_CORRELATION;
+    /// padding on one side on the first dimension
+    pad_h:uint = 0;
+    /// padding on one side on the second dimension
+    pad_w:uint = 0;
+    /// kernel stride on the first dimension
+    stride_h:uint = 1;
+    /// kernel stride on the second dimension
+    stride_w:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_h:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_w:uint = 1;
+    sparse:ConvolutionV0Sparse = DENSE;
+    /// convolution data/filter/output format; see :class:`RelayoutFormat` for
+    /// more details
+    format:ConvolutionFormat = NCHW;
+    compute_mode:ConvolutionV1ComputeMode = DEFAULT;
+}
+
+table MaskPropagate {
+    /// padding on one side on the first dimension
+    pad_h:uint = 0;
+    /// padding on one side on the second dimension
+    pad_w:uint = 0;
+    /// kernel stride on the first dimension
+    stride_h:uint = 1;
+    /// kernel stride on the second dimension
+    stride_w:uint = 1;
+    /// kernel height
+    kernel_h:uint = 1;
+    /// kernel width
+    kernel_w:uint = 1;
+    /// dilate height
+    dilate_h:uint = 1;
+    /// dilate width
+    dilate_w:uint = 1;
+}
+
+table ConvPooling {
+    method:ConvPoolingMethod = WITH_TEXTURE_OBJ;
+    convMode:ConvolutionV0Mode = CROSS_CORRELATION;
+    poolMode:ConvPoolingPoolMode = AVERAGE;
+    nonlineMode:ConvPoolingNonlineMode = IDENTITY;
+    pool_shape_h:uint = 1;
+    pool_shape_w:uint = 1;
+    pool_stride_h:uint = 1;
+    pool_stride_w:uint = 1;
+    pool_pad_h:uint = 0;
+    pool_pad_w:uint = 0;
+    conv_stride_h:uint = 1;
+    conv_stride_w:uint = 1;
+    conv_pad_h:uint = 0;
+    conv_pad_w:uint = 0;
+}
+
+/// legacy conv_bias
+table ConvBiasV0 {
+    nonlineMode:ConvBiasV0NonlineMode = IDENTITY;
+    mode:ConvolutionV0Mode = CROSS_CORRELATION;
+    pad_h:uint = 0;
+    pad_w:uint = 0;
+    stride_h:uint = 1;
+    stride_w:uint = 1;
+}
+
+/// active(conv(x, w) + bias)
+table ConvBiasV1 {
+    nonlineMode:ConvBiasV0NonlineMode = IDENTITY;
+    mode:ConvolutionV0Mode = CROSS_CORRELATION;
+    data_type:ConvolutionV0DataType = FLOAT;
+    sparse:ConvolutionV0Sparse = DENSE;
+    format:ConvolutionV0Format = NCHW;
+    /// padding on one side on the first dimension
+    pad_h:uint = 0;
+    /// padding on one side on the second dimension
+    pad_w:uint = 0;
+    /// kernel stride on the first dimension
+    stride_h:uint = 1;
+    /// kernel stride on the second dimension
+    stride_w:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_h:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_w:uint = 1;
+}
+
+/// active(conv(x, w) + bias)
+table ConvBiasV2 {
+    nonlineMode:ConvBiasV0NonlineMode = IDENTITY;
+    mode:ConvolutionV0Mode = CROSS_CORRELATION;
+    sparse:ConvolutionV0Sparse = DENSE;
+    format:ConvolutionV0Format = NCHW;
+    /// padding on one side on the first dimension
+    pad_h:uint = 0;
+    /// padding on one side on the second dimension
+    pad_w:uint = 0;
+    /// kernel stride on the first dimension
+    stride_h:uint = 1;
+    /// kernel stride on the second dimension
+    stride_w:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_h:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_w:uint = 1;
+    compute_mode:ConvolutionV1ComputeMode = DEFAULT;
+}
+
+/// active(conv(x, w) + bias)
+table ConvBiasV3 {
+    nonlineMode:ConvBiasV0NonlineMode = IDENTITY;
+    mode:ConvolutionV0Mode = CROSS_CORRELATION;
+    sparse:ConvolutionV0Sparse = DENSE;
+    format:ConvolutionV0Format = NCHW;
+    /// padding on one side on the first dimension
+    pad_h:uint = 0;
+    /// padding on one side on the second dimension
+    pad_w:uint = 0;
+    /// kernel stride on the first dimension
+    stride_h:uint = 1;
+    /// kernel stride on the second dimension
+    stride_w:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_h:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_w:uint = 1;
+    /// detail meaning \see winograd in conv bias
+    output_block_size:uint = 0;
+    compute_mode:ConvolutionV1ComputeMode = DEFAULT;
+}
+
+/// active(conv(x, w) + bias)
+table ConvBias {
+    nonlineMode:ConvBiasV0NonlineMode = IDENTITY;
+    mode:ConvolutionV0Mode = CROSS_CORRELATION;
+    sparse:ConvolutionV0Sparse = DENSE;
+    format:ConvolutionFormat = NCHW;
+    /// padding on one side on the first dimension
+    pad_h:uint = 0;
+    /// padding on one side on the second dimension
+    pad_w:uint = 0;
+    /// kernel stride on the first dimension
+    stride_h:uint = 1;
+    /// kernel stride on the second dimension
+    stride_w:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_h:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_w:uint = 1;
+    compute_mode:ConvolutionV1ComputeMode = DEFAULT;
+}
+
+table SeparableConv {
+    mode:ConvolutionV0Mode = CROSS_CORRELATION;
+    borderMode:SeparableConvBorderMode = BORDER_REPLICATE;
+    is_symm_kernel:bool = true;
+    pad_h:uint = 0;
+    pad_w:uint = 0;
+    stride_h:uint = 1;
+    stride_w:uint = 1;
+    ksize_h:uint = 3;
+    ksize_w:uint = 3;
+    anchor_h:uint = 1;
+    anchor_w:uint = 1;
+}
+
+table Images2Neibs {
+    pad_h:uint = 0;
+    pad_w:uint = 0;
+    stride_h:uint = 1;
+    stride_w:uint = 1;
+    dilate_h:uint = 1;
+    dilate_w:uint = 1;
+    window_h:uint = 3;
+    window_w:uint = 3;
+}
+
+table SlidingWindowTranspose {
+    out_h:uint = 0;
+    out_w:uint = 0;
+    pad_h:uint = 0;
+    pad_w:uint = 0;
+    stride_h:uint = 1;
+    stride_w:uint = 1;
+    dilate_h:uint = 1;
+    dilate_w:uint = 1;
+    window_h:uint = 3;
+    window_w:uint = 3;
+}
+
+table PoolingV0 {
+    mode:PoolingV0Mode = MAX_;
+    pad_h:uint = 0;
+    pad_w:uint = 0;
+    stride_h:uint = 2;
+    stride_w:uint = 2;
+    window_h:uint = 2;
+    window_w:uint = 2;
+    format:ConvolutionV0Format = NCHW;
+}
+
+table Pooling {
+    mode:PoolingV0Mode = MAX_;
+    pad_h:uint = 0;
+    pad_w:uint = 0;
+    stride_h:uint = 2;
+    stride_w:uint = 2;
+    window_h:uint = 2;
+    window_w:uint = 2;
+    format:ConvolutionFormat = NCHW;
+}
+
+table Softmax {
+    axis:int = -1;
+}
+
+table AdaptivePoolingV0 {
+    mode:PoolingV0Mode = MAX_;
+    format:ConvolutionV0Format = NCHW;
+}
+
+table AdaptivePooling {
+    mode:PoolingV0Mode = MAX_;
+    format:ConvolutionFormat = NCHW;
+}
+
+/// see ImageNet Classification with Deep Convolutional Neural Networks for
+/// meaning of the fields
+table LRN {
+    /// must be odd
+    n:uint = 5;
+    k:float = 2.;
+    alpha:float = 1e-4;
+    beta:float = 0.75;
+}
+
+table BN {
+    param_dim:BNParamDim = DIM_11HW;
+    fwd_mode:BNFwdMode = TRAINING;
+    epsilon:double = 1e-4;
+    avg_factor:double = 1.;
+    scale:float = 1.;
+    bias:float = 0.;
+}
+
+table ROIPooling {
+    mode:ROIPoolingMode = MAX_;
+    scale:float = 1.;
+}
+
+table WarpPerspectiveV1 {
+    imode:WarpPerspectiveV1InterpolationMode = LINEAR;
+    bmode:WarpPerspectiveV1BorderMode = REPLICATE;
+    format:ConvolutionV0Format = NCHW;
+    /// used for CONSTANT bmode
+    border_val:float = .0;
+}
+
+table WarpPerspective {
+    imode:WarpPerspectiveV1InterpolationMode = LINEAR;
+    bmode:WarpPerspectiveV1BorderMode = REPLICATE;
+    format:ConvolutionFormat = NCHW;
+    /// used for CONSTANT bmode
+    border_val:float = .0;
+}
+
+table SpatialTfGridGenerator {
+    mode:SpatialTfGridGeneratorMode = AFFINE;
+}
+
+table SpatialTfSampler {
+    mode:SpatialTfSamplerMode = BILINEAR;
+}
+
+table AddUpdate {
+    alpha:float = 1.;
+    beta:float = 1.;
+    bias:float = 0.;
+}
+
+table Elemwise {
+    mode:ElemwiseMode = RELU;
+}
+
+table ElemwiseMultiType {
+    mode:ElemwiseMultiTypeMode = FUSE_MUL_ADD3_INT16x32x32x32;
+}
+
+/// power with constant exponent
+table PowC {
+    exp:float = 0;
+}
+
+/// 2d discrete cosine transform
+table DctChannelSelectV0 {
+    format:ConvolutionV0Format = NCHW;
+    fastImpl:DctChannelSelectV0FastImpl = NONE;
+    dct_block_size:int = 8;
+}
+
+/// 2d discrete cosine transform
+table DctChannelSelect {
+    format:ConvolutionFormat = NCHW;
+    fastImpl:DctChannelSelectV0FastImpl = NONE;
+    dct_block_size:int = 8;
+}
+
+table MatrixMulV0 {
+    transposeA:bool = false;
+    transposeB:bool = false;
+    data_type:MatrixMulV0DataType = FLOAT;
+}
+
+table MatrixMulV1 {
+    transposeA:bool = false;
+    transposeB:bool = false;
+    /// Specifies special computation modes, e.g. different combinations of
+    /// intermediate result data types.
+    compute_mode:MatrixMulV1ComputeMode = DEFAULT;
+}
+
+table MatrixMul {
+    transposeA:bool = false;
+    transposeB:bool = false;
+    compute_mode:MatrixMulV1ComputeMode = DEFAULT;
+    format:MatrixMulFormat = DEFAULT;
+}
+
+table SVD {
+    /// Whether to compute the full-sized u and v or only the leading min(m, n)
+    /// singular vectors. Ignored if compute_uv is false.
+    full_matrices:bool = false;
+    /// Whether the left (u) and right (v) singular vectors will be computed and
+    /// outputted.
+    compute_uv:bool = true;
+}
+
+/// legacy reduce
+table ReduceV0 {
+    mode:ReduceV0Mode = SUM;
+    /// axis along which reduction is performed; if -1 is given, reduce to given
+    /// target shape (only used in megbrain)
+    axis:int = -1;
+}
+
+/// reduce along given axis
+table ReduceV1 {
+    mode:ReduceV1Mode = SUM;
+    /// axis along which reduction is performed; if -1 is given, reduce to given
+    /// target shape (only used in megbrain)
+    axis:int = -1;
+    data_type:ReduceV1DataType = DEFAULT;
+}
+
+/// reduce along given axis
+table Reduce {
+    mode:ReduceMode = SUM;
+    /// axis along which reduction is performed; if INT_MAX is given, reduce to
+    /// given target shape (only used in megbrain)
+    axis:int = 2147483647;
+    data_type:ReduceDataType = DEFAULT;
+}
+
+/// calculate accumulated sum along given axis
+table CumsumV0 {
+    /// axis along which cumsum is performed
+    axis:int = -1;
+    /// whether the current element is taken into account
+    exclusive:bool = true;
+    /// whether the cumsum is forward or backward
+    reverse:bool = false;
+}
+
+/// calculate accumulated sum along given axis
+table Cumsum {
+    /// axis along which cumsum is performed, default with INT_MAX
+    axis:int = 2147483647;
+    /// whether the current element is taken into account
+    exclusive:bool = true;
+    /// whether the cumsum is forward or backward
+    reverse:bool = false;
+}
+
+table CondTake {
+    mode:CondTakeMode = EQ;
+    /// the value to be compared with; note that for integer data, val is also
+    /// converted to int
+    val:float = 0;
+    /// used for float equality comparison
+    eps:float = 1e-06;
+}
+
+table Argsort {
+    order:ArgsortOrder = ASCENDING;
+}
+
+table IndexingRemap {
+    /// Whether no two dst element maps to the same src element. Enabling this
+    /// option can accelerate gradient operator since atomic adding operations
+    /// could be avoided.
+    is_non_overlapping:bool = false;
+}
+
+table Sleep {
+    /// time to sleep in seconds
+    time:float = 0;
+}
+
+table Linspace {
+    /// Whether stop is included in the generated tensor
+    endpoint:bool = true;
+}
+
+table LinspaceFull {
+    /// The first val.
+    start:double = 0;
+    /// The last val.
+    stop:double = 1;
+    /// Whether stop is included in the generated tensor
+    endpoint:bool = true;
+}
+
+table Eye {
+    /// Index of the diagonal: 0 (the default) refers to the main diagonal, a
+    /// positive value refers to an upper diagonal, and a negative value to a
+    /// lower diagonal.
+    k:int = 0;
+    /// data type of output value
+    dtype:DTypeEnum = Float32;
+}
+
+table Diag {
+    /// Index of the diagonal: 0 (the default) refers to the main diagonal, a
+    /// positive value refers to an upper diagonal, and a negative value to a
+    /// lower diagonal.
+    k:int = 0;
+}
+
+table UniformRNGV0 {
+    seed:ulong = 0;
+}
+
+table UniformRNG {
+    seed:ulong = 0;
+    /// The dtype of output Tensor. Only support Float32.
+    dtype:DTypeEnum = Float32;
+}
+
+table GaussianRNGV0 {
+    seed:ulong = 0;
+    mean:float = 0;
+    std:float = 1;
+}
+
+table GaussianRNG {
+    seed:ulong = 0;
+    mean:float = 0;
+    std:float = 1;
+    /// The dtype of output Tensor. Only support Float32.
+    dtype:DTypeEnum = Float32;
+}
+
+table GammaRNG {
+    seed:ulong = 0;
+}
+
+table BetaRNG {
+    seed:ulong = 0;
+}
+
+table PoissonRNG {
+    seed:ulong = 0;
+}
+
+table PermutationRNG {
+    seed:ulong = 0;
+    /// The dtype of output Tensor. Int32, Int16 and Float32 are supported.
+    dtype:DTypeEnum = Int32;
+}
+
+table ShuffleRNG {
+    seed:ulong = 0;
+}
+
+table Flip {
+    vertical:bool = false;
+    horizontal:bool = false;
+}
+
+table Rotate {
+    clockwise:bool = true;
+}
+
+table ROICopy {
+    row_from:uint = 0;
+    row_to:uint = 0;
+    col_from:uint = 0;
+    col_to:uint = 0;
+}
+
+table CvtColor {
+    mode:CvtColorMode = RGB2GRAY;
+}
+
+table WarpAffineV0 {
+    imode:WarpPerspectiveV1InterpolationMode = LINEAR;
+    border_mode:WarpPerspectiveV1BorderMode = REPLICATE;
+    /// used for CONSTANT bmode
+    border_val:float = .0;
+}
+
+table WarpAffineV1 {
+    imode:WarpPerspectiveV1InterpolationMode = LINEAR;
+    border_mode:WarpPerspectiveV1BorderMode = REPLICATE;
+    /// used for CONSTANT bmode
+    border_val:float = .0;
+    format:ConvolutionV0Format = NHWC;
+}
+
+table WarpAffine {
+    imode:WarpPerspectiveV1InterpolationMode = LINEAR;
+    border_mode:WarpPerspectiveV1BorderMode = REPLICATE;
+    /// used for CONSTANT bmode
+    border_val:float = .0;
+    format:ConvolutionFormat = NHWC;
+}
+
+table GaussianBlur {
+    border_mode:WarpPerspectiveV1BorderMode = REPLICATE;
+    kernel_height:uint = 0;
+    kernel_width:uint = 0;
+    sigma_x:float = 0.;
+    sigma_y:float = 0.;
+}
+
+table ResizeV0 {
+    imode:WarpPerspectiveV1InterpolationMode = LINEAR;
+}
+
+table ResizeV1 {
+    imode:WarpPerspectiveV1InterpolationMode = LINEAR;
+    format:ConvolutionV0Format = NHWC;
+}
+
+table Resize {
+    imode:WarpPerspectiveV1InterpolationMode = LINEAR;
+    format:ConvolutionFormat = NHWC;
+}
+
+table RemapV0 {
+    imode:WarpPerspectiveV1InterpolationMode = LINEAR;
+    border_type:WarpPerspectiveV1BorderMode = REPLICATE;
+    format:ConvolutionV0Format = NHWC;
+    scalar:float = 0.;
+}
+
+table Remap {
+    imode:WarpPerspectiveV1InterpolationMode = LINEAR;
+    border_type:WarpPerspectiveV1BorderMode = REPLICATE;
+    format:ConvolutionFormat = NHWC;
+    scalar:float = 0.;
+}
+
+table Convolution3D {
+    mode:Convolution3DMode = CROSS_CORRELATION;
+    /// padding on one side on the first dimension
+    pad_d:uint = 0;
+    /// padding on one side on the second dimension
+    pad_h:uint = 0;
+    /// padding on one side on the third dimension
+    pad_w:uint = 0;
+    /// kernel stride on the first dimension
+    stride_d:uint = 1;
+    /// kernel stride on the second dimension
+    stride_h:uint = 1;
+    /// kernel stride on the third dimension
+    stride_w:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the first
+    /// dimension
+    dilate_d:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_h:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the third
+    /// dimension
+    dilate_w:uint = 1;
+    sparse:Convolution3DSparse = DENSE;
+    data_type:Convolution3DDataType = FLOAT;
+    format:Convolution3DFormat = NCDHW;
+}
+
+table Conv3DBias {
+    nonlineMode:Conv3DBiasNonlineMode = IDENTITY;
+    mode:Convolution3DMode = CROSS_CORRELATION;
+    pad_d:uint = 0;
+    pad_h:uint = 0;
+    pad_w:uint = 0;
+    stride_d:uint = 1;
+    stride_h:uint = 1;
+    stride_w:uint = 0;
+}
+
+table SeparableConv3D {
+    mode:Convolution3DMode = CROSS_CORRELATION;
+    borderMode:SeparableConv3DBorderMode = BORDER_REPLICATE;
+    is_symm_kernel:bool = true;
+    pad_d:uint = 0;
+    pad_h:uint = 0;
+    pad_w:uint = 0;
+    stride_d:uint = 0;
+    stride_h:uint = 1;
+    stride_w:uint = 1;
+    ksize_d:uint = 0;
+    ksize_h:uint = 3;
+    ksize_w:uint = 3;
+    anchor_d:uint = 0;
+    anchor_h:uint = 1;
+    anchor_w:uint = 1;
+}
+
+table TopK {
+    mode:TopKMode = KTH_ONLY;
+}
+
+/// Change the tensor layout format
+table RelayoutFormatV0 {
+    /// Relayout mode.
+    /// 
+    /// **Naming conventions**
+    /// 
+    /// 1. ``A_B`` means change from layout format ``A`` to ``B``.
+    /// 2. ``INTER_WEIGHT_xx`` means relayout the weight for faster processing by
+    ///    :attr:`Convolution.Format.NHWCD4` convolutions.
+    /// 3. A suffix of ``I`` means ``Image2DPack4TensorFormat`` tensor format is used
+    ///    for faster processing on GPUs.
+    /// 
+    /// **Layout definitions**
+    /// 
+    /// * ``NCHW`` layout: ``{N, C, H, W}``
+    /// * ``NHWC`` layout: ``{N, H, W, C}``
+    /// * ``NHWCD4`` layout: ``{N, H, (C + 3) / 4, W, 4}``
+    /// * ``NHWCD4I`` layout: with ``align_axis = 2``
+    /// * ``NCHW4`` layout: ``{N, C/4, H, W, 4}``
+    /// * ``NCHW88`` layout: ``{N, C/8, H, W, 8}``
+    /// * ``CHWN4`` layout: ``{C/4, H, W, N, 4}``
+    /// * ``NCHW64`` layout: ``{N, C/64, H, W, 64}``
+    /// 
+    /// **Float weight transformation definitions**
+    /// 
+    /// +---------------+---------------------------------+--------------------+--------------------------------------+------+
+    /// | Sparsity Type | Input Layout                    | Input Req          | Output Layout                        | Axis |
+    /// +===============+=================================+====================+======================================+======+
+    /// | DENSE         | ``{OC, IC, FH, FW}``            | ``OC % 4 == 0``    | ``{OC/4, FH, FW, IC, 4}``            | 3    |
+    /// +---------------+---------------------------------+--------------------+--------------------------------------+------+
+    /// | GROUP         | ``{GROUP, OCPG, ICPG, FH, FW}`` | ``OCPG % 4 == 0``  | ``{GROUP, OCPG/4, FH, FW, ICPG, 4}`` | 4    |
+    /// |               |                                 | ``ICPG % 4 == 0``  |                                      |      |
+    /// +---------------+---------------------------------+--------------------+--------------------------------------+------+
+    /// | CHAN          | ``{GROUP, 1, 1, FH, FW}``       | ``GROUP % 4 == 0`` | ``{GROUP / 4, 1, FH ,FW, 4}``        | 1    |
+    /// +---------------+---------------------------------+--------------------+--------------------------------------+------+
+    /// 
+    /// **Float weight transformation nchw88 definitions**
+    /// 
+    /// +---------------+---------------------------------+--------------------+--------------------------------------+
+    /// | Sparsity Type | Input Layout                    | Input Req          | Output Layout                        |
+    /// +===============+=================================+====================+======================================+
+    /// | DENSE         | ``{OC, IC, FH, FW}``            | ``OC % 8 == 0``    |``{OC/8, IC/8 ,FH, FW, 8(IC), 8(OC)}``|
+    /// |               |                                 | ``IC % 8 == 0``    |                                      |
+    /// +---------------+---------------------------------+--------------------+--------------------------------------+
+    /// | GROUP         | ``{GROUP, OCPG, ICPG, FH, FW}`` | ``OCPG % 8 == 0``  | ``{GROUP, OCPG/8, ICPG/8 FH, FW,     |
+    /// |               |                                 | ``ICPG % 8 == 0``  |  8(ICPG), 8(OCPG)} ``                |
+    /// +---------------+---------------------------------+--------------------+--------------------------------------+
+    /// | CHAN          | ``{GROUP, 1, 1, FH, FW}``       | ``GROUP % 8 == 0`` | ``{GROUP / 8, 1, FH ,FW, 8}``        |
+    /// +---------------+---------------------------------+--------------------+--------------------------------------+
+    /// 
+    /// **Int8(DOT) weight transformation definitions**
+    /// 
+    /// +---------------+---------------------------------+--------------------+------------------------------------------+------+
+    /// | Sparsity Type | Input Layout                    | Input Req          | Output Layout                            | Axis |
+    /// +===============+=================================+====================+==========================================+======+
+    /// | DENSE         | ``{OC, IC, FH, FW}``            | ``OC % 4 == 0``    | ``{OC/4, FH, FW, IC/4, 4, 4}`            | 3    |
+    /// +---------------+---------------------------------+--------------------+------------------------------------------+------+
+    /// | GROUP         | ``{GROUP, OCPG, ICPG, FH, FW}`` | ``OCPG % 4 == 0``  | ``{GROUP, OCPG/4, FH, FW, ICPG/4, 4, 4}``| 4    |
+    /// |               |                                 | ``ICPG % 4 == 0``  |                                          |      |
+    /// +---------------+---------------------------------+--------------------+------------------------------------------+------+
+    /// 
+    /// Note: the axis column means the corresponding ``align_axis`` for image format
+    /// when the ``I`` suffix is present.
+    /// 
+    /// Note: NCHW_NCHW4_WEIGHT will auto pad oc and ic, you should remove oc in later opr by seting group and oc param with NCHW4_NCHW
+    /// 
+    mode:RelayoutFormatV0Mode = NHWC_NHWCD4;
+}
+
+/// Change the tensor layout format
+table RelayoutFormat {
+    mode:RelayoutFormatV0Mode = NHWC_NHWCD4;
+    oc:uint = 0;
+    group:uint = 1;
+}
+
+table SeparableFilterV0 {
+    format:ConvolutionV0Format = NCHW;
+    borderMode:WarpPerspectiveV1BorderMode = REPLICATE;
+    is_symm_kernel:bool = true;
+    ksize_h:uint = 3;
+    ksize_w:uint = 3;
+    anchor_h:uint = 1;
+    anchor_w:uint = 1;
+}
+
+table SeparableFilter {
+    format:ConvolutionFormat = NCHW;
+    borderMode:WarpPerspectiveV1BorderMode = REPLICATE;
+    is_symm_kernel:bool = true;
+    ksize_h:uint = 3;
+    ksize_w:uint = 3;
+    anchor_h:uint = 1;
+    anchor_w:uint = 1;
+}
+
+/// Local share convolution
+table LocalShareV0 {
+    mode:ConvolutionV0Mode = CROSS_CORRELATION;
+    /// padding on one side on the first dimension
+    pad_h:uint = 0;
+    /// padding on one side on the second dimension
+    pad_w:uint = 0;
+    /// kernel stride on the first dimension
+    stride_h:uint = 1;
+    /// kernel stride on the second dimension
+    stride_w:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_h:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_w:uint = 1;
+    /// spatial groups on the first dimension
+    spatial_groups_h:uint = 1;
+    /// spatial groups on the second dimension
+    spatial_groups_w:uint = 1;
+    sparse:ConvolutionV0Sparse = DENSE;
+    format:ConvolutionV0Format = NCHW;
+    computeMode:ConvolutionV1ComputeMode = DEFAULT;
+}
+
+/// Local share convolution
+table LocalShare {
+    mode:ConvolutionV0Mode = CROSS_CORRELATION;
+    /// padding on one side on the first dimension
+    pad_h:uint = 0;
+    /// padding on one side on the second dimension
+    pad_w:uint = 0;
+    /// kernel stride on the first dimension
+    stride_h:uint = 1;
+    /// kernel stride on the second dimension
+    stride_w:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_h:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_w:uint = 1;
+    /// spatial groups on the first dimension
+    spatial_groups_h:uint = 1;
+    /// spatial groups on the second dimension
+    spatial_groups_w:uint = 1;
+    sparse:ConvolutionV0Sparse = DENSE;
+    format:ConvolutionFormat = NCHW;
+    computeMode:ConvolutionV1ComputeMode = DEFAULT;
+}
+
+table ROIAlignV0 {
+    mode:ROIAlignV0Mode = MAX_;
+    format:ConvolutionV0Format = NCHW;
+    spatial_scale:float = 1.0;
+    offset:float = 0.0;
+    pooled_height:uint = 1;
+    pooled_width:uint = 1;
+    sample_height:uint = 2;
+    sample_width:uint = 2;
+}
+
+table ROIAlign {
+    mode:ROIAlignV0Mode = MAX_;
+    format:ConvolutionFormat = NCHW;
+    spatial_scale:float = 1.0;
+    offset:float = 0.0;
+    pooled_height:uint = 1;
+    pooled_width:uint = 1;
+    sample_height:uint = 2;
+    sample_width:uint = 2;
+}
+
+table Correlation {
+    format:ConvolutionV0Format = NCHW;
+    kernel_size:uint = 1;
+    max_displacement:uint = 1;
+    stride1:uint = 1;
+    stride2:uint = 1;
+    pad_size:uint = 0;
+    is_multiply:bool = true;
+}
+
+table DeformablePSROIPooling {
+    no_trans:bool = true;
+    spatial_scale:float = 1;
+    trans_std:float = 1;
+    /// height of pooling output
+    pooled_h:uint = 1;
+    /// width of pooling output
+    pooled_w:uint = 1;
+    /// size of each deformable part
+    part_size:uint = 1;
+    /// sample count of each bbox
+    sample_per_part:uint = 1;
+}
+
+/// Batch convolution (unshare weights on the batch dimension)
+table BatchConvBiasV0 {
+    nonlineMode:ConvBiasV0NonlineMode = IDENTITY;
+    mode:ConvolutionV0Mode = CROSS_CORRELATION;
+    /// padding on one side on the first dimension
+    pad_h:uint = 0;
+    /// padding on one side on the second dimension
+    pad_w:uint = 0;
+    /// kernel stride on the first dimension
+    stride_h:uint = 1;
+    /// kernel stride on the second dimension
+    stride_w:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_h:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_w:uint = 1;
+    sparse:ConvolutionV0Sparse = DENSE;
+    format:ConvolutionV0Format = NCHW;
+    compute_mode:ConvolutionV1ComputeMode = DEFAULT;
+}
+
+/// Batch convolution (unshare weights on the batch dimension)
+table BatchConvBias {
+    nonlineMode:ConvBiasV0NonlineMode = IDENTITY;
+    mode:ConvolutionV0Mode = CROSS_CORRELATION;
+    /// padding on one side on the first dimension
+    pad_h:uint = 0;
+    /// padding on one side on the second dimension
+    pad_w:uint = 0;
+    /// kernel stride on the first dimension
+    stride_h:uint = 1;
+    /// kernel stride on the second dimension
+    stride_w:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_h:uint = 1;
+    /// dilation (i.e. size of each zero-padded kernel block) on the second
+    /// dimension
+    dilate_w:uint = 1;
+    sparse:ConvolutionV0Sparse = DENSE;
+    format:ConvolutionFormat = NCHW;
+    compute_mode:ConvolutionV1ComputeMode = DEFAULT;
+}
+
+table FakeQuant {
+    qmin:int = -2147483648;
+    qmax:int = 2147483647;
+}
+
+table TQT {
+    qmin:int = -2147483648;
+    qmax:int = 2147483647;
+}
+
+table LSQ {
+    qmin:int = -2147483648;
+    qmax:int = 2147483647;
+}
+
+table Fill {
+    value:float = 0;
+}
+
+table CheckNonFinite {
+    scale:float = 1.0;
+}
+
+table Padding {
+    /// offset in dim 0
+    front_offset_dim0:uint = 0;
+    /// offset in dim 1
+    front_offset_dim1:uint = 0;
+    /// offset in dim 2
+    front_offset_dim2:uint = 0;
+    /// offset in dim 3
+    front_offset_dim3:uint = 0;
+    /// offset in dim 4
+    front_offset_dim4:uint = 0;
+    /// offset in dim 5
+    front_offset_dim5:uint = 0;
+    /// offset in dim 6
+    front_offset_dim6:uint = 0;
+    /// back offset in dim0
+    back_offset_dim0:uint = 0;
+    /// back offset in dim1
+    back_offset_dim1:uint = 0;
+    /// back offset in dim2
+    back_offset_dim2:uint = 0;
+    /// back offset in dim3
+    back_offset_dim3:uint = 0;
+    /// back offset in dim4
+    back_offset_dim4:uint = 0;
+    /// back offset in dim5
+    back_offset_dim5:uint = 0;
+    /// back offset in dim6
+    back_offset_dim6:uint = 0;
+    /// param of padding opr
+    padding_val:float = 0;
+    padding_mode:PaddingPaddingMode = CONSTANT;
+}
+
+table LayerNorm {
+    affine:bool = true;
+    eps:float = 1e-5;
+    normalized_dim:ulong = 1;
+    normalized_size:ulong = 1;
+}
+
+table Dropout {
+    drop_prob:float = 0;
+    seed:ulong = 0;
+}
+
+table RNNCell {
+    nonlineMode:RNNCellNonlineMode = IDENTITY;
+}
+
+table RNN {
+    /// Number of recurrent layers
+    num_layers:uint = 1;
+    /// If becomes a bidirectional RNN
+    bidirectional:bool = false;
+    /// If the layer use bias weights b_ih and b_hh
+    bias:bool = true;
+    /// The number of features in the hidden state
+    hidden_size:uint = 128;
+    /// If introduce a Dropout layer on the outputs of each RNN layer
+    dropout:float = 0.;
+    nonlineMode:RNNCellNonlineMode = IDENTITY;
+    fwd_mode:BNFwdMode = TRAINING;
+}
+
+table LSTM {
+    /// Number of recurrent layers
+    num_layers:uint = 1;
+    /// If becomes a bidirectional LSTM
+    bidirectional:bool = false;
+    /// If the layer use bias weights b_ih and b_hh
+    bias:bool = true;
+    /// The number of features in the hidden state
+    hidden_size:uint = 128;
+    /// If use LSTM with projections of corresponding size
+    proj_size:uint = 0;
+    /// If introduce a Dropout layer on the outputs of each LSTM layer
+    dropout:float = 0.;
+    fwd_mode:BNFwdMode = TRAINING;
+}
+
+
diff --git a/ci/compatibility/fbs/V2-backup/schema_v2.fbs b/ci/compatibility/fbs/V2-backup/schema_v2.fbs
new file mode 100644
index 00000000..7bbb847e
--- /dev/null
+++ b/ci/compatibility/fbs/V2-backup/schema_v2.fbs
@@ -0,0 +1,228 @@
+include "dtype.fbs";
+include "opr_param_defs.fbs";
+include "mgb_opr_param_defs.fbs";
+include "mgb_cpp_opr.fbs";
+
+namespace mgb.serialization.fbs.v2;
+
+file_identifier "mge2";
+
+table CompNode {
+    logical_locator:string;
+}
+
+table DefaultTensorFormat{}
+
+table Image2DPackedTensorFormat{
+    align_axis: ubyte;
+}
+
+table LowbitsAlignedTensorFormat{
+    size_nbits: ubyte;
+    align_size_in_bits: ubyte;
+}
+
+/// The Tensor Format
+union TensorFormat {
+    DefaultTensorFormat = 1,
+    Image2DPackedTensorFormat = 2,
+    LowbitsAlignedTensorFormat = 3,
+}
+
+/// Opaque byte buffer defined by operator implementation
+table Blob {
+    data:[ubyte];
+}
+
+table Tensor {
+    name:string;
+    shape:[uint];
+    comp_node:CompNode;
+    dtype:DType;
+    format:TensorFormat;
+    /// The tensor raw data
+    data:[ubyte];
+}
+
+table Reserved0 {}
+table DeprecatedParam {}
+
+union OperatorParam {
+    param.Empty = 1,
+    param.Axis = 2,
+    param.Convolution = 3,
+    param.MaskPropagate = 4,
+    param.ConvPooling = 5,
+    param.ConvBias = 6,
+    param.SeparableConv = 7,
+    param.Images2Neibs = 8,
+    param.Pooling = 9,
+    param.LRN = 10,
+    param.BN = 11,
+    param.ROIPooling = 12,
+    param.WarpPerspective = 13,
+    param.SpatialTfGridGenerator = 14,
+    param.SpatialTfSampler = 15,
+    param.MGBAddUpdate = 16,
+    param.Elemwise = 17,
+    param.ElemwiseMultiType = 18,
+    param.PowC = 19,
+    param.MatrixMul = 20,
+    //Reserved for param.Winograd = 21,
+    DeprecatedParam = 21,
+    param.SVD = 22,
+    param.Reduce = 23,
+    param.Cumsum = 24,
+    param.CondTake = 25,
+    param.Argsort = 26,
+    param.IndexingRemap = 27,
+    param.MGBSleep = 28,
+    param.Linspace = 29,
+    param.LinspaceFull = 30,
+    param.Eye = 31,
+    param.UniformRNG = 32,
+    param.GaussianRNG = 33,
+    param.Flip = 34,
+    param.Rotate = 35,
+    param.ROICopy = 36,
+    param.CvtColor = 37,
+    param.WarpAffine = 38,
+    param.GaussianBlur = 39,
+    param.Resize = 40,
+    param.Convolution3D = 41,
+    param.Conv3DBias = 42,
+    param.SeparableConv3D = 43,
+    param.TopK = 44,
+    param.RelayoutFormat = 45,
+    param.SeparableFilter = 46,
+    param.LocalShare = 47,
+    param.ROIAlign = 48,
+    param.DeformablePSROIPooling = 49,
+    param.BatchConvBias = 50,
+    param.DType = 51,
+    param.PersistentOutputStorage = 52,
+    param.OptionalAxis = 53,
+    param.OptionalAxisV1 = 54,
+    param.ExecutionPolicy = 55,
+    param.AssertEqual = 56,
+    param.FpgaConv = 57,
+    param.CollectiveComm = 58,
+    param.CondExecPred = 59,
+    param.CondExecPredLogical = 60,
+    param.CondExecMark = 61,
+    param.CondExecMerge = 62,
+    param.Host2DeviceCopy = 63,
+    param.Dimshuffle = 64,
+    param.AxisAddRemove = 65,
+    param.IndexDescMaskDump = 66,
+    DType = 67,
+    param.Remap = 68,
+    param.NMSKeep = 69,
+    param.AdaptivePooling = 70,
+    param.NvOf = 71,
+    param.DctChannelSelect = 72,
+    param.FakeQuant = 73,
+    param.TQT = 74,
+    param.Correlation = 75,
+    param.LSQ = 76,
+    param.GammaRNG = 77,
+    param.PoissonRNG = 78,
+    param.PermutationRNG = 79,
+    param.BetaRNG = 80,
+    param.SlidingWindowTranspose = 81,
+    param.Padding = 82,
+    param.ShuffleRNG = 83,
+    param.CheckNonFinite = 84,
+    param.LayerNorm = 85,
+    param.Dropout = 86,
+    param.RNNCell = 87,
+    param.RNN = 88,
+    param.LSTM = 89,
+    param.Softmax = 90,
+    param.Diag = 91,
+}
+
+table Operator {
+    /// the Operator type id
+    type:string;
+    /// sometime type maybe not exist, so add type_id
+    type_id:ulong;
+    name:string;
+
+    /// Operator parameter
+    param:OperatorParam;
+    /// Operator may want to save more than one OperatorParam
+    additional_params:[OperatorParam];
+
+    /// ID of the input tensor in the middle_tensors of a model
+    inputs:[uint];
+
+    /// ID of the output tensor in the middle_tensors of a model
+    outputs:[uint];
+
+    comp_node:[CompNode];
+    output_dtype:DType;
+
+    /// the const value in tensor format of the Operator
+    tensors:[Tensor];
+
+    /// opr version, with develop of MegEngine, some opr may have multi version
+    opr_version:uint;
+
+    /// the order of the Operator in the graph
+    priority:int = 0;
+
+    /// custom may want to save big, opaque byte buffers.
+    custom_data:[Blob];
+}
+
+table Metadata {
+    is_valid:bool;
+    graph_modified:bool;
+    optimize_options:ulong;
+    user_info:string;
+}
+
+table MiddleTensor {
+    name:string;
+    shape:[uint];
+    comp_node:CompNode;
+    dtype:DType;
+    format:TensorFormat;
+}
+
+table OutputVar {
+    /// the id of the middle tensor in graph, the same as the inputs in Operator
+    compact_id:uint;
+    original_id:uint;
+}
+
+table OutputAlias {
+    id:uint;
+    name:string;
+}
+
+table Model {
+    /// the megengine version when serialize the model
+    mge_version:uint;
+
+    /// model version, now model support:
+    /// version v1: the original fbs serialization version
+    /// version v2: support backward and poor forward compatibility
+    model_version:uint;
+
+    oprs:[Operator];
+
+    /// the tensors produce and consume by the Operators, not the input or
+    /// output tensor
+    middle_tensors:[MiddleTensor];
+
+    output_vars_idx:[OutputVar];
+    output_alias:[OutputAlias];
+
+    nr_shared_tensor:uint;
+    /// the Metadata to storage the custom data or some flags
+    metadata:Metadata;
+}
+
+root_type Model;
diff --git a/imperative/CMakeLists.txt b/imperative/CMakeLists.txt
index 37e6b86e..c943e6fb 100644
--- a/imperative/CMakeLists.txt
+++ b/imperative/CMakeLists.txt
@@ -66,7 +66,8 @@ target_link_libraries(${MODULE_NAME} PRIVATE nlohmann_json::nlohmann_json)
 target_include_directories(
   ${MODULE_NAME}
   PUBLIC src/include
-  PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${CPP_REDIS_INCLUDES})
+  PRIVATE ${PROJECT_SOURCE_DIR}/third_party/boost_subset/boost ${PYTHON_INCLUDE_DIRS}
+          ${NUMPY_INCLUDE_DIR} ${CPP_REDIS_INCLUDES})
 target_link_libraries(${MODULE_NAME} PRIVATE mgb_opdef_inc)
 target_compile_definitions(${MODULE_NAME} PRIVATE MODULE_NAME=${MODULE_NAME})
 target_compile_options(${MODULE_NAME} PRIVATE -Wno-unused-parameter)