build(mge): add boost (part 1)

GitOrigin-RevId: 93e202dc5d
3 years ago · a5dea703e9
--- a/ci/compatibility/fbs/V2-backup/dtype.fbs
+++ b/ci/compatibility/fbs/V2-backup/dtype.fbs
@@ -0,0 +1,43 @@
 namespace mgb.serialization.fbs;
 // Keep in sync with dnn/include/megdnn/dtype.h
 // Please only add new dtypes at the end of this list
 enum DTypeEnum : byte {
    Float32,
    Uint8,
    Int8,
    Int16,
    Int32,
    IntB1,
    IntB2,
    IntB4,
    Byte,
    Float16,
    UintB4,
    Quantized8Asymm,
    QuantizedS32,
    QuantizedS8,
    Quantized4Asymm,
    QuantizedS4,
    QuantizedS16,
    BFloat16,
    Bool,
    Uint16,
    QuantizedS1,
 }
 table LinearQuantizationParam {
    scale:float;
    // Won't be set for symmetric quantization types
    zero_point:ubyte;
 }
 union DTypeParam {
    LinearQuantizationParam,
 }
 table DType {
    type:DTypeEnum;
    param:DTypeParam;
 }
--- a/ci/compatibility/fbs/V2-backup/mgb_cpp_opr.fbs
+++ b/ci/compatibility/fbs/V2-backup/mgb_cpp_opr.fbs
@@ -0,0 +1,62 @@
 include "dtype.fbs";
 namespace mgb.serialization.fbs.param;
 struct PersistentDTypeScalar {
    dtype:DTypeEnum;
    storage:[ubyte:4];
 }
 table MGBAddUpdate {
    alpha:PersistentDTypeScalar;
    beta:PersistentDTypeScalar;
    bias:PersistentDTypeScalar;
 }
 table Host2DeviceCopy {
    enable_value_infer:bool = true;
    dump_default_value:bool = false;
    allow_cpu_mem_fwd:bool = true;
 }
 table Dimshuffle {
    pattern:[int];
    ndim:uint;
 }
 enum AxisDescMethod : byte {
    ADD_1,
    REMOVE,
 }
 struct AxisDesc {
    method:AxisDescMethod;
    axis:int;
 }
 table AxisAddRemove {
    desc:[AxisDesc];
 }
 table MGBSleep {
    device:bool = true;
    host:bool = false;
    seconds:double;
 }
 struct IndexDescMaskItem {
    axis:byte;
    begin:bool;
    end:bool;
    step:bool;
    idx:bool;
 }
 table IndexDescMaskDump {
    items:[IndexDescMaskItem];
 }
 table NMSKeep {
    iou_thresh:float;
    max_output:uint;
 }
--- a/ci/compatibility/fbs/V2-backup/mgb_opr_param_defs.fbs
+++ b/ci/compatibility/fbs/V2-backup/mgb_opr_param_defs.fbs
@@ -0,0 +1,237 @@
 // generated by gen_param_defs.py for c23d51f3c4f33119fd74f58f04d112ccea8f64f1249ab372300975ab7e710e9a
 include "dtype.fbs";
 namespace mgb.serialization.fbs.param;
 /// mode of collective communication
 enum CollectiveCommMode : uint  {
    /// reduce by sum to output computing node
    REDUCE_SUM = 0,
    /// copy input value to each output computing node
    BROADCAST = 1,
    /// each output comp node gets the concatenated value of all inputs
    ALL_GATHER = 2,
    /// reduce inputs by sum and each output gets one part of it
    REDUCE_SCATTER_SUM = 3,
    /// every output gets the sum of all inputs
    ALL_REDUCE_SUM = 4,
    /// every output gets the max of all inputs
    ALL_REDUCE_MAX = 5,
    /// every output gets the min of all inputs
    ALL_REDUCE_MIN = 6,
    /// every output gets the prod of all inputs
    ALL_REDUCE_PROD = 7,
    /// concat inputs to one node
    GATHER = 8,
    /// scatter input to each output computing node
    SCATTER = 9,
    /// scatter inputs and gather them on each computing node
    ALL_TO_ALL = 10,
 }
 /// mode for computing the gradient
 enum CondExecMarkGradMode : uint  {
    /// normal gradient mode: sum all the activated components
    SUM = 0,
    /// use :attr:`CondExecMerge.SUM_COND_OUT` mode so oprs that depend on the
    /// gradient opr would not be executed if the forward var is not used.
    SUM_COND_OUT = 1,
 }
 /// static inference option. **Note:** This is a workaround: since
 /// currently static inference in MegBrain does not take conditional
 /// execution into account, this option can be used to bypass static
 /// inference errors. This is currently only used by automatically
 /// generated gradient oprs.
 enum CondExecMarkStaticInfer : uint  {
    /// enable both shape and value inference
    SHAPE_VALUE = 0,
    /// only enable shape inference (disable value inference)
    SHAPE_ONLY = 1,
    /// disable both shape and value inference
    NONE = 2,
 }
 enum CondExecMergeMode : uint  {
    /// copy the var whose mask is activated to the output, requiring that
    /// exactly one branch is active
    EXACT_ONE = 0,
    /// like :attr:`EXACT_ONE` with the requirement that all branches have the
    /// same shape, so shape inference can be easier
    EXACT_ONE_SAME_SHAPE = 1,
    /// sum all the active branches into output var; require all branches to
    /// have the same shape. Extra shape vars are needed in this mod, so the
    /// outputs can be initialized to zero when no input is active (and their
    /// shapes are probably unknown).
    SUM = 2,
    /// like :attr:`SUM` but also add an ExecutionMask to the readers of output
    /// vars, so they would be skipped if  no branch is taken
    SUM_COND_OUT = 3,
 }
 /// how to compare predicate var with branch keys
 enum CondExecPredMode : uint  {
    /// The outputs correspond to branch keys, and the one which equals
    /// predicate would be activated. This behaves like a case-statement in many
    /// languages.
    CASE = 0,
    /// like :attr:`CASE`, but add an extra output that would be activated if no
    /// branch is matched
    CASE_FALLBACK = 1,
    /// One more outputs would be produced than the number of branch keys,
    /// representing the interval in which the predicate var fits in. The
    /// intervals are defined as :math:`(-\\infty, k_0), [k_0, k_1), \\ldots,
    /// [k_{n-2}, k_{n-1}), [k_{n-1}, \infty)`. The keys must be given in
    /// ascending order.
    PIECEWISE = 2,
 }
 enum CondExecPredLogicalMode : uint  {
    /// logical or
    OR = 0,
    /// logical and
    AND = 1,
    /// exclusive-or
    XOR = 2,
    /// not or(inputs)
    NOR = 3,
    /// not and(inputs)
    NAND = 4,
    /// not xor(inputs)
    XNOR = 5,
 }
 enum ExecutionPolicyStrategy : uint (bit_flags) {
    /// use heuristic to choose the fastest algorithm
    HEURISTIC =  0,
    /// run possible algorithms on real device to find the best
    PROFILE =  1,
    /// when profile or heuristic algo selection it require the algosmust be
    /// reproducible
    REPRODUCIBLE =  2,
    /// profile require algos are optmized to achieve fast-profile
    OPTIMIZED =  3,
 }
 enum ExecutionPolicyV0Strategy : uint  {
    /// use heuristic to choose the fastest algorithm
    HEURISTIC = 0,
    /// use heuristic to choose the fastest algorithm, and the chosen algorithm
    /// is reproducible
    HEURISTIC_REPRODUCIBLE = 1,
    /// run possible algorithms on real device to find the best
    PROFILE = 2,
    /// the fastest of profile result that is also reproducible
    PROFILE_REPRODUCIBLE = 3,
    /// use profile result and heuristic to choose the fastest algorithm
    PROFILE_HEURISTIC = 4,
 }
 table DType {
    dtype:DTypeEnum = Byte;
 }
 table PersistentOutputStorage {
    /// This is used for controlling memory sharing. Multiple
    /// ``PersistentOutputStorage'' oprs with the same ``share_key'' would share
    /// underlying tensor storage. Note that the value ``-1'' is treated
    /// specially: storage of oprs with this key would be private and would not
    /// be shared with any other opr.
    share_key:int = -1;
 }
 /// optinal axis: axis == -1 means no axis
 table OptionalAxis {
    axis:int = -1;
 }
 /// optinal axis: axis == MAX_NDIM means no axis
 table OptionalAxisV1 {
    axis:int = 7;
 }
 table ExecutionPolicyV0 {
    strategy:ExecutionPolicyV0Strategy = HEURISTIC;
    /// workspace limit in bytes
    workspace_limit:ulong = 18446744073709551615;
 }
 /// specify how to select an algorithm for an operator
 table ExecutionPolicy {
    strategy:ExecutionPolicyStrategy = 1;
    /// workspace limit in bytes
    workspace_limit:ulong = 18446744073709551615;
 }
 table AssertEqual {
    /// max allowed error; error is defined as the minimal of absolute and
    /// relative error
    maxerr:float = 0.0001;
    /// whether to print maxerr to stdout during opr exec
    verbose:bool = false;
 }
 table FpgaConv {
    need_output_quantize:bool = false;
    need_output_threshold:bool = false;
    stride:int = 1;
    input_bit_width:int = 2;
    output_bit_width:int = 2;
    weight_bit_width:int = 2;
    thres0:int = 0;
    thres1:int = 1;
    unpool_size:uint = 4;
    direct_size:uint = 4;
 }
 /// collective communication between multiple computing nodes on localhost
 table CollectiveComm {
    /// mode of collective communication
    mode:CollectiveCommMode = REDUCE_SUM;
 }
 /// HACK: The tag of this param def is actually used for another non-generated
 /// param def SerializedDType, the sole purpose of this param def is to provide
 /// a spare tag. Do not use.
 table FakeSerializedDType {
 }
 /// evaluate a predicate and branch keys to setup ExecutionMask objects with
 /// associated predicate proxy vars (PPVs)
 table CondExecPred {
    /// how to compare predicate var with branch keys
    mode:CondExecPredMode = CASE;
    /// threshold for checking equality of float point values
    eps:float = 0.0001;
 }
 /// compute a logical function over a set of PPVs
 table CondExecPredLogical {
    mode:CondExecPredLogicalMode = OR;
 }
 /// add ExecutionMask of the input PPV to this opr and readers of the outputs of
 /// this opr
 table CondExecMark {
    /// mode for computing the gradient
    grad_mode:CondExecMarkGradMode = SUM;
    /// static inference option. **Note:** This is a workaround: since
    /// currently static inference in MegBrain does not take conditional
    /// execution into account, this option can be used to bypass static
    /// inference errors. This is currently only used by automatically
    /// generated gradient oprs.
    static_infer:CondExecMarkStaticInfer = SHAPE_VALUE;
 }
 /// merge multiple conditional execution branches
 table CondExecMerge {
    /// number of output vars (i.e. vars per branch)
    nr_output:uint = 1;
    mode:CondExecMergeMode = EXACT_ONE;
 }
 /// opr Implements NVIDIA Optical Flow SDK.
 table NvOf {
    precision:uint = 1;
 }
--- a/ci/compatibility/fbs/V2-backup/opr_param_defs.fbs
+++ b/ci/compatibility/fbs/V2-backup/opr_param_defs.fbs
--- a/ci/compatibility/fbs/V2-backup/schema_v2.fbs
+++ b/ci/compatibility/fbs/V2-backup/schema_v2.fbs
@@ -0,0 +1,228 @@
 include "dtype.fbs";
 include "opr_param_defs.fbs";
 include "mgb_opr_param_defs.fbs";
 include "mgb_cpp_opr.fbs";
 namespace mgb.serialization.fbs.v2;
 file_identifier "mge2";
 table CompNode {
    logical_locator:string;
 }
 table DefaultTensorFormat{}
 table Image2DPackedTensorFormat{
    align_axis: ubyte;
 }
 table LowbitsAlignedTensorFormat{
    size_nbits: ubyte;
    align_size_in_bits: ubyte;
 }
 /// The Tensor Format
 union TensorFormat {
    DefaultTensorFormat = 1,
    Image2DPackedTensorFormat = 2,
    LowbitsAlignedTensorFormat = 3,
 }
 /// Opaque byte buffer defined by operator implementation
 table Blob {
    data:[ubyte];
 }
 table Tensor {
    name:string;
    shape:[uint];
    comp_node:CompNode;
    dtype:DType;
    format:TensorFormat;
    /// The tensor raw data
    data:[ubyte];
 }
 table Reserved0 {}
 table DeprecatedParam {}
 union OperatorParam {
    param.Empty = 1,
    param.Axis = 2,
    param.Convolution = 3,
    param.MaskPropagate = 4,
    param.ConvPooling = 5,
    param.ConvBias = 6,
    param.SeparableConv = 7,
    param.Images2Neibs = 8,
    param.Pooling = 9,
    param.LRN = 10,
    param.BN = 11,
    param.ROIPooling = 12,
    param.WarpPerspective = 13,
    param.SpatialTfGridGenerator = 14,
    param.SpatialTfSampler = 15,
    param.MGBAddUpdate = 16,
    param.Elemwise = 17,
    param.ElemwiseMultiType = 18,
    param.PowC = 19,
    param.MatrixMul = 20,
    //Reserved for param.Winograd = 21,
    DeprecatedParam = 21,
    param.SVD = 22,
    param.Reduce = 23,
    param.Cumsum = 24,
    param.CondTake = 25,
    param.Argsort = 26,
    param.IndexingRemap = 27,
    param.MGBSleep = 28,
    param.Linspace = 29,
    param.LinspaceFull = 30,
    param.Eye = 31,
    param.UniformRNG = 32,
    param.GaussianRNG = 33,
    param.Flip = 34,
    param.Rotate = 35,
    param.ROICopy = 36,
    param.CvtColor = 37,
    param.WarpAffine = 38,
    param.GaussianBlur = 39,
    param.Resize = 40,
    param.Convolution3D = 41,
    param.Conv3DBias = 42,
    param.SeparableConv3D = 43,
    param.TopK = 44,
    param.RelayoutFormat = 45,
    param.SeparableFilter = 46,
    param.LocalShare = 47,
    param.ROIAlign = 48,
    param.DeformablePSROIPooling = 49,
    param.BatchConvBias = 50,
    param.DType = 51,
    param.PersistentOutputStorage = 52,
    param.OptionalAxis = 53,
    param.OptionalAxisV1 = 54,
    param.ExecutionPolicy = 55,
    param.AssertEqual = 56,
    param.FpgaConv = 57,
    param.CollectiveComm = 58,
    param.CondExecPred = 59,
    param.CondExecPredLogical = 60,
    param.CondExecMark = 61,
    param.CondExecMerge = 62,
    param.Host2DeviceCopy = 63,
    param.Dimshuffle = 64,
    param.AxisAddRemove = 65,
    param.IndexDescMaskDump = 66,
    DType = 67,
    param.Remap = 68,
    param.NMSKeep = 69,
    param.AdaptivePooling = 70,
    param.NvOf = 71,
    param.DctChannelSelect = 72,
    param.FakeQuant = 73,
    param.TQT = 74,
    param.Correlation = 75,
    param.LSQ = 76,
    param.GammaRNG = 77,
    param.PoissonRNG = 78,
    param.PermutationRNG = 79,
    param.BetaRNG = 80,
    param.SlidingWindowTranspose = 81,
    param.Padding = 82,
    param.ShuffleRNG = 83,
    param.CheckNonFinite = 84,
    param.LayerNorm = 85,
    param.Dropout = 86,
    param.RNNCell = 87,
    param.RNN = 88,
    param.LSTM = 89,
    param.Softmax = 90,
    param.Diag = 91,
 }
 table Operator {
    /// the Operator type id
    type:string;
    /// sometime type maybe not exist, so add type_id
    type_id:ulong;
    name:string;
    /// Operator parameter
    param:OperatorParam;
    /// Operator may want to save more than one OperatorParam
    additional_params:[OperatorParam];
    /// ID of the input tensor in the middle_tensors of a model
    inputs:[uint];
    /// ID of the output tensor in the middle_tensors of a model
    outputs:[uint];
    comp_node:[CompNode];
    output_dtype:DType;
    /// the const value in tensor format of the Operator
    tensors:[Tensor];
    /// opr version, with develop of MegEngine, some opr may have multi version
    opr_version:uint;
    /// the order of the Operator in the graph
    priority:int = 0;
    /// custom may want to save big, opaque byte buffers.
    custom_data:[Blob];
 }
 table Metadata {
    is_valid:bool;
    graph_modified:bool;
    optimize_options:ulong;
    user_info:string;
 }
 table MiddleTensor {
    name:string;
    shape:[uint];
    comp_node:CompNode;
    dtype:DType;
    format:TensorFormat;
 }
 table OutputVar {
    /// the id of the middle tensor in graph, the same as the inputs in Operator
    compact_id:uint;
    original_id:uint;
 }
 table OutputAlias {
    id:uint;
    name:string;
 }
 table Model {
    /// the megengine version when serialize the model
    mge_version:uint;
    /// model version, now model support:
    /// version v1: the original fbs serialization version
    /// version v2: support backward and poor forward compatibility
    model_version:uint;
    oprs:[Operator];
    /// the tensors produce and consume by the Operators, not the input or
    /// output tensor
    middle_tensors:[MiddleTensor];
    output_vars_idx:[OutputVar];
    output_alias:[OutputAlias];
    nr_shared_tensor:uint;
    /// the Metadata to storage the custom data or some flags
    metadata:Metadata;
 }
 root_type Model;
--- a/imperative/CMakeLists.txt
+++ b/imperative/CMakeLists.txt
@@ -66,7 +66,8 @@ target_link_libraries(${MODULE_NAME} PRIVATE nlohmann_json::nlohmann_json)
 target_include_directories(
  ${MODULE_NAME}
  PUBLIC src/include
  PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${CPP_REDIS_INCLUDES})
  PRIVATE ${PROJECT_SOURCE_DIR}/third_party/boost_subset/boost ${PYTHON_INCLUDE_DIRS}
          ${NUMPY_INCLUDE_DIR} ${CPP_REDIS_INCLUDES})
 target_link_libraries(${MODULE_NAME} PRIVATE mgb_opdef_inc)
 target_compile_definitions(${MODULE_NAME} PRIVATE MODULE_NAME=${MODULE_NAME})
 target_compile_options(${MODULE_NAME} PRIVATE -Wno-unused-parameter)