#!/usr/bin/env python3
# coding: utf-8
# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""operator dsl function:maxpool"""
import math

import akg
import akg.tvm
from akg.tvm.hybrid import script
from akg.utils import validation_check as vc_util, custom_tiling as ct_util, kernel_exec as utils
from akg.utils.dsl_create import cal_pad_shapes_by_strategy
from akg.utils.format_transform import get_shape
from akg.utils import dynamic_shape as ds

maxpool_set_dim_map = {
    str(((32, 4, 112, 112, 16), (3, 3), (2, 2), (0, 1, 0, 1), "float16")):
        ((0, 0, 1, 1), (0, 1, 1, 1), (0, 2, 1, 1), (0, 3, 0, 1), (0, 4, 0, 1),
         (1, 0, 1, 1), (1, 1, 1, 1), (1, 2, 1, 1), (1, 3, 0, 1), (1, 4, 0, 1)),
    str(((32, 4, 112, 112, 16), (3, 3), (2, 2), 'SAME', "float16")):
        ((0, 0, 1, 1), (0, 1, 1, 1), (0, 2, 1, 1), (0, 3, 0, 1), (0, 4, 0, 1),
         (1, 0, 1, 1), (1, 1, 1, 1), (1, 2, 1, 1), (1, 3, 0, 1), (1, 4, 0, 1)),
    str(((32, 4, 112, 112, 16), (3, 3), (2, 2), (0, 1, 0, 1), "float32")):
        ((0, 0, 1, 1), (0, 1, 1, 1), (0, 2, 1, 1), (0, 3, 0, 1), (0, 4, 0, 1),
         (1, 0, 1, 1), (1, 1, 1, 1), (1, 2, 1, 1), (1, 3, 0, 1), (1, 4, 0, 1)),
    str(((32, 4, 112, 112, 16), (3, 3), (2, 2), 'SAME', "float32")):
        ((0, 0, 1, 1), (0, 1, 1, 1), (0, 2, 1, 1), (0, 3, 0, 1), (0, 4, 0, 1),
         (1, 0, 1, 1), (1, 1, 1, 1), (1, 2, 1, 1), (1, 3, 0, 1), (1, 4, 0, 1)),
    str(((32, 6, 55, 55, 16), (3, 3), (2, 2), 'VALID', "float16")):
        ((0, 0, 1, 1), (0, 1, 1, 1), (0, 2, 3, 1), (0, 3, 3, 1), (0, 4, 0, 1), (0, 5, 0, 1),
         (1, 0, 1, 1), (1, 1, 1, 1), (1, 2, 3, 1), (1, 3, 3, 1), (1, 4, 1, 1), (1, 5, 0, 1)),
    str(((32, 6, 55, 55, 16), (3, 3), (2, 2), 'VALID', "float32")):
        ((0, 0, 1, 1), (0, 1, 1, 1), (0, 2, 3, 1), (0, 3, 3, 1), (0, 4, 0, 1), (0, 5, 0, 1),
         (1, 0, 1, 1), (1, 1, 1, 1), (1, 2, 3, 1), (1, 3, 3, 1), (1, 4, 1, 1), (1, 5, 0, 1)),
}


maxpool_set_attr_map = {
    str(((32, 4, 112, 112, 16), (3, 3), (2, 2), 'SAME', "float16")): {
        "merge_outer_loop_for_multicore": 1,
    },
    str(((32, 4, 112, 112, 16), (3, 3), (2, 2), 'SAME', "float32")): {
        "merge_outer_loop_for_multicore": 1,
    },
}

attr_map = dict()


def maxpool_set_dim_func(data, kernel, stride, pad):
    """Set dim info with maxpool_set_dim_map."""
    key = []
    key.append(tuple(data.shape))
    key.append(kernel)
    key.append(stride)
    key.append(pad)
    key.append(data.dtype)
    hash_key = str(tuple(key))

    global attr_map
    default_attr_map = {
        "pragma_reschedule": 1,
        "pragma_reorder_schedule": True,
        "pragma_opt_for_dsa": 1,
        "pragma_disable_loop_reversal": 1,
        "loop_partition_unroll": False,
    }
    attr_map.clear()
    for k, v in default_attr_map.items():
        attr_map[k] = v
    if hash_key in maxpool_set_attr_map.keys():
        for k, v in maxpool_set_attr_map[hash_key].items():
            attr_map[k] = v

    if hash_key in maxpool_set_dim_map.keys():
        return ct_util.set_dims(maxpool_set_dim_map[hash_key]), hash_key
    return "", hash_key


def maxpool_param_check(kernel, stride, pad):
    """check maxpool parameters"""
    if len(kernel) != 2:
        raise ValueError("Only support 2-dim kernel!")
    if len(stride) != 2:
        raise ValueError("Only support 2-dim stride!")
    if len(pad) != 2 and (len(pad) != 4 or pad[0] != pad[1] or pad[2] != pad[3]):
        raise ValueError("Only support 2-dim pad, or 4-dim with 2 equal values!")

@vc_util.check_input_type(akg.tvm.tensor.Tensor, (list, tuple),
                          (list, tuple), (list, tuple))
def old_maxpool(data, kernel, stride, pad):
    """
    Old implement for maxpool.

    Args:
        data (tvm.tensor.Tensor): Tensor of type float16 or float32, \"NC1HWC0\"
                                  format (N: batch, C1: channel, H: height, W:
                                  width, C0: block size)
        kernel (Union[list, tuple]): List or tuple with two int number as
                                     window sizes of H and W.
        stride (Union[list, tuple]): List or tuple with two int number as
                                     stride sizes of H and W.
        pad (Union[list, tuple]): List or tuple with two int number as
                                  pad sizes of H and W.

    Returns:
        tvm.tensor.Tensor, result of maxpool operator.
    """
    shape = get_shape(data)
    dtype = data.dtype
    vc_util.davinci_format_check(shape, "NC1HWC0", dim=5)
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_FLOAT)

    maxpool_param_check(kernel, stride, pad)

    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    if len(pad) == 2:
        pad_height, pad_width = pad
    else:
        pad_height, pad_width = pad[0], pad[2]

    in_n, in_c1, in_h, in_w, in_c0 = shape

    out_h = int(math.floor((in_h + 2 * pad_height - kernel_h)
                           / float(stride_h)) + 1)
    out_w = int(math.floor((in_w + 2 * pad_width - kernel_w)
                           / float(stride_w)) + 1)

    if pad[0] != 0 or pad[1] != 0:
        pad_shape = (in_n, in_c1, in_h + 2 * pad_height, in_w + 2 * pad_width, in_c0)

        pad2d = akg.tvm.compute(pad_shape,
                                lambda n, c1, h, w, c0:
                                akg.tvm.const(0.0, dtype=dtype),
                                name="pad2d"
                                )
        pad2d = akg.tvm.compute(pad_shape,
                                lambda n, c1, h, w, c0:
                                akg.tvm.if_then_else(
                                    akg.tvm.any(
                                        h < pad_height,
                                        h > in_h + pad_height - 1,
                                        w < pad_width,
                                        w > in_w + pad_width - 1
                                    ),
                                    pad2d[n, c1, h, w, c0],
                                    data[n, c1, h - pad_height, w - pad_width, c0],
                                ),
                                name="pad2d"
                                )
    else:
        pad2d = data

    axis_kernel_h = akg.tvm.reduce_axis((0, kernel_h), name="ah")
    axis_kernel_w = akg.tvm.reduce_axis((0, kernel_w), name="aw")

    out_shape = (in_n, in_c1, out_h, out_w, in_c0)

    res_value = akg.tvm.compute(out_shape,
                                lambda n, c1, h, w, c0:
                                akg.tvm.max(
                                    pad2d[n, c1, h * stride_h + axis_kernel_h,
                                          w * stride_w + axis_kernel_w, c0],
                                    axis=[axis_kernel_h, axis_kernel_w]
                                ),
                                name="res_value")
    return res_value


def maxpool_manual_schedule(shape, kernel, stride, padding, dtype, attrs=None, polyhedral=False):
    """maxpool with manual schedule"""
    vc_util.davinci_format_check(shape, "NC1HWC0", dim=5)
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_FLOAT)

    maxpool_param_check(kernel, stride, padding)

    data = akg.tvm.placeholder(shape, dtype, name="input_data")
    batch_size, in_c1, input_h, input_w, in_c0 = data.shape

    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    if len(padding) == 2:
        pad_h, pad_w = padding
    elif len(padding) == 4:
        pad_h, pad_w = padding[0], padding[2]

    out_size_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1
    out_size_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1

    # padding operation
    if pad_h != 0 or pad_w != 0:
        pad_shape = (batch_size, in_c1, input_h + 2 * pad_h, input_w + 2 * pad_w, in_c0)

        padded_input = akg.tvm.compute(pad_shape,
                                       lambda n, c1, h, w, c0:
                                       akg.tvm.if_then_else(
                                           akg.tvm.any(
                                               h > input_h + pad_h - 1,
                                               h < pad_h,
                                               w > input_w + pad_w - 1,
                                               w < pad_w,
                                           ),
                                           akg.tvm.const(0.0, dtype=dtype),
                                           data[n, c1, h - pad_h, w - pad_w, c0],
                                       ),
                                       name="padded_input")
    else:
        padded_input = data

    # reduce iterators
    it_kernel_h = akg.tvm.reduce_axis((0, kernel_h), name="iterator_reduction_height")
    it_kernel_w = akg.tvm.reduce_axis((0, kernel_w), name="iterator_reduction_width")

    out_shape = (batch_size, in_c1, out_size_h, out_size_w, in_c0)

    res = akg.tvm.compute(out_shape,
                          lambda n, c1, h, w, c0:
                          akg.tvm.max(
                              padded_input[n, c1, (h * stride_h + it_kernel_h), (w * stride_w + it_kernel_w), c0],
                              axis=[it_kernel_h, it_kernel_w]
                          ),
                          name="maxpool_not_hybrid")

    s = akg.tvm.create_schedule([res.op])

    if pad_w != 0 or pad_h != 0:
        padded_input = res.op.input_tensors[0]
    else:
        padded_input = res

    # cache reads and writes
    # after this cache write: reference to res_ub to change the reduction axis
    res_ub = s.cache_write(res, "local.UB")
    if pad_w != 0 or pad_h != 0:
        data_ub = s.cache_read(data, "local.UB", [padded_input])
    else:
        data_ub = s.cache_read(data, "local.UB", [res_ub])

    # get tiling attributes
    if attrs is None:
        raise Exception('attrs is None')
    tiling_factors = attrs['tile']
    split_iterators = []
    if len(tiling_factors) != len(res.shape):
        raise RuntimeError("tiling factors mismatch out shape")
    # split the final compute and save the iterators
    for index, factor in enumerate(tiling_factors):
        split_iterators.append(s[res_ub].split(res_ub.op.axis[index], factor))

    # get iterators
    iterator_b_outer = split_iterators[0][0]
    iterator_b_inner = split_iterators[0][1]
    iterator_c1_outer = split_iterators[1][0]
    iterator_c1_inner = split_iterators[1][1]
    iterator_h_outer = split_iterators[2][0]
    iterator_h_inner = split_iterators[2][1]
    iterator_w_outer = split_iterators[3][0]
    iterator_w_inner = split_iterators[3][1]
    iterator_c0_outer = split_iterators[4][0]
    iterator_c0_inner = split_iterators[4][1]
    # reduction axis
    iterator_reduce_h = res_ub.op.reduce_axis[0]
    iterator_reduce_w = res_ub.op.reduce_axis[1]

    # move caches
    s[res_ub].compute_at(s[res], res.op.axis[0])
    s[data_ub].compute_at(s[res_ub], iterator_c1_outer)

    if pad_w != 0 or pad_h != 0:
        s[padded_input].compute_at(s[res_ub], iterator_c1_outer)
        s[padded_input].set_scope("local.UB")

    # reorder computation
    s[res_ub].reorder(iterator_b_outer, iterator_b_inner, iterator_c1_outer, iterator_c1_inner, iterator_h_outer,
                      iterator_h_inner, iterator_w_outer, iterator_w_inner, iterator_reduce_h, iterator_reduce_w,
                      iterator_c0_outer, iterator_c0_inner)

    with akg.build_config(add_lower_pass=utils.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [data, res], "cce", name="maxpool_manual_schedule", attrs=attrs, polyhedral=polyhedral)
        source_code = mod.imported_modules[0].get_source()
        kernel_name = "maxpool_ad_manual_schedule"
        utils.create_code(kernel_name, './', source_code)
    return mod

def pad_strategy_check(strategy):
    if not isinstance(strategy, str) \
            and not (isinstance(strategy, (list, tuple)) and len(strategy) == 4):
        raise ValueError("Only support string or list/tuple of 4 int numbers!")

@vc_util.check_input_type(akg.tvm.tensor.Tensor, (list, tuple), (list, tuple),
                          (list, tuple, str))
def maxpool(data, kernel, stride, strategy):
    """
    Performs the max pooling on the input data.

    Note:
        Only support 5D format(NC1HWC0), and pooling will work on H and W.

    Args:
        data (tvm.tensor.Tensor): Tensor of type float16, float32.
        kernel (Union[list, tuple]): two int numbers for pooling window's size.
        stride (Union[list, tuple]): two int numbers for window's stride.
        strategy (Union[str, list, tuple]): padding, should be 'VALID',
            'SAME' or instance of list(four int numbers for 'CONSTANTS' strategy).
            Support **Strategies** is same as avgpool.

    Returns:
        tvm.tensor.Tensor, as result for max pooling.
    """
    attrs = attr_map
    attrs['dim'] = maxpool_set_dim_func(data, kernel, stride, strategy)[0]

    shape = get_shape(data)
    dtype = data.dtype
    vc_util.davinci_format_check(shape, "NC1HWC0", dim=5)
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_FLOAT)
    vc_util.check_shape(kernel, 2, "Kernel")
    vc_util.check_shape(stride, 2, "Stride")

    pad_strategy_check(strategy)

    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    in_n, in_c1, in_h, in_w, in_c0 = shape

    [ph_h, _, pw_h, _], [out_h, out_w] = \
        cal_pad_shapes_by_strategy(shape, kernel, stride, strategy)
    if attrs.get("dynamic") is True:
        # dynamic shape: although we can represent out_h and out_w using input shapes, they are too complicated
        out_h = akg.tvm.var("OUT_H")
        out_w = akg.tvm.var("OUT_W")

    @script(capture=locals())
    def dynamic_max_pool_hybrid_0(zero_, one_, min_value_, x_, in_n, in_c1, in_h, in_w, in_c0, out_h, out_w):
        output = output_tensor((in_n, in_c1, out_h, out_w, in_c0), x_.dtype)

        for n in range(in_n):
            for c1 in range(in_c1):
                # Head
                for ow in range(out_w):
                    for c0 in range(in_c0):
                        output[n, c1, 0, ow, c0] = min_value_
                for kh in range(kernel_h):
                    for kw in range(kernel_w):
                        for ow in range(out_w):
                            for c0 in range(in_c0):
                                if ph_h <= kh <= in_h + ph_h - 1 and 0 <= ow * stride_w + kw - pw_h <= in_w - 1:
                                    output[n, c1, 0, ow, c0] = \
                                        max(output[n, c1, 0, ow, c0],
                                            x_[n, c1, kh - ph_h, ow * stride_w + kw - pw_h, c0])
                # Tail
                for oh in range(out_h - 1):
                    for ow in range(out_w):
                        for c0 in range(in_c0):
                            output[n, c1, oh + 1, ow, c0] = min_value_
                for kh in range(kernel_h):
                    for kw in range(kernel_w):
                        for oh in range(out_h - 1):
                            for ow in range(out_w):
                                for c0 in range(in_c0):
                                    if ph_h <= (oh + 1) * stride_h + kh <= in_h + ph_h - 1\
                                            and pw_h <= ow * stride_w + kw <= in_w + pw_h - 1:
                                        output[n, c1, oh + 1, ow, c0] = max(output[n, c1, oh + 1, ow, c0],
                                                                            x_[n, c1, (oh + 1) * stride_h
                                                                               + kh - ph_h, ow * stride_w
                                                                               + kw - pw_h, c0])

        return output

    # static shape's hybrid
    @script(capture=locals())
    def static_max_pool_hybrid_0(zero_, one_, min_value_, x_):
        output = output_tensor((in_n, in_c1, out_h, out_w, in_c0), x_.dtype)

        for n in range(in_n):
            for c1 in range(in_c1):
                # Head
                for ow in range(out_w):
                    for c0 in range(in_c0):
                        output[n, c1, 0, ow, c0] = min_value_
                for kh in range(kernel_h):
                    for kw in range(kernel_w):
                        for ow in range(out_w):
                            for c0 in range(in_c0):
                                if ph_h <= kh <= in_h + ph_h - 1 and 0 <= ow * stride_w + kw - pw_h <= in_w - 1:
                                    output[n, c1, 0, ow, c0] = \
                                        max(output[n, c1, 0, ow, c0],
                                            x_[n, c1, kh - ph_h, ow * stride_w + kw - pw_h, c0])
                # Tail
                for oh in range(out_h - 1):
                    for ow in range(out_w):
                        for c0 in range(in_c0):
                            output[n, c1, oh + 1, ow, c0] = min_value_
                for kh in range(kernel_h):
                    for kw in range(kernel_w):
                        for oh in range(out_h - 1):
                            for ow in range(out_w):
                                for c0 in range(in_c0):
                                    if ph_h <= (oh + 1) * stride_h + kh <= in_h + ph_h - 1 \
                                            and pw_h <= ow * stride_w + kw <= in_w + pw_h - 1:
                                        output[n, c1, oh + 1, ow, c0] = max(output[n, c1, oh + 1, ow, c0],
                                                                            x_[n, c1, (oh + 1) * stride_h
                                                                               + kh - ph_h, ow * stride_w
                                                                               + kw - pw_h, c0])

        return output

    zero = akg.tvm.const(0.0, dtype=dtype)
    one = akg.tvm.const(1.0, dtype=dtype)
    min_value = akg.tvm.const(-65504.0 if dtype == 'float16'
                              else -340282346638528859811704183484516925440.0, dtype=dtype)
    if attrs.get("dynamic") is True:
        output = dynamic_max_pool_hybrid_0(zero, one, min_value, data,
                                           in_n, in_c1, in_h, in_w, in_c0, out_h, out_w)
    else:
        output = static_max_pool_hybrid_0(zero, one, min_value, data)

    return output, attrs


maxpool_with_argmax_set_dim_map = {
    str(((32, 4, 112, 112, 16), (3, 3), (2, 2), 'SAME', "float16")):
        ((0, 0, 1, 1), (0, 1, 1, 1), (0, "H", 3, 1), (0, 3, 56, 1), (0, 4, 5, 1), (0, 5, 16, 1)),
    str(((32, 4, 112, 112, 16), (3, 3), (2, 2), (1, 1, 1, 1), "float16")):
        ((0, 0, 1, 1), (0, 1, 1, 1), (0, "H", 3, 1), (0, 3, 56, 1), (0, 4, 5, 1), (0, 5, 16, 1)),
    str(((1, 1, 28, 28, 16), (2, 2), (2, 2), 'VALID', "float16")):
        ((0, 0, 14, 1), (0, 1, 14, 1), (0, 4, 3, 1), (0, 5, 16, 1)),
    # str('((I0, I1, I2, I3, 16), (3, 3), (2, 2), (1, 1, 1, 1), \'float16\')'):
    #     ((0, 0, 1, 1), (0, 1, 1, 1), (0, 2, 3, 1), (0, 3, 3, 1), (0, 4, 3, 1),
    #      (1, 0, 1, 1), (1, 1, 1, 1), (1, 2, 3, 1), (1, 3, 3, 1), (1, "H", 3, 1)),
}

maxpool_with_argmax_set_attr_map = {
}
attr_map_v2 = dict()


def maxpool_with_argmax_tiling_strategy(data, kernel, stride, pad):
    """Custom tiling for maxpool with argmax version."""
    batch, c1, fm_h, fm_w, c0 = data.shape
    _, [out_h, _] = \
        cal_pad_shapes_by_strategy(get_shape(data), kernel, stride, pad)
    strategy = list()
    if data.ndim == 5 and c0.value == 16:
        h_cut = out_h
        if isinstance(fm_h, akg.tvm.expr.Var) or (fm_h.value >= 50 and fm_w.value >= 50):
            h_cut = 3
        dim_ind = 0
        if isinstance(batch, akg.tvm.expr.Var) or batch.value > 1:
            strategy += ct_util.create_constraint_on_axis(values=1,
                                                          constraints=ct_util.TileConstraint.FACTOR,
                                                          axis=dim_ind)
            dim_ind = dim_ind + 1
        if isinstance(c1, akg.tvm.expr.Var) or c1.value > 1:
            strategy += ct_util.create_constraint_on_axis(values=1,
                                                          constraints=ct_util.TileConstraint.FACTOR,
                                                          axis=dim_ind)
            dim_ind = dim_ind + 1
        strategy += ct_util.create_constraint_on_axis(values=h_cut,
                                                      constraints=ct_util.TileConstraint.FACTOR,
                                                      axis=dim_ind)
        strategy += ct_util.create_constraint_on_axis(values="H",
                                                      constraints=ct_util.TileConstraint.SET_AXIS_INFO,
                                                      axis=dim_ind)
        strategy += ct_util.create_constraint_on_axis(values="FULL",
                                                      constraints=ct_util.TileConstraint.MAX,
                                                      axis=dim_ind + 1)
        strategy += ct_util.create_constraint_on_axis(values=5,
                                                      constraints=ct_util.TileConstraint.FACTOR,
                                                      axis=dim_ind + 2)
        strategy += ct_util.create_constraint_on_axis(values=16,
                                                      constraints=ct_util.TileConstraint.FACTOR,
                                                      axis=dim_ind + 3)
    return strategy

def maxpool_with_argmax_dynamic_tensor_strategy(data, im2col, mask):
    """Custom tiling for maxpool with argmax version."""
    _, _, _, _, c0 = data.shape
    strategy = list()
    if data.ndim == 5 and c0.value == 16:
        strategy += ct_util.create_constraint_on_tensor(tensor=im2col,
                                                        values=1,
                                                        constraints=ct_util.TileConstraint.FACTOR,
                                                        tensor_pos=0)
        strategy += ct_util.create_constraint_on_tensor(tensor=im2col,
                                                        values=1,
                                                        constraints=ct_util.TileConstraint.FACTOR,
                                                        tensor_pos=1)
        strategy += ct_util.create_constraint_on_tensor(tensor=im2col,
                                                        values="FULL",
                                                        constraints=ct_util.TileConstraint.MAX,
                                                        tensor_pos=2)
        strategy += ct_util.create_constraint_on_tensor(tensor=im2col,
                                                        values="FULL",
                                                        constraints=ct_util.TileConstraint.MAX,
                                                        tensor_pos=3)
        strategy += ct_util.create_constraint_on_tensor(tensor=im2col,
                                                        values=1,
                                                        constraints=ct_util.TileConstraint.FACTOR,
                                                        tensor_pos=4)
        strategy += ct_util.create_constraint_on_tensor(tensor=im2col,
                                                        values="FULL",
                                                        constraints=ct_util.TileConstraint.MAX,
                                                        tensor_pos=5)
        strategy += ct_util.create_constraint_on_tensor(tensor=im2col,
                                                        values="FULL",
                                                        constraints=ct_util.TileConstraint.MAX,
                                                        tensor_pos=6)

        strategy += ct_util.create_constraint_on_tensor(tensor=mask,
                                                        values=1,
                                                        constraints=ct_util.TileConstraint.FACTOR,
                                                        tensor_pos=0)
        strategy += ct_util.create_constraint_on_tensor(tensor=mask,
                                                        values=1,
                                                        constraints=ct_util.TileConstraint.FACTOR,
                                                        tensor_pos=1)
        strategy += ct_util.create_constraint_on_tensor(tensor=mask,
                                                        values=1,
                                                        constraints=ct_util.TileConstraint.FACTOR,
                                                        tensor_pos=2)
        strategy += ct_util.create_constraint_on_tensor(tensor=mask,
                                                        values=1,
                                                        constraints=ct_util.TileConstraint.FACTOR,
                                                        tensor_pos=3)
        strategy += ct_util.create_constraint_on_tensor(tensor=mask,
                                                        values="FULL",
                                                        constraints=ct_util.TileConstraint.MAX,
                                                        tensor_pos=4)
        strategy += ct_util.create_constraint_on_tensor(tensor=mask,
                                                        values="FULL",
                                                        constraints=ct_util.TileConstraint.MAX,
                                                        tensor_pos=5)
        strategy += ct_util.create_constraint_on_tensor(tensor=mask,
                                                        values="FULL",
                                                        constraints=ct_util.TileConstraint.MAX,
                                                        tensor_pos=6)
    return strategy

def maxpool_with_argmax_custom_tiling_strategy(data):
    """Custom tiling for maxpool with argmax version."""
    batch, c1, _, _, c0 = data.shape
    strategy = list()
    if data.ndim == 5 and c0.value == 16:
        band = 1
        dim_ind = 0
        if isinstance(batch, akg.tvm.expr.Var) or batch.value > 1:
            strategy += ct_util.create_constraint_on_axis(values=1,
                                                          constraints=ct_util.TileConstraint.FACTOR,
                                                          band=band,
                                                          axis=dim_ind)
            dim_ind = dim_ind + 1
        if isinstance(c1, akg.tvm.expr.Var) or c1.value > 1:
            strategy += ct_util.create_constraint_on_axis(values=1,
                                                          constraints=ct_util.TileConstraint.FACTOR,
                                                          band=band,
                                                          axis=dim_ind)
            dim_ind = dim_ind + 1
        strategy += ct_util.create_constraint_on_axis(values=1,
                                                      constraints=ct_util.TileConstraint.FACTOR,
                                                      band=band,
                                                      axis=dim_ind)
        dim_ind = dim_ind + 1
        strategy += ct_util.create_constraint_on_axis(values="FULL",
                                                      constraints=ct_util.TileConstraint.MAX,
                                                      band=band,
                                                      axis=dim_ind)
        dim_ind = dim_ind + 1
        strategy += ct_util.create_constraint_on_axis(values="FULL",
                                                      constraints=ct_util.TileConstraint.MAX,
                                                      band=band,
                                                      axis=dim_ind)
        dim_ind = dim_ind + 1
        strategy += ct_util.create_constraint_on_axis(values="FULL",
                                                      constraints=ct_util.TileConstraint.MAX,
                                                      band=band,
                                                      axis=dim_ind)
        dim_ind = dim_ind + 1
        strategy += ct_util.create_constraint_on_axis(values="FULL",
                                                      constraints=ct_util.TileConstraint.MAX,
                                                      band=band,
                                                      axis=dim_ind)
        band = 0
        dim_ind = 0
        strategy += ct_util.create_constraint_on_axis(values=1,
                                                      constraints=ct_util.TileConstraint.FACTOR,
                                                      band=band,
                                                      axis=dim_ind)
        dim_ind = dim_ind + 1
        strategy += ct_util.create_constraint_on_axis(values=1,
                                                      constraints=ct_util.TileConstraint.FACTOR,
                                                      band=band,
                                                      axis=dim_ind)
        dim_ind = dim_ind + 1

        strategy += ct_util.create_constraint_on_axis(values="FULL",
                                                      constraints=ct_util.TileConstraint.MAX,
                                                      band=band,
                                                      axis=dim_ind)
        dim_ind = dim_ind + 1

        strategy += ct_util.create_constraint_on_axis(values="FULL",
                                                      constraints=ct_util.TileConstraint.MAX,
                                                      band=band,
                                                      axis=dim_ind)
        dim_ind = dim_ind + 1

        strategy += ct_util.create_constraint_on_axis(values=1,
                                                      constraints=ct_util.TileConstraint.FACTOR,
                                                      band=band,
                                                      axis=dim_ind)
        dim_ind = dim_ind + 1

        strategy += ct_util.create_constraint_on_axis(values="FULL",
                                                      constraints=ct_util.TileConstraint.MAX,
                                                      band=band,
                                                      axis=dim_ind)
        dim_ind = dim_ind + 1

        strategy += ct_util.create_constraint_on_axis(values="FULL",
                                                      constraints=ct_util.TileConstraint.MAX,
                                                      band=band,
                                                      axis=dim_ind)
        dim_ind = dim_ind + 1
        strategy += ct_util.create_constraint_on_axis(values="FULL",
                                                      constraints=ct_util.TileConstraint.MAX,
                                                      band=band,
                                                      axis=dim_ind)
    return strategy


def get_attrs():
    """Get default attrs for maxpool."""
    default_attr_map = {
        "pragma_reschedule": 1,
        "pragma_opt_for_dsa": 1,
        "pragma_reorder_schedule": True,
        "enable_pre_poly_loop_partition": False,
        "enable_post_poly_loop_partition": False,
        "disable_cse": True,
        "enable_bk_optimize": False,
        "enable_to_three_address": False
    }
    return default_attr_map

def get_dynamic_attrs():
    """Get default attrs for maxpool."""
    default_attr_map = {
        "pragma_reschedule": 1,
        "pragma_opt_for_dsa": 1,
        "pragma_reorder_schedule": True,
        "enable_pre_poly_loop_partition": False,
        "enable_post_poly_loop_partition": False,
        "disable_cse": True,
        "enable_bk_optimize": False,
        "enable_double_buffer": False,
        "enable_hoist_cond_write": False,
        "extent_to_cond": False,
        "merge_outer_loop_for_multicore": 1,
        "multicore_loop_max_depth": 2,
        "enable_sink_allocate": True,
    }
    return default_attr_map

def maxpool_with_argmax_set_dim_func(data, kernel, stride, pad):
    """set dim info for attr"""
    key = []
    key.append(tuple(data.shape))
    key.append(tuple(kernel))
    key.append(tuple(stride))
    if isinstance(pad, list):
        pad = tuple(pad)
    elif isinstance(pad, str):
        pad = pad.upper()
    key.append(pad)
    key.append(data.dtype)
    hash_key = str(tuple(key))

    global attr_map_v2
    default_attr_map = get_attrs()
    attr_map_v2.clear()
    for k, v in default_attr_map.items():
        attr_map_v2[k] = v
    if hash_key in maxpool_with_argmax_set_attr_map.keys():
        for k, v in maxpool_with_argmax_set_attr_map[hash_key].items():
            attr_map_v2[k] = v

    if hash_key in maxpool_with_argmax_set_dim_map.keys():
        return ct_util.set_dims(maxpool_with_argmax_set_dim_map[hash_key]), hash_key
    return "", hash_key

def maxpool_value(index):
    print(type(index))
    if isinstance(index, akg.tvm.expr.IntImm):
        return index.value
    return index

def img2col(input_img, col_shape, filter_h, filter_w, pad, stride, min_value, tag=None):
    """implement ima2col"""
    def img2col_compute(input_img, indices, filter_w, pad, stride):
        _, _, fmap_h, fmap_w, _ = input_img.shape
        col_n, col_c1, col_hw, col_ww, col_ho, col_wo, col_c0 = indices
        stride_h, stride_w = stride
        pad_top, pad_bottom, pad_left, pad_right = pad

        img_n_index = col_n
        img_c1_index = col_c1
        img_h_index = col_ho * stride_h + col_hw
        img_w_index = col_wo * stride_w + col_ww
        img_c0_index = col_c0
        dilation_h = 1
        dilation_w = 1
        repeat_mode = 1
        jmp_offset = 1

        return akg.lang.cce.load_im2col_c1_buf(
            akg.tvm.if_then_else(
                akg.tvm.any(
                    img_h_index < pad_top,
                    img_h_index > maxpool_value(fmap_h) + pad_top - 1,
                    img_w_index < pad_left,
                    img_w_index > maxpool_value(fmap_w) + pad_left - 1),
                min_value,
                input_img(
                    img_n_index,
                    img_c1_index,
                    img_h_index - pad_top,
                    img_w_index - pad_left,
                    img_c0_index)),
            pad_top, pad_bottom, pad_left, pad_right, fmap_h, fmap_w,
            stride_h, stride_w, filter_h, filter_w, dilation_h, dilation_w,
            repeat_mode, jmp_offset)

    if tag is None:
        tag = 'im2col_row_major'
    return akg.tvm.compute(
        col_shape,
        lambda *indices: img2col_compute(input_img, indices, filter_w, pad, stride),
        name='im2col_row_major',
        tag=tag,
        attrs={
            'pragma_conv_kernel_h': filter_h,
            'pragma_conv_kernel_w': filter_w,
            'pragma_conv_padding_top': pad[0],
            'pragma_conv_padding_bottom': pad[1],
            'pragma_conv_padding_left': pad[2],
            'pragma_conv_padding_right': pad[3],
            'pragma_conv_stride_h': stride[0],
            'pragma_conv_stride_w': stride[1],
            'pragma_conv_dilation_h': 1,
            'pragma_conv_dilation_w': 1,
            'pragma_conv_fm_h': input_img.shape[2],
            'pragma_conv_fm_w': input_img.shape[3],
            'pragma_conv_h_cut': (3 - 1) * stride[0] + filter_h,
            'pragma_conv_w_cut': input_img.shape[3]
        })

@ct_util.reg_set_dim_func(maxpool_with_argmax_set_dim_func)
@vc_util.check_input_type(akg.tvm.tensor.Tensor, (list, tuple), (list, tuple), (list, tuple, str))
def maxpool_with_argmax_dynamic(data, kernel, stride, strategy):
    """
    Performs the max pooling on the input datas.

    Note:
        Only support 5D format(NC1HWC0), and pooling will work on H and W.

    Args:
        data (tvm.tensor.Tensor): Tensor of type float16, float32.
        kernel (Union[list, tuple]): two int numbers for pooling window's size.
        stride (Union[list, tuple]): two int numbers for window's stride.
        strategy (Union[str, list, tuple]): padding, should be 'VALID','SAME' or
            instance of list(four int numbers, as 'CONSTANTS' strategy).
            Support **Strategies** is the same as avgpool.

    Returns:
        tvm.tensor.Tensor, result for gradient of maxpooling.
    """
    attrs = get_dynamic_attrs()
    dim_info = maxpool_with_argmax_set_dim_func(data, kernel, stride, strategy)[0]
    for k, v in attr_map_v2.items():
        attrs[k] = v
    if dim_info != "":
        attrs['dim'] = dim_info
    # attrs["custom_tiling"] = maxpool_with_argmax_custom_tiling_strategy(data)
    attrs["enable_feature_library"] = True
    shape = get_shape(data)
    dtype = data.dtype

    vc_util.davinci_format_check(shape, "NC1HWC0", dim=5)
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.FLOAT16)
    vc_util.check_shape(kernel, 2, 'Kernel')
    vc_util.check_shape(stride, 2, 'Stride')

    pad_strategy_check(strategy)

    kernel_h, kernel_w = kernel
    in_n, in_c1, _, _, in_c0 = shape

    [ph_h, ph_t, pw_h, pw_t], [out_h, out_w] = \
        cal_pad_shapes_by_strategy(shape, kernel, stride, strategy)

    pad = [ph_h, ph_t, pw_h, pw_t]
    zero = akg.tvm.const(0.0, dtype=dtype)
    min_value = akg.tvm.const(-65504.0 if dtype == 'float16'
                              else -340282346638528859811704183484516925440.0, dtype=dtype)

    # fmap img2col l1 -> ub in zZ format by fractal
    fmap_img2col_shape_ub = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w, in_c0)

    fmap_img2col_ub = img2col(data, fmap_img2col_shape_ub, kernel_h, kernel_w,
                              pad, stride, min_value, tag='')

    out_shape = (in_n, in_c1, out_h, out_w, in_c0)
    reduce_axis_h = akg.tvm.reduce_axis((0, kernel_h), name="reduce_h")
    reduce_axis_w = akg.tvm.reduce_axis((0, kernel_w), name="reduce_w")
    output = akg.tvm.compute(out_shape,
                             lambda n, c1, oh, ow, c0:
                             akg.tvm.max(
                                 fmap_img2col_ub[n, c1, reduce_axis_h,
                                                 reduce_axis_w, oh, ow, c0],
                                 axis=[reduce_axis_h, reduce_axis_w]),
                             name="pooling_max")

    zero = akg.tvm.const(0.0, dtype=dtype)
    mask_first_max_shape = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w, in_c0)
    mask_first_max = akg.tvm.compute(mask_first_max_shape, lambda *indice: zero, name="mask_first_max")

    attrs["custom_tiling"] = maxpool_with_argmax_dynamic_tensor_strategy(
        data, fmap_img2col_ub, mask_first_max)
    attrs["dynamic_shape"] = ds.set_dynamic_shape_limit_for_tensor(output, [64, 64], [2, 3])
    return output, mask_first_max, attrs


@ct_util.reg_set_dim_func(maxpool_with_argmax_set_dim_func)
@vc_util.check_input_type(akg.tvm.tensor.Tensor, (list, tuple), (list, tuple), (list, tuple, str))
def maxpool_with_argmax(data, kernel, stride, strategy):
    """
    Performs the max pooling on the input datas.

    Note:
        Only support 5D format(NC1HWC0), and pooling will work on H and W.

    Args:
        data (tvm.tensor.Tensor): Tensor of type float16, float32.
        kernel (Union[list, tuple]): two int numbers for pooling window's size.
        stride (Union[list, tuple]): two int numbers for window's stride.
        strategy (Union[str, list, tuple]): padding, should be 'VALID','SAME' or
            instance of list(four int numbers, as 'CONSTANTS' strategy).
            Support **Strategies** is the same as avgpool.

    Returns:
        tvm.tensor.Tensor, result for gradient of maxpooling.
    """
    attrs = get_attrs()
    dim_info = maxpool_with_argmax_set_dim_func(data, kernel, stride, strategy)[0]
    for k, v in attr_map_v2.items():
        attrs[k] = v
    if dim_info != "":
        attrs['dim'] = dim_info
    attrs["custom_tiling"] = maxpool_with_argmax_tiling_strategy(data, kernel, stride, strategy)
    shape = get_shape(data)
    dtype = data.dtype

    vc_util.davinci_format_check(shape, "NC1HWC0", dim=5)
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.FLOAT16)
    vc_util.check_shape(kernel, 2, 'Kernel')
    vc_util.check_shape(stride, 2, 'Stride')

    pad_strategy_check(strategy)

    kernel_h, kernel_w = kernel
    in_n, in_c1, _, _, in_c0 = shape

    [ph_h, ph_t, pw_h, pw_t], [out_h, out_w] = \
        cal_pad_shapes_by_strategy(shape, kernel, stride, strategy)

    pad = [ph_h, ph_t, pw_h, pw_t]
    zero = akg.tvm.const(0.0, dtype=dtype)
    one = akg.tvm.const(1.0, dtype=dtype)
    min_value = akg.tvm.const(-65504.0 if dtype == 'float16'
                              else -340282346638528859811704183484516925440.0, dtype=dtype)

    # fmap img2col l1 -> ub in zZ format by fractal
    fmap_img2col_shape_ub = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w, in_c0)

    fmap_img2col_ub = img2col(data, fmap_img2col_shape_ub, kernel_h, kernel_w,
                              pad, stride, min_value, tag='')

    out_shape = (in_n, in_c1, out_h, out_w, in_c0)
    reduce_axis_h = akg.tvm.reduce_axis((0, kernel_h), name="reduce_h")
    reduce_axis_w = akg.tvm.reduce_axis((0, kernel_w), name="reduce_w")
    output = akg.tvm.compute(out_shape,
                             lambda n, c1, oh, ow, c0:
                             akg.tvm.max(
                                 fmap_img2col_ub[n, c1, reduce_axis_h,
                                                 reduce_axis_w, oh, ow, c0],
                                 axis=[reduce_axis_h, reduce_axis_w]),
                             name="pooling_max")

    pooling_mask = akg.tvm.compute(fmap_img2col_shape_ub,
                                   lambda n, c1, kh, kw, oh, ow, c0:
                                   akg.tvm.if_then_else(
                                       fmap_img2col_ub[n, c1, kh, kw, oh, ow, c0]
                                       < output[n, c1, oh, ow, c0], zero, one),
                                   name="pooling_mask")

    mask_flag = akg.tvm.compute(
        out_shape,
        lambda n, c1, oh, ow, c0: pooling_mask[n, c1, 0, 0, oh, ow, c0],
        name="mask_flag")

    mask_init = akg.tvm.compute(
        out_shape,
        lambda n, c1, oh, ow, c0: pooling_mask[n, c1, 0, 0, oh, ow, c0],
        name="mask_init")

    # spec 2
    @script(capture=locals())
    def hybrid_first_max(mask_, flag_, flag2_, zero_, one_):
        output_ = allocate((in_n, in_c1, kernel_h, kernel_w, out_h, out_w, in_c0), mask_.dtype, 'local')
        for n_i in range(in_n):
            for c1_i in range(in_c1):
                for oh_i in range(out_h):
                    for ow_i in range(out_w):
                        for c0_i in range(in_c0):
                            output_[n_i, c1_i, 0, 0, oh_i, ow_i, c0_i] = flag2_[n_i, c1_i, oh_i, ow_i, c0_i]
                for kh_i in range(kernel_h):
                    for kw_i in range(kernel_w):
                        for oh_i in range(out_h):
                            for ow_i in range(out_w):
                                for c0_i in range(in_c0):
                                    output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i] = \
                                        mask_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i] -\
                                        flag_[n_i, c1_i, oh_i, ow_i, c0_i]
                                    output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i] = \
                                        max(output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i], zero_)
                                    flag_[n_i, c1_i, oh_i, ow_i, c0_i] =\
                                        flag_[n_i, c1_i, oh_i, ow_i, c0_i] +\
                                        output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i]
        return output_

    mask_first_max = hybrid_first_max(pooling_mask, mask_flag, mask_init, zero, one)
    return output, mask_first_max, attrs