# Copyright 2020 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """load tensor and combine tensor""" import numpy as np from mindspore.common.tensor import Tensor from ..communication.management import get_rank, get_group_size def _get_tensor_strategy(dev_mat, tensor_map): """ Get split strategy by device arrangement and tensor map. Args: dev_mat (list): The device matrix. tensor_map (list): The map relation between tensor and devices. Returns: List, the split strategy with the same size of np_tensor. """ tensor_strategy = [] for dim in tensor_map: if dim == -1: tensor_strategy.append(1) else: tensor_strategy.append(dev_mat[-dim-1]) return tensor_strategy def _get_tensor_slice_index(device_arrangement, tensor_strategy, tensor_map, rank_index): """ Get the tensor slice index for the local device. Args: device_arrangement (list): The device matrix. tensor_strategy (list): The split strategy with the same size of np_tensor. tensor_map (list): The map relation between tensor and devices. rank_index (int): The rank of local device. Returns: Integer, the index of the local device for tensor slices. """ device_coordinate = _rank_to_coordinate(rank_index, device_arrangement) device_coordinate_new = _convert_to_new_device_coordinate(device_coordinate, tensor_map) tensor_slice_index = _coordinate_to_rank(device_coordinate_new, tensor_strategy) return tensor_slice_index def _rank_to_coordinate(rank_index, device_arrangement): """ Convert rank index to device coordinate. Args: rank_index (int): The index of the local device. device_arrangement (list): The device matrix. Returns: List, the coordinate for local device in the device matrix """ dim_len = len(device_arrangement) device_coordinate = np.zeros(dim_len) for i in range(dim_len): size = device_arrangement[dim_len - 1 - i] device_coordinate[dim_len - 1 - i] = rank_index % size rank_index = int(rank_index / size) return device_coordinate def _coordinate_to_rank(device_coordinate, device_arrangement): """ Convert device coordinate to rank index. Args: device_coordinate (list): The coordinate for local device in the device matrix. device_arrangement (list): The device matrix. Returns: Integer, the index of the local device for tensor slices. """ rank_index = 0 size = 1 for i in range(len(device_coordinate)): rank_index += size * device_coordinate[len(device_coordinate) - 1 - i] size *= device_arrangement[len(device_coordinate) - 1 - i] return rank_index def _convert_to_new_device_coordinate(device_coordinate, tensor_map): """ Convert device_coordinate according to the tensor map. Args: device_coordinate (list): The coordinate for local device in the device matrix. tensor_map (list): The map relation between tensor and devices. Returns: List, the converted coordinate. """ device_coordinate_new = [] for i in range(len(tensor_map)): if tensor_map[len(tensor_map) - 1 - i] != -1: device_coordinate_new.insert(0, device_coordinate[len(device_coordinate) - 1 - tensor_map[len(tensor_map) - 1 - i]]) else: device_coordinate_new.insert(0, 0) return device_coordinate_new def _chunk_tensor(np_tensor, strategy, depth): """ Recursive function to chunk tensor. Args: np_tensor (NDarray): The matrix to be split. strategy (list): The split strategy with the same size of np_tensor. depth (int): Recursion depth. Returns: NDarray, the splited matrix. Raises: ValueError: If np_tensor can not be split by strategy. """ output = [] axis = len(np_tensor.shape) - depth if np_tensor.shape[axis] % strategy[0] != 0: raise ValueError("np_tensor can not be split by strategy!") ret = list(np.split(np_tensor, strategy[0], axis)) if depth == 1: return ret for ret_ in ret: output.extend( _chunk_tensor(ret_, strategy[len(strategy) - depth + 1:len(strategy)], depth - 1)) return output def _chunk_tensor_by_strategy(np_tensor, strategy): """ Split the input by strategy. Args: np_tensor (NDarray): The matrix to be split. strategy (list): The split strategy with the same size of np_tensor. Returns: NDarray, the splited matrix. Raises: TypeError: If np_tensor is not ndarray ValueError: If the length of np_tensor does not match the length of strategy. """ if not isinstance(np_tensor, np.ndarray): raise TypeError("np_tensor should be ndarray!") if len(strategy) != len(np_tensor.shape): raise ValueError("The length of np_tensor does not match the length of strategy!") return _chunk_tensor(np_tensor, strategy, len(strategy)) def _get_slice_index(dev_mat, tensor_map): """ Get the slice index for current slice. Args: dev_mat (list): The device matrix of devices. tensor_map (list): The split strategy of tensor. Returns: Integer, the slice index for slice on this device. """ rank = get_rank() tensor_strategy = _get_tensor_strategy(dev_mat, tensor_map) tensor_slice_index = _get_tensor_slice_index(dev_mat, tensor_strategy, tensor_map, rank) return tensor_slice_index def _load_tensor(tensor, dev_mat, tensor_map): """ Get the tensor slice of the local device by the device matrix and the tensor map Args: tensor (Tensor): The tensor to be split. dev_mat (list): The device matrix of devices. tensor_map (list): The split strategy of tensor. Returns: numpy.array, the sliced array. Examples: >>> tensor = Tensor(np.ones([32, 32])) >>> dev_mat = [2, 4] >>> tensor_map = [1, -1] >>> tensor_slice = _load_tensor(tensor, dev_mat, tensor_map) """ rank = get_rank() tensor_strategy = _get_tensor_strategy(dev_mat, tensor_map) tensor_slice_index = _get_tensor_slice_index(dev_mat, tensor_strategy, tensor_map, rank) np_tensor = tensor.asnumpy() np_tensor_list = _chunk_tensor_by_strategy(np_tensor, tensor_strategy) np_tensor_slice = np_tensor_list[int(tensor_slice_index)] return np_tensor_slice def _load_tensor_by_layout(tensor, layout): """ Load tensor by layout. Args: tensor (Tensor): The input tensor. layout (list): The tensor layout in auto parallel. Returns: Tensor, the sliced tensor. Raises: TypeError: If layout is not list. ValueError: If the length of layout is not 3. """ if not isinstance(layout, tuple): raise TypeError("The layout should be tuple! layout is {}".format(layout)) if len(layout) < 6: raise ValueError("The length of layout must be larger than 5! layout is {}".format(layout)) dev_mat = layout[0] tensor_map = layout[1] uniform_split = layout[4] group = layout[5] if uniform_split == 0: raise RuntimeError("The load tensor only support uniform split now") if tensor.size() == 1: return tensor tensor_slice = _load_tensor(tensor, dev_mat, tensor_map) if group: # get a totally shard tensor slice for parallel optimizer rank = get_rank(group) size = get_group_size(group) tensor_slice = np.split(tensor_slice, size)[rank] return Tensor(tensor_slice) def _reshape_param_data(param_data, dev_mat, tensor_map): """ Combine param slice by the device matrix and the tensor map, used in model parallel scenario. Args: param_data (Tensor): The tensor to be reshaped, generated from all the device from AllGatherParamNet. dev_mat (list): The device matrix of devices. tensor_map (list): The split strategy of tensor. Returns: Tensor, the combined tensor which with the whole data value. Examples: >>> param_data = _allgather_param_net(param_data) >>> dev_mat = [2, 2] >>> tensor_map = [1, 0] >>> tensor = _reshape_param_data(tensor_slices, dev_mat, tensor_map) """ device_count = 1 for dim in dev_mat: device_count *= dim tensor_slices = np.split(param_data.asnumpy(), device_count, axis=0) tensor_strategy = _get_tensor_strategy(dev_mat, tensor_map) # get the actual number of slices,as: different devices may load the same slice slice_count = 1 for dim in tensor_strategy: slice_count *= dim # reorder slices and remove duplicates based on device matrix and tensor_map tensor_slices_new = list(range(slice_count)) for i in range(device_count): slice_index = _get_tensor_slice_index(dev_mat, tensor_strategy, tensor_map, i) tensor_slices_new[int(slice_index)] = np.array(tensor_slices[i]) # combine slices to generate complete parameter dim_len = len(tensor_strategy) for i in range(dim_len): ele_count = int(len(tensor_slices_new) / tensor_strategy[dim_len - 1 - i]) tensor_slices_new_inner = [] for j in range(ele_count): new_tensor = tensor_slices_new[j * tensor_strategy[dim_len - 1 - i]] for l in range(j * tensor_strategy[dim_len - 1 - i] + 1, (j + 1) * tensor_strategy[dim_len - 1 - i]): new_tensor = np.concatenate((new_tensor, tensor_slices_new[l]), axis=dim_len - 1 - i) tensor_slices_new_inner.insert(len(tensor_slices_new_inner), np.array(new_tensor)) tensor_slices_new = tensor_slices_new_inner return Tensor(tensor_slices_new[0]) def _reshape_param_data_with_weight(param_data, dev_mat, field_size): """ Combine param slice by the device matrix, used in model parallel scenario. Args: param_data (Tensor): The tensor to be reshaped and rearrangement, generated from all the device from AllGatherParamNet. dev_mat (list): The device matrix of devices. Returns: Tensor, the combined tensor which with the whole data value. Examples: >>> param_data = _allgather_param_net(param_data) >>> dev_mat = [2, 2] >>> field_size = [39] >>> tensor = _reshape_param_data_with_weight(param_data, dev_mat, field_size) """ device_count = 1 for dim in dev_mat: device_count *= dim tensor_slices = np.split(param_data.asnumpy(), device_count, axis=0) tensor_slices_col = [] for i in range(len(tensor_slices[0][0])): tensor_slices_new = np.array(tensor_slices[0][:, i]).reshape(field_size, -1) for j in range(1, device_count): tensor_slices_new = np.concatenate((tensor_slices_new,\ np.array(tensor_slices[j][:, i]).reshape(field_size, -1)), axis=1) tensor_slices_col.append(tensor_slices_new) new_tensor = np.array(tensor_slices_col[0]).reshape(-1, 1) for i in range(1, len(tensor_slices_col)): new_tensor = np.concatenate((new_tensor, np.array(tensor_slices_col[i]).reshape(-1, 1)), axis=1) return Tensor(new_tensor)