/** * Copyright 2020 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "dataset/engine/gnn/graph.h" #include #include #include #include #include #include "dataset/core/tensor_shape.h" #include "dataset/util/random.h" namespace mindspore { namespace dataset { namespace gnn { Graph::Graph(std::string dataset_file, int32_t num_workers) : dataset_file_(dataset_file), num_workers_(num_workers), rnd_(GetRandomDevice()), random_walk_(this) { rnd_.seed(GetSeed()); MS_LOG(INFO) << "num_workers:" << num_workers; } Status Graph::GetAllNodes(NodeType node_type, std::shared_ptr *out) { auto itr = node_type_map_.find(node_type); if (itr == node_type_map_.end()) { std::string err_msg = "Invalid node type:" + std::to_string(node_type); RETURN_STATUS_UNEXPECTED(err_msg); } else { RETURN_IF_NOT_OK(CreateTensorByVector({itr->second}, DataType(DataType::DE_INT32), out)); } return Status::OK(); } template Status Graph::CreateTensorByVector(const std::vector> &data, DataType type, std::shared_ptr *out) { if (!type.IsCompatible()) { RETURN_STATUS_UNEXPECTED("Data type not compatible"); } if (data.empty()) { RETURN_STATUS_UNEXPECTED("Input data is empty"); } std::shared_ptr tensor; size_t m = data.size(); size_t n = data[0].size(); RETURN_IF_NOT_OK(Tensor::CreateTensor( &tensor, TensorImpl::kFlexible, TensorShape({static_cast(m), static_cast(n)}), type, nullptr)); auto ptr = tensor->begin(); for (const auto &id_m : data) { CHECK_FAIL_RETURN_UNEXPECTED(id_m.size() == n, "Each member of the vector has a different size"); for (const auto &id_n : id_m) { *ptr = id_n; ptr++; } } tensor->Squeeze(); *out = std::move(tensor); return Status::OK(); } template Status Graph::ComplementVector(std::vector> *data, size_t max_size, T default_value) { if (!data || data->empty()) { RETURN_STATUS_UNEXPECTED("Input data is empty"); } for (std::vector &vec : *data) { size_t size = vec.size(); if (size > max_size) { RETURN_STATUS_UNEXPECTED("The max_size parameter is abnormal"); } else { for (size_t i = 0; i < (max_size - size); ++i) { vec.push_back(default_value); } } } return Status::OK(); } Status Graph::GetAllEdges(EdgeType edge_type, std::shared_ptr *out) { auto itr = edge_type_map_.find(edge_type); if (itr == edge_type_map_.end()) { std::string err_msg = "Invalid edge type:" + std::to_string(edge_type); RETURN_STATUS_UNEXPECTED(err_msg); } else { RETURN_IF_NOT_OK(CreateTensorByVector({itr->second}, DataType(DataType::DE_INT32), out)); } return Status::OK(); } Status Graph::GetNodesFromEdges(const std::vector &edge_list, std::shared_ptr *out) { if (edge_list.empty()) { RETURN_STATUS_UNEXPECTED("Input edge_list is empty"); } std::vector> node_list; node_list.reserve(edge_list.size()); for (const auto &edge_id : edge_list) { auto itr = edge_id_map_.find(edge_id); if (itr == edge_id_map_.end()) { std::string err_msg = "Invalid edge id:" + std::to_string(edge_id); RETURN_STATUS_UNEXPECTED(err_msg); } else { std::pair, std::shared_ptr> nodes; RETURN_IF_NOT_OK(itr->second->GetNode(&nodes)); node_list.push_back({nodes.first->id(), nodes.second->id()}); } } RETURN_IF_NOT_OK(CreateTensorByVector(node_list, DataType(DataType::DE_INT32), out)); return Status::OK(); } Status Graph::GetAllNeighbors(const std::vector &node_list, NodeType neighbor_type, std::shared_ptr *out) { if (node_list.empty()) { RETURN_STATUS_UNEXPECTED("Input node_list is empty."); } if (node_type_map_.find(neighbor_type) == node_type_map_.end()) { std::string err_msg = "Invalid neighbor type:" + std::to_string(neighbor_type); RETURN_STATUS_UNEXPECTED(err_msg); } std::vector> neighbors; size_t max_neighbor_num = 0; neighbors.resize(node_list.size()); for (size_t i = 0; i < node_list.size(); ++i) { std::shared_ptr node; RETURN_IF_NOT_OK(GetNodeByNodeId(node_list[i], &node)); RETURN_IF_NOT_OK(node->GetAllNeighbors(neighbor_type, &neighbors[i])); max_neighbor_num = max_neighbor_num > neighbors[i].size() ? max_neighbor_num : neighbors[i].size(); } RETURN_IF_NOT_OK(ComplementVector(&neighbors, max_neighbor_num, kDefaultNodeId)); RETURN_IF_NOT_OK(CreateTensorByVector(neighbors, DataType(DataType::DE_INT32), out)); return Status::OK(); } Status Graph::CheckSamplesNum(NodeIdType samples_num) { NodeIdType all_nodes_number = std::accumulate(node_type_map_.begin(), node_type_map_.end(), 0, [](NodeIdType t1, const auto &t2) -> NodeIdType { return t1 + t2.second.size(); }); if ((samples_num < 1) || (samples_num > all_nodes_number)) { std::string err_msg = "Wrong samples number, should be between 1 and " + std::to_string(all_nodes_number) + ", got " + std::to_string(samples_num); RETURN_STATUS_UNEXPECTED(err_msg); } return Status::OK(); } Status Graph::GetSampledNeighbors(const std::vector &node_list, const std::vector &neighbor_nums, const std::vector &neighbor_types, std::shared_ptr *out) { CHECK_FAIL_RETURN_UNEXPECTED(!node_list.empty(), "Input node_list is empty."); CHECK_FAIL_RETURN_UNEXPECTED(neighbor_nums.size() == neighbor_types.size(), "The sizes of neighbor_nums and neighbor_types are inconsistent."); for (const auto &num : neighbor_nums) { RETURN_IF_NOT_OK(CheckSamplesNum(num)); } for (const auto &type : neighbor_types) { if (node_type_map_.find(type) == node_type_map_.end()) { std::string err_msg = "Invalid neighbor type:" + std::to_string(type); RETURN_STATUS_UNEXPECTED(err_msg); } } std::vector> neighbors_vec(node_list.size()); for (size_t node_idx = 0; node_idx < node_list.size(); ++node_idx) { std::shared_ptr input_node; RETURN_IF_NOT_OK(GetNodeByNodeId(node_list[node_idx], &input_node)); neighbors_vec[node_idx].emplace_back(node_list[node_idx]); std::vector input_list = {node_list[node_idx]}; for (size_t i = 0; i < neighbor_nums.size(); ++i) { std::vector neighbors; neighbors.reserve(input_list.size() * neighbor_nums[i]); for (const auto &node_id : input_list) { if (node_id == kDefaultNodeId) { for (int32_t j = 0; j < neighbor_nums[i]; ++j) { neighbors.emplace_back(kDefaultNodeId); } } else { std::shared_ptr node; RETURN_IF_NOT_OK(GetNodeByNodeId(node_id, &node)); std::vector out; RETURN_IF_NOT_OK(node->GetSampledNeighbors(neighbor_types[i], neighbor_nums[i], &out)); neighbors.insert(neighbors.end(), out.begin(), out.end()); } } neighbors_vec[node_idx].insert(neighbors_vec[node_idx].end(), neighbors.begin(), neighbors.end()); input_list = std::move(neighbors); } } RETURN_IF_NOT_OK(CreateTensorByVector(neighbors_vec, DataType(DataType::DE_INT32), out)); return Status::OK(); } Status Graph::NegativeSample(const std::vector &data, const std::unordered_set &exclude_data, int32_t samples_num, std::vector *out_samples) { CHECK_FAIL_RETURN_UNEXPECTED(!data.empty(), "Input data is empty."); std::vector shuffled_id(data.size()); std::iota(shuffled_id.begin(), shuffled_id.end(), 0); std::shuffle(shuffled_id.begin(), shuffled_id.end(), rnd_); for (const auto &index : shuffled_id) { if (exclude_data.find(data[index]) != exclude_data.end()) { continue; } out_samples->emplace_back(data[index]); if (out_samples->size() >= samples_num) { break; } } return Status::OK(); } Status Graph::GetNegSampledNeighbors(const std::vector &node_list, NodeIdType samples_num, NodeType neg_neighbor_type, std::shared_ptr *out) { CHECK_FAIL_RETURN_UNEXPECTED(!node_list.empty(), "Input node_list is empty."); RETURN_IF_NOT_OK(CheckSamplesNum(samples_num)); if (node_type_map_.find(neg_neighbor_type) == node_type_map_.end()) { std::string err_msg = "Invalid neighbor type:" + std::to_string(neg_neighbor_type); RETURN_STATUS_UNEXPECTED(err_msg); } std::vector> neighbors_vec; neighbors_vec.resize(node_list.size()); for (size_t node_idx = 0; node_idx < node_list.size(); ++node_idx) { std::shared_ptr node; RETURN_IF_NOT_OK(GetNodeByNodeId(node_list[node_idx], &node)); std::vector neighbors; RETURN_IF_NOT_OK(node->GetAllNeighbors(neg_neighbor_type, &neighbors)); std::unordered_set exclude_node; std::transform(neighbors.begin(), neighbors.end(), std::insert_iterator>(exclude_node, exclude_node.begin()), [](const NodeIdType node) { return node; }); auto itr = node_type_map_.find(neg_neighbor_type); if (itr == node_type_map_.end()) { std::string err_msg = "Invalid node type:" + std::to_string(neg_neighbor_type); RETURN_STATUS_UNEXPECTED(err_msg); } else { neighbors_vec[node_idx].emplace_back(node->id()); if (itr->second.size() > exclude_node.size()) { while (neighbors_vec[node_idx].size() < samples_num + 1) { RETURN_IF_NOT_OK(NegativeSample(itr->second, exclude_node, samples_num - neighbors_vec[node_idx].size(), &neighbors_vec[node_idx])); } } else { MS_LOG(DEBUG) << "There are no negative neighbors. node_id:" << node->id() << " neg_neighbor_type:" << neg_neighbor_type; // If there are no negative neighbors, they are filled with kDefaultNodeId for (int32_t i = 0; i < samples_num; ++i) { neighbors_vec[node_idx].emplace_back(kDefaultNodeId); } } } } RETURN_IF_NOT_OK(CreateTensorByVector(neighbors_vec, DataType(DataType::DE_INT32), out)); return Status::OK(); } Status Graph::RandomWalk(const std::vector &node_list, const std::vector &meta_path, float step_home_param, float step_away_param, NodeIdType default_node, std::shared_ptr *out) { RETURN_IF_NOT_OK(random_walk_.Build(node_list, meta_path, step_home_param, step_away_param, default_node)); std::vector> walks; RETURN_IF_NOT_OK(random_walk_.SimulateWalk(&walks)); RETURN_IF_NOT_OK(CreateTensorByVector({walks}, DataType(DataType::DE_INT32), out)); return Status::OK(); } Status Graph::GetNodeDefaultFeature(FeatureType feature_type, std::shared_ptr *out_feature) { auto itr = default_feature_map_.find(feature_type); if (itr == default_feature_map_.end()) { std::string err_msg = "Invalid feature type:" + std::to_string(feature_type); RETURN_STATUS_UNEXPECTED(err_msg); } else { *out_feature = itr->second; } return Status::OK(); } Status Graph::GetNodeFeature(const std::shared_ptr &nodes, const std::vector &feature_types, TensorRow *out) { if (!nodes || nodes->Size() == 0) { RETURN_STATUS_UNEXPECTED("Input nodes is empty"); } CHECK_FAIL_RETURN_UNEXPECTED(!feature_types.empty(), "Inpude feature_types is empty"); TensorRow tensors; for (const auto &f_type : feature_types) { std::shared_ptr default_feature; // If no feature can be obtained, fill in the default value RETURN_IF_NOT_OK(GetNodeDefaultFeature(f_type, &default_feature)); TensorShape shape(default_feature->Value()->shape()); auto shape_vec = nodes->shape().AsVector(); dsize_t size = std::accumulate(shape_vec.begin(), shape_vec.end(), 1, std::multiplies()); shape = shape.PrependDim(size); std::shared_ptr fea_tensor; RETURN_IF_NOT_OK( Tensor::CreateTensor(&fea_tensor, TensorImpl::kFlexible, shape, default_feature->Value()->type(), nullptr)); dsize_t index = 0; for (auto node_itr = nodes->begin(); node_itr != nodes->end(); ++node_itr) { std::shared_ptr feature; if (*node_itr == kDefaultNodeId) { feature = default_feature; } else { std::shared_ptr node; RETURN_IF_NOT_OK(GetNodeByNodeId(*node_itr, &node)); if (!node->GetFeatures(f_type, &feature).IsOk()) { feature = default_feature; } } RETURN_IF_NOT_OK(fea_tensor->InsertTensor({index}, feature->Value())); index++; } TensorShape reshape(nodes->shape()); for (auto s : default_feature->Value()->shape().AsVector()) { reshape = reshape.AppendDim(s); } RETURN_IF_NOT_OK(fea_tensor->Reshape(reshape)); fea_tensor->Squeeze(); tensors.push_back(fea_tensor); } *out = std::move(tensors); return Status::OK(); } Status Graph::GetEdgeFeature(const std::shared_ptr &edges, const std::vector &feature_types, TensorRow *out) { return Status::OK(); } Status Graph::Init() { RETURN_IF_NOT_OK(LoadNodeAndEdge()); return Status::OK(); } Status Graph::GetMetaInfo(MetaInfo *meta_info) { meta_info->node_type.resize(node_type_map_.size()); std::transform(node_type_map_.begin(), node_type_map_.end(), meta_info->node_type.begin(), [](auto itr) { return itr.first; }); std::sort(meta_info->node_type.begin(), meta_info->node_type.end()); meta_info->edge_type.resize(edge_type_map_.size()); std::transform(edge_type_map_.begin(), edge_type_map_.end(), meta_info->edge_type.begin(), [](auto itr) { return itr.first; }); std::sort(meta_info->edge_type.begin(), meta_info->edge_type.end()); for (const auto &node : node_type_map_) { meta_info->node_num[node.first] = node.second.size(); } for (const auto &edge : edge_type_map_) { meta_info->edge_num[edge.first] = edge.second.size(); } for (const auto &node_feature : node_feature_map_) { for (auto type : node_feature.second) { meta_info->node_feature_type.emplace_back(type); } } std::sort(meta_info->node_feature_type.begin(), meta_info->node_feature_type.end()); auto unique_node = std::unique(meta_info->node_feature_type.begin(), meta_info->node_feature_type.end()); meta_info->node_feature_type.erase(unique_node, meta_info->node_feature_type.end()); for (const auto &edge_feature : edge_feature_map_) { for (const auto &type : edge_feature.second) { meta_info->edge_feature_type.emplace_back(type); } } std::sort(meta_info->edge_feature_type.begin(), meta_info->edge_feature_type.end()); auto unique_edge = std::unique(meta_info->edge_feature_type.begin(), meta_info->edge_feature_type.end()); meta_info->edge_feature_type.erase(unique_edge, meta_info->edge_feature_type.end()); return Status::OK(); } Status Graph::GraphInfo(py::dict *out) { MetaInfo meta_info; RETURN_IF_NOT_OK(GetMetaInfo(&meta_info)); (*out)["node_type"] = py::cast(meta_info.node_type); (*out)["edge_type"] = py::cast(meta_info.edge_type); (*out)["node_num"] = py::cast(meta_info.node_num); (*out)["edge_num"] = py::cast(meta_info.edge_num); (*out)["node_feature_type"] = py::cast(meta_info.node_feature_type); (*out)["edge_feature_type"] = py::cast(meta_info.edge_feature_type); return Status::OK(); } Status Graph::LoadNodeAndEdge() { GraphLoader gl(dataset_file_, num_workers_); // ask graph_loader to load everything into memory RETURN_IF_NOT_OK(gl.InitAndLoad()); // get all maps RETURN_IF_NOT_OK(gl.GetNodesAndEdges(&node_id_map_, &edge_id_map_, &node_type_map_, &edge_type_map_, &node_feature_map_, &edge_feature_map_, &default_feature_map_)); return Status::OK(); } Status Graph::GetNodeByNodeId(NodeIdType id, std::shared_ptr *node) { auto itr = node_id_map_.find(id); if (itr == node_id_map_.end()) { std::string err_msg = "Invalid node id:" + std::to_string(id); RETURN_STATUS_UNEXPECTED(err_msg); } else { *node = itr->second; } return Status::OK(); } Graph::RandomWalkBase::RandomWalkBase(Graph *graph) : graph_(graph), step_home_param_(1.0), step_away_param_(1.0), default_node_(-1), num_walks_(1), num_workers_(1) {} Status Graph::RandomWalkBase::Build(const std::vector &node_list, const std::vector &meta_path, float step_home_param, float step_away_param, const NodeIdType default_node, int32_t num_walks, int32_t num_workers) { node_list_ = node_list; if (meta_path.empty() || meta_path.size() > kMaxNumWalks) { std::string err_msg = "Failed, meta path required between 1 and " + std::to_string(kMaxNumWalks) + ". The size of input path is " + std::to_string(meta_path.size()); RETURN_STATUS_UNEXPECTED(err_msg); } meta_path_ = meta_path; if (step_home_param < kGnnEpsilon || step_away_param < kGnnEpsilon) { std::string err_msg = "Failed, step_home_param and step_away_param required greater than " + std::to_string(kGnnEpsilon) + ". step_home_param: " + std::to_string(step_home_param) + ", step_away_param: " + std::to_string(step_away_param); RETURN_STATUS_UNEXPECTED(err_msg); } step_home_param_ = step_home_param; step_away_param_ = step_away_param; default_node_ = default_node; num_walks_ = num_walks; num_workers_ = num_workers; return Status::OK(); } Status Graph::RandomWalkBase::Node2vecWalk(const NodeIdType &start_node, std::vector *walk_path) { // Simulate a random walk starting from start node. auto walk = std::vector(1, start_node); // walk is an vector // walk simulate while (walk.size() - 1 < meta_path_.size()) { // current nodE auto cur_node_id = walk.back(); std::shared_ptr cur_node; RETURN_IF_NOT_OK(graph_->GetNodeByNodeId(cur_node_id, &cur_node)); // current neighbors std::vector cur_neighbors; RETURN_IF_NOT_OK(cur_node->GetAllNeighbors(meta_path_[walk.size() - 1], &cur_neighbors, true)); std::sort(cur_neighbors.begin(), cur_neighbors.end()); // break if no neighbors if (cur_neighbors.empty()) { break; } // walk by the fist node, then by the previous 2 nodes std::shared_ptr stochastic_index; if (walk.size() == 1) { RETURN_IF_NOT_OK(GetNodeProbability(cur_node_id, meta_path_[0], &stochastic_index)); } else { NodeIdType prev_node_id = walk[walk.size() - 2]; RETURN_IF_NOT_OK(GetEdgeProbability(prev_node_id, cur_node_id, walk.size() - 2, &stochastic_index)); } NodeIdType next_node_id = cur_neighbors[WalkToNextNode(*stochastic_index)]; walk.push_back(next_node_id); } while (walk.size() - 1 < meta_path_.size()) { walk.push_back(default_node_); } *walk_path = std::move(walk); return Status::OK(); } Status Graph::RandomWalkBase::SimulateWalk(std::vector> *walks) { // Repeatedly simulate random walks from each node std::vector permutation(node_list_.size()); std::iota(permutation.begin(), permutation.end(), 0); for (int32_t i = 0; i < num_walks_; i++) { unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); std::shuffle(permutation.begin(), permutation.end(), std::default_random_engine(seed)); for (const auto &i_perm : permutation) { std::vector walk; RETURN_IF_NOT_OK(Node2vecWalk(node_list_[i_perm], &walk)); walks->push_back(walk); } } return Status::OK(); } Status Graph::RandomWalkBase::GetNodeProbability(const NodeIdType &node_id, const NodeType &node_type, std::shared_ptr *node_probability) { // Generate alias nodes std::shared_ptr node; graph_->GetNodeByNodeId(node_id, &node); std::vector neighbors; RETURN_IF_NOT_OK(node->GetAllNeighbors(node_type, &neighbors, true)); std::sort(neighbors.begin(), neighbors.end()); auto non_normalized_probability = std::vector(neighbors.size(), 1.0); *node_probability = std::make_shared(GenerateProbability(Normalize(non_normalized_probability))); return Status::OK(); } Status Graph::RandomWalkBase::GetEdgeProbability(const NodeIdType &src, const NodeIdType &dst, uint32_t meta_path_index, std::shared_ptr *edge_probability) { // Get the alias edge setup lists for a given edge. std::shared_ptr src_node; graph_->GetNodeByNodeId(src, &src_node); std::vector src_neighbors; RETURN_IF_NOT_OK(src_node->GetAllNeighbors(meta_path_[meta_path_index], &src_neighbors, true)); std::shared_ptr dst_node; graph_->GetNodeByNodeId(dst, &dst_node); std::vector dst_neighbors; RETURN_IF_NOT_OK(dst_node->GetAllNeighbors(meta_path_[meta_path_index + 1], &dst_neighbors, true)); std::sort(dst_neighbors.begin(), dst_neighbors.end()); std::vector non_normalized_probability; for (const auto &dst_nbr : dst_neighbors) { if (dst_nbr == src) { non_normalized_probability.push_back(1.0 / step_home_param_); // replace 1.0 with G[dst][dst_nbr]['weight'] continue; } auto it = std::find(src_neighbors.begin(), src_neighbors.end(), dst_nbr); if (it != src_neighbors.end()) { // stay close, this node connect both src and dst non_normalized_probability.push_back(1.0); // replace 1.0 with G[dst][dst_nbr]['weight'] } else { // step far away non_normalized_probability.push_back(1.0 / step_away_param_); // replace 1.0 with G[dst][dst_nbr]['weight'] } } *edge_probability = std::make_shared(GenerateProbability(Normalize(non_normalized_probability))); return Status::OK(); } StochasticIndex Graph::RandomWalkBase::GenerateProbability(const std::vector &probability) { uint32_t K = probability.size(); std::vector switch_to_large_index(K, 0); std::vector weight(K, .0); std::vector smaller; std::vector larger; auto random_device = GetRandomDevice(); std::uniform_real_distribution<> distribution(-kGnnEpsilon, kGnnEpsilon); float accumulate_threshold = 0.0; for (uint32_t i = 0; i < K; i++) { float threshold_one = distribution(random_device); accumulate_threshold += threshold_one; weight[i] = i < K - 1 ? probability[i] * K + threshold_one : probability[i] * K - accumulate_threshold; weight[i] < 1.0 ? smaller.push_back(i) : larger.push_back(i); } while ((!smaller.empty()) && (!larger.empty())) { uint32_t small = smaller.back(); smaller.pop_back(); uint32_t large = larger.back(); larger.pop_back(); switch_to_large_index[small] = large; weight[large] = weight[large] + weight[small] - 1.0; weight[large] < 1.0 ? smaller.push_back(large) : larger.push_back(large); } return StochasticIndex(switch_to_large_index, weight); } uint32_t Graph::RandomWalkBase::WalkToNextNode(const StochasticIndex &stochastic_index) { auto switch_to_large_index = stochastic_index.first; auto weight = stochastic_index.second; const uint32_t size_of_index = switch_to_large_index.size(); auto random_device = GetRandomDevice(); std::uniform_real_distribution<> distribution(0.0, 1.0); // Generate random integer between [0, K) uint32_t random_idx = std::floor(distribution(random_device) * size_of_index); if (distribution(random_device) < weight[random_idx]) { return random_idx; } return switch_to_large_index[random_idx]; } template std::vector Graph::RandomWalkBase::Normalize(const std::vector &non_normalized_probability) { float sum_probability = 1.0 * std::accumulate(non_normalized_probability.begin(), non_normalized_probability.end(), 0); if (sum_probability < kGnnEpsilon) { sum_probability = 1.0; } std::vector normalized_probability; std::transform(non_normalized_probability.begin(), non_normalized_probability.end(), std::back_inserter(normalized_probability), [&](T value) -> float { return value / sum_probability; }); return normalized_probability; } } // namespace gnn } // namespace dataset } // namespace mindspore