!2789 support pubmed dataset

Merge pull request !2789 from heleiwang/support_pubmed
5 years ago · c96046d94b
--- a/model_zoo/utils/graph_to_mindrecord/pubmed/init.py
+++ b/model_zoo/utils/graph_to_mindrecord/pubmed/init.py
--- a/model_zoo/utils/graph_to_mindrecord/pubmed/mr_api.py
+++ b/model_zoo/utils/graph_to_mindrecord/pubmed/mr_api.py
@@ -0,0 +1,105 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """
 User-defined API for MindRecord GNN writer.
 """
 import os

 import pickle as pkl
 import numpy as np
 import scipy.sparse as sp

 # parse args from command line parameter 'graph_api_args'
 #     args delimiter is ':'
 args = os.environ['graph_api_args'].split(':')
 PUBMED_PATH = args[0]
 dataset_str = 'pubmed'

 # profile:  (num_features, feature_data_types, feature_shapes)
 node_profile = (2, ["float32", "int32"], [[-1], [-1]])
 edge_profile = (0, [], [])


 def _normalize_cora_features(features):
    row_sum = np.array(features.sum(1))
    r_inv = np.power(row_sum * 1.0, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    features = r_mat_inv.dot(features)
    return features


 def _parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index


 def yield_nodes(task_id=0):
    """
    Generate node data

    Yields:
        data (dict): data row which is dict.
    """
    print("Node task is {}".format(task_id))

    names = ['tx', 'ty', 'allx', 'ally']
    objects = []
    for name in names:
        with open("{}/ind.{}.{}".format(PUBMED_PATH, dataset_str, name), 'rb') as f:
            objects.append(pkl.load(f, encoding='latin1'))
    tx, ty, allx, ally = tuple(objects)
    test_idx_reorder = _parse_index_file(
        "{}/ind.{}.test.index".format(PUBMED_PATH, dataset_str))
    test_idx_range = np.sort(test_idx_reorder)

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    features = _normalize_cora_features(features)
    features = features.A

    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]

    line_count = 0
    for i, label in enumerate(labels):
        node = {'id': i, 'type': 0, 'feature_1': features[i].tolist(),
                'feature_2': label.tolist().index(1)}
        line_count += 1
        yield node
    print('Processed {} lines for nodes.'.format(line_count))


 def yield_edges(task_id=0):
    """
    Generate edge data

    Yields:
        data (dict): data row which is dict.
    """
    print("Edge task is {}".format(task_id))
    with open("{}/ind.{}.graph".format(PUBMED_PATH, dataset_str), 'rb') as f:
        graph = pkl.load(f, encoding='latin1')
        line_count = 0
        for i in graph:
            for dst_id in graph[i]:
                edge = {'id': line_count,
                        'src_id': i, 'dst_id': dst_id, 'type': 0}
                line_count += 1
                yield edge
        print('Processed {} lines for edges.'.format(line_count))
--- a/model_zoo/utils/graph_to_mindrecord/write_pubmed.sh
+++ b/model_zoo/utils/graph_to_mindrecord/write_pubmed.sh
@@ -0,0 +1,12 @@
 #!/bin/bash
 SRC_PATH=/tmp/pubmed/dataset
 MINDRECORD_PATH=/tmp/pubmed/mindrecord

 rm -f $MINDRECORD_PATH/*

 python writer.py --mindrecord_script pubmed \
 --mindrecord_file "$MINDRECORD_PATH/pubmed_mr" \
 --mindrecord_partitions 1 \
 --mindrecord_header_size_by_bit 18 \
 --mindrecord_page_size_by_bit 20 \
 --graph_api_args "$SRC_PATH"