jjfraaa
/
AutoGL

"""
Test file for nas on node classification

AUTOGL_BACKEND=pyg python test/nas/node_classification.py
AUTOGL_BACKEND=dgl python test/nas/node_classification.py

TODO: make it a unit test file to test all the possible combinations
"""

import os
import logging

logging.basicConfig(level=logging.INFO)

from autogl.backend import DependentBackend

if DependentBackend.is_dgl():
    from autogl.module.model.dgl import BaseAutoModel
    from dgl.data import CoraGraphDataset
elif DependentBackend.is_pyg():
    from torch_geometric.datasets import Planetoid
    from autogl.module.model.pyg import BaseAutoModel
from autogl.datasets import build_dataset_from_name
import torch
from torch import nn
import torch.nn.functional as F
#from autogl.module.nas.algorithm.agnn_rl import AGNNRL
from autogl.module.nas.backend import bk_feat, bk_label
from autogl.module.nas.algorithm import Darts, RL, GraphNasRL, Enas, RandomSearch,Spos
from autogl.module.nas.estimator import BaseEstimator
from autogl.module.train.evaluation import Acc
import numpy as np
from autogl.solver.utils import set_seed
from autogl.module.nas.space import BaseSpace
import typing as _typ
from nas_bench_graph import light_read, gnn_list, gnn_list_proteins, Arch
import pandas as pd
import argparse
import os
import os.path as osp

# Define the search space in NAS-bench-graph
class StrModule(nn.Module):
    def __init__(self, lambd):
        super().__init__()
        self.name = lambd

    def forward(self, *args, **kwargs):
        return self.name

    def __repr__(self):
        return "{}({})".format(self.__class__.__name__, self.name)

class BenchSpace(BaseSpace):
    def __init__(
        self,
        hidden_dim: _typ.Optional[int] = 64,
        layer_number: _typ.Optional[int] = 2,
        dropout: _typ.Optional[float] = 0.9,
        input_dim: _typ.Optional[int] = None,
        output_dim: _typ.Optional[int] = None,
        ops_type = 0
    ):
        super().__init__()
        self.layer_number = layer_number
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.ops_type=ops_type

    def instantiate(
        self,
        hidden_dim: _typ.Optional[int] = None,
        layer_number: _typ.Optional[int] = None,
        dropout: _typ.Optional[float] = None,
        input_dim: _typ.Optional[int] = None,
        output_dim: _typ.Optional[int] = None,
        ops_type=None
    ):
        super().instantiate()
        self.dropout = dropout or self.dropout
        self.hidden_dim = hidden_dim or self.hidden_dim
        self.layer_number = layer_number or self.layer_number
        self.input_dim = input_dim or self.input_dim
        self.output_dim = output_dim or self.output_dim
        self.ops_type = ops_type or self.ops_type
        self.ops = [gnn_list,gnn_list_proteins][self.ops_type]
        for layer in range(4):
            setattr(self,f"in{layer}",self.setInputChoice(layer,n_candidates=layer+1,n_chosen=1,return_mask=False,key=f"in{layer}"))
            setattr(self,f"op{layer}",self.setLayerChoice(layer,list(map(lambda x:StrModule(x),self.ops)),key=f"op{layer}"))
        self.dummy=nn.Linear(1,1)

    def forward(self, bench):
        lks = [getattr(self, "in" + str(i)).selected for i in range(4)]
        ops = [getattr(self, "op" + str(i)).name for i in range(4)]
        arch = Arch(lks, ops)
        h = arch.valid_hash()
        if h == "88888" or h==88888:
            return 0
        return bench[h]['perf']

    def parse_model(self, selection, device) -> BaseAutoModel:
        return self.wrap().fix(selection)

# Define a new estimator which directly get performance from NAS-bench-graph instead of training the model
class BenchEstimator(BaseEstimator):
    def __init__(self, data_name, loss_f="nll_loss", evaluation=[Acc()]):
        super().__init__(loss_f, evaluation)
        self.evaluation = evaluation
        self.bench=light_read(data_name)
    def infer(self, model: BaseSpace, dataset, mask="train"):
        perf=model(self.bench)
        return [perf],0

# Run NAS with NAS-bench-graph
def run(data_name='cora',algo='graphnas',num_epochs=50,ctrl_steps_aggregate=20,log_dir='./logs/tmp'):
    print("Testing backend: {}".format("dgl" if DependentBackend.is_dgl() else "pyg"))
    if DependentBackend.is_dgl():
        from autogl.datasets.utils.conversion._to_dgl_dataset import to_dgl_dataset as convert_dataset
    else:
        from autogl.datasets.utils.conversion._to_pyg_dataset import to_pyg_dataset as convert_dataset

    di=2
    do=2
    dataset=None

    ops_type=data_name=='proteins'

    space = BenchSpace().cuda()
    space.instantiate(input_dim=di, output_dim=do,ops_type=ops_type)
    esti = BenchEstimator(data_name)
    if algo=='graphnas':
        algo = GraphNasRL(num_epochs=num_epochs,ctrl_steps_aggregate=ctrl_steps_aggregate)
    elif algo=='agnn':
        algo = AGNNRL(guide_type=1,num_epochs=num_epochs,ctrl_steps_aggregate=ctrl_steps_aggregate)
    else:
        assert False,f'Not implemented algo {algo}'
    model = algo.search(space, dataset, esti)
    result=esti.infer(model._model,None)[0][0]

    os.makedirs(log_dir,exist_ok=True)
    with open(osp.join(log_dir,f'log.txt'),'w') as f:
        f.write(str(result))

    import json
    archs=algo.allhist
    json.dump(archs,open(osp.join(log_dir,f'archs.json'),'w'))

    arch_strs=[str(x[1]) for x in archs]
    print(f'number of archs: {len(arch_strs)} ; number of unique archs : {len(set(arch_strs))}')   

    scores=[-x[0] for x in archs] # accs
    idxs=np.argsort(scores) # increasing order
    with open(osp.join(log_dir,f'idx.txt'),'w') as f:
        f.write(str(idxs))
    return result

# Run NAS with NAS-bench-graph for all provided datasets
def run_all():
    data_names='arxiv citeseer computers cora cs photo physics proteins pubmed'.split()
    algos='graphnas agnn'.split()
    results=[]
    for data_name in data_names:
        for algo in algos:
            print(f'data {data_name} algo {algo}')
            # metric=run(data_name,algo,2,2)
            if data_name=='proteins':
                metric=run(data_name,algo,8,5)
            else:
                metric=run(data_name,algo,50,10)
            results.append([data_name,algo,metric])
    return results

if __name__ == "__main__":
    # results=run_all()
    # df=pd.DataFrame(results,columns='data algo v'.split()).pivot_table(values='v',index='algo',columns='data')
    # print(df.to_string())

    parser = argparse.ArgumentParser()
    parser.add_argument('--data', type=str, default='cora', help='datasets')
    parser.add_argument('--algo', type=str, default='graphnas')
    parser.add_argument('--log_dir', type=str, default='./logs/')

    args = parser.parse_args()
    dname=args.data
    algo=args.algo
    log_dir= os.path.join(args.log_dir,f'{dname,algo}')
    if dname=='proteins':
        # 40 archs in total
        num_epochs=8
        ctrl_steps_aggregate=5
    else:
        # 500 archs in total
        num_epochs=50
        ctrl_steps_aggregate=10
    result=run(dname,algo,num_epochs,ctrl_steps_aggregate,log_dir)