In [None]:
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features. We could
                      # avoid this ugly slicing by using a two-dim dataset
Y = iris.target


def my_kernel(X, Y):
    """
    We create a custom kernel:

                 (2  0)
    k(X, Y) = X  (    ) Y.T
                 (0  1)
    """
    M = np.array([[2, 0], [0, 1.0]])
    return np.dot(np.dot(X, M), Y.T)


h = .02  # step size in the mesh

# we create an instance of SVM and fit out data.
clf = svm.SVC(kernel=my_kernel)
clf.fit(X, Y)

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolors='k')
plt.title('3-Class classification using Support Vector Machine with custom'
          ' kernel')
plt.axis('tight')
plt.show()

In [69]:
# Author: Elisabetta Ghisu

"""
- This script take as input a kernel matrix
and returns the classification or regression performance
- The kernel matrix can be calculated using any of the graph kernels approaches
- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression
- For predition we divide the data in training, validation and test. For each split, we first train on the train data, 
then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally
provide the corresponding performance on the test set. If more than one split is performed, the final results 
correspond to the average of the performances on the test sets. 

@references
    https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py
"""

print(__doc__)

import sys
import pathlib
sys.path.insert(0, "../py-graph/")

import random
import numpy as np
import matplotlib.pyplot as plt

from sklearn.kernel_ridge import KernelRidge # 0.17
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn import svm

from kernels.spkernel import spkernel
from utils.graphfiles import loadDataset

print('\n Loading dataset from file...')
dataset, y = loadDataset("/home/ljia/Documents/research-repo/datasets/acyclic/Acyclic/dataset_bps.ds")
y = np.array(y)
print(y)

kernel_file_path = 'kernelmatrix.ds'
path = pathlib.Path(kernel_file_path)
if path.is_file():
    print('\n Loading the matrix from file...')
    Kmatrix = np.loadtxt(kernel_file_path)
    print(Kmatrix)
else:
    print('\n Calculating kernel matrix, this could take a while...')
    Kmatrix = spkernel(dataset)
    print(Kmatrix)
    print('Saving kernel matrix to file...')
    np.savetxt(kernel_file_path, Kmatrix)

# setup the parameters
model_type = 'regression' # Regression or classification problem
print('\n --- This is a %s problem ---' % model_type)

datasize = len(dataset)
trials = 100 # Trials for hyperparameters random search
splits = 10 # Number of splits of the data
alpha_grid = np.linspace(0.01, 100, num = trials) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression
# C_grid = np.linspace(0.0001, 10, num = trials)
random.seed(20) # Set the seed for uniform parameter distribution


"""
-  Here starts the main program
-  First we permute the data, then for each split we evaluate corresponding performances
-  In the end, the performances are averaged over the test sets
"""

# Initialize the performance of the best parameter trial on validation with the corresponding performance on test
val_split = []
test_split = []

# For each split of the data
for j in range(10, 10 + splits):
    print('\n Starting split %d...' % j)

    # Set the random set for data permutation
    random_state = int(j)
    np.random.seed(random_state)
    idx_perm = np.random.permutation(datasize)
#     print(idx_perm)
    
    # Permute the data
    y_perm = y[idx_perm] # targets permutation
#     print(y_perm)
    Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation
#     print(Kmatrix_perm)
    Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation
    
    # Set the training, validation and test
    # Note: the percentage can be set up by the user
    num_train_val = int((datasize * 90) / 100)         # 90% (of entire dataset) for training and validation
    num_test = datasize - num_train_val              # 10% (of entire dataset) for test
    num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training
    num_val = num_train_val - num_train # 10% (of train + val) for validation
    
    # Split the kernel matrix
    Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]
    Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]
    Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]

    # Split the targets
    y_train = y_perm[0:num_train]

    # Normalization step (for real valued targets only)
    print('\n Normalizing output y...')
    if model_type == 'regression':
        y_train_mean = np.mean(y_train)
        y_train_std = np.std(y_train)
        y_train = (y_train - y_train_mean) / float(y_train_std)
#         print(y)
        
    y_val = y_perm[num_train:(num_train + num_val)]
    y_test = y_perm[(num_train + num_val):datasize]
    
    # Record the performance for each parameter trial respectively on validation and test set
    perf_all_val = []
    perf_all_test = []
    
    # For each parameter trial
    for i in range(trials):
        # For regression use the Kernel Ridge method
        if model_type == 'regression':
#             print('\n Starting experiment for trial %d and parameter alpha = %3f\n ' % (i, alpha_grid[i]))

            # Fit the kernel ridge model
            KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])
            KR.fit(Kmatrix_train, y_train)

            # predict on the validation and test set
            y_pred = KR.predict(Kmatrix_val)
            y_pred_test = KR.predict(Kmatrix_test)
#             print(y_pred)

            # adjust prediction: needed because the training targets have been normalizaed
            y_pred = y_pred * float(y_train_std) + y_train_mean
#             print(y_pred)
            y_pred_test = y_pred_test * float(y_train_std) + y_train_mean
#             print(y_pred_test)

            # root mean squared error on validation
            rmse = np.sqrt(mean_squared_error(y_val, y_pred))
            perf_all_val.append(rmse)

            # root mean squared error in test 
            rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
            perf_all_test.append(rmse_test)

#             print('The performance on the validation set is: %3f' % rmse)
#             print('The performance on the test set is: %3f' % rmse_test)
            
    # --- FIND THE OPTIMAL PARAMETERS --- #
    # For regression: minimise the mean squared error
    if model_type == 'regression':

        # get optimal parameter on validation (argmin mean squared error)
        min_idx = np.argmin(perf_all_val)
        alpha_opt = alpha_grid[min_idx]

        # performance corresponding to optimal parameter on val
        perf_val_opt = perf_all_val[min_idx]

        # corresponding performance on test for the same parameter
        perf_test_opt = perf_all_test[min_idx]

        print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))
        print('The best performance on the validation set is: %3f' % perf_val_opt)
        print('The corresponding performance on test set is: %3f' % perf_test_opt)

# # we create an instance of SVM and fit out data.
# clf = svm.SVC(kernel = 'precomputed')
# clf.fit(Kmatrix, )

# # predict on validation and test
# y_pred = clf.predict(K_val)
# y_pred_test = clf.predict(K_test)

# # accuracy on validation set
# acc = accuracy_score(y_val, y_pred)
# perf_all_val.append(acc)

# # accuracy on test set
# acc_test = accuracy_score(y_test, y_pred_test)
# perf_all_test.append(acc_test)

# # print "The performance on the validation set is: %3f" % acc
# # print "The performance on the test set is: %3f" % acc_test



# # Plot the decision boundary. For that, we will assign a color to each
# # point in the mesh [x_min, x_max]x[y_min, y_max].
# x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
# y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
# xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# # Put the result into a color plot
# Z = Z.reshape(xx.shape)
# plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# # Plot also the training points
# plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolors='k')
# plt.title('3-Class classification using Support Vector Machine with custom'
#           ' kernel')
# plt.axis('tight')
# plt.show()


- This script take as input a kernel matrix
and returns the classification or regression performance
- The kernel matrix can be calculated using any of the graph kernels approaches
- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression
- For predition we divide the data in training, validation and test. For each split, we first train on the train data, 
then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally
provide the corresponding performance on the test set. If more than one split is performed, the final results 
correspond to the average of the performances on the test sets. 

@references
    https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py


 Loading dataset from file...
[ -23.7   14.    37.3  109.7   10.8   39.    42.    66.6  135.   148.5
   40.    34.6   32.    63.    53.5   67.    64.4   84.7   95.5   92.
  

The best performance is for trial 99 with parameter alpha = 100.000000
The best performance on the validation set is: 35.836403
The corresponding performance on test set is: 37.447219
Starting split 37...

 Normalizing output y...
The best performance is for trial 47 with parameter alpha = 47.480000
The best performance on the validation set is: 31.172116
The corresponding performance on test set is: 39.504962
Starting split 38...

 Normalizing output y...
The best performance is for trial 36 with parameter alpha = 36.370000
The best performance on the validation set is: 40.025101
The corresponding performance on test set is: 41.314650
Starting split 39...

 Normalizing output y...
The best performance is for trial 99 with parameter alpha = 100.000000
The best performance on the validation set is: 28.474810
The corresponding performance on test set is: 38.093995
Starting split 40...

 Normalizing output y...
The best performance is for trial 99 with parameter alpha = 100.000000
The bes

The best performance is for trial 99 with parameter alpha = 100.000000
The best performance on the validation set is: 31.903823
The corresponding performance on test set is: 32.937886
Starting split 73...

 Normalizing output y...
The best performance is for trial 19 with parameter alpha = 19.200000
The best performance on the validation set is: 40.825941
The corresponding performance on test set is: 38.535950
Starting split 74...

 Normalizing output y...
The best performance is for trial 99 with parameter alpha = 100.000000
The best performance on the validation set is: 34.181621
The corresponding performance on test set is: 34.089714
Starting split 75...

 Normalizing output y...
The best performance is for trial 39 with parameter alpha = 39.400000
The best performance on the validation set is: 40.264289
The corresponding performance on test set is: 47.412526
Starting split 76...

 Normalizing output y...
The best performance is for trial 27 with parameter alpha = 27.280000
The best

The best performance is for trial 4 with parameter alpha = 4.050000
The best performance on the validation set is: 39.485731
The corresponding performance on test set is: 50.146953


In [None]:
4841564986 / 3