# Wild type sequence provided in the "Dataset Description": wtseq <- 'VPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQRVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGTNAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKALGSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVGK' # Read testing set sequences and pH: test <- read.csv('../input/novozymes-enzyme-stability-prediction/test.csv') # Add mutation information to testing set: test[,c('type','resid','wt','mut')] <- do.call(rbind,lapply(test$protein_sequence,function(seq){ # case 1 = wild type: if(seq==wtseq){ return(c('WT',-1,'_','_')) # case 2 = substitution: } else if(nchar(seq)==nchar(wtseq)){ i <- mapply(function(x,y) which(x!=y)[1], strsplit(seq,""), strsplit(wtseq,"")) return(c('SUB',i,substr(wtseq,i,i),substr(seq,i,i))) # case 3 = deletion: } else if(nchar(seq)