|
1234567891011121314151617181920212223 |
- # Wild type sequence provided in the "Dataset Description":
- wtseq <- 'VPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQRVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGTNAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKALGSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVGK'
-
- # Read testing set sequences and pH:
- test <- read.csv('../input/novozymes-enzyme-stability-prediction/test.csv')
-
- # Add mutation information to testing set:
- test[,c('type','resid','wt','mut')] <- do.call(rbind,lapply(test$protein_sequence,function(seq){
- # case 1 = wild type:
- if(seq==wtseq){
- return(c('WT',-1,'_','_'))
- # case 2 = substitution:
- } else if(nchar(seq)==nchar(wtseq)){
- i <- mapply(function(x,y) which(x!=y)[1], strsplit(seq,""), strsplit(wtseq,""))
- return(c('SUB',i,substr(wtseq,i,i),substr(seq,i,i)))
- # case 3 = deletion:
- } else if(nchar(seq)<nchar(wtseq)){
- wtsub <- substr(wtseq,1,nchar(seq))
- i <- mapply(function(x,y) which(x!=y)[1], strsplit(seq,""), strsplit(wtsub,""))
- return(c('DEL',i,substr(wtseq,i,i),'_'))
- }
- }))
- head(test)
|