You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

1.ipynb 1.1 kB

1234567891011121314151617181920212223
  1. # Wild type sequence provided in the "Dataset Description":
  2. wtseq <- 'VPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQRVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGTNAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKALGSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVGK'
  3. # Read testing set sequences and pH:
  4. test <- read.csv('../input/novozymes-enzyme-stability-prediction/test.csv')
  5. # Add mutation information to testing set:
  6. test[,c('type','resid','wt','mut')] <- do.call(rbind,lapply(test$protein_sequence,function(seq){
  7. # case 1 = wild type:
  8. if(seq==wtseq){
  9. return(c('WT',-1,'_','_'))
  10. # case 2 = substitution:
  11. } else if(nchar(seq)==nchar(wtseq)){
  12. i <- mapply(function(x,y) which(x!=y)[1], strsplit(seq,""), strsplit(wtseq,""))
  13. return(c('SUB',i,substr(wtseq,i,i),substr(seq,i,i)))
  14. # case 3 = deletion:
  15. } else if(nchar(seq)<nchar(wtseq)){
  16. wtsub <- substr(wtseq,1,nchar(seq))
  17. i <- mapply(function(x,y) which(x!=y)[1], strsplit(seq,""), strsplit(wtsub,""))
  18. return(c('DEL',i,substr(wtseq,i,i),'_'))
  19. }
  20. }))
  21. head(test)