--- title: "DEB RNA HFX S30" output: pdf_document --- # Load required packages ```{r include=T, echo=F, warning=FALSE} library(corrplot) library(stringr) library(tibble) library(tidyverse) library(kableExtra) library(rmarkdown) library(magrittr) library(ggpubr) library(gridExtra) library(knitr) library(cowplot) library(tinytex) library(readxl) library(Hmisc) library(caret) library(ggcorrplot) library(pander) library(devtools) library(Peptides) library(RColorBrewer) ``` # Loading RNP:Xl data into the R environment I included the manually evaluated datasets for my ASMS talk to see if machine learning can help me with identifying a possible score cutoff or other metric to help reduce the numbers of entries I have to go through after RNP:xl output. The datasets include a merged version of both the S30 and S100 fraction that were initially considered individually: *Ecoli DEB RNA ## Data wrangling First, we need to add an extra column to each dataset, indicating which cross-linker was used in the experiment. ```{r, echo=F, warning=FALSE, tidy=T, message=FALSE} complete_list_DEB_RNA <- read_excel("AWulf_261119_machine_learning_Ecoli_DEB_RNA.xlsx") names(complete_list_DEB_RNA) <- gsub("\\.", "_", names(complete_list_DEB_RNA)) complete_list_DEB_RNA$XLinker <- "DEB" complete_list_DEB_RNA$Nucleotides <- "RNA" ``` Then, we need to combine the dataframes. ```{r, echo=F, warning=FALSE, tidy=T, message=FALSE} Grand_total_list <- complete_list_DEB_RNA ``` ## First filtering step Remove any entry that does not contain an RNA adduct and fits to a target better than a decoy. ```{r echo = F} data1<-Grand_total_list[Grand_total_list$`RNPxl_RNA`!="none",] %>% filter(target_decoy == 'target') ``` ## Compute additional metrices (columns) ### Peplength ```{r echo = F} #add extra column with the length of the peptide# data1$prepPep<-gsub("(Carbamyl)", "", as.character(data1$sequence)) data1$prepPep <- gsub("(Oxidation)", "", as.character(data1$prepPep)) data1$prepPep <- gsub("(Phospho)", "", as.character(data1$prepPep)) data1$prepPep <- gsub("\\(|\\)", "", data1$prepPep) data1$prepPep <- gsub("\\..*","",data1$prepPep) data1$Pepseq <- data1$prepPep data1$Peplength<-nchar(data1$Pepseq, type = "chars") ``` ### classification of amino acids Create new column with peptide length and classify identified amino acids in peptides as follows: 1. charged amino acids: R, K, D, E 2. polar amino acids: Q, N, H, S, T, Y, C, W 3. hydrophobic amino acids: A, I, L, M, V, P, G ```{r echo = F} #add extra column with classified amino acid composition of the sequence. Charged = R, K, D, E polar = Q, N, H, S, # T, Y, C, W hydrophobic = A, I, L, M, F , V, P, G# data1$aa_class_charged<-str_count(data1$Pepseq, paste("R|K|D|E", collapse = "|")) data1$aa_class_polar<-str_count(data1$Pepseq, paste("Q|N|H|S|T|Y|C|W", collapse = "|")) data1$aa_class_hydrophobic<-str_count(data1$Pepseq, paste("A|I|L|M|F|V|P|G", collapse = "|")) ``` ### Peptide pI (isoelectric point) Compute the isolelectric point for each peptide in each entry. ```{r, echo=F, warning=FALSE, tidy=T, message=FALSE} data1$pI <- round(pI(data1$Pepseq),2) ``` ### aI (aliphatic index) Compute the aliphatic Index for each peptide in each entry. See the following paper for more info: *Thermostability and aliphatic index of globular proteins.* *J Biochem. 1980 Dec;88(6):1895-8.* "A statistical analysis shows that the alphatic index, which is defined as the relative volume of a protein occupied by aliphatic side schains (alanine, valine, isoleucine, and leucine), of proteins of thermophilic bacteria is significantly higher than that of ordinary proteins. The index may be regarded as a positive factor for the increase of thermostability of globular proteins." In general, an aI of greater 92.6 is particulary stable thermally as found in thermophiles. Now, since we are investigating tryptic peptides, I could only find one paper on that matter. *GETTING INTIMATE WITH TRYPSIN, THE LEADING PROTEASE IN PROTEOMICS* *Published online 15 June 2013 in Wiley Online Library (wileyonlinelibrary.com). DOI 10.1002/mas.21376* According to this paper, human proteins result in 61 tryptic peptides on average. ```{r, echo=F, warning=FALSE, tidy=T, message=FALSE} data1$aIndex <- round(aIndex(data1$Pepseq),2) ``` ### proposed cross-linking amino acid ```{r, echo=F, warning=FALSE, tidy=T, message=FALSE} data1$XL_aa<-gsub("[:A-Z:]", "", data1$RNPxl_best_localization) ``` ### Hydrophibicity index ```{r, echo=F, warning=FALSE, tidy=T, message=FALSE} data1$Hydro <- round(hydrophobicity(data1$Pepseq, scale = "KyteDoolittle"),2) ``` ### Molecular weight of peptide *Note: returns the average masses, not the monoisotopic masses!* ```{r, echo=F, warning=FALSE, tidy=T, message=FALSE} data1$mw <- round(mw(data1$Pepseq, monoisotopic = F),2) ``` # Machine learning ## Split dataset Seperate different species of nucleotides (RNA, DNA) and cross-linker (DEB, DEB). ```{r echo = F, fig.align="center"} true_hits<-filter(data1, Curated == "1") true_hits_RNA<-filter(true_hits, Nucleotides == "RNA") RNA_DEB<-filter(true_hits_RNA, XLinker == "DEB") ``` # Setting up the dataset Harmonize the number of trues and falses and dropping all categorical columns.50% T and %0 F ```{r, echo=T} set.seed(42) drops_general<-c("precursor_purity", "RNPxl_a_ion_score") ml_DEB_RNA<-data1 %>% filter(XLinker == "DEB", Curated == "0", Nucleotides == "RNA") %>% sample_n(459) %>% rbind(RNA_DEB) rows_DEB_RNA <- sample(nrow(ml_DEB_RNA)) ml_DEB_RNA <- ml_DEB_RNA[rows_DEB_RNA, ] %>% dplyr::select_if(is.numeric) ml_DEB_RNA<-ml_DEB_RNA[, !(colnames(ml_DEB_RNA) %in% drops_general)] ``` # Splitting into train and test data I will use 80% of the data to train, and 20% to test the model ```{r, echo=T} ml_DEB_RNA$Curated<-as.factor(ml_DEB_RNA$Curated) ml_DEB_RNA$Curated<-factor(ml_DEB_RNA$Curated, levels = c("0", "1"), labels = c("False", "True")) ml_DEB_RNA_train <- ml_DEB_RNA[1:734, ] ml_DEB_RNA_test <- ml_DEB_RNA[735:918, ] ``` # KNN K nearest neighboor classification based on numeric variables, min-max normalized. ```{r, echo=T} library(class) library(gmodels) knn_drop <- c("Curated", "isotope_error", "RNPxl_a_ion_score", "RNPxl_precursor_purity") normalize <- function(x) { return((x-min(x))/(max(x)-min(x))) } ml_DEB_RNA_test_knn<-ml_DEB_RNA_test[,!(names(ml_DEB_RNA_test) %in% knn_drop)] ml_DEB_RNA_test_knn<-as.data.frame(lapply(ml_DEB_RNA_test_knn[1:39], normalize)) ml_DEB_RNA_test_knn_labels<-ml_DEB_RNA_test[1:184, 5] ml_DEB_RNA_train_knn<-ml_DEB_RNA_train[,!(names(ml_DEB_RNA_train) %in% knn_drop)] ml_DEB_RNA_train_knn<-as.data.frame(lapply(ml_DEB_RNA_train_knn[1:39], normalize)) ml_DEB_RNA_train_knn_labels<-as.data.frame(ml_DEB_RNA_train[1:734, 5]) ``` ## Evaluate Model Performance DEB RNA Here, I chose 5 neighbors: ```{r, echo=T} DEB_RNA_knn_pred <- knn(train = ml_DEB_RNA_train_knn, test = ml_DEB_RNA_test_knn, cl = ml_DEB_RNA_train_knn_labels$Curated, k =5 ) CrossTable(x = ml_DEB_RNA_test_knn_labels$Curated, y = DEB_RNA_knn_pred, prop.chisq = F) ``` # Decision trees and classification rules ## C5.0 ```{r, echo=T} library(C50) C5_drop <- c("isotope_error", "RNPxl_a_ion_score", "precursor_purity") ml_DEB_RNA_train_C5<-ml_DEB_RNA_train[,!(names(ml_DEB_RNA_train) %in% C5_drop)] ml_DEB_RNA_test_C5<-ml_DEB_RNA_test[,!(names(ml_DEB_RNA_test) %in% C5_drop)] ``` ## DEB RNA ### with boosting and penalty for false positives penalize False Positives 4 times more than false negatives. ```{r, echo=T} matrix_dimensions<- list(c("True", "False"), c("True", "False")) names(matrix_dimensions) <- c("predicted", "actual") error_cost <- matrix(c(0,1,4,0), nrow = 2, dimnames = matrix_dimensions) C5_model_DEB_RNA_boost_pen<-C5.0(x=ml_DEB_RNA_train_C5[-5], y=ml_DEB_RNA_train_C5$Curated, costs = error_cost, trials = 10) summary(C5_model_DEB_RNA_boost_pen) C5_DEB_RNA_predict_boost_pen<-predict(C5_model_DEB_RNA_boost_pen, ml_DEB_RNA_test_C5) CrossTable(ml_DEB_RNA_test_C5$Curated, C5_DEB_RNA_predict_boost_pen, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c("actual TRUE", "predicted TRUE")) ``` # Classifyers ## DEB RNA ### RIPPER ```{r, echo=T, tidy=TRUE} library(rJava) library(RWeka) RIPPER_model_DEB_RNA<-JRip(Curated ~ ., data =ml_DEB_RNA_train_C5) RIPPER_model_DEB_RNA summary(RIPPER_model_DEB_RNA) RIPPER_DEB_RNA_predict<-predict(RIPPER_model_DEB_RNA, ml_DEB_RNA_test_C5) CrossTable(ml_DEB_RNA_test_C5$Curated, RIPPER_DEB_RNA_predict, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c("actual TRUE", "predicted TRUE")) ``` # DEB RNA HFX, charge state 2-7 Now, let us test KNN, C5, and RIPPER models on unknown data: Ecoli DEB HFX runs. Combine the triplicates into one giant dataframe first! ## Data setup DEB RNA HFX charge state 2-7 ```{r, echo=T, tidy=TRUE} A_Wulf_020819_080819_Ecoli_DEB_S30_rep1_XL_table<-read_delim("A_Wulf_020819_080819_Ecoli_DEB_S30_rep1_XL_table.csv", delim = "\t") A_Wulf_020819_080819_Ecoli_DEB_S30_rep2_XL_table<-read_delim("A_Wulf_020819_080819_Ecoli_DEB_S30_rep2_XL_table.csv", delim = "\t") A_Wulf_020819_080819_Ecoli_DEB_S30_rep3_XL_table<-read_delim("A_Wulf_020819_080819_Ecoli_DEB_S30_rep3_XL_table.csv", delim = "\t") DEB_RNA_S30_HFX_total_reps<- rbind(A_Wulf_020819_080819_Ecoli_DEB_S30_rep1_XL_table, A_Wulf_020819_080819_Ecoli_DEB_S30_rep2_XL_table, A_Wulf_020819_080819_Ecoli_DEB_S30_rep3_XL_table) DEB_RNA_HFX_S30_KNN<-DEB_RNA_S30_HFX_total_reps names(DEB_RNA_HFX_S30_KNN) <- gsub("\\.", "_", names(DEB_RNA_HFX_S30_KNN)) names(DEB_RNA_HFX_S30_KNN) <- gsub("\\:", "_", names(DEB_RNA_HFX_S30_KNN)) names(DEB_RNA_HFX_S30_KNN) <- gsub("\\ ", "_", names(DEB_RNA_HFX_S30_KNN)) names(DEB_RNA_HFX_S30_KNN) <- gsub("\\|", "_", names(DEB_RNA_HFX_S30_KNN)) DEB_RNA_HFX_S30_KNN$XLinker <- "DEB" DEB_RNA_HFX_S30_KNN$Nucleotides <- "RNA" DEB_RNA_HFX_S30_KNN$Sample <- "S30" DEB_RNA_HFX_S30_KNN<-DEB_RNA_HFX_S30_KNN[DEB_RNA_HFX_S30_KNN$RNPxl_RNA!="none",] %>% filter(target_decoy == 'target') DEB_RNA_HFX_S30_KNN$prepPep<-gsub("(Carbamyl)", "", as.character(DEB_RNA_HFX_S30_KNN$sequence)) DEB_RNA_HFX_S30_KNN$prepPep <- gsub("(Oxidation)", "", as.character(DEB_RNA_HFX_S30_KNN$prepPep)) DEB_RNA_HFX_S30_KNN$prepPep <- gsub("(Phospho)", "", as.character(DEB_RNA_HFX_S30_KNN$prepPep)) DEB_RNA_HFX_S30_KNN$prepPep <- gsub("\\(|\\)", "", DEB_RNA_HFX_S30_KNN$prepPep) DEB_RNA_HFX_S30_KNN$prepPep <- gsub("\\..*","",DEB_RNA_HFX_S30_KNN$prepPep) DEB_RNA_HFX_S30_KNN$Pepseq <- DEB_RNA_HFX_S30_KNN$prepPep DEB_RNA_HFX_S30_KNN$Peplength<-nchar(DEB_RNA_HFX_S30_KNN$Pepseq, type = "chars") DEB_RNA_HFX_S30_KNN$aa_class_charged<-str_count(DEB_RNA_HFX_S30_KNN$Pepseq, paste("R|K|D|E", collapse = "|")) DEB_RNA_HFX_S30_KNN$aa_class_polar<-str_count(DEB_RNA_HFX_S30_KNN$Pepseq, paste("Q|N|H|S|T|Y|C|W", collapse = "|")) DEB_RNA_HFX_S30_KNN$aa_class_hydrophobic<-str_count(DEB_RNA_HFX_S30_KNN$Pepseq, paste("A|I|L|M|F|V|P|G", collapse = "|")) DEB_RNA_HFX_S30_KNN$pI <- round(pI(DEB_RNA_HFX_S30_KNN$Pepseq),2) DEB_RNA_HFX_S30_KNN$aIndex <- round(aIndex(DEB_RNA_HFX_S30_KNN$Pepseq),2) DEB_RNA_HFX_S30_KNN$XL_aa<-gsub("[:A-Z:]", "", DEB_RNA_HFX_S30_KNN$RNPxl_best_localization) DEB_RNA_HFX_S30_KNN$Hydro <- round(hydrophobicity(DEB_RNA_HFX_S30_KNN$Pepseq, scale = "KyteDoolittle"),2) DEB_RNA_HFX_S30_KNN$mw <- round(mw(DEB_RNA_HFX_S30_KNN$Pepseq, monoisotopic = F),2) DEB_RNA_HFX_S30_KNN$cv <- c("35_45") names(DEB_RNA_HFX_S30_KNN)[names(DEB_RNA_HFX_S30_KNN) == "precursor_m/z"] <- "precursor" names(DEB_RNA_HFX_S30_KNN)[names(DEB_RNA_HFX_S30_KNN) == "precursor_error_(_ppm_)"] <- "precursor_error_ppm" DEB_RNA_HFX_S30_KNN<-DEB_RNA_HFX_S30_KNN %>% filter(RNPxl_isPhospho == "0") DEB_RNA_HFX_S30_KNN_model<-DEB_RNA_HFX_S30_KNN %>% select(colnames(ml_DEB_RNA_train_knn)) ``` ## Normalizing ```{r, echo=T, tidy=TRUE} DEB_RNA_HFX_S30_KNN_model_norm<-DEB_RNA_HFX_S30_KNN_model%>% select_if(is.numeric) DEB_RNA_HFX_S30_KNN_model_norm<-as.data.frame(lapply(DEB_RNA_HFX_S30_KNN_model_norm, normalize)) DEB_RNA_HFX_S30_KNN_model_norm[is.na(DEB_RNA_HFX_S30_KNN_model_norm)] <- 0 ``` ## Classifying DEB RNA HFX data ### KNN5 ```{r, echo=T, tidy=TRUE} DEB_RNA_HFX_S30_KNN_pred <- knn(train = ml_DEB_RNA_train_knn, test = DEB_RNA_HFX_S30_KNN_model_norm, cl = ml_DEB_RNA_train_knn_labels$Curated, k =5) DEB_RNA_HFX_S30_KNN_model$Prediction<-DEB_RNA_HFX_S30_KNN_pred table(DEB_RNA_HFX_S30_KNN_model$Prediction) DEB_RNA_HFX_S30_KNN$Prediction_KNN<-DEB_RNA_HFX_S30_KNN_pred ``` ### C5.0 with boosting and penalty for false positives penalize False Positives 4 times more than false negatives. ```{r, echo=T} matrix_dimensions<- list(c("True", "False"), c("True", "False")) names(matrix_dimensions) <- c("predicted", "actual") error_cost <- matrix(c(0,1,4,0), nrow = 2, dimnames = matrix_dimensions) ml_DEB_RNA_HFX_test_C5<-DEB_RNA_HFX_S30_KNN %>% select(colnames(ml_DEB_RNA_train)) C5_DEB_RNA_HFX_predict_boost_pen<-predict(C5_model_DEB_RNA_boost_pen, ml_DEB_RNA_HFX_test_C5) ml_DEB_RNA_HFX_test_C5$Prediction<-C5_DEB_RNA_HFX_predict_boost_pen table(ml_DEB_RNA_HFX_test_C5$Prediction) DEB_RNA_HFX_S30_KNN$Prediction_C5<-C5_DEB_RNA_HFX_predict_boost_pen ``` ### RIPPER ```{r, echo=T, tidy=TRUE} RIPPER_DEB_RNA_HFX_predict<-predict(RIPPER_model_DEB_RNA, ml_DEB_RNA_HFX_test_C5) ml_DEB_RNA_HFX_test_C5$Prediction_RIPPER<-RIPPER_DEB_RNA_HFX_predict table(ml_DEB_RNA_HFX_test_C5$Prediction_RIPPER) DEB_RNA_HFX_S30_KNN$Prediction_RIPPER<-RIPPER_DEB_RNA_HFX_predict ``` ## collapsed Proteins ### S30 HFX charge state 2-7 KNN ```{r, echo=T} d<-DEB_RNA_HFX_S30_KNN %>% filter(Prediction_KNN == "True") %>% select(accessions) %>% distinct() nrow(d) ``` ### S30 HFX charge state 2-7 C5.0 ```{r, echo=T} nrow(DEB_RNA_HFX_S30_KNN %>% filter(Prediction_C5 == "True") %>% select(accessions) %>% distinct()) ``` ### S30 HFX charge state 2-7 RIPPER ```{r, echo=T} nrow(DEB_RNA_HFX_S30_KNN %>% filter(Prediction_RIPPER == "True") %>% select(accessions) %>% distinct()) ``` ### Export KNN predictions ```{r, echo=T} HFX_DEB_RNA_S30_KNN_predictions <- DEB_RNA_HFX_S30_KNN %>% filter(Prediction_KNN == "True") write_csv(HFX_DEB_RNA_S30_KNN_predictions, "HFX_DEB_RNA_S30_KNN_predictions.csv") table(HFX_DEB_RNA_S30_KNN_predictions$accessions) %>% as.data.frame() %>% arrange(desc(Freq)) %>% head(10) ```