setwd("C:/Users/helen/OneDrive/Desktop/Directed Research Spring 2020/R-UROP") #loading in package to read depmap if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") # The following initializes usage of Bioc devel BiocManager::install(version='devel') BiocManager::install("depmap") install.packages("dplyr") install.packages("utf8") library("depmap") library("ExperimentHub") eh <- ExperimentHub() query(eh, "depmap") #loading in datasets to work with rnai <- eh[["EH2260"]] mutationCalls <- eh[["EH3457"]] metadata <- eh[["EH3458"]] crispr <- eh[["EH2261"]] TMP <- eh[["EH3456"]] #looking at mutation call categories data1 <- mutationCalls[1,] #figuring out column names colnames(mutationCalls) #looking at types of variables in columns unique(mutationCalls$var_class) unique(mutationCalls$is_deleterious) #creating dataframe of just NF1 deficient cell lines NF1null <- mutationCalls[mutationCalls$gene_name=="NF1" & mutationCalls$is_deleterious=="TRUE",] #creating dataframe of just CDKN2A deficient cell lines CDKN2Anull <- mutationCalls[mutationCalls$gene_name=="CDKN2A" & mutationCalls$is_deleterious=="TRUE",] #removing duplicates NF1null_no_dups <- NF1null[!duplicated(NF1null$depmap_id), ] CDKN2Anull_no_dups <- CDKN2Anull[!duplicated(CDKN2Anull$depmap_id), ] #combine into 1 table nulltable <- rbind(NF1null_no_dups, CDKN2Anull_no_dups) #finding duplicates duplicated(nulltable$depmap_id) Duplicates <- nulltable[duplicated(nulltable$depmap_id)|duplicated(nulltable$depmap_id, fromLast=TRUE),] write.csv(Duplicates, file = "~/CellLines.csv", row.names = TRUE ) #Extracting cell lines of interest InterestingCellLines <- data.frame(Duplicates$depmap_id) #removing duplicates ICL <- InterestingCellLines[!duplicated(InterestingCellLines$Duplicates.depmap_id),] #Exporting ICL write.csv(ICL, file = "C:/Users/helen/OneDrive/Desktop/Directed Research Spring 2020/R-UROP/ICL.csv", row.names = TRUE ) #Working back into the RNAI data #Creating list of dependency scores of cell lines of interest #Figuring out headers colnames(rnai) #Filtering out just the cell lines we are interested in rnai1 <- filter(rnai, depmap_id == ICL) #selecting just the variables of interest rnai2 <- select(rnai1, cell_line, gene_name, dependency) #Sorting according to dependency rnai3 <- arrange(rnai2, dependency) #Trying to make data frame with averages for each cell lines #then, calculate z score for each gene #Then, find the highest Z score compared to the ICL gene dependencies b <- select(rnai, cell_line, gene_name, dependency) prex <- filter(b, is.na(b$dependency) != TRUE ) #taking out NA values y <- aggregate(prex[,3], list(prex$gene_name), mean) #Finding means for each gene #Adding difference into rnai3 #rename col names in y to keep track y <- y %>% rename( gene_name = Group.1, dependency_full = dependency) #naming gene_names the same so the sf can be joined, naming dependency something unique head(rnai3) #Checking the names of the headings head(y) #joining the two sheets by the gene neame z <- full_join(rnai3, y, by = "gene_name") head(z) z1 <- cbind(z, diff = z$dependency-z$dependency_full) #Adding a column of the differece between dependency scores head(z1) z2 <- arrange(z1, diff) #arranging according to that difference head(z2) #filtering out dependencies more than -/05 from the full dataset z3 <- filter(z2, dependency_full >= -0.4) head(z3) Top_Targets_NF1_CDKN2A <- head(z3, 500) #Saving the top 100 different write.csv(Top_Targets_NF1_CDKN2A, file = "C:/Users/helen/OneDrive/Desktop/Directed Research Spring 2020/R-UROP/Top_Targets_NF1_CDKN2A500.csv", row.names = TRUE ) colnames(metadata) #Working back into the CRISPR data #Creating list of dependency scores of cell lines of interest #Figuring out headers ICLb <- data.frame(ICL) #Turning ICL into a dataframe so it can be worked with better crispr1 <- filter(crispr, depmap_id %in% ICLb$ICL) #filtering out just our cell lines we care about crispr2 <- select(crispr1, cell_line, gene_name, dependency)# Selecting variables of interest #Sorting according to dependency crispr3 <- arrange(crispr2, dependency) #Tmake data frame with averages for each cell lines #then, calculate z score for each gene #Then, find the highest Z score compared to the ICL gene dependencies h <- select(crispr, cell_line, gene_name, dependency) prei <- filter(h, is.na(h$dependency) != TRUE ) #taking out NA values i <- aggregate(prei[,3], list(prei$gene_name), mean) #Finding means for each gene #Adding difference into crispr #rename col names in y to keep track j <- i %>% rename( gene_name = Group.1, dependency_full = dependency) #naming gene_names the same so the sf can be joined, naming dependency soething unuqie head(crispr3) #Checking the names of the headings head(j) #joining the two sheets by the gene neame k <- full_join(crispr3, j, by = "gene_name") head(k) k1 <- cbind(k, diff = k$dependency-k$dependency_full) #Adding a column of the differece between dependency scores head(k1) k2 <- arrange(k1, diff) #arranging according to that difference head(k2) #filtering out dependencies more than -/05 from the full dataset k3 <- filter(k2, dependency_full >= -0.4) head(k3) Top_Targets_NF1_CDKN2A_CRISPR <- head(k3, 500) #Saving the top 100 different write.csv(Top_Targets_NF1_CDKN2A_CRISPR, file = "C:/Users/helen/OneDrive/Desktop/Directed Research Spring 2020/R-UROP/Top_Targets_NF1_CDKN2A_CRISPR500.csv", row.names = TRUE ) #currently not fuctioning - goal is to get genes that appear in both lists combined <- filter(Top_Targets_NF1_CDKN2A, gene_name %in% Top_Targets_NF1_CDKN2A_CRISPR$gene_name) metadata %>% dplyr::select(depmap_id, lineage) %>% dplyr::full_join(TMP, by = "depmap_id") %>% dplyr::filter(gene_name == "DNAJC13") %>% ggplot(aes(x = lineage, y = rna_expression, fill = lineage)) + geom_boxplot(outlier.alpha = 0.1) + ggtitle("Boxplot of expression values for gene DNAJC13 by lineage") + theme(axis.text.x = element_text(angle = 45, hjust=1)) + theme(legend.position = "none") head(TMP) #Some graphing to look at the top gene, AAMP library(ggplot2) rnai %>% dplyr::select(gene, gene_name, dependency) %>% dplyr::filter(gene_name == "AAMP") %>% ggplot(aes(x = dependency)) + geom_histogram() + geom_vline(xintercept = mean(rnai$dependency, na.rm = TRUE), linetype = "dotted", color = "red") + ggtitle("Histogram of dependency scores for gene AAMP") crisprsample <- head(crispr, 10) write.csv(crisprsample, file = "C:/Users/helen/OneDrive/Desktop/Directed Research Spring 2020/R-UROP/crisprsample.csv", row.names = TRUE)