setwd("C:/Users/helen/OneDrive/Desktop/Directed Research Spring 2020/R-UROP")

#loading in package to read depmap
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

# The following initializes usage of Bioc devel
BiocManager::install(version='devel')

BiocManager::install("depmap")
install.packages("dplyr")
install.packages("utf8")
library("depmap")
library("ExperimentHub")

 eh <- ExperimentHub()

query(eh, "depmap")

#loading in datasets to work with
rnai <- eh[["EH2260"]]
mutationCalls <- eh[["EH3457"]]
metadata <- eh[["EH3458"]]
crispr <- eh[["EH2261"]]
TMP <- eh[["EH3456"]]
#looking at mutation call categories
data1 <- mutationCalls[1,]

#figuring out column names
colnames(mutationCalls)

#looking at types of variables in columns
unique(mutationCalls$var_class)
unique(mutationCalls$is_deleterious)
#creating dataframe of just NF1 deficient cell lines
NF1null <- mutationCalls[mutationCalls$gene_name=="NF1" & mutationCalls$is_deleterious=="TRUE",] 
#creating dataframe of just CDKN2A deficient cell lines
CDKN2Anull <- mutationCalls[mutationCalls$gene_name=="CDKN2A" & mutationCalls$is_deleterious=="TRUE",] 

#removing duplicates
NF1null_no_dups <- NF1null[!duplicated(NF1null$depmap_id), ]
CDKN2Anull_no_dups <- CDKN2Anull[!duplicated(CDKN2Anull$depmap_id), ]

#combine into 1 table
nulltable <- rbind(NF1null_no_dups, CDKN2Anull_no_dups)
#finding duplicates
duplicated(nulltable$depmap_id)
Duplicates <- nulltable[duplicated(nulltable$depmap_id)|duplicated(nulltable$depmap_id, fromLast=TRUE),]

write.csv(Duplicates, file = "~/CellLines.csv", row.names = TRUE )
#Extracting cell lines of interest
InterestingCellLines <- data.frame(Duplicates$depmap_id)
#removing duplicates

ICL <- InterestingCellLines[!duplicated(InterestingCellLines$Duplicates.depmap_id),]
#Exporting ICL
write.csv(ICL, file = "C:/Users/helen/OneDrive/Desktop/Directed Research Spring 2020/R-UROP/ICL.csv", row.names = TRUE )

#Working back into the RNAI data
#Creating list of dependency scores of cell lines of interest
#Figuring out headers
colnames(rnai)
#Filtering out just the cell lines we are interested in
rnai1 <- filter(rnai, depmap_id == ICL)
#selecting just the variables of interest
rnai2 <- select(rnai1, cell_line, gene_name, dependency)
#Sorting according to dependency
rnai3 <- arrange(rnai2, dependency)

#Trying to make data frame with averages for each cell lines
#then, calculate z score for each gene
#Then, find the highest Z score compared to the ICL gene dependencies
b <- select(rnai, cell_line, gene_name, dependency)
prex <- filter(b, is.na(b$dependency) != TRUE ) #taking out NA values
y <- aggregate(prex[,3], list(prex$gene_name), mean) #Finding means for each gene
#Adding difference into rnai3
#rename col names in y to keep track
y <- y %>% rename( gene_name = Group.1, dependency_full = dependency) #naming gene_names the same so the sf can be joined, naming dependency something unique
head(rnai3) #Checking the names of the headings
head(y)
#joining the two sheets by the gene neame
z <- full_join(rnai3, y, by = "gene_name")
head(z)
z1 <- cbind(z, diff = z$dependency-z$dependency_full) #Adding a column of the differece between dependency scores
head(z1)
z2 <- arrange(z1, diff) #arranging according to that difference
head(z2)
#filtering out dependencies more than -/05 from the full dataset
z3 <- filter(z2, dependency_full >= -0.4)
head(z3)
Top_Targets_NF1_CDKN2A <- head(z3, 500) #Saving the top 100 different

write.csv(Top_Targets_NF1_CDKN2A, file = "C:/Users/helen/OneDrive/Desktop/Directed Research Spring 2020/R-UROP/Top_Targets_NF1_CDKN2A500.csv", row.names = TRUE )

colnames(metadata)
#Working back into the CRISPR data
#Creating list of dependency scores of cell lines of interest
#Figuring out headers

ICLb <- data.frame(ICL) #Turning ICL into a dataframe so it can be worked with better
crispr1 <- filter(crispr, depmap_id %in% ICLb$ICL) #filtering out just our cell lines we care about

crispr2 <- select(crispr1, cell_line, gene_name, dependency)# Selecting variables of interest
#Sorting according to dependency
crispr3 <- arrange(crispr2, dependency)

#Tmake data frame with averages for each cell lines
#then, calculate z score for each gene
#Then, find the highest Z score compared to the ICL gene dependencies
h <- select(crispr, cell_line, gene_name, dependency)
prei <- filter(h, is.na(h$dependency) != TRUE ) #taking out NA values
i <- aggregate(prei[,3], list(prei$gene_name), mean) #Finding means for each gene
#Adding difference into crispr
#rename col names in y to keep track
j <- i %>% rename( gene_name = Group.1, dependency_full = dependency) #naming gene_names the same so the sf can be joined, naming dependency soething unuqie
head(crispr3) #Checking the names of the headings
head(j)
#joining the two sheets by the gene neame
k <- full_join(crispr3, j, by = "gene_name")
head(k)
k1 <- cbind(k, diff = k$dependency-k$dependency_full) #Adding a column of the differece between dependency scores
head(k1)
k2 <- arrange(k1, diff) #arranging according to that difference
head(k2)
#filtering out dependencies more than -/05 from the full dataset
k3 <- filter(k2, dependency_full >= -0.4)
head(k3)
Top_Targets_NF1_CDKN2A_CRISPR <- head(k3, 500) #Saving the top 100 different

write.csv(Top_Targets_NF1_CDKN2A_CRISPR, file = "C:/Users/helen/OneDrive/Desktop/Directed Research Spring 2020/R-UROP/Top_Targets_NF1_CDKN2A_CRISPR500.csv", row.names = TRUE )

#currently not fuctioning - goal is to get genes that appear in both lists
combined <- filter(Top_Targets_NF1_CDKN2A, gene_name %in% Top_Targets_NF1_CDKN2A_CRISPR$gene_name)

metadata %>%
  dplyr::select(depmap_id, lineage) %>%
  dplyr::full_join(TMP, by = "depmap_id") %>%
  dplyr::filter(gene_name == "DNAJC13") %>% 
  ggplot(aes(x = lineage, y = rna_expression, fill = lineage)) +
  geom_boxplot(outlier.alpha = 0.1) +
  ggtitle("Boxplot of expression values for gene DNAJC13 by lineage") +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  theme(legend.position = "none")
head(TMP)

#Some graphing to look at the top gene, AAMP
library(ggplot2)
rnai %>% dplyr::select(gene, gene_name, dependency) %>% 
  dplyr::filter(gene_name == "AAMP") %>% 
  ggplot(aes(x = dependency)) +
  geom_histogram() +
  geom_vline(xintercept = mean(rnai$dependency, na.rm = TRUE),
             linetype = "dotted", color = "red") +
  ggtitle("Histogram of dependency scores for gene AAMP")

crisprsample <- head(crispr, 10) 
write.csv(crisprsample, file = "C:/Users/helen/OneDrive/Desktop/Directed Research Spring 2020/R-UROP/crisprsample.csv", row.names = TRUE)