From 84911fd8b786b1c0c1ade88cb3dc29cb8fcffe38 Mon Sep 17 00:00:00 2001
From: jlamanna9 <41341179+jlamanna9@users.noreply.github.com>
Date: Tue, 16 Jun 2020 14:49:07 -0400
Subject: [PATCH] Create 4_CopyNumberEstimation

---
 4_CopyNumberEstimation | 203 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 203 insertions(+)
 create mode 100644 4_CopyNumberEstimation

diff --git a/4_CopyNumberEstimation b/4_CopyNumberEstimation
new file mode 100644
index 0000000..ce34978
--- /dev/null
+++ b/4_CopyNumberEstimation
@@ -0,0 +1,203 @@
+Code adapted from Wierman et al. (Wierman, M. B., Burbulis, I. E., Chronister, W. D., Bekiranov, S. & McConnell, M. J. in Genomic Mosaicism in Neurons and Other Cell Types (eds. Frade, J. M. & Gage, F. H.) 109–131 (Springer New York, 2017)).
+
+#Set arguments (directory and *SORT.cov file)
+#args <- commandArgs(trailing=T)
+directory <-"./"#(args[1])
+infile <- "trimmed_1A_S1_L001_R1_001_sorted_rmdup_onebasereads_winIntersect_500kb_SORT.cov"#(args[1])
+
+splitfname <- strsplit(infile,"\\.")[[1]]
+if(length(splitfname)==2){
+  header <- splitfname[1]
+} else if(length(splitfname)>2){
+  header <- splitfname[1]
+  for(i in 2:(length(splitfname)-1)){
+    namepiece <-splitfname[i]
+    header <-paste(header, namepiece, sep=".")
+  }
+}
+
+splitsampname <- strsplit(header,"_")[[1]]
+if(length(splitsampname)==2){
+  sampname <- splitsampname[1]
+} else if(length(splitsampname)>2){
+  sampname <- splitsampname[1]
+  for(i in 2:(length(splitsampname)-2)){
+    sampnamepiece <-splitsampname[i]
+    sampname <-paste(sampname, sampnamepiece, sep="_")
+  }
+}
+outfile <-paste(header,"norm.num", sep="_")
+outLong <- paste(header,"norm.num.long",sep='_')
+outShort <- paste(header,"norm.num.short",sep='_')
+outPlot <- paste(header,"PLOT.png",sep='_')
+outfile <-paste(directory,outfile, sep='/')
+outLong <- paste(directory,outLong,sep='/')
+outShort <- paste(directory,outShort,sep='/')
+outPlot <- paste(directory,outPlot,sep='/')
+
+#source("http://bioconductor.org/biocLite.R")
+#biocLite("DNAcopy")
+
+
+#Our (slightly modified) version of a function by Sacha Epskamp for checking for the presence of a package and installing if absent.
+#http://stackoverflow.com/questions/9341635/how-can-i-check-for-installed-r-packages-before-running-install-packages
+pkgTestBC <- function(x){
+	if(!require(x,character.only=TRUE)){
+		source("http://bioconductor.org/biocLite.R")
+		biocLite(x)
+		if(!require(x,character.only=TRUE)) stop("Package not found")
+	}
+}
+
+pkgTest <- function(x){
+	if(!require(x,character.only=TRUE)){
+		install.packages(x)
+		if(!require(x,character.only=TRUE)) stop("Package not found")
+	}
+}
+
+pkgTestBC("DNAcopy")
+#pkgTest("sqldf")
+pkgTest("ggplot2")
+
+#Load packages
+library(DNAcopy)
+library(MASS) #MASS pkg usually built-in
+#library(sqldf) #Package to use SQL-style databases (for loading big tables).
+library(ggplot2)
+
+#Set working directory and read in file.
+#setwd(directory)
+
+fullfilepath <-paste(directory,infile,sep='/')
+f <- file(fullfilepath)
+cov.data <- read.table(fullfilepath, header=FALSE, sep="")
+#cov.data <- sqldf("select * from f", dbname = tempfile(), file.format = list(header = F, row.names = F, sep='\t'))
+
+print(ncol(cov.data))
+#cov.data <- cbind(cov.data,0,0)
+print(ncol(cov.data))
+
+#names(cov.data)<-c("Index","Chrom","Start","End","GCfrac","Nfrac","Count")
+
+names(cov.data)<-c("Chrom","Start","End","GCfrac","Nfrac","Count")
+
+cov.data<-subset(cov.data, select=c("Chrom","Start","End","GCfrac","Nfrac","Count"))
+
+#Compute total reads and mean per window and append to file.
+meanperwindow <- mean(cov.data$Count)
+totalsampreads <-sum(cov.data$Count)
+meantotoutline <-paste(sampname,meanperwindow,totalsampreads, sep="\t")
+rawmeanfilename <- "All_counts_means.txt"
+meanfilename <-paste(directory,rawmeanfilename,sep='')
+write(meantotoutline, file=meanfilename, append=TRUE)
+
+#Store chromosomes as numeric
+#Note:  This automatically converts X to 23 and Y to 24.
+#cov.data$Chrom <-as.numeric(cov.data$Chrom)
+
+#Eliminate sex chromosomes
+#cov.data<-subset(cov.data, Chrom!='X')
+#cov.data<-subset(cov.data, Chrom!='Y')
+
+#binSize <- c(0,34:45,47,49,53,100)
+#binSize <- binSize/100
+binSize <- c(0,34:45,47,49,53,100)
+binSize <- binSize/100
+
+#Assign each row of the coverage file to a bin and add the bin id to the data frame.
+binid<-vector()
+for(i in 1:nrow(cov.data)){
+  thebin <-0
+  gc <- cov.data$GCfrac[i]
+  for(j in 1:(length(binSize)-1)){
+    low <-binSize[j]
+    high <- binSize[j+1]
+    if(low <= gc){
+      if(gc < high){
+        thebin <- j
+        binid<-c(binid,thebin)  
+      }
+    }
+  }
+}
+
+#Adds the bin id column to the data frame
+cov.data$bin <- binid
+
+#Compute total number of reads
+totreads <- sum(cov.data$Count)
+millreads <- totreads/1000000
+
+head(cov.data)
+print(totreads)
+print(millreads)
+
+#Loop through list of the bin ids.
+#Remove outliers from each bin.
+cov.data.final<-{}
+for(k in 1:16){
+   temptable <- subset(cov.data, bin==k)
+   countmad <-mad(temptable$Count)
+   countmean <-mean(temptable$Count)
+   countmax <- countmean + (countmad * 4)
+   countmin <- countmean - (countmad * 4)
+   indexlist <-vector()
+   for(m in 1:length(temptable$Count)){
+     thecount <- temptable$Count[m]
+     if(thecount < countmin){
+       indexlist <- c(indexlist,m)
+     }
+     else if(thecount > countmax){
+       indexlist <- c(indexlist,m)
+     }
+   }
+   if(length(indexlist)>0){
+    temptable.noout <- temptable[-indexlist,]
+   }
+   else{
+     temptable.noout <-temptable
+   }
+   temptable.noout<-subset(temptable.noout, Chrom!=23)
+   temptable.noout<-subset(temptable.noout, Chrom!=24)
+#   binmedian <-median(temptable$Count)
+   binmedian <-median(temptable.noout$Count)
+   thestats <-fitdistr(temptable.noout$Count, "normal")
+   binmean <- thestats$estimate[1]
+   binsd <- thestats$estimate[2]
+   zvals <-vector()
+   pvals <-vector()
+   l2fvals <-vector()
+   rpkmvals <-vector()
+   estvals <-vector()
+   for(n in 1:length(temptable$Count)){
+     thecount <- temptable$Count[n]
+#     countz <- (binmean - thecount) / (binsd/sqrt(length(temptable.noout$Count)))
+     countz <- (thecount - binmean) / (binsd)     
+#     countp <- dnorm(countz)
+     countp <- 2*pnorm(-abs(countz))
+     countl2f <- log2(thecount/binmean)
+#     countrpkm <- thecount / 500 / millreads
+     countrpkm <- thecount / millreads
+     countest <- 2*(thecount / binmedian)
+     zvals <-c(zvals, countz)
+     pvals <-c(pvals, countp)                                                                              
+     l2fvals <-c(l2fvals, countl2f)
+     rpkmvals <-c(rpkmvals, countrpkm)
+     estvals <-c(estvals, countest)
+   }
+   temptable$zscore <- zvals
+   temptable$pvalue <- pvals
+   temptable$Log2_foldchange <- l2fvals
+   temptable$rpkm <- rpkmvals
+   temptable$est_copy <- estvals
+   cov.data.final <-rbind(cov.data.final, temptable)
+}
+
+head(cov.data.final)
+
+#Sort the data frame by the chromosome, then the start position
+cov.data.final <-cov.data.final[with(cov.data.final, order(Chrom, Start)), ]
+#Write the table to a file.
+write.table(cov.data.final, file=outfile, row.names=FALSE, col.names=FALSE, sep="\t", quote=FALSE)
+#write.table(cov.data.final, file="trimmed_1A_S1_L001_R1_001_sorted_rmdup_onebasereads_winIntersect_500kb_SORT_NORM.txt", row.names=FALSE, col.names=FALSE, sep="\t", quote=FALSE)
-- 
GitLab