##--------------------------------------------------------------------------- ## Modified on May 14, 2009; main changes include: ## (1) Disable the reduced-dimension (RD) analysis. If interseted in the ## reduced-dimension analysis, please use an improved version available at ## http://www4.stat.ncsu.edu/~jytzeng/Softwares/Hap-Clustering/R/ ## Reference: ## Tzeng, J.Y., Wang, C.H., Kao, J.T., and Hsiao, C.K. (2006). ## Regression-based association analysis with clustered qqqhaplotypes ## using genotypes. The American Journal of Human Genetics, ## 78:231-242. ## Tzeng, J.Y. (2005). Evolutionary-based Grouping of Haplotypes in ## Association Analysis. Genetic Epidemiology, 28:220-231. ## (2) Simplify the input of function "QSHStest.fun" and hence allow more flexible ## filename of haplotype frequencies ## (3) Example I is for single region analysis, and Example II is for multiple ## regions with genomic control ##--------------------------------------------------------------------------- ##----------------------------------------------------------------------------------------------- ##----------------------------------------------------------------------------------------------- ##----------------------------------------------------------------------------------------------- ## TO USE: ##----------------------------------------------------------------- ## Step 1: prepare haplotype freq table in the following format ##----------------------------------------------------------------- ## format: R-by-2 matrix; the 1st column is haplotype and the 2nd column is the haplotype frequency ## Filename: hapfreq.cs1 ## 4851495149 0.046409639 ## 4851495150 0.004000000 ## 4851515149 0.004000000 ## 4950495149 0.012000000 ## . ## . ## ## Filename hapfreq.cn1 ## 4951485149 0.004000000 ## 4951495049 0.005204819 ## 4951495148 0.008000000 ## . ## . ## ##----------------------------------------------------------------- ## Step 2: save file QSHS.r and source("QSHS.r"). See examples below ## for details. ##----------------------------------------------------------------- ## QSHS.r includes two main funcitons, "QSHStest.fun" and "getresult.fun", ## to perform the haplotype similarity test ## (1) QSHStest.fun ---- calculate the quadratic statistics of haplotype ## similarity (QSHS; i.e., Pi-A-Pi in the paper) ## (2) getresult.fun ---- use genomic control to adjust for confounders (e.g., ## relatedness or subpopulation structure) and return ## the adjusted "Z" value ## ## The arguments of "QSHStest.fun" include ## . hcs : an L-by-2 matrix for haplotype frequencies in cases, where ## Column 1 is the haplotype names and Column 2 is the haplotype frequencies ## . hcn : an L-by-2 matrix for haplotype frequencies in controls, where ## Column 1 is the haplotype names and Column 2 is the haplotype frequencies ## . loci : number of loci per haplotype region ## . nn : sample size of case haplotypes ## . mm : sample size of control haplotypes ## ## The arguments of "getresult.fun" include ## . qshs.result : a 6-by-K matrix output from QSHStest.fun with ## K = no. of haplotype regions ## . type : 1= match statistic ## 2= length statistic ## 3= count statistic ##----------------------------------------------------------------------------------------------- ##----------------------------------------------------------------------------------------------- ##----------------------------------------------------------------------------------------------- ##----------------------------------------------------------------------------------------------- ## Example I (one single hap region only; in this case getresult.fun will output the 3 types of ## QSHS Z statistics regardless of the value of "type") ##----------------------------------------------------------------------------------------------- ## (1) copy QSHS.r (the source code) and two example datafiles: hapfreq.cs and hapfreq.cn ## (2) in R, type: ## > source("QSHS.r") ## > Hcs = read.table("hapfreq.cs", colClasses = c("character","numeric")) ## > Hcn = read.table("hapfreq.cn", colClasses = c("character","numeric")) ## > qshs = QSHStest.fun(hcs=Hcs, hcn=Hcn, loci=5, nn=200, mm=200) ## > qshs ## ## mtch len ct var.mtch var.len var.ct ## ## 0.0028984255 0.1326394059 0.1860585201 0.0001046128 0.0055902168 0.0067585282 ## ## getresult.fun(qshs) ## ## Z pvalue ## ## mtch 0.2833803 0.38844265 ## ## len 1.7740189 0.03803001 ## ## ct 2.2632029 0.01181159 ## ##-------------------------------------------------------------------------------------- ## Example II (multiple regions + Genomic Control to detect mutation): ##-------------------------------------------------------------------------------------- ## (1) copy QSHS.r (the source code) and the example datafiles: hapfreq.cs1 to hapfreq.cs3 ## hapfreq.cn1 to hapfreq.cn3 ## (2) in R, type: ## ## > source("QSHS.r") ## > Hcs<-lapply(1:3, function(fileIndex){read.table(paste("hapfreq.cs", fileIndex, sep=""), colClasses = c("character","numeric"))}) ## > Hcn<-lapply(1:3, function(fileIndex){read.table(paste("hapfreq.cn", fileIndex, sep=""), colClasses = c("character","numeric"))}) ## > qshs<-sapply(1:3,function(region){QSHStest.fun(hcs=Hcs[[region]], hcn=Hcn[[region]],loci=5, nn=200, mm=200)}) ## ##------------------------------------------------------------------ ## ## (region 1) (region 2) (region 3) ## ## [,1] [,2] [,3] ## ## mtch 0.0028984255 0.0207122579 0.0229081600 ## ## len 0.1326394059 0.1974848741 0.3562220800 ## ## ct 0.1860585201 0.3175514681 0.3080070400 ## ## var.mtch 0.0001046128 0.0004466133 0.0001304666 ## ## var.len 0.0055902168 0.0107979180 0.0203082416 ## ## var.ct 0.0067585282 0.0120631875 0.0206738161 ## ##---------------------------------------------------------------- ## ## > getresult.fun(qshs,type=3) ##<--count statistics ## ## $mu.med [1] 0.3080070 ## ## $tau [1] 1.132744 ## ## $Z [1] -1.30953995 0.07671621 0.00000000 ## ## $pvalue [1] 0.9048242 0.4694247 0.5000000 ## ## $No. of outliers [1] 0 ## ## $Regions with outliers [1] NA ## ####--------------------------------------------- #### Calculate Pi-A-Pi statistics (Full-Dim) ####--------------------------------------------- qshs.fun<-function(p, matA){P<-as.matrix(p); t(P) %*% matA %*% P} getQSHS.fun<-function(PP=Hfreq, hap=hap, loci=loci){ pcs<-PP$Pcs; pcn<-PP$Pcn RR<-length(pcs) digit<-nchar(hap[1])/loci MM<-matrix(0, ncol=loci, nrow=RR) for(i in 1:loci){ MM[,i]<-substring(hap, (i-1)*digit+1,i*digit)} index.ij<-cbind(rep((1:RR), each=RR), rep((1:RR), times=RR)) chk<-t(apply(index.ij, 1, function(ij, mat=MM){i<-ij[1];j<-ij[2];as.numeric(mat[i,]==mat[j,])})) ##---matAct--- tri<-apply(chk,1,sum) matAct<-matrix(tri, ncol=RR, nrow=RR) ## qshs.fun<-function(p, matA){P<-as.matrix(p); t(P) %*% matA %*% P} ct<-qshs.fun(pcs, matAct)-qshs.fun(pcn, matAct) ##---matAlen--- sum1<-rep(0, length(tri)); for(i in 1:length(tri)) { tmpsum=0; maxsum=0 if(tri[i]==loci){sum1[i]=loci} else if(tri[i]==0){sum1[i] = 0} else { tmpsum=chk[i,1] if(tmpsum>maxsum){maxsum=tmpsum} for(j in 2:loci) { if(chk[i,j]==0){tmpsum=0} else if (chk[i,j]==1 & chk[i,(j-1)]==0){tmpsum=1} else if (chk[i,j]==1 & chk[i,(j-1)]==1){tmpsum=1+tmpsum} if(tmpsum>maxsum){maxsum=tmpsum} } sum1[i] = maxsum } } matAlen<-matrix(sum1, nrow=RR, ncol=RR) len<-qshs.fun(pcs, matAlen)-qshs.fun(pcn, matAlen) ##---matAmtch--- matAmtch<-diag(rep(1, RR)) mtch<-qshs.fun(pcs, matAmtch)-qshs.fun(pcn, matAmtch) return(list(mtch=mtch,len=len, ct=ct, matAmtch=matAmtch, matAlen=matAlen,matAct= matAct)) } ####------- #### VAR ####------- get.power30.fun<-function(i=1,matE=E,vecP=p.pool,matA=A){sum(matA[i,i]*vecP[i]*matE[-i,-i])} exctVAR.QSHS.fun<-function(p.pool=p.pool,n=50,m=50, A=matA){ P.pool<-as.matrix(p.pool) L <-length(p.pool) ## A <-Smat[[i]] ##---power4--- power4<-(t(P.pool) %*% A %*% P.pool)^2 ##---power3--- ## 3-part0: d0A <-A diag(d0A)<-0 dP.pool <-diag(p.pool) E <-dP.pool%*%d0A%*%dP.pool power30 <-2*sum(apply(as.matrix(1:L),1,get.power30.fun, matE=E, vecP=p.pool, matA=A)) ## get.power30.fun<-function(i=1,matE=E,vecP=p.pool,matA=A){sum(matA[i,i]*vecP[i]*matE[-i,-i])} ## 3-part1: tmp <-dP.pool %*% A %*% dP.pool %*% A %*% dP.pool diag(tmp)<-0 power31 <-4 * sum(tmp) ## 3-part2: power32<-4 * sum( diag( d0A%*%dP.pool %*% ( matrix(rep(diag(A),L), ncol=L, byrow=T) ) %*% (dP.pool)^2 ) ) ## power32<-2 * sum(t(P.pool) %*% d0A %*% P.pool %*% t(P.pool)) ## 3-part3: dA <-as.matrix(diag(A)) dAdA <-dA %*% t(dA) power33<-2 * t(P.pool) %*% dAdA %*% (P.pool^2) # 0.36 ## 3-part4: power34<-4 * t(P.pool) %*% (A^2) %*% (P.pool^2) # 0.192 power3 <-(power30 + power31 + power32 + power33 + power34) ##---power2--- power21<-4 * t(dA*P.pool) %*% A %*% P.pool # 1.36 power22<-t(P.pool) %*% dAdA %*% P.pool # 1 power23<-2 * t(P.pool) %*% (A^2) %*% P.pool # 0.52 power2 <-(power21 + power22 + power23) ##---power1--- power1 <- t( (dA)^2) %*% P.pool ## for testing: ## p.pool<-c(0.2,0.3,0.5) ## A<-matrix(c(3,1,2, 1,3,4, 2,4,3),ncol=3, nrow=3) ##-------------------------------------------- B.bp <-(sum(diag( A %*% (diag(p.pool)-P.pool%*%t(P.pool)) ))/n + t(P.pool) %*% A %*% P.pool)^2 B.cn <-(sum(diag( A %*% (diag(p.pool)-P.pool%*%t(P.pool)) ))/m + t(P.pool) %*% A %*% P.pool)^2 ##-------------------------------------------- var.exct <-(power4* ( exp(log(m-1)+log(m-2)+log(m-3)-3*log(m)) + exp(log(n-1)+log(n-2)+log(n-3)-3*log(n)) )+ power3* (exp(log(m-1)+log(m-2)-3*log(m)) + exp(log(n-1)+log(n-2)-3*log(n)) ) + power2* (exp(log(m-1)-3*log(m)) + exp(log(n-1)-3*log(n)) ) + power1* (exp(-3*log(m)) + exp(-3*log(n)) ) - B.bp - B.cn ) ## var.exct <-(power4* ( ## exp(log(n-1)+log(n-2)+log(n-3)-3*log(n)) ## )+ ## power3* ( ## exp(log(n-1)+log(n-2)-3*log(n)) ## ) + ## power2* ( exp(log(n-1)-3*log(n)) ) + ## power1* ( exp(-3*log(n)) ## ) - B.bp ) return(var.exct)} getVAR.fun<-function(PP=pcscn, nn=nn, mm=mm, matAmtch=matAmtch,matAlen=matAlen,matAct=matAct){ var.ct <-exctVAR.QSHS.fun(PP, nn,mm, matAct) var.mtch<-exctVAR.QSHS.fun(PP, nn,mm, matAmtch) var.len <-exctVAR.QSHS.fun(PP, nn,mm, matAlen) return(list(var.mtch=var.mtch, var.len=var.len, var.ct=var.ct)) } ####=================================== #### main function (1): QSHStest.fun ####=================================== QSHStest.fun<-function(hcs=Hcs, hcn=Hcn,loci=5, nn=200, mm=200) { ##---------------------------- ## get table of Pcs and Pcn ##---------------------------- hap<-sort(unique(c(hcs$V1, hcn$V1))) Hfreq <-data.frame(matrix(0, nrow=length(hap), ncol=2)); row.names(Hfreq)<-hap;names(Hfreq)<-c("Pcs","Pcn") Hfreq[match(hcs$V1, hap),1]<-hcs$V2 Hfreq[match(hcn$V1, hap),2]<-hcn$V2 ##--------------------------------------------- ## calculate 3 types of Pi-A-Pi statistics (FD) ##--------------------------------------------- out.qshs<-getQSHS.fun(PP=Hfreq, hap=hap, loci=loci) ## "mtch" "len" "ct" "matAmtch" "matAlen" "matAct" Pcscn<-Hfreq$Pcs*(nn/(nn+mm)) + Hfreq$Pcn*(mm/(nn+mm)) var.qshs<-getVAR.fun(PP=Pcscn, nn=nn,mm=mm, matAmtch=out.qshs$matAmtch, matAlen=out.qshs$matAlen, matAct=out.qshs$matAct) ## "var.mtch" "var.len" "var.ct" ## Z.mtch <- out.qshs$mtch/sqrt(var.qshs$var.mtch) ## Z.len <- out.qshs$len /sqrt(var.qshs$var.len) ## Z.ct <- out.qshs$ct /sqrt(var.qshs$var.ct) return(c( "mtch" =out.qshs$mtch, "len" =out.qshs$len, "ct" =out.qshs$ct, "var.mtch"=var.qshs$var.mtch, "var.len" =var.qshs$var.len, "var.ct" =var.qshs$var.ct)) } ####-------------------------------------- #### Genomic Control to identify outliers ####-------------------------------------- ##--------------------------------------------------- ## estimate TAU by Ustat ##--------------------------------------------------- Ukg.fun<-function(kg, Dk=Dk, VVar=Var){ kk<-kg[1]; gg<-kg[2] UUkg<-(Dk[kk]-Dk[gg])/sqrt(VVar[kk]+VVar[gg]) return(UUkg)} ##--------------------------------------------------- ## finding cut point by controlling FDR ##--------------------------------------------------- findJ.fun<-function(ZZk){ pk <-1-pnorm(ZZk) nn <-length(pk) pk.sort<-pk[order(pk)] jan <-(1:nn)*0.05/nn out <-(1:nn)[pk.sort<=jan] if(length(out)==0){ return(NA)} else{return(max((1:nn)[pk.sort<=jan]))} } ##-------------- ## get outliers ##-------------- judgeplot<-function(z.i, ZZk=Zk, ZZ.cut=Z.cut){ plot( seq(1:length(z.i)), z.i, type="n",xlab="region",ylab="Z-stat", ylim=c(min(ZZk),max(ZZk))) text(seq(1:length(z.i)),z.i,seq(1:length(z.i))) abline(h=ZZ.cut,lty=2) return( "outlier"=(1:length(z.i))[z.i>ZZ.cut])} getoutlier.fun<-function(Dk,Var){ k <-length(Dk) KK <-rep((1:(k-1)),((k-1):1)) GG <-unlist(apply(as.matrix(2:k) ,1,seq,to=k,by=1) ) Index.Ukg<-cbind(KK,GG); rm(KK,GG) Ukg <-apply(Index.Ukg,1,Ukg.fun, Dk=Dk,VVar=Var) tau <-median(abs(Ukg))/0.65 mu.med <-median(Dk) Zk <-(Dk-mu.med)/(tau*sqrt(Var)) J <-findJ.fun(Zk) if(is.na(J)){Z.cut<-NA}else{ Z.cut<-qnorm(1-J*0.05/length(Zk))} if(is.na(Z.cut)) {Z.cut<-max(Zk+2)} pvalue<-pnorm(Zk, lower.tail=F) ## postscript("CountStat.ps", horizontal=F) qqnorm(Zk);abline(c(0,1)) outlier<- judgeplot(Zk, ZZk=Zk, ZZ.cut=Z.cut) ## dev.off() if(is.na(J)){J<-0} if(length(outlier)==0){outlier<-NA} return(list(mu.mud=mu.med, tau=tau, "Z"=Zk, "pvalue"=pvalue, "No. of outliers"=J, "Regions with outliers"=outlier)) } ####============================================ #### main function (1): getresult.fun ####============================================ getresult.fun<-function(qshs.result, type=3){ ##-------------------------------------------------------------- ## (1) type: 1=match stat, 2=length stat, 3=count stat ## (2) NOTE that if only 1 region (and hence qshs.result is a vector, ## will return the Z-stat of 3 types of Pi-A-Pi stat ##-------------------------------------------------------------- ## "qshs.result" must have the following format: (i.e. region is stored by column) ##-------------------------------------------------- ## region 1 region 2 region 3 ## mtch 0.0028984255 0.0207122579 0.0229081600 ## len 0.1326394059 0.1974848741 0.3562220800 ## ct 0.1860585201 0.3175514681 0.3080070400 ## var.mtch 0.0001046128 0.0004466133 0.0001304666 ## var.len 0.0055902168 0.0107979180 0.0203082416 ## var.ct 0.0067585282 0.0120631875 0.0206738161 ##-------------------------------------------------- tmp <-ncol(qshs.result) if(is.null(tmp) || tmp==1) { Z.mtch <-qshs.result[1]/sqrt(qshs.result[ 4]); Z.len <-qshs.result[2]/sqrt(qshs.result[ 5]); Z.ct <-qshs.result[3]/sqrt(qshs.result[ 6]); Z <-c(Z.mtch, Z.len, Z.ct) names(Z)<-c("mtch", "len", "ct") pvalue <-pnorm(Z, lower.tail=F) return(cbind(Z, pvalue)) }else{ index <-seq(type, nrow(qshs.result), 3); out<-qshs.result[index,] par(mfrow=c(2,length(index)/2)) Dk <-out[1,] Var <-out[2,] result <-getoutlier.fun(Dk,Var) } return(result) }