##----------------------------------------------------------------- ## Step 1: prepare haplotype freq table in the following format ##----------------------------------------------------------------- ## format: R-by-2 table, column = haplotype HapFreq ## Filename: hapfreq.cs1 ## 4851495149 0.046409639 ## 4851495150 0.004000000 ## 4851515149 0.004000000 ## 4950495149 0.012000000 ## . ## . ## ## Filename hapfreq.cn1 ## 4951485149 0.004000000 ## 4951495049 0.005204819 ## 4951495148 0.008000000 ## . ## . ## ##----------------------------------------------------------------- ## Step 2: save file QSHS.FD.RD.r ##----------------------------------------------------------------- ## source("QSHS.FD.RD.r") ## ## two main funcitons : (1) QSHStest.fun ---- calculate the quadratic statistics of haplotype ## similarity (QSHS; i.e., Pi-A-Pi in the paper) ## (2) getresult.fun ---- use genomic control to adjust for confounders (e.g., ## relatedness or subpopulation structure) and return ## the adjusted "Z" value ## ## ------------------------------------------------------------------ ## (1) Arguments of QSHStest.fun ## ------------------------------------------------------------------ ## . csfile, cnfile: filenames for hapfreq of cases and controls ## . fileIndex : specify a number if there are more than one hap regions needed to be analyzed. ## E.g., there are 100 regions. First save file names as "hapfreq.cs1", ## "hapfreq.cs2" to "hapfreq.cs100" etc, then fileIndex=1,2,...100. ## To run, type ## ## ## for the 3rd hap region only: ## QSHStest.fun(csfile="hapfreq.cs", cnfile="hapfreq.cn", fileIndex=3, loci=5, ## nn=200, mm=200, cc.RD=0.2, Rstar=NULL) ## ## or ## ## ## for hap region 1 to 100 ## apply(as.matrix(1:100),1,QSHStest.fun, csfile="hapfreq.cs", cnfile="hapfreq.cn",loci=5, ## nn=200, mm=200, cc.RD=0.2, Rstar=NULL) ## ## . loci : number of loci per haplotype region ## . nn : sample size of case haplotypes ## . mm : sample size of control haplotypes ## . cc.RD : cut-off criteria for reduced-dim (RD) ## (i.e. reserve categories with Pi >= cc.RD * Pi_{max}) ## . Rstar : directly specify the number of categories to reserve. The top Rstar categories ## would be reserved. NOTE: ## if perform Full-dim analysis only, then assign cc.RD=Rstar=NULL ## if perform RD with cc.RD*Pi_{max} cut-off, then assign cc.RD=(say)0.2, and Rstar=NULL ## if perform RD with a particular number of categories to reserve, then ## Rstar=(say)8, and cc.RD=NULL ## ## ------------------------------------------------------------------ ## (2) Arguments of getresult.fun ## ------------------------------------------------------------------ ## . qshs.result : a 6-by-K or 12-by-K matrix output from QSHStest.fun ## it's 6-by-K if only full-dim analysis is performed; ## it's 12-by-K if both full-dim and reduced-dim analyses; ## here K=no. of haplotype regions ## . type : 1= match statistic ## 2= length statistic ## 3= count statistic ## ##----------------------------------------------------------------------------------------------- ## Example I (multiple regions + Genomic Control to detect mutation): ##-------------------------------------------------------------------------------------- ## (1) copy QSHS.FD.RD.r (the source code) and the example datafiles: hapfreq.cs1 to hapfreq.cs3 ## hapfreq.cn1 to hapfreq.cn3 ## (2) in R, type: ## ## > source("QSHS.FD.RD.r") ## > qshs<-apply(as.matrix(1:3),1,QSHStest.fun, csfile="hapfreq.cs", cnfile="hapfreq.cn", ## loci=5, nn=200, mm=200, cc.RD=0.2, Rstar=NULL) ## Warning messages: ## 1: NAs introduced by coercion ## 2: NAs introduced by coercion ## 3: NAs introduced by coercion ## 4: NAs introduced by coercion ## 5: NAs introduced by coercion ## 6: NAs introduced by coercion ## ##------------------------------------------------------------------ ## ## region 1 region 2 region 3 ## ## mtch 0.0028984255 0.0207122579 0.0229081600 ## ## len 0.1326394059 0.1974848741 0.3562220800 ## ## ct 0.1860585201 0.3175514681 0.3080070400 ## ## var.mtch 0.0001046128 0.0004466133 0.0001304666 ## ## var.len 0.0055902168 0.0107979180 0.0203082416 ## ## var.ct 0.0067585282 0.0120631875 0.0206738161 ## ## mtch.rd 0.0115989489 0.1119812336 0.0007430400 ## ## len.rd 0.2201941367 0.4551034970 0.4632320000 ## ## ct.rd 0.2561867774 0.4551034970 0.4792140800 ## ## var.rd.mtch 0.0001794867 0.0012357990 0.0002094831 ## ## var.rd.len 0.0228871567 0.0350438867 0.0484044302 ## ## var.rd.ct 0.0307437873 0.0350438867 0.0507183490 ## ##---------------------------------------------------------------- ## ## > getresult.fun(qshs,type=3) ##<--count statistics ## ## $result.FD ## ## $result.FD$mu.med [1] 0.3080070 ## ## $result.FD$tau [1] 1.132744 ## ## $result.FD$Zk [1] -1.30953995 0.07671621 0.00000000 ## ## $result.FD$pvalue [1] 0.9048242 0.4694247 0.5000000 ## ## $result.FD$J [1] 0 ## ## $result.FD$outlier NA ## ## ## ## ## ## $result.RD ## ## $result.RD$mu.med [1] 0.4551035 ## ## $result.RD$tau [1] 1.193124 ## ## $result.RD$Zk [1] -0.95083890 0.00000000 0.08973038 ## ## $result.RD$pvalue [1] 0.8291569 0.5000000 0.4642507 ## ## $result.RD$J [1] 0 ## ## $result.RD$outlier NA ## ##----------------------------------------------------------------------------------------------- ## Example II (one single hap region only; in this case getresult.fun will output the 3 types of ## QSHS Z statistics regardless of the value of "type") ##----------------------------------------------------------------------------------------------- ## (1) copy QSHS.FD.RD.r (the source code) and two example datafiles: hapfreq.cs and hapfreq.cn ## (2) in R, type: ## > source("QSHS.FD.RD.r") ## > qshs<- QSHStest.fun(csfile="hapfreq.cs", cnfile="hapfreq.cn", fileIndex=NULL, loci=5, nn=200, mm=200, ## cc.RD=0.2, Rstar=NULL) ## (run Full-Dim and Reduced-Dim methods with cutoff = 0.2 Pi_{max}) ## Warning messages: ## 1: NAs introduced by coercion ## 2: NAs introduced by coercion ## ## mtch len ct var.mtch var.len var.ct ## ## 0.0028984255 0.1326394059 0.1860585201 0.0001046128 0.0055902168 0.0067585282 ## ## mtch.rd len.rd ct.rd var.rd.mtch var.rd.len var.rd.ct ## ## 0.0115989489 0.2201941367 0.2561867774 0.0001794867 0.0228871567 0.0307437873 ## ## getresult.fun(qshs) ## ## Z pvalue ## ## mtch 0.2833803 0.38844265 ## ## len 1.7740189 0.03803001 ## ## ct 2.2632029 0.01181159 ## ## rd.mtch 0.8657698 0.19330820 ## ## rd.len 1.4554920 0.07276655 ## ## rd.ct 1.4610936 0.07199488 ## ##-------------------------------------------------------------------------------------- ####--------------------------------------------- #### Calculate Pi-A-Pi statistics (Full-Dim) ####--------------------------------------------- qshs.fun<-function(p, matA){P<-as.matrix(p); t(P) %*% matA %*% P} getQSHS.fun<-function(PP=Hfreq, hap=hap, loci=loci){ pcs<-PP$Pcs; pcn<-PP$Pcn RR<-length(pcs) digit<-nchar(hap[1])/loci MM<-matrix(0, ncol=loci, nrow=RR) for(i in 1:loci){ MM[,i]<-substring(hap, (i-1)*digit+1,i*digit)} index.ij<-cbind(rep((1:RR), each=RR), rep((1:RR), times=RR)) chk<-t(apply(index.ij, 1, function(ij, mat=MM){i<-ij[1];j<-ij[2];as.numeric(mat[i,]==mat[j,])})) ##---matAct--- tri<-apply(chk,1,sum) matAct<-matrix(tri, ncol=RR, nrow=RR) ## qshs.fun<-function(p, matA){P<-as.matrix(p); t(P) %*% matA %*% P} ct<-qshs.fun(pcs, matAct)-qshs.fun(pcn, matAct) ##---matAlen--- sum1<-rep(0, length(tri)); for(i in 1:length(tri)) { tmpsum=0; maxsum=0 if(tri[i]==loci){sum1[i]=loci} else if(tri[i]==0){sum1[i] = 0} else { tmpsum=chk[i,1] if(tmpsum>maxsum){maxsum=tmpsum} for(j in 2:loci) { if(chk[i,j]==0){tmpsum=0} else if (chk[i,j]==1 & chk[i,(j-1)]==0){tmpsum=1} else if (chk[i,j]==1 & chk[i,(j-1)]==1){tmpsum=1+tmpsum} if(tmpsum>maxsum){maxsum=tmpsum} } sum1[i] = maxsum } } matAlen<-matrix(sum1, nrow=RR, ncol=RR) len<-qshs.fun(pcs, matAlen)-qshs.fun(pcn, matAlen) ##---matAmtch--- matAmtch<-diag(rep(1, RR)) mtch<-qshs.fun(pcs, matAmtch)-qshs.fun(pcn, matAmtch) return(mtch,len, ct, matAmtch, matAlen, matAct) } ####------- #### VAR ####------- get.power30.fun<-function(i=1,matE=E,vecP=p.pool,matA=A){sum(matA[i,i]*vecP[i]*matE[-i,-i])} exctVAR.QSHS.fun<-function(p.pool=p.pool,n=50,m=50, A=matA){ P.pool<-as.matrix(p.pool) L <-length(p.pool) ## A <-Smat[[i]] ##---power4--- power4<-(t(P.pool) %*% A %*% P.pool)^2 ##---power3--- ## 3-part0: d0A <-A diag(d0A)<-0 dP.pool <-diag(p.pool) E <-dP.pool%*%d0A%*%dP.pool power30 <-2*sum(apply(as.matrix(1:L),1,get.power30.fun, matE=E, vecP=p.pool, matA=A)) ## get.power30.fun<-function(i=1,matE=E,vecP=p.pool,matA=A){sum(matA[i,i]*vecP[i]*matE[-i,-i])} ## 3-part1: tmp <-dP.pool %*% A %*% dP.pool %*% A %*% dP.pool diag(tmp)<-0 power31 <-4 * sum(tmp) ## 3-part2: power32<-4 * sum( diag( d0A%*%dP.pool %*% ( matrix(rep(diag(A),L), ncol=L, byrow=T) ) %*% (dP.pool)^2 ) ) ## power32<-2 * sum(t(P.pool) %*% d0A %*% P.pool %*% t(P.pool)) ## 3-part3: dA <-as.matrix(diag(A)) dAdA <-dA %*% t(dA) power33<-2 * t(P.pool) %*% dAdA %*% (P.pool^2) # 0.36 ## 3-part4: power34<-4 * t(P.pool) %*% (A^2) %*% (P.pool^2) # 0.192 power3 <-(power30 + power31 + power32 + power33 + power34) ##---power2--- power21<-4 * t(dA*P.pool) %*% A %*% P.pool # 1.36 power22<-t(P.pool) %*% dAdA %*% P.pool # 1 power23<-2 * t(P.pool) %*% (A^2) %*% P.pool # 0.52 power2 <-(power21 + power22 + power23) ##---power1--- power1 <- t( (dA)^2) %*% P.pool ## for testing: ## p.pool<-c(0.2,0.3,0.5) ## A<-matrix(c(3,1,2, 1,3,4, 2,4,3),ncol=3, nrow=3) ##-------------------------------------------- B.bp <-(sum(diag( A %*% (diag(p.pool)-P.pool%*%t(P.pool)) ))/n + t(P.pool) %*% A %*% P.pool)^2 B.cn <-(sum(diag( A %*% (diag(p.pool)-P.pool%*%t(P.pool)) ))/m + t(P.pool) %*% A %*% P.pool)^2 ##-------------------------------------------- var.exct <-(power4* ( exp(log(m-1)+log(m-2)+log(m-3)-3*log(m)) + exp(log(n-1)+log(n-2)+log(n-3)-3*log(n)) )+ power3* (exp(log(m-1)+log(m-2)-3*log(m)) + exp(log(n-1)+log(n-2)-3*log(n)) ) + power2* (exp(log(m-1)-3*log(m)) + exp(log(n-1)-3*log(n)) ) + power1* (exp(-3*log(m)) + exp(-3*log(n)) ) - B.bp - B.cn ) ## var.exct <-(power4* ( ## exp(log(n-1)+log(n-2)+log(n-3)-3*log(n)) ## )+ ## power3* ( ## exp(log(n-1)+log(n-2)-3*log(n)) ## ) + ## power2* ( exp(log(n-1)-3*log(n)) ) + ## power1* ( exp(-3*log(n)) ## ) - B.bp ) return(var.exct)} getVAR.fun<-function(PP=pcscn, nn=nn, mm=mm, matAmtch=matAmtch,matAlen=matAlen,matAct=matAct){ var.ct <-exctVAR.QSHS.fun(PP, nn,mm, matAct) var.mtch<-exctVAR.QSHS.fun(PP, nn,mm, matAmtch) var.len <-exctVAR.QSHS.fun(PP, nn,mm, matAlen) return(var.mtch, var.len, var.ct) } ####---------------------------- #### Reduced-dim Analysis ####---------------------------- ##--------------------------------------------------------- ## decide the hap catogeries to be reserved (the Main dim) ##--------------------------------------------------------- getnewdim.fun<-function(vecp.pool=pcscn, hap=hap, Rstar=Rstar){ orgP <-vecp.pool names(orgP)<-hap sortP <-rev(sort(orgP)) max.ind <-max( (1:length(sortP))[!is.na(match(sortP, sortP[Rstar]))] ) RR <-max.ind ## to prevent a tie in the cutoff main.freq <-sortP[1:RR] main.type <-names(main.freq) newP.pool <-c(main.freq, (1-sum(main.freq))) names(newP.pool)[RR+1]<-"gggggggg" return(list(main.type=main.type, main.freq=main.freq, newP.pool=newP.pool)) } ##----------------------------------------------------- ## regroup the rest categories into "Main" or "Other" ##----------------------------------------------------- regroup.fun<-function(oldP=Hfreq$Pcs, hap=hap, main.list=newdim, loci=loci){ main.type <-main.list$main.type main.freq <-main.list$main.freq Rstar <-length(main.type) ## Rstar = RR-1 main.freq.here<-rep(0, Rstar+1) names(main.freq.here)<-c(main.type, "gggggggg") main.freq.here[match(hap, main.type)]<-oldP rest.freq.here<-oldP[is.na(match(hap, main.type))] rest.type <-hap[is.na(match(hap, main.type))] nn.rest <-length(rest.freq.here) if(nn.rest>0) { digit<-nchar(hap[1])/loci ale.rest<-matrix(0, ncol=loci, nrow=nn.rest) ale.main<-matrix(0, ncol=loci, nrow=Rstar) for(i in 1:loci){ ale.rest[,i]<-as.numeric(substring(rest.type, (i-1)*digit+1,i*digit))} for(i in 1:loci){ ale.main[,i]<-as.numeric(substring(main.type, (i-1)*digit+1,i*digit))} tmp <-apply(as.matrix(1:nn.rest), 1, count.regroup.fun, RRR=(Rstar+1), loci=loci, rest.freq=rest.freq.here, main.type=main.type, main.freq=main.freq, ale.rest =ale.rest, ale.main=ale.main) regroup<-main.freq.here+ apply(tmp, 1,sum) } else{ regroup<-main.freq.here } return(regroup) } count.regroup.fun<-function(j, RRR=RR, loci=loci, rest.freq=rest.freq.here, main.type=main.type, main.freq=main.freq, ale.rest=ale.rest, ale.main=ale.main){ regr<-rep(0, RRR) jnk<-0; for(i in 1:loci){jnk<-jnk+abs(ale.rest[j,i]-ale.main[,i])} close.main<-main.type[jnk==1] len.close <-length(close.main) if(len.close>0) { pos <-match(close.main, main.type) tmp <-max(main.freq[pos]) final.main <-close.main[main.freq[pos]==tmp] final.pos <-match(final.main, main.type) llength <-length(final.pos) regr[final.pos]<-rest.freq[j]/llength } else {regr[RRR]<-rest.freq[j]} return(regr) } ##-------------------------------------------------- ## calculate Rstar from the inputed cut-off ##------------------------------------------------- get.Rstar.fun<-function(cc,pcscn=pcscn){ length(pcscn[pcscn>max(pcscn)*cc | pcscn==max(pcscn)*cc])} ##------------------------------------------- ## calculate Pi-A-Pi stat (reduced-dim) ##------------------------------------------ getQSHS.rd.fun<-function(pcs, pcn, hap=hap, loci=loci){ RR<-length(pcs) digit<-nchar(hap[1])/loci MM<-matrix(0, ncol=loci, nrow=RR) for(i in 1:loci){ MM[,i]<-substring(hap, (i-1)*digit+1,i*digit)} index.ij<-cbind(rep((1:RR), each=RR), rep((1:RR), times=RR)) chk<-t(apply(index.ij, 1, function(ij, mat=MM){i<-ij[1];j<-ij[2];as.numeric(mat[i,]==mat[j,])})) ##---matAct--- tri<-apply(chk,1,sum) matAct<-matrix(tri, ncol=RR, nrow=RR) ## qshs.fun<-function(p, matA){P<-as.matrix(p); t(P) %*% matA %*% P} if(is.na(sum(as.numeric(hap[RR])))){ matAct[RR,]<-0; matAct[,RR]<-0 } ct<-qshs.fun(pcs, matAct)-qshs.fun(pcn, matAct) ##---matAlen--- sum1<-rep(0, length(tri)); for(i in 1:length(tri)) { tmpsum=0; maxsum=0 if(tri[i]==loci){sum1[i]=loci} else if(tri[i]==0){sum1[i] = 0} else { tmpsum=chk[i,1] if(tmpsum>maxsum){maxsum=tmpsum} for(j in 2:loci) { if(chk[i,j]==0){tmpsum=0} else if (chk[i,j]==1 & chk[i,(j-1)]==0){tmpsum=1} else if (chk[i,j]==1 & chk[i,(j-1)]==1){tmpsum=1+tmpsum} if(tmpsum>maxsum){maxsum=tmpsum} } sum1[i] = maxsum } } matAlen <-matrix(sum1, nrow=RR, ncol=RR) matAmtch<-diag(rep(1, RR)) if(is.na(sum(as.numeric(hap[RR])))){ matAlen[RR,]<-0; matAlen[,RR]<-0; matAmtch[RR,RR]<-0 } len<-qshs.fun(pcs, matAlen)-qshs.fun(pcn, matAlen) mtch<-qshs.fun(pcs, matAmtch)-qshs.fun(pcn, matAmtch) return(mtch,len, ct, matAmtch, matAlen, matAct) } ####=================================== #### main function (1): QSHStest.fun ####=================================== QSHStest.fun<-function(csfile="hapfreq.cs", cnfile="hapfreq.cn",fileIndex=NULL, loci=5, nn=200, mm=200, cc.RD=0.2, Rstar=NULL) { ## if cc.RD=NULL and Rstar=NULL then only performance Full dim analysis ##---------------------------- ## get table of Pcs and Pcn ##---------------------------- hcs<-read.table(paste(csfile,fileIndex,sep=""), colClasses = c("character","numeric")) hcn<-read.table(paste(cnfile,fileIndex,sep=""), colClasses = c("character","numeric")) hap<-sort(unique(c(hcs$V1, hcn$V1))) Hfreq <-data.frame(matrix(0, nrow=length(hap), ncol=2)); row.names(Hfreq)<-hap;names(Hfreq)<-c("Pcs","Pcn") Hfreq[match(hcs$V1, hap),1]<-hcs$V2 Hfreq[match(hcn$V1, hap),2]<-hcn$V2 ##--------------------------------------------- ## calculate 3 types of Pi-A-Pi statistics (FD) ##--------------------------------------------- out.qshs<-getQSHS.fun(PP=Hfreq, hap=hap, loci=loci) ## "mtch" "len" "ct" "matAmtch" "matAlen" "matAct" Pcscn<-Hfreq$Pcs*(nn/(nn+mm)) + Hfreq$Pcn*(mm/(nn+mm)) var.qshs<-getVAR.fun(PP=Pcscn, nn=nn,mm=mm, matAmtch=out.qshs$matAmtch, matAlen=out.qshs$matAlen, matAct=out.qshs$matAct) ## "var.mtch" "var.len" "var.ct" ## Z.mtch <- out.qshs$mtch/sqrt(var.qshs$var.mtch) ## Z.len <- out.qshs$len /sqrt(var.qshs$var.len) ## Z.ct <- out.qshs$ct /sqrt(var.qshs$var.ct) if((is.null(cc.RD)+is.null(Rstar))<2) { ##--------------------------------------------- ## dimension reduction ##--------------------------------------------- if(is.null(cc.RD)){Rstar<-Rstar}else{Rstar<-get.Rstar.fun(cc=cc.RD, Pcscn)} if(Rstar>0){ newdim <-getnewdim.fun(Pcscn, hap, Rstar=Rstar) pcs.rd <-regroup.fun(oldP=Hfreq$Pcs, hap=hap, main.list=newdim, loci=loci) pcn.rd <-regroup.fun(oldP=Hfreq$Pcn, hap=hap, main.list=newdim, loci=loci) pcscn.rd<-regroup.fun(oldP=Pcscn, hap=hap, main.list=newdim, loci=loci) qshs.rd <-getQSHS.rd.fun(pcs.rd,pcn.rd, hap=c(newdim$main.type, "gggggggg"),loci=loci) ## "mtch" "len" "ct" "matAmtch" "matAlen" "matAct" var.rd.pool<-getVAR.fun(pcscn.rd, nn=nn, mm=mm, matAmtch=qshs.rd$matAmtch,matAlen=qshs.rd$matAlen,matAct=qshs.rd$matAct) ##"var.mtch" "var.len" "var.ct" ## Z.rd.mtch<-qshs.rd$mtch/sqrt(var.rd.pool$var.mtch) ## Z.rd.len <-qshs.rd$len/sqrt(var.rd.pool$var.len) ## Z.rd.ct <-qshs.rd$ct/sqrt(var.rd.pool$var.ct) } return(c( "mtch" =out.qshs$mtch, "len" =out.qshs$len, "ct" =out.qshs$ct, "var.mtch"=var.qshs$var.mtch, "var.len" =var.qshs$var.len, "var.ct" =var.qshs$var.ct, "mtch.rd" =qshs.rd$mtch, "len.rd" =qshs.rd$len, "ct.rd" =qshs.rd$ct, "var.rd.mtch"=var.rd.pool$var.mtch, "var.rd.len"=var.rd.pool$var.len, "var.rd.ct"=var.rd.pool$var.ct)) } else { return(c( "mtch" =out.qshs$mtch, "len" =out.qshs$len, "ct" =out.qshs$ct, "var.mtch"=var.qshs$var.mtch, "var.len" =var.qshs$var.len, "var.ct" =var.qshs$var.ct)) } } ####-------------------------------------- #### Genomic Control to identify outliers ####-------------------------------------- ##--------------------------------------------------- ## estimate TAU by Ustat ##--------------------------------------------------- Ukg.fun<-function(kg, Dk=Dk, VVar=Var){ kk<-kg[1]; gg<-kg[2] UUkg<-(Dk[kk]-Dk[gg])/sqrt(VVar[kk]+VVar[gg]) return(UUkg)} ##--------------------------------------------------- ## finding cut point by controlling FDR ##--------------------------------------------------- findJ.fun<-function(ZZk){ pk <-1-pnorm(ZZk) nn <-length(pk) pk.sort<-pk[order(pk)] jan <-(1:nn)*0.05/nn out <-(1:nn)[pk.sort<=jan] if(length(out)==0){ return(NA)} else{return(max((1:nn)[pk.sort<=jan]))} } ##-------------- ## get outliers ##-------------- judgeplot<-function(z.i, ZZk=Zk, ZZ.cut=Z.cut){ plot( seq(1:length(z.i)), z.i, type="n",xlab="region",ylab="Z-stat", ylim=c(min(ZZk),max(ZZk))) text(seq(1:length(z.i)),z.i,seq(1:length(z.i))) abline(h=ZZ.cut,lty=2) return( "outlier"=(1:length(z.i))[z.i>ZZ.cut])} getoutlier.fun<-function(Dk,Var){ k <-length(Dk) KK <-rep((1:(k-1)),((k-1):1)) GG <-unlist(apply(as.matrix(2:k) ,1,seq,to=k,by=1) ) Index.Ukg<-cbind(KK,GG); rm(KK,GG) Ukg <-apply(Index.Ukg,1,Ukg.fun, Dk=Dk,VVar=Var) tau <-median(abs(Ukg))/0.65 mu.med <-median(Dk) Zk <-(Dk-mu.med)/(tau*sqrt(Var)) J <-findJ.fun(Zk) if(is.na(J)){Z.cut<-NA}else{ Z.cut<-qnorm(1-J*0.05/length(Zk))} if(is.na(Z.cut)) {Z.cut<-max(Zk+2)} pvalue<-pnorm(Zk, lower.tail=F) ## postscript("CountStat.ps", horizontal=F) qqnorm(Zk);abline(c(0,1)) outlier<- judgeplot(Zk, ZZk=Zk, ZZ.cut=Z.cut) ## dev.off() if(is.na(J)){J<-0} if(length(outlier)==0){outlier<-NA} return(mu.med, tau, "Z"=Zk, "pvalue"=pvalue, "No. of outliers"=J, "Regions with outliers"=outlier) } ####============================================ #### main function (1): getresult.fun ####============================================ getresult.fun<-function(qshs.result, type=3){ ##-------------------------------------------------------------- ## (1) type: 1=match stat, 2=length stat, 3=count stat ## (2) NOTE that if only 1 region (and hence qshs.result is a vector, ## will return the Z-stat of 3 types of Pi-A-Pi stat ##-------------------------------------------------------------- ## "qshs.result" must have the following format: (i.e. region is stored by column) ##-------------------------------------------------- ## region 1 region 2 region 3 ## mtch 0.0028984255 0.0207122579 0.0229081600 ## len 0.1326394059 0.1974848741 0.3562220800 ## ct 0.1860585201 0.3175514681 0.3080070400 ## var.mtch 0.0001046128 0.0004466133 0.0001304666 ## var.len 0.0055902168 0.0107979180 0.0203082416 ## var.ct 0.0067585282 0.0120631875 0.0206738161 ## mtch.rd 0.0115989489 0.1119812336 0.0007430400 ## len.rd 0.2201941367 0.4551034970 0.4632320000 ## ct.rd 0.2561867774 0.4551034970 0.4792140800 ## var.rd.mtch 0.0001794867 0.0012357990 0.0002094831 ## var.rd.len 0.0228871567 0.0350438867 0.0484044302 ## var.rd.ct 0.0307437873 0.0350438867 0.0507183490 ##-------------------------------------------------- tmp <-ncol(qshs.result) if(is.null(tmp) || tmp==1) { Z.mtch <-qshs.result[1]/sqrt(qshs.result[ 4]); Z.len <-qshs.result[2]/sqrt(qshs.result[ 5]); Z.ct <-qshs.result[3]/sqrt(qshs.result[ 6]); Z.rd.mtch<-qshs.result[7]/sqrt(qshs.result[10]); Z.rd.len <-qshs.result[8]/sqrt(qshs.result[11]); Z.rd.ct <-qshs.result[9]/sqrt(qshs.result[12]); Z <-c(Z.mtch, Z.len, Z.ct, Z.rd.mtch, Z.rd.len, Z.rd.ct); names(Z)<-c("mtch", "len", "ct", "rd.mtch", "rd.len", "rd.ct"); pvalue <-pnorm(Z, lower.tail=F) return(cbind(Z, pvalue)) }else{ index <-seq(type, nrow(qshs.result), 3); out<-qshs.result[index,] par(mfrow=c(2,length(index)/2)) Dk <-out[1,] Var <-out[2,] result.FD <-getoutlier.fun(Dk,Var) if(length(index)>2) { Dk.rd <-out[3,]; Var.rd<-out[4,] result.RD<- getoutlier.fun(Dk.rd,Var.rd) return(result.FD, result.RD) } else {return(result.FD)} } }