#---------------------------------------------------------------------------------------------- # Functions implementing methods in "Survival Prediction of Diffuse Large-B-Cell Lymphoma # Based on Both Clinical and Gene Expression Information" # # Author: Lexin Li # # Email: li@stat.ncsu.edu # # Last updated: September 7, 2005 # #---------------------------------------------------------------------------------------------- #---------------------------------------------------------------------------------------------- # supporting functions #---------------------------------------------------------------------------------------------- # normalize a vector norm<-function(v) v/sqrt(sum(v^2)) # center a vector center<-function(v) v - mean(v) # covariance matrix cov.x<-function(X) { Xc<-apply(X, 2, center) t(Xc) %*% Xc / nrow(Xc) } # square-root of a positive definite symmetric matrix mat.sqrt<-function(A) { if(nrow(A) != ncol(A)) stop("the matrix must be symmetric") ei<-eigen(A) ans<-ei$vectors %*% diag(sqrt(ei$values)) %*% t(ei$vectors) return(ans) } # square-root-inverse of a positive definite symmetric matrix mat.sqrt.inv<-function(A) { if(nrow(A) != ncol(A)) stop("the matrix must be symmetric") ei<-eigen(A) ans<-ei$vectors %*% diag(1 / sqrt(ei$values)) %*% t(ei$vectors) return(ans) } # angle between two spaces angles<-function(B1, B2) { if(ncol(B1) >= ncol(B2)) { B<-B1; B.hat<-B2 } else { B<-B2; B.hat<-B1 } P1<-B %*% solve(t(B) %*% B) %*% t(B) if(ncol(B.hat) == 1) { nume<-as.vector(t(B.hat) %*% P1 %*% B.hat) deno<-as.vector(t(B.hat) %*% B.hat) ratio<-nume / deno } else { BtB<-t(B.hat) %*% B.hat ei<-eigen(BtB) BtB2<-ei$vectors %*% diag(1/sqrt(ei$values)) %*% t(ei$vectors) M<-BtB2 %*% t(B.hat) %*% P1 %*% B.hat %*% BtB2 ratio<-abs(eigen(M)$values[nrow(M)]) } ans<-acos(sqrt(ratio))/pi * 180 if(ans > 90) ans<-180 - ans return(ans) } # accumlative percentage of variance for principal components apv.pc<-function(data.x, gindex=seq(1, ncol(data.x)), pc.center=F, pc.scale=F, plot=F) { # data X0<-data.x[, gindex] # pc pc.out<-prcomp(X0, center=pc.center, scale=pc.scale) # accumulative percentage of variance pc.var<-pc.out$sdev^2 total.var<-sum(pc.var) perc.var<-NULL for(i in 1:length(pc.var)) perc.var<-c(perc.var, sum(pc.var[1:i])/total.var) if(plot) plot(seq(1, length(pc.var)), perc.var) ans<-perc.var return(ans) } #---------------------------------------------------------------------------------------------- # sir functions #---------------------------------------------------------------------------------------------- # double slice function dr.slices.double<-function(y, nslices) { u<-sort(unique(y[,2])) h<-nslices ind<-sizes<-0 count.slices<-0 for (j in 1:length(u)) { pos<-which(y[,2]==u[j]) s2<-dr.slice.1d(y[pos, 1], h) ind[pos]<-count.slices + s2$slice.indicator sizes[(count.slices + 1) : (count.slices + s2$nslices)]<-s2$slice.sizes count.slices<-count.slices + s2$nslices } list(slice.indicator=ind, nslices=count.slices, slice.sizes=sizes) } # slice functions from Sandy Weisberg's dr package dr.slices <- function(y,nslices) { p <- if (is.matrix(y)) dim(y)[2] else 1 h <- if (length(nslices) == p) nslices else rep(ceiling(nslices^(1/p)),p) a <- dr.slice.1d( if(is.matrix(y)) y[,1] else y, h[1]) if (p > 1){ for (col in 2:p) { ns <- 0 for (j in unique(a$slice.indicator)) { b <- dr.slice.1d(y[a$slice.indicator==j,col],h[col]) a$slice.indicator[a$slice.indicator==j] <- a$slice.indicator[a$slice.indicator==j] + 10^(p-1)*b$slice.indicator ns <- ns + b$nslices } a$nslices <- ns } #recode unique values to 1:nslices and fix up slice sizes v <- unique(a$slice.indicator) L <- NULL for (i in 1:length(v)) { sel <- a$slice.indicator==v[i] a$slice.indicator[sel] <- i L <- c(L,length(a$slice.indicator[sel])) } a$slice.sizes <- L } a } dr.slice.1d <- function(y,h) { z<-unique(y) if (length(z) >= h) dr.slice2(y,h) else dr.slice1(y,length(z),sort(z)) } dr.slice1 <- function(y,h,u){ z <- sizes <- 0 for (j in 1:length(u)) { temp <- which(y==u[j]) z[temp] <- j sizes[j] <- length(temp) } list(slice.indicator=z, nslices=length(u), slice.sizes=sizes) } dr.slice2<-function(y,h) { or <- order(y) n <- length(y) m<-floor(n/h) r<-n-m*h start<-sp<-ans<-0 j<-1 while((start+m) 0) { e<-sort(ei$values) st<-df<-pv<-0 for(m in 0:dmax) { st[m+1]<-n*sum(e[seq(1,p-m)]) df[m+1]<-(sum(hw) - m - C)*(p - m) pv[m+1]<-1 - pchisq(st[m+1],df[m+1]) } asy.test<-data.frame(cbind(st,df,pv)) rr<-paste(0:dmax,"D vs >= ",1:(dmax+1),"D",sep="") dimnames(asy.test)<-list(rr,c("Stat","df","p-value")) } # output ans<-list(b.est=b.est) if(dmax > 0) ans<-list(b.est=b.est, asy.test=asy.test) ans<-return(ans) } # survival sir ssir<-function(X, y, nslices, d=1, dmax=4) { # parameters n<-nrow(X) p<-ncol(X) Sigma.x<-cov.x(X) Sigma.x.inv2<-mat.sqrt.inv(Sigma.x) # kernel matrix Z<-apply(X, 2, center) %*% Sigma.x.inv2 out.M<-M.sir(Z, y, nslices) Mp<-out.M$M nslices<-out.M$nslices # estimate ei<-eigen(Mp) v<-Sigma.x.inv2 %*% ei$vectors[, 1:d] if(d == 1) v<-matrix(v, ncol=1) b.est<-apply(v, 2, norm) # dimension test (normal simplification) if(dmax > 0) { e<-sort(ei$values) st<-df<-pv<-0 for(m in 0:dmax) { st[m+1]<-n*sum(e[seq(1,p-m)]) df[m+1]<-(nslices - m - 1)*(p - m) pv[m+1]<-1 - pchisq(st[m+1],df[m+1]) } asy.test<-data.frame(cbind(st,df,pv)) rr<-paste(0:dmax,"D vs >= ",1:(dmax+1),"D",sep="") dimnames(asy.test)<-list(rr,c("Stat","df","p-value")) } # output ans<-list(b.est=b.est) if(dmax > 0) ans<-list(b.est=b.est, asy.test=asy.test) return(ans) } #---------------------------------------------------------------------------------------------- # survival functions #---------------------------------------------------------------------------------------------- surv.plot<-function(scores, data.y, num.strata=3, sep.pos=NULL, xmax=max(stime), plot=T) { stime<-data.y[,1] status<-data.y[,2] if(length(unique(scores)) <= num.strata) { sep.pos<-unique(scores) group<-rep(0, length(scores)) for(i in 1:length(sep.pos)) group[scores == sep.pos[i]]<-i } else { if(is.null(sep.pos)) { probs<-seq(1, num.strata) / num.strata sep.pos<-quantile(scores, probs=probs) group<-rep(1, length(scores)) for(i in 1:(num.strata-1)) group[scores >= sep.pos[i]]<-i+1 } else { group<-rep(1, length(scores)) for(i in 1:length(sep.pos)) group[scores >= sep.pos[i]]<-i+1 } } test.out<-survdiff(Surv(stime, status) ~ group) test.p.value<-1 - pchisq(test.out$chisq, num.strata-1) if(plot) { out<-plot(survfit(Surv(stime, status) ~ strata(group)), xlab="Time to death", ylab="Death-free survival", xmax=xmax) } #ans<-list(test=test.out, x=out$x, y=out$y) ans<-test.p.value return(ans) } auc<-function(scores, data.y, max.time=10) { ans<-NULL for(i in 1:max.time) { roc<-roc.KM.calc(data.y[,1], data.y[,2], scores, predict.time=i, span=1) ans<-c(ans, roc.area(roc)) } return(ans) } # K-fold cv cv<-function(data.x.tr, data.w.tr, data.y.tr, q, K=10, pc.center=T, pc.scale=F, nslices=4, max.time=10) { n<-nrow(data.x.tr) ngp<-floor(n / K) slice<-dr.slices(seq(1,n), ngp) auc.ssir<-auc.psir<-NULL for(i in 1:slice$nslices) { CVset<-seq(1, n)[slice$slice.indicator == i] x.tr<-data.x.tr[-CVset,]; w.tr<-data.w.tr[-CVset]; y.tr<-data.y.tr[-CVset,] x.te<-data.x.tr[CVset,]; w.te<-data.w.tr[CVset]; y.te<-data.y.tr[CVset,] out.i<-score.mdls(x.tr, w.tr, y.tr, x.te, w.te, y.te, q, pc.center, pc.scale, nslices) scores.ssir<-out.i$scores.te2 scores.psir<-out.i$scores.te3 auc.ssir.i<-auc(scores.ssir, y.te, max.time=max.time) auc.psir.i<-auc(scores.psir, y.te, max.time=max.time) auc.ssir<-rbind(auc.ssir, auc.ssir.i) auc.psir<-rbind(auc.psir, auc.psir.i) } # mean auc over K-fold auc.ssir.m<-apply(auc.ssir, 2, mean, na.rm=T) auc.psir.m<-apply(auc.psir, 2, mean, na.rm=T) ans<-list(auc.ssir=auc.ssir.m, auc.psir=auc.psir.m) return(ans) } score.mdls<-function(data.x.tr, data.w.tr, data.y.tr, data.x.te, data.w.te, data.y.te, q, pc.center=T, pc.scale=F, nslices=4) { # pc (q selected components) pc.out<-prcomp(data.x.tr, center=pc.center, scale=pc.scale) X<-pc.out$x[, 1:q] rotation<-pc.out$rotation colm<-as.vector(apply(data.x.tr, 2, mean)) cols<-as.vector(apply(data.x.tr, 2, sd)) y<-data.y.tr W<-factor(unname(data.w.tr)) # sdr out.ssir<-ssir(X, y, nslices, d=1, dmax=4) out.psir<-pssir.eqv(X, W, y, nslices, d=1, dmax=4) # extracted components stime<-y[,1] status<-y[,2] bs1<-as.vector(X %*% out.ssir$b.est[,1]) bp1<-as.vector(X %*% out.psir$b.est[,1]) bp1.2<-bp1^2 # survival fit1<-coxph(Surv(stime, status)~W, method="breslow", x=TRUE) fit2<-coxph(Surv(stime, status)~bs1, method="breslow", x=TRUE) fit3<-coxph(Surv(stime, status)~W + bp1, method="breslow", x=TRUE) # scores for training data scores.tr1<-as.vector(predict(fit1, type="lp")) scores.tr2<-as.vector(predict(fit2, type="lp")) scores.tr3<-as.vector(predict(fit3, type="lp")) # scores for testing data X0.test<-data.x.te if(pc.center) X0.test<-t(apply(X0.test, 1, "-", colm)) if(pc.scale) X0.test<-t(apply(X0.test, 1, "/", cols)) X.test<-(X0.test %*% rotation)[, 1:q] bs1<-as.vector(X.test %*% out.ssir$b.est[,1]) bp1<-as.vector(X.test %*% out.psir$b.est[,1]) bp1.2<-bp1^2 W.all<-factor(unname(c(data.w.te, data.w.tr))) W.test<-model.matrix(~W.all)[(1:length(data.w.te)),-1] newdata<-cbind(bs1, bp1, bp1.2, W.test) if(nlevels(W.all) == 2) { scores.te1<-as.vector(newdata[,4] * fit1$coef) - sum(fit1$coef * fit1$means) scores.te2<-as.vector(newdata[,1] * fit2$coef) - sum(fit2$coef * fit2$means) scores.te3<-as.vector(newdata[,c(4,2)] %*% fit3$coef) - sum(fit3$coef * fit3$means) } if(nlevels(W.all) == 3) { scores.te1<-as.vector(newdata[,c(4,5)] %*% fit1$coef) - sum(fit1$coef * fit1$means) scores.te2<-as.vector(newdata[,1] * fit2$coef) - sum(fit2$coef * fit2$means) scores.te3<-as.vector(newdata[,c(4,5,2)] %*% fit3$coef) - sum(fit3$coef * fit3$means) } # return ans<-list(scores.tr1=scores.tr1, scores.tr2=scores.tr2, scores.tr3=scores.tr3, scores.te1=scores.te1, scores.te2=scores.te2, scores.te3=scores.te3) return(ans) } score.pc<-function(data.x.tr, data.w.tr, data.y.tr, data.x.te, data.w.te, data.y.te, q, pc.center=T, pc.scale=F) { # pc (q selected components) pc.out<-prcomp(data.x.tr, center=pc.center, scale=pc.scale) X<-pc.out$x[, 1:q] rotation<-pc.out$rotation colm<-as.vector(apply(data.x.tr, 2, mean)) cols<-as.vector(apply(data.x.tr, 2, sd)) y<-data.y.tr W<-factor(unname(data.w.tr)) # extracted components stime<-y[,1] status<-y[,2] # survival fit1<-coxph(Surv(stime, status)~X, method="breslow", x=TRUE) fit2<-coxph(Surv(stime, status)~X + W, method="breslow", x=TRUE) # scores for training data scores.tr1<-as.vector(predict(fit1, type="lp")) scores.tr2<-as.vector(predict(fit2, type="lp")) # scores for testing data X0.test<-data.x.te if(pc.center) X0.test<-t(apply(X0.test, 1, "-", colm)) if(pc.scale) X0.test<-t(apply(X0.test, 1, "/", cols)) X.test<-(X0.test %*% rotation)[, 1:q] W.all<-factor(unname(c(data.w.te, data.w.tr))) W.test<-model.matrix(~W.all)[(1:length(data.w.te)),-1] newdata<-cbind(X.test, W.test) scores.te1<-as.vector(newdata[,seq(1,q)] %*% fit1$coef) - sum(fit1$coef * fit1$means) scores.te2<-as.vector(newdata[,] %*% fit2$coef) - sum(fit2$coef * fit2$means) # return ans<-list(scores.tr1=scores.tr1, scores.tr2=scores.tr2, scores.te1=scores.te1, scores.te2=scores.te2) return(ans) }