# MAFpseudocov.r # Pseudocovariate Correction for MAF # Raymond Walters, 2011 ####################################### # # Description: # Defines the function to correct the variable # importance of SNPs to reduce the impact # of MAF. # # INPUT: # - imps.M: matrix of importances for SNPs to be corrected. # Assumes each column corresponds to 1 SNP, and each row # corresponds to 1 LD subset. Missing values assumed to be # coded NA (eg. non-numeric). # - snpmafs.V: vector of observed MAFs for each of the SNPs in # imps.M. Length of snpmafs.V should match number of columns # in imps.M. # - psimps.M: matrix of importances for pseudocovariates. # Assumes each column corresponds to 1 pseudocovariate, # and each row corresponds to 1 LD subset. Should not # have any missing data. # - psmafs.V: vector of MAFs used to create the pseudocovariates. # Length of psmafs.V should match number of columns in psimps.M. # - digits: scalar indicating the precision of the pseudocovariates. # For example, if the pseudocovariates differ to 3 significant # digits [ex. psmafs.V <- seq(.01,.5,.001)], then should set # digits=3. # # OUTPUT: # - 1 by p matrix with dimensions matching imps.M containing # the corrected aggregated variable importances. Column names # from imps.M will be retained if present. # ####################################### pcovcor <- function(imps.M, snpmafs.V, psimps.M, psmafs.V,digits=3) { # check dimensions for imps match if(!(ncol(imps.M)==length(snpmafs.V))){ stop("Dimensions of importances and MAFs for SNPs do not match") } # check dimensions for covariates match if(!(ncol(psimps.M)==length(psmafs.V))){ stop("Dimensions of importances and MAFs for pseudocovariates do not match") } # initialize output dat.M <- matrix(NA,nrow(imps.M),ncol(imps.M)) try(colnames(dat.M) <- colnames(imps.M),silent=T) # fit loess curve to pseudocovariates x.M <- data.frame(imp=matrix(colMeans(psimps.M),ncol=1),maf=psmafs.V) lfit <- loess(imp~maf,data=x.M,family="symmetric") # estimate sd at pseudocovariate sd.V <- apply(psimps.M,2,sd) # correct the given importances for(i in 1:ncol(imps.M)){ # get predicted value from loess curve maf <- matrix(as.numeric(snpmafs.V[i]),1,1) colnames(maf) <- "maf" predimp <- predict(lfit,maf) # find closest pseudocovar to get sd estimate ind <- which(round(psmafs.V,digits)==round(snpmafs.V[i],digits)) localsd <- sd.V[ind] # error if multiple matches if(length(localsd)>1){ stop(paste("Multiple matches to observed MAF at column",i,"; incorrect number of significant digits?",sep="")) } # record corrected imp dat.M[,i] <- (imps.M[,i]-predimp)/localsd } # aggregate out.M <- matrix(colMeans(dat.M,na.rm=T),nrow=1) try(colnames(out.M) <- colnames(dat.M)) # output results return(out.M) } # eof