# snp selection to disrupt LD # Gitta Lubke and Raymond Walters, 2011 ########################################### # # INPUT: # snp.M - matrix of snp data with columns properly ordered # k - the size of the windows (also determines the number of data sets produced # bl - the size of blocks to permute data within # OUTPUT # selmat.M - k x k matrix; each column contains index numbers for SNPs in a data set # ########################################### snp_sel <- function(snp.M,k,bl) { n <- ncol(snp.M) snp.M[snp.M==-9]<-NA ################################# # Randomly permute columns within small blocks. This should help # randomize the snps that get grouped for the "first selection" # while maintaining the distances between snps for windows ################################# bc <- n%/%bl # count of blocks present in data perm <- NULL for(j in 1:bc){ perm <- c(perm,sample((bl*(j-1)+1):(bl*j),bl,replace=F)) } if(n%%bl>1){ # handles any remainder beyond multiple of block length perm <- c(perm,sample((bl*bc+1):n,n%%bl,replace=F)) } if(n%%bl==1){ # handled separately due to behavior of sample() with single number perm <- c(perm,n) } ################################### # Handle missing data by removing SNPs with # no variance. We do this after permutation # so that the block sizes are still handled as intended. ################################### # if snp has no variance, remove from consideration perm2 <- perm perm <- perm[sd(snp.M[,perm],na.rm=T)>0] # update n in case some SNPs removed n <- length(perm) dat.M <- snp.M[,perm] ########################################## # "first selection" is straight forward: seq(1,n,by=k) # the w loop loops through the windows ########### mcmat.M<-matrix(0,k-1,n-1) #test: n=300 for(s in 1:(n-1)){ if((s+k)<=(n+1)){ mcmat.M[,s]<-cor(dat.M[,s],dat.M[,((s+1):(s+k-1))],use="pairwise.complete.obs") }else{ dum<-length((s+1):n) mcmat.M[1:dum,s]<-cor(dat.M[,s],dat.M[,(s+1):n],use="pairwise.complete.obs") } } mcmat.M<-cbind(mcmat.M,rep(1,k-1)) # this is only needed to avoid errors when checking correlation with the next "first selection" snp at the end of the chromosome # to prevent keeping strong negative correlations mcmat.M <- abs(mcmat.M) # avoid issues with missing pairwise correlations mcmat.M[is.na(mcmat.M)] <- 9 ######################################### # get the selections for the k subsets ######################################### selmat.M<-matrix(0,n,k) for(z in 1:k){ selsnp.V<-seq(z,n,by=k) if(z>1){ # look back at snps before window nn <- seq(1,z-1,by=1) # acts as a new window before z, requiring special rules nn <- nn[!diag(mcmat.M[z-nn,nn]>.1)] # remove those that correlate with first selection (snp z) nn <- nn[sample(1:length(nn),length(nn),replace=F)] # permute order to randomize augmentation while(length(nn)>0){ selsnp.V <- c(selsnp.V,nn[1]) lnn <- nn[1] # record one just used and remove it nn <- nn[-1] nnf <- nn[nn>lnn] # split elements forward, behind lnn nnb <- nn[nn.1] # remove those forward lvv that correlate with lvv nnb <- nnb[!diag(mcmat.M[lnn-nnb,nnb]>.1)] # remove those behind lvv that correlate with lvv if(length(c(nnb,nnf))>0){ # recombine, avoid bias toward vvf or vvb nn <- c(nnb,nnf) nn <- nn[sample(1:length(nn),length(nn),replace=F)] # if() avoids error with sample from nothing }else{ nn <- numeric(0) } } } for(w in seq(z,n,by=k)){ ww <- seq(w+1,w+k-1,by=1) # start with all in window ww <- ww[ww<=n] # remove numbers beyond the number of predictors vv <- ww[!(mcmat.M[ww-w,w]>.1)] # remove those that correlate with first selection vv <- vv[!diag(mcmat.M[w+k-vv,vv]>.1)] # remove those that correlate with first selection in next window vv <- vv[sample(1:length(vv),length(vv),replace=F)] # permute order to randomize augmentation while(length(vv)>0){ selsnp.V <- c(selsnp.V,vv[1]) lvv <- vv[1] # record one just used and remove it vv <- vv[-1] vvf <- vv[vv>lvv] # split elements forward, behind lvv vvb <- vv[vv.1] # remove those forward lvv that correlate with lvv vvb <- vvb[!diag(mcmat.M[lvv-vvb,vvb]>.1)] # remove those behind lvv that correlate with lvv if(length(c(vvb,vvf))>0){ # recombine, avoid bias toward vvf or vvb vv <- c(vvb,vvf) vv <- vv[sample(1:length(vv),length(vv),replace=F)] # if() avoids error with sample from nothing }else{ vv <- numeric(0) } } } selmat.M[1:length(selsnp.V),z]<-perm[selsnp.V] # use perm to convert back to original locations } return(selmat.M) } # end function # eof