# snp selection to disrupt LD
# Gitta Lubke and Raymond Walters, 2011
###########################################
# 
# INPUT:
# 	snp.M - matrix of snp data with columns properly ordered
# 	k - the size of the windows (also determines the number of data sets produced
# 	bl - the size of blocks to permute data within
# OUTPUT
#	selmat.M - k x k matrix; each column contains index numbers for SNPs in a data set
#
###########################################

snp_sel <- function(snp.M,k,bl)
{

n <- ncol(snp.M)
snp.M[snp.M==-9]<-NA

#################################
# Randomly permute columns within small blocks. This should help 
# randomize the snps that get grouped for the "first selection" 
# while maintaining the distances between snps for windows
#################################

bc <- n%/%bl					# count of blocks present in data
perm <- NULL
for(j in 1:bc){
	perm <- c(perm,sample((bl*(j-1)+1):(bl*j),bl,replace=F))
}
if(n%%bl>1){					# handles any remainder beyond multiple of block length
	perm <- c(perm,sample((bl*bc+1):n,n%%bl,replace=F))
}
if(n%%bl==1){					# handled separately due to behavior of sample() with single number
	perm <- c(perm,n)
}

###################################
# Handle missing data by removing SNPs with
# no variance. We do this after permutation
# so that the block sizes are still handled as intended.
###################################

# if snp has no variance, remove from consideration
perm2 <- perm
perm <- perm[sd(snp.M[,perm],na.rm=T)>0]

# update n in case some SNPs removed 
n <- length(perm)

dat.M <- snp.M[,perm]

##########################################
# "first selection" is straight forward: seq(1,n,by=k)
# the w loop loops through the windows
###########

mcmat.M<-matrix(0,k-1,n-1)
#test: n=300
for(s in 1:(n-1)){
	if((s+k)<=(n+1)){
		 mcmat.M[,s]<-cor(dat.M[,s],dat.M[,((s+1):(s+k-1))],use="pairwise.complete.obs")
	}else{
		dum<-length((s+1):n)
		mcmat.M[1:dum,s]<-cor(dat.M[,s],dat.M[,(s+1):n],use="pairwise.complete.obs")
	}
}
mcmat.M<-cbind(mcmat.M,rep(1,k-1)) # this is only needed to avoid errors when checking correlation with the next "first selection" snp at the end of the chromosome

# to prevent keeping strong negative correlations
mcmat.M <- abs(mcmat.M) 

# avoid issues with missing pairwise correlations
mcmat.M[is.na(mcmat.M)] <- 9

#########################################
# get the selections for the k subsets
#########################################
selmat.M<-matrix(0,n,k)
for(z in 1:k){
	selsnp.V<-seq(z,n,by=k)
	if(z>1){													# look back at snps before window
		nn <- seq(1,z-1,by=1)									# acts as a new window before z, requiring special rules
		nn <- nn[!diag(mcmat.M[z-nn,nn]>.1)] 					# remove those that correlate with first selection (snp z)
		nn <- nn[sample(1:length(nn),length(nn),replace=F)] 	# permute order to randomize augmentation
		while(length(nn)>0){
			selsnp.V <- c(selsnp.V,nn[1])
			lnn <- nn[1] 										# record one just used and remove it
			nn <- nn[-1] 
			nnf <- nn[nn>lnn] 									# split elements forward, behind lnn
			nnb <- nn[nn<lnn] 
			nnf <- nnf[!mcmat.M[nnf-lnn,lnn]>.1] 				# remove those forward lvv that correlate with lvv 
			nnb <- nnb[!diag(mcmat.M[lnn-nnb,nnb]>.1)] 			# remove those behind lvv that correlate with lvv
			if(length(c(nnb,nnf))>0){							# recombine, avoid bias toward vvf or vvb
				nn <- c(nnb,nnf)
				nn <- nn[sample(1:length(nn),length(nn),replace=F)]		# if() avoids error with sample from nothing
				}else{ 
				nn <- numeric(0) }
			}
		}
	for(w in seq(z,n,by=k)){
		ww <- seq(w+1,w+k-1,by=1)					# start with all in window
		ww <- ww[ww<=n]											# remove numbers beyond the number of predictors
		vv <- ww[!(mcmat.M[ww-w,w]>.1)] 						# remove those that correlate with first selection
		vv <- vv[!diag(mcmat.M[w+k-vv,vv]>.1)] 					# remove those that correlate with first selection in next window
		vv <- vv[sample(1:length(vv),length(vv),replace=F)] 	# permute order to randomize augmentation
		while(length(vv)>0){
			selsnp.V <- c(selsnp.V,vv[1])
			lvv <- vv[1] 									# record one just used and remove it
			vv <- vv[-1] 
			vvf <- vv[vv>lvv] 								# split elements forward, behind lvv
			vvb <- vv[vv<lvv] 
			vvf <- vvf[!mcmat.M[vvf-lvv,lvv]>.1] 			# remove those forward lvv that correlate with lvv 
			vvb <- vvb[!diag(mcmat.M[lvv-vvb,vvb]>.1)] 		# remove those behind lvv that correlate with lvv
			if(length(c(vvb,vvf))>0){						# recombine, avoid bias toward vvf or vvb
				vv <- c(vvb,vvf)
				vv <- vv[sample(1:length(vv),length(vv),replace=F)]		# if() avoids error with sample from nothing
				}else{ 
				vv <- numeric(0) }
			}
		}
	selmat.M[1:length(selsnp.V),z]<-perm[selsnp.V]			# use perm to convert back to original locations
	}

return(selmat.M)

} # end function 

# eof