#! /usr/bin/perl -w
use strict;
use Parallel::ForkManager;


unless($ARGV[5]){
print <<EOF;
	This program slides elements to generate null distributions. 
	Usage:
		perl xxx.pl  sliding_interval_start sliding_interval_end num_of_randomization derived_allele_frequency_cut_off Polymorphisms input_file number_categories_per_run
		
		* sliding_interval_start : the lower bound of sliding distance
		* sliding_interval_end :  the upper bound of sliding distance
		* num_of_randomization : number of randomizations
		* derived_allele_frequency_cut_off : DAF threshold to define rare variants
		* Polymorphisms : SNV file in BED format, with DAF as the fourth column (to define 'sensitive' regions, please prepare only non-coding SNVs)
		* input_file : one category per line (two columns: category_file_path, category_name); category files should be in BED format.
		* number_categories_per_run : if there are multiple categories, specify how many categories to run per time. Default is 5. 
		------------------
		Output will be in random/* 
EOF
exit;
}
my ($slide_start,$slide_end,$num_random,$daf_th,$snp_file,$input) = @ARGV[0..5];
my $num_per_run =5;
if ($ARGV[6]){
	$num_per_run = $ARGV[6];
}
unless (-d "random"){
	mkdir "random";
}


my %SNP;
my @DISTANCE;
$DISTANCE[0]= 0;
my %CHR;  # chromosome length is from hg19, UCSC.
my %CATEGORY; 
my @chr_label = ("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX");
my @chr_array = (@chr_label) x5;

&dis_gen($slide_start,$slide_end,$num_random);
&read_snp($snp_file);
if (scalar keys %SNP ==0){
	die "no SNVs with DAF ...\n";
}
&chr_length();
&read_cate($input);



my $pm = new Parallel::ForkManager($num_per_run);
my @category = sort keys %CATEGORY;
my $num = int(scalar @category / $num_per_run);
my $i;
my $j;
my $out_matrix = "random/randomization.matrix";
if (-f $out_matrix){
	unlink($out_matrix);
}
		
for $i (1 .. $num){
	my @names = ();
	for $j (($i-1)*$num_per_run  .. $i*$num_per_run - 1){
       		my $cate_name = $category[$j];
       		push @names,$cate_name;
       		my $pid = $pm -> start and next;
       		&main($cate_name);
        	$SIG{INT} = sub {kill 9, $pid;};
		$pm -> finish;
	}
	$pm -> wait_all_children;
			
	foreach my $cate_name(@names){
		my $tmp_in = "random/$cate_name";
		`cat $tmp_in >> $out_matrix`;
		unlink("$tmp_in");
	}
}
	
if ((scalar @category - $num*$num_per_run )>0){
	my @names = ();			
	for $j ($num*$num_per_run .. $#category){
		my $cate_name = $category[$j];
       		push @names,$cate_name;
       		my $pid = $pm -> start and next;
       		&main($cate_name);
        	$SIG{INT} = sub {kill 9, $pid;};
		$pm -> finish;
	}
	$pm -> wait_all_children;

	foreach my $cate_name(@names){
		my $tmp_in = "random/$cate_name";
		`cat $tmp_in >> $out_matrix`;
		unlink("$tmp_in");
	}			
}



#running 
sub main{
	my ($cate_name) = @_;
	my $fh;
	open($fh, ">random/$cate_name")||die;
	my $file = $CATEGORY{$cate_name};
	my @file_m = `cut -f 1,2,3 $file | sort -k1,1 -k2,2n | mergeBed`;
	my $out = &slide($cate_name,$fh,\@file_m);
	close $fh;
}

#randomization process
sub slide{
	my ($name,$fh,$file) = @_;
	my @elm = @{$file};
	my $i;
	my $j;
	my $overall; 
	my $rare;	
	my $line;
	my $distance;
	my $chr;
	my $start;
	my $end;
	my $adj_start;
	my $adj_end;
	my $flag;
	my $float;
	my $chr_pos;
	my $new_start;
	my $new_end;
	my $new_chr;
	my $adj_chr;
	my @output = ();

	print $fh $name," ";

	for $i (0 .. $#DISTANCE){
		$overall = 0;
		$rare = 0;
		$distance = $DISTANCE[$i];
		
		foreach $line(@elm){
			chomp $line;
			($chr,$start,$end) = (split /\t+/,$line)[0..2];
			$chr =~ s/^chr//gi;
			$chr =~ s/^/chr/;
			$adj_start = $start + $distance;
			$adj_end = $end + $distance;
		
			if ($chr =~ /\d+/ || $chr =~ /X/){
				# Situation 1 : start position sliding out of chromosome, then move to the next chromosome. 
				if ($adj_start > $CHR{$chr} -1){
						$chr_pos = (grep{$chr_array[$_] eq $chr} 0 .. $#chr_array)[0];
						$flag = $CHR{$chr};  # length of original chromosome 
						$float = 0;
						for  $j ($chr_pos+1 .. $#chr_array){
							$flag = $flag + $CHR{$chr_array[$j]}; 
							if ($adj_start < $flag){
								$float = $j;
								last;
							}	
						}
				
						$new_start = $adj_start - $flag + $CHR{$chr_array[$float]};
						$new_end = $new_start + ($end-$start);
						$new_chr = $chr_array[$float];
						
						#situ1.1 : new end position within new chromosome. 
						if ($new_end <= $CHR{$new_chr}){
							($overall,$rare) = &daf_cal($overall,$rare,$new_chr,$new_start,$new_end);
						}
						
						#situ1.2 : new end position out of new chromosome. Split the element.  
						if ($new_end > $CHR{$new_chr}){
                        		$new_end = $new_end - $CHR{$new_chr};  # assume none of the elements span more than 1 chromosome. 
                    			if ($new_chr =~ /chr(\d+)$/){
                                		if ($1+1 <= 22){
											($overall,$rare) = &cal($overall,$rare,$new_chr,$new_start,$CHR{$new_chr});
											$adj_chr = join("","chr",$1+1);
											($overall,$rare) = &cal($overall,$rare,$adj_chr,0,$new_end);
                                		}elsif($1 == 22){
											($overall,$rare) = &cal($overall,$rare,$new_chr,$new_start,$CHR{$new_chr});
											$adj_chr = "chrX";
											($overall,$rare) = &cal($overall,$rare,$adj_chr,0,$new_end);					
                        				}
                        		}		
                        		if ($new_chr =~ /chrX/){
                        				($overall,$rare) = &cal($overall,$rare,$new_chr,$new_start,$CHR{$new_chr});
										$adj_chr = "chr1";
										($overall,$rare) = &cal($overall,$rare,$adj_chr,0,$new_end);
								}
						}
				}		
				# Situ2 : elements stay on same chromosome
				if ($adj_end <=$CHR{$chr}){
					($overall,$rare) = &cal($overall,$rare,$chr,$adj_start,$adj_end);	
            	}
			
				#Situ3 : start position stays on same chromosome, while end position slides out. 
				if ($adj_end > $CHR{$chr} && $adj_start <= $CHR{$chr}-1){
					$new_start = $adj_start;
					$new_end = $adj_end - $CHR{$chr}; # assume none of the elements span more than 1 chromosome.
					if ($chr =~ /chr(\d+)$/){
						if ($1+1 <= 22){
								($overall,$rare) = &cal($overall,$rare,$chr,$new_start,$CHR{$chr});
								$adj_chr = join("","chr",$1+1);
								($overall,$rare) = &cal($overall,$rare,$adj_chr,0,$new_end);
						}elsif($1 == 22){
								($overall,$rare) = &cal($overall,$rare,$chr,$new_start,$CHR{$chr});
								$adj_chr = "chrX";
								($overall,$rare) = &cal($overall,$rare,$adj_chr,0,$new_end);	
						}
					}	
					if ($chr =~ /chrX/){
						($overall,$rare) = &cal($overall,$rare,$chr,$new_start,$CHR{$chr});
						$adj_chr = "chr1";
						($overall,$rare) = &cal($overall,$rare,$adj_chr,0,$new_end);
					}
				}	
			}
		}				
		if ($overall !=0){
			if ($distance == 0){
				print $fh $rare," ", $overall," ",$rare/$overall," ";
			}else{
				print $fh $rare/$overall," ";
			}
		}else{
			if ($distance ==0){
				print $fh $rare," ",$overall," ","0 ";
			}else{
				print $fh "0 ";
			}
		}	
	}
	print $fh "\n";
	$|++;
}

#enrichment calculation
sub cal{
	my ($overall,$rare,$chr,$start,$end) = @_;
	for my $i ($start .. $end -1){ 
			if (defined $SNP{$chr}{$i}){
					$overall ++;
					if ($SNP{$chr}{$i} < $daf_th ){
							$rare ++;
					}
			}
	}
	return ($overall,$rare);
}

#read input category
sub read_cate {
		my ($file) = @_;
        open(IN, $file)||die "input file no found ...\n";
        while(<IN>){
                chomp $_;
                my ($path,$name) = (split /\s+/,$_)[0,1];
                $CATEGORY{$name} = $path;
        }
        close IN;
}

#distance generator
sub dis_gen{
	my ($start,$end,$num) = @_;
	open(O,">random/sliding.distance");
	my %dis;
	while (scalar keys %dis < $num){
		$dis{int(rand($end-$start) + $start)}=1;
	}
	foreach my $tmp(sort {$a<=>$b} keys %dis){
		print O $tmp,"\n";
		push @DISTANCE,$tmp;
	}
	close O;
}

#read in polymorphisms data
sub read_snp{
	my ($file) = @_;
        my $chr;
	my $start;
	my $end;
	my $daf;

	open(IN,$file)||die "cannot open Polymorphisms file ... \n";
        while(<IN>){
		chomp $_;
		if (/\S+\s+\d+\s+\d+\s+\S+/){
         		($chr,$start,$end,$daf) = (split /\t+/,$_)[0..3];
			$chr =~ s/^chr//i;
			$chr =~ s/^/chr/;
			if ($end =~ /\d+/ && $start =~ /\d+/ && ($end-$start)==1 && ($daf =~ /^\d$/ || $daf =~ /^\d+\.\d+$/)){       
				$SNP{$chr}{$start} = $daf;
        		}
		}
        }
	close IN;
}

#chromosome length
sub chr_length{
        $CHR{"chr1"} = 249250621;
        $CHR{"chr2"} = 243199373;
        $CHR{"chr3"} = 198022430;
        $CHR{"chr4"} = 191154276;
        $CHR{"chr5"} = 180915260;
        $CHR{"chr6"} = 171115067;
        $CHR{"chr7"} = 159138663;
        $CHR{"chr8"} = 146364022;
        $CHR{"chr9"} = 141213431;
        $CHR{"chr10"} = 135534747;
        $CHR{"chr11"} = 135006516;
        $CHR{"chr12"} = 133851895;
        $CHR{"chr13"} = 115169878;
        $CHR{"chr14"} = 107349540;
        $CHR{"chr15"} = 102531392;
        $CHR{"chr16"} = 90354753;
        $CHR{"chr17"} = 81195210;
        $CHR{"chr18"} = 78077248;
        $CHR{"chr19"} = 59128983;
        $CHR{"chr20"} = 63025520;
        $CHR{"chr21"} = 48129895;
        $CHR{"chr22"} = 51304566;
        $CHR{"chrX"} = 155270560;
}
