#!/usr/bin/perl

use strict;
use Getopt::Long;
use File::Basename;


###############################################################
##################### DOCUMENTATION ###########################

## Qing Duan Nov 2016
## Goal: 
##	 Convert plink format ped/map to HapMix genotype and snp file

###############################################################
################# END DOCUMENTATION ###########################

my $start = time();

my %opts = ();

        # Default Options

Getopt::Long::GetOptions(\%opts,qw(
  	chr=s
	admixpop=s
        ped=s
        map=s
	hap1=s
	snps1=s
	ref1label=s
	hap2=s
	snps2=s
	ref2label=s
	rate=s
 	outdir=s
        o=s
)) || die "Failed to parse options\n\n";

if ( (!defined $opts{chr}) || (!defined $opts{admixpop}) || (!defined $opts{ped}) || (!defined $opts{map}) || (!defined $opts{hap1}) || (!defined $opts{snps1}) || (!defined $opts{ref1label}) || (!defined $opts{hap2}) || (!defined $opts{snps2}) || (!defined $opts{ref2label}) || (!defined $opts{rate}) || (!defined $opts{outdir}) )
{
        &usage();
        die "Failed to parse parameters\n\n";
}

&printPar();

# define global variable:
my %snp_refAllele = ();   	# key: snp
                        	# value: reference allele from genotype file
my %indices_refAllele = (); 	# key: linenumber
				# value: reference allele from genotype file
my %gSNPs = ();       		# key: snp
                        	# value: linenumber
my $chr = $opts{chr};

&main();

# begin sub

sub usage
{
        print "\n";
        print "By Qing Duan, Nov 2016\n";
        print "convert.pl: convert plink ped/map genotypes";
        print "Usage:\n\t";
	print "-chr\t chromosome number \n\t";
	print "-admixpop\t e.g., AA";
        print "-ped\t plink ped file \n\t";
        print "-map\t plink map file\n\t";
	print "-hap1\t reference haplotypes for population1 \n\t";
	print "-snps1\t reference snps for population1 \n\t";
	print "-ref1label\t e.g., CEU";
	print "-hap2\t reference haplotypes for population2 \n\t";
	print "-snps2\t reference snps for population2 \n\t";
	print "-ref2label\t e.g., YRI";
	print "-rate\t rate file \n\t";
	print "-outdir\t output directory \n\t";
        print "\n";
}


sub printPar
{
        print "\n";
        print "Parameters in Effect:\n";
	print "\t Input chr number \n\t\t-chr '$opts{chr}'\n";
	print "\t Input population \n\t\t-admixpop '$opts{admixpop}'\n";
        print "\t Input ped \n\t\t-ped '$opts{ped}'\n";
        print "\t Input map \n\t\t-map '$opts{map}'\n";
	print "\t Input hap1 \n\t\t-hap1 '$opts{hap1}'\n";
	print "\t Input snps1 \n\t\t-snps1 '$opts{snps1}'\n";
	print "\t Input ref1 \n\t\t-ref1label '$opts{ref1label}'\n";
	print "\t Input hap2 \n\t\t-hap2 '$opts{hap2}'\n";
	print "\t Input snps2 \n\t\t-snps2 '$opts{snps2}'\n";
	print "\t Input ref2 \n\t\t-ref2label '$opts{ref2label}'\n";
	print "\t Input rate \n\t\t-rate '$opts{rate}'\n";
	print "\t Output directory \n\t\t-outdir '$opts{outdir}'\n";
        print "\n";
}


sub main
{
  &convertGenotypes();
  &convertHaplotypes();
  &formatRate();
  &writePar();
 
  my $end = time();
  my $diff = $end - $start;
  print "\nAnalysis took $diff seconds\n\n";

}

### sub

sub convertGenotypes
{
  my $ped = $opts{ped};
  my $map = $opts{map};
  my $genofile = $opts{outdir}."/genofile.".$chr;
  my $snpfile = $opts{outdir}."/snpfile.".$chr;
  my $indfile = $opts{outdir}."/ind.".$chr;

  my $of = $opts{outdir}."/tmp.par.".$chr;
  open(OUT, ">$of") || die "cannot create file $of\n\n";
  open(OUT, ">>$of") || die "cannot append to file $of\n\n";
  
  print OUT 
"genotypename: $ped 
snpname: $map
indivname: $ped
outputformat: EIGENSTRAT
genotypeoutname: $genofile
snpoutname: $snpfile
indivoutname: $indfile
familynames: NO";

  close(OUT);

  system("./bin/convertf -p $of");
  system("rm $of");

  print "convert genotypes: $genofile $snpfile $indfile\n";
}


sub convertHaplotypes
{
  my $snpfile = $opts{outdir}."/snpfile.".$chr;
  my $ref1 = $opts{hap1};
  my $snps1 = $opts{snps1};
  my $ref2 = $opts{hap2};
  my $snps2 = $opts{snps2};
  my $rate = $opts{rate};
  my $prefix1 = $opts{outdir}."/pop1";
  my $prefix2 = $opts{outdir}."/pop2";
 
  &processGenoSNPs($snpfile);	
  &formatReference($ref1, $snps1, $rate, $prefix1);
  &formatReference($ref2, $snps2, $rate, $prefix2);

} # end of sub

sub processGenoSNPs
{
  my $snpfile = $_[0];

  # record each geno allele's major allele, which is the referencen allele for the genotype file. Make sure the reference allele of reference is the same.

  open(IN, $snpfile) || die "cannot open file $snpfile \n\n";

  while(my $line = <IN>)
  {
	chomp($line);
	$line =~ s/^\s+//;

	my @lineArr = split(/\s+/, $line);
	my $snp = $lineArr[0];
	my $refAll = $lineArr[4];

	$snp_refAllele{$snp} = $refAll;

  }
  close(IN);

} # end of sub

sub formatReference
{
  my $ref = $_[0];
  my $snps = $_[1];
  my $rate = $_[2];
  my $prefix = $_[3];

## define
  my $linenumber = 0;  # track snp position for extracting hap; start from 0 
  my @outputindices = (); # array of indices for output SNPs;

  open(IN, $snps) || die "cannot open file $snps \n\n";

  while(my $line = <IN>)
  {
        chomp($line);

  	my $snp = $line;

	if (exists $snp_refAllele{$snp})
 	{
		$gSNPs{$snp} = $linenumber;
		$indices_refAllele{$linenumber} = $snp_refAllele{$snp};
	}
	
	$linenumber++;

  } # end of while loop
  close(IN);

###### generate snpfile: 
  open(IN, $rate) || die "cannot open file $rate \n\n";

  my $out = $prefix."snpfile.".$chr;
  open(OUT, ">$out") || die "cannot create file $out \n\n";
  open(OUT, ">>$out") || die "cannot create file $out \n\n";

  while(my $line = <IN>)
  {
        chomp($line);
  	my @linearr = (split(/\s+/, $line));

	my $snp = $linearr[0];
	my $chr = $linearr[1];
	my $bp = $linearr[2];
	my $cm = $linearr[4];

 	if (exists $gSNPs{$snp})
	{
		push @outputindices, $gSNPs{$snp};
		print OUT "$snp\t$chr\t$cm\t$bp\n";
	}
 
  } # end of while loop
 
  close(IN);
  close(OUT);

  print "generate snpfile: $rate $out\n";  

###### generate genofile:
  open(IN, $ref) ||  die "cannot open file $ref \n\n";
  
  my $oldlastcol = 0;
  my $lastcol=0;
  my @outline;

  while(my $line = <IN>)
  {
	chomp($line);
	my @alleles = (split(//, $line));

	my @newAlleles = ();
  	
	for (my $i=0; $i <= $#outputindices; $i++)
        {

	  if ($alleles[$outputindices[$i]] eq $indices_refAllele{$outputindices[$i]})
	  {
		push @newAlleles, "1";
	  } else {
		push @newAlleles, "0";
	  }
	} # end of for loop

	## transpose:

	$oldlastcol = $lastcol;
	$lastcol = $#newAlleles if $#newAlleles > $lastcol;

	for (my $i = $oldlastcol; $i < $lastcol; $i++) {
	  	$outline[$i] = "" x $oldlastcol;
        }
	
	for (my $i=0; $i <=$lastcol; $i++) {
	  $outline[$i] .= "$newAlleles[$i]";
	}

  } # end of while
  close(IN);

## output
  my $out = $prefix."genofile.".$chr;
  open(OUT, ">$out") || die "cannot create file $out \n\n";
  open(OUT, ">>$out") || die "cannot create file $out \n\n";

  for (my $i=0; $i <= $lastcol; $i++) {
  	$outline[$i] =~ s/\s*$//g;
        print OUT $outline[$i]."\n";
  }
  
  close(OUT);

  print "generate reference genofile $ref $out\n";

} # end of sub

sub formatRate
{
  # define
  my @bparray = ();
  my @cmarray = ();
  
  open(IN, $opts{rate}) || die "cannot open file $opts{rate} \n\n";

  while(my $line = <IN>)
  {
        chomp($line);
        my @linearr = (split(/\s+/, $line));

        my $snp = $linearr[0];
        my $chr = $linearr[1];
        my $bp = $linearr[2];
        my $cm = $linearr[4];

        if (exists $gSNPs{$snp})
        {
                push @bparray, $bp;
		push @cmarray, $cm;
        }

  } # end of while loop
  close(IN);

  my $out = $opts{outdir}."/rates.".$chr;
  
  open(OUT, ">$out") || die "cannot create file $out \n\n";
  open(OUT, ">>$out") || die "cannot create file $out \n\n";

  my $sites = scalar(@bparray);

  print OUT ":sites:$sites\n";
  print OUT join(" ",@bparray),"\n";
  print OUT join(" ",@cmarray),"\n";

  close(OUT);

  print "generate rate file $out\n";

} # end of sub

sub writePar
{
  my $out = $opts{outdir}."/hapmix.par.".$chr;
  open(OUT, ">$out") || die "cannot create file $out \n\n";
  open(OUT, ">>$out") || die "cannot create file $out \n\n";

  print OUT
"GENOTYPE:1
OUTPUT_SITES:0
SITE_POSITIONS: 1 1000000000
THETA:0.2
LAMBDA:6.0
RECOMBINATION_VALS:600 900
MUTATION_VALS:0.2 0.2 0.01
MISCOPYING_VALS:0.05 0.05
REF1LABEL: $opts{ref1label}
REF2LABEL: $opts{ref2label}
ADMIXPOP:  $opts{admixpop}
HAPMIX_MODE:DIPLOID
OUTPUT_DETAILS:PROB
THRESHOLD:0.0
ATESFILE:LES:0
REFPOP1GENOFILE:$opts{outdir}/pop1genofile.$chr
REFPOP2GENOFILE:$opts{outdir}/pop2genofile.$chr
REFPOP1SNPFILE:$opts{outdir}/pop1snpfile.$chr
REFPOP2SNPFILE:$opts{outdir}/pop2snpfile.$chr
RATESFILE:$opts{outdir}/rates.$chr
ADMIXSNPFILE:$opts{outdir}/snpfile.$chr
ADMIXGENOFILE:$opts{outdir}/genofile.$chr
ADMIXINDFILE:$opts{outdir}/ind.$chr
CHR:$chr
OUTDIR:$opts{outdir}/RUN
KEEPINTFILES:0";

  close(OUT);

  print "generate parameter file: $out\n";

  system("mkdir -p $opts{outdir}/RUN");
} # end of sub


