#!/usr/bin/perl -w 

$refsnpfile = $ARGV[0]; 
$refAFRgenofile = $ARGV[1];
$refEURgenofile = $ARGV[2];
$aasnpfile = $ARGV[3];
$aagenofile = $ARGV[4];
$aaindfile = $ARGV[5];
$chr = $ARGV[6];
$snpbadfile = "badsnps.".$chr;
$ttestfile = "hwtest_chr".$chr.".dat";
$convertfile = "parconvert2_chr.".$chr;
$origaagenofile = $aagenofile.".ORIG";
$origaasnpfile = $aasnpfile.".ORIG";

if(($#ARGV+1) != 7)
  {
    print ("WRONG USAGE: Usage filterSNPs.pl REFsnpfile AFRgenofile EURgenofile AAsnpfile AAgenofile AAindfile CHR\n");
    exit;
  }


open(REFSNPFILE ,$refsnpfile) || die("COF");
open(REFAFRGENOFILE ,$refAFRgenofile) || die("COF");
open(REFEURGENOFILE ,$refEURgenofile) || die("COF");
open(AASNPFILE ,$aasnpfile) || die("COF");
open(AAGENOFILE ,$aagenofile) || die("COF");

$refAFRfreqfile = "AFR_freq.dat";
$refEURfreqfile = "EUR_freq.dat";
$aafreqfile = "AA_freq.dat";

open(REFAFRFREQFILE ,">$refAFRfreqfile") || die("COF");
open(REFEURFREQFILE ,">$refEURfreqfile") || die("COF");
open(AAFREQFILE ,">$aafreqfile") || die("COF");



#Read in the Afr and Eur files to calculate the frequencies

@row = <REFSNPFILE>;

for($i=0;$i< scalar @row; $i++)
  {
    chomp($row[$i]);
    @data =split(' ',$row[$i]);
    $snp[$i] = $data[0];
    $refAfr[$i] = 0;
    $vartAfr[$i] = 0;
    $refEur[$i] = 0;
    $vartEur[$i] = 0;
  }

@row = <REFAFRGENOFILE>;
for($i=0; $i < scalar @row; $i++)
  {
    chomp($row[$i]);
    @data= split('',$row[$i]);
    for($j = 0; $j < scalar @data; $j++)
      {
	if($data[$j] == 0)
	  {
	    $vartAfr[$i] += 1;
	  }
	elsif($data[$j] == 1)
	  {
	    $refAfr[$i] += 1;
	  }
      }
    print REFAFRFREQFILE $snp[$i],"\t",($refAfr[$i]+$vartAfr[$i]),"\t",$vartAfr[$i]/($refAfr[$i]+$vartAfr[$i]),"\n";
  }

@row1 = <REFEURGENOFILE>;
for($i=0; $i < scalar @row1; $i++)
  {
    chomp($row1[$i]);
    @data1 = split('',$row1[$i]);
    for($j = 0; $j < scalar @data1; $j++)
      {
	if($data1[$j] == 0)
	  {
	    $vartEur[$i] += 1;
	  }
	elsif($data1[$j] == 1)
	  {
	    $refEur[$i] += 1;
	  }
      }
    print REFEURFREQFILE $snp[$i],"\t",($refEur[$i]+$vartEur[$i]),"\t",$vartEur[$i]/($refEur[$i]+$vartEur[$i]),"\n";
  }
	

@row = <AASNPFILE>;

for($i=0;$i< scalar @row; $i++)
  {
    chomp($row[$i]);
    @data =split(' ',$row[$i]);
    $snp[$i] = $data[0];
    $refAA[$i] = 0;
    $vartAA[$i] = 0;
    
  }
@row1 = <AAGENOFILE>;
for($i=0; $i < scalar @row1; $i++)
  {
    chomp($row1[$i]);
    @data1 = split('',$row1[$i]);
    for($j = 0; $j < scalar @data1; $j++)
      {
	if($data1[$j] == 0)
	  {
	    $vartAA[$i] += 2;
	  }
	elsif($data1[$j] == 1)
	  {
	    $refAA[$i] += 1;
	    $vartAA[$i] += 1;
	  }
	elsif($data1[$j] == 2)
	  {
	    $refAA[$i] += 2;
	  }
      }
    print AAFREQFILE $snp[$i],"\t",($refAA[$i]+$vartAA[$i]),"\t",$vartAA[$i]/($refAA[$i]+$vartAA[$i]),"\n";
  }

close REFAFRGENOFILE;
close REFEURGENOFILE;
close REFSNPFILE;
close AASNPFILE;
close AAGENOFILE;
close REFAFRFREQFILE;
close REFEURFREQFILE;
close AAFREQFILE;



#This does the t-test across the new and reference data set for different population subgroups: 
#It needs as input file with the HW cnts for the reference data, and the file with the HW cnts for the new data:

open(REFAFRFREQFILE ,"$refAFRfreqfile") || die("COF");
open(REFEURFREQFILE ,"$refEURfreqfile") || die("COF");
open(AAFREQFILE ,"$aafreqfile") || die("COF");
open(SNPBADFILE ,">$snpbadfile") || die("COF");
open(GENOFILE ,">$ttestfile") || die("COF");

%snpnumrefaf = ();
%snpnumrefeur = ();
%snpnumrefafam = ();
%snpqrefaf = ();
%snpqrefeur = ();
%snpqrefafam = ();
%qexp = ();
%tstatmix = ();
  
@row = <REFAFRFREQFILE>; 

for($i = 0; $i < scalar @row; $i++)
{
  chomp($row[$i]);
  @data = split('\t',$row[$i]);
  $snpnumrefaf{$data[0]} = $data[1];
  $snpqrefaf{$data[0]} = $data[2];  
}

@row = <REFEURFREQFILE>;

for($i = 0; $i < scalar @row; $i++)
{
  chomp($row[$i]);
  @data = split('\t',$row[$i]);
  $snpnumrefeur{$data[0]} = $data[1];
  $snpqrefeur{$data[0]} = $data[2];  
}

@row = <AAFREQFILE>;
for($i= 0; $i < scalar @row; $i++)
  {
    chomp($row[$i]);
    @data = split('\t',$row[$i]);
    $snpnumrefafam{$data[0]} = $data[1];
    $snpqrefafam{$data[0]} = $data[2];

  }

print "##################################T_Test for African,European and African Americans###################################\n";

#t-test for African,European and African-American mix
$qave = 0; 
$numsnp = 0;  
foreach $key(keys %snpnumrefaf) 
  { 
    if((exists $snpnumrefeur{$key}) and (exists $snpnumrefaf{$key})) 
      { 
        $qexp{$key} =  (0.20*$snpqrefeur{$key}) + (0.80*$snpqrefaf{$key}); 
        $qave += $qexp{$key}; 
        $numsnp++; 
 
      } 
  } 

if($numsnp > 0)
{ 
  $qave = $qave/$numsnp;
} 
print $qave,"\t",$numsnp,"\n"; 
 
$nbad = 0;
foreach $key(keys %qexp) 
  { 
    if((exists $snpnumrefeur{$key}) and (exists $snpnumrefaf{$key}) and (exists $snpnumrefafam{$key})) 
      {  
	$sdev = sqrt(($qave*(1.0-$qave))*(1.0/($snpnumrefaf{$key}) + 1.0/($snpnumrefeur{$key}) + 1.0/($snpnumrefafam{$key}))); 
 
	$tstatmix{$key} = ($qexp{$key} - $snpqrefafam{$key})/$sdev; 
	if(abs($tstatmix{$key}) > 3.0)
	  {
	    print SNPBADFILE $key,"\n";
	    $nbad++;
	  }
      }
  }   

if($nbad > $numsnp/3)
  {
    print "TOO MANY BAD SNPS, PLEASE CHECK THE T_TEST FILE AND CHECK YOUR DATA\n";
    exit;
} 
else
  {
    print "REMOVING $nbad SNPS FROM YOUR ADMIXED DATA\n";
  }

#Combine all the information together in one file:
print GENOFILE "SNP_ID\tRef_Afr_Freq\tRef_ Eur_Freq\tRef_AfrAm_Freq\tExp_AfrAm_Freq\tT-statMix\n";
foreach $key(keys %tstatmix)
{

  print GENOFILE $key,"\t",$snpqrefaf{$key},"\t",$snpqrefeur{$key},"\t",$snpqrefafam{$key},"\t",$qexp{$key},"\t",$tstatmix{$key},"\n"; 
    
}


close REFAFRFREQFILE;
close REFEURFREQFILE;
close AAFREQFILE;
close GENOFILE;

`rm $refAFRfreqfile`;
`rm $refEURfreqfile`;
`rm $aafreqfile`;


print $nbad,"\n";
if($nbad == 0)
  {
    `rm $ttestfile`;
  }

if($nbad > 0)
  {
    print "Removing $nbad BAD SNPs\n";
    open(CONVERTFILE,">$convertfile") || die("Unable to make file $convertfile");

    $admixgenooutfile = $aagenofile.".convert";
    $admixsnpoutfile = $aasnpfile.".convert";
    $admixindoutfile = $aaindfile.".convert";
    
    print CONVERTFILE "genotypename: $aagenofile\n";
    print CONVERTFILE "snpname: $aasnpfile\n";
    print CONVERTFILE "indivname: $aaindfile\n";
    print CONVERTFILE "outputformat:    EIGENSTRAT\n";
    print CONVERTFILE "familynames: NO\n";
    print CONVERTFILE "badsnpname: $snpbadfile\n";
    print CONVERTFILE "genotypeoutname: $admixgenooutfile\n";
    print CONVERTFILE "snpoutname: $admixsnpoutfile\n";
    print CONVERTFILE "indivoutname: $admixindoutfile\n";
    
    close CONVERTFILE;


    $command ="bin/convertf -p $convertfile";
    `$command`;

    `rm $admixindoutfile`;
    `cp $aagenofile $origaagenofile`;
    `cp $aasnpfile $origaasnpfile`;
    `mv $admixgenooutfile $aagenofile`;
    `mv $admixsnpoutfile $aasnpfile`;

  }
exit;

