#! /usr/local/bin/perl # genomap.pl: Combines genotyper output files and produces MapMaker # input. Also identifies probably null alleles, checks for possible # genotyping errors, and analyzes segregation distortion # Steve DiFazio, 2-2003 #!!!!note: don't forget to delete Tongming's fix!!! ########################################## use Getopt::Long; GetOptions( "haplo" => \$haplo, "geno" => \$geno, "map" => \$map, "dist" => \$dist, "summ" => \$summ, "null:i" => \$minnull); if($dist) { $map = 1; } if((!$geno && !$haplo && ! $summ && !$map) || !($ARGV[0])) { die " _______________________________________________________________\n Usage: perl genomap.pl -g -h -s -m -d -null=n files.txt > out.txt \n where \t-g provides output for genotypic mapping \t (ABCD genotypes for each locus),\n \t-h provides output for haplotype mapping \t (1 or 0 genotypes per allele), \n \t-m provides output in Mapmaker format\n \t-d provides output in Mapmaker format for one \t allele per locus for distance calculations\n \t-s provides a summary of null alleles \t and progeny without parental alleles,\n \t-null=n, where n is the maximum number of loci \t for which parental mismatches are allowed before \t designating a segregating null allele. Default is 1.\n \t is the name of a file containing a list \t of all genotyper table files to be processed, \t each of which must have a \".dat\" extension, and\n \t is the name of the file to save the output\n ** You can choose unique names for these input and output files ** _______________________________________________________________\n"; } warn "\n"; # Read in data from config file open(config, "genomap.cfg") or die "Can't find file genomap.cfg\n"; while() { if(!(/^#/)) { if(/^mother/i) { upcase(($t,$mthr) = split()); } # name of mother. if(/^father/i) { upcase(($t,$fthr) = split()); } # name of father. if(/^blank/i) { upcase(($t,$blnk) = split()); } # name of negative control. Excluded from output. # optional identifiers for grandparents if(/^m_gmthr/) { ($t,$m_gmthr) = split(); } # name of mother of mother (maternal grandmother) if(/^m_gfthr/) { ($t,$m_gfthr) = split(); } # name of father of mother (maternal grandfather) if(/^p_gmthr/) { ($t,$p_gmthr) = split(); } # name of mother of mother (paternal grandmother) if(/^p_gfthr/) { ($t,$p_gfthr) = split(); } # name of father of father (paternal grandfather) # Optional: Parental species. For pure species, use a single letter (A-Z). For # hybrids, use two letters to indicate the parental species # Used for determining species origins of alleles in interspecific # crosses) if(/^sp_mthr/) { ($t,$sp{$mthr}) = split(); } if(/^sp_fthr/) { ($t,$sp{$fthr}) = split(); } if(/^sp_m_gmthr/) { ($t,$sp{$m_gmthr}) = split(); } if(/^sp_m_gfthr/) { ($t,$sp{$m_gfthr}) = split(); } if(/^sp_p_gmthr/) { ($t,$sp{$p_gmthr}) = split(); } if(/^sp_p_gfthr/) { ($t,$sp{$p_gfthr}) = split(); } if(/^exclude/i) { ($e,$ind,$loc,$all) = split(); $loc =~ tr/[a-z]/[A-Z]/; $ind =~ tr/[a-z]/[A-Z]/; if(($loc eq "ALL") && ($ind ne "ALL")) { warn "$ind excluded\n"; $exind{$ind} = 1; } elsif(($ind eq "ALL") && ($all eq "ALL")) { $exloc{$loc} = 1; warn "$loc excluded\n"; } elsif ($ind eq "ALL") { warn "$all $loc excluded\n"; $exallele{$loc.":".$all} = 1; } elsif ($all eq "ALL") { warn "$ind $loc excluded\n"; $exclude{$ind.":".$loc} = 1; } else { warn "$ind $all $loc excluded\n"; $exclude{$ind.":".$loc.":".$all} = 1; } } } } sub by_num { $a <=> $b; } # minimum number of parental mismatches so that null is assumed to be segregating if(!($minnull)) { $minnull = 1; } # set indicator for genotypes for which segregation shouldn't be tracked $noseg{"Father"} = 1; $noseg{"Mother"} = 1; $noseg{"BLANK"} = 1; $noseg{$m_gmthr} = 1; $noseg{$m_gfthr} = 1; $noseg{$p_gmthr} = 1; $noseg{$p_gfthr} = 1; # determine if species identity of alleles can be tracked for each parent if(!($sp{$mthr}) || (($sp{$mthr} =~ tr/[a-zA-Z]// > 1) && (!($sp{$m_gmthr})) && !($sp{$m_gfthr}))) { $unk{$mthr} = 1; } if(!($sp{$fthr}) || (($sp{$fthr} =~ tr/[a-zA-Z]// > 1) && (!($sp{$p_gmthr})) && !($sp{$p_gfthr}))) { $unk{$fthr} = 1; } # Process files and save genotype data for each individual while(<>) { @files=split(); if($files[-1] =~ /\.dat$/i) { open(file, ($files[-1])) or die "\n\t\t*** ERROR ***\n Can't find file $files[-1]\n"; warn "Processing ",$files[-1],"\n"; LINE: while() { chomp(); if(!(/\t/)) { die "\n\t\t*** ERROR ***\n Problem with format of file $files[-1]. \tIs line $. tab-delimited?\n"; } # save column titles and make sure required fields are present if(/Low Signal/i) { $titles=1; # reset pks array for each file to allow variable number of allele columns while ($pks[0]) { pop(@pks); } if(!(/Sample/i)) { die "\n\t\t*** ERROR ***\n Problem with format of file $files[-1]. Make sure 'Sample' field is present\n"; } if(!(/Categ/i)) { die "\n\t\t*** ERROR ***\n Problem with format of file $files[-1]. Make sure 'Category' field is present\n"; } if(!(/Peak/i)) { die "\n\t\t*** ERROR ***\n Problem with format of file $files[-1]. Make sure at least one 'Peak' field is present\n"; } $lw=$sm=$ct=$pk=$fl=0; (@titl) = split(/\t/); for($i=0;$i<=$#titl;$i++) { if($titl[$i] =~ /Low/i) { $lw = $i; } if($titl[$i] =~ /Sample/i) { $sm = $i; } if($titl[$i] =~ /Categ/i) { $ct = $i; } if($titl[$i] =~ /Peak/i) { push(@pks,$i); } if($titl[$i] =~ /File/i) { $fl = $i; } if(($titl[$i]) && (!($titl[$i-1]))) { die "\n\t\t*** ERROR ***\n Problem with format of file $files[-1]. Make sure all column titles are present\n"; } } } else { (@dat) = split(/\t/); if($#titl < $#dat) { if($#titl < 0) { die "\n\t\t*** ERROR ***\n Problem with format of file $files[-1]. Is 'Low Signal' present in title row?\n" } else { die "\n\t\t*** ERROR ***\n There are $#titl categories and $#dat data points at line $. of file $files[-1]\n"; } } # make mother, father, and blank names case-insensitive # ($samp) = split(/-/,$dat[$sm]); $samp = $dat[$sm]; ##########fix for Tongming's samples (delete when distributing) $samp =~ s/^m.*/M/i; $samp =~ s/^f.*/F/i; $samp =~ s/^gm.*/GM/i; $samp =~ s/^gf.*/GF/i; $samp =~ s/^zm.*/GM/i; $samp =~ s/^blank.*/blank/i; #capitalize all sample names $samp =~ tr/[a-z]/[A-Z]/; if($samp =~ /^$mthr$/i) { $samp = "Mother"; } elsif($samp =~ /^$fthr$/i) { $samp = "Father"; } # capitalize all locus names $loc = $dat[$ct]; $loc =~ tr/[a-z]/[A-Z]/; $loc =~ s/ORNL-//; if(!($exloc{$loc}) && (!($exind{$samp})) && (!($exclude{$samp.":".$loc}))) { if($samp && !($samps{$samp}) && ($samp ne "Mother") && ($samp ne "Father") && ($samp ne $blnk) && ($samp ne $m_gmthr) && ($samp ne $m_gfthr) && ($samp ne $p_gmthr) && ($samp ne $p_gfthr)) { $samps{$samp}=1; $nsamps++; } $samples{$samp} = 1; # check for repeated samples $cont=0; for $i (0 .. $#pks) { # capitalize allele names $dat[$pks[$i]] =~ tr/[a-z]/[A-Z]/; } if($samp ne $blnk) { while(defined($amp{$samp.":".$loc})) { $samp = rep_check($samp,$loc); } } # save alleles to hash of arrays for $i (0 .. $#pks) { if(($dat[$pks[$i]]) && ($dat[$pks[0]] ne "M")) { push(@{$geno{$samp.":".$loc}},$dat[$pks[$i]]); $alleles{$loc.":".$dat[$pks[$i]]} = 1; $amp{$samp.":".$loc} = 1; } } } #sort alleles alphabetically for each sample @{$geno{$samp.":".$loc}} = sort(@{$geno{$samp.":".$loc}}); # save filenames for later use push(@{$fname{$samp.":".$loc}},$files[-1].":".$dat[$fl]); if($fname{$loc} !~ /$dat[$fl]/) { $fname{$loc} .= ":".$dat[$fl]; $fname{$loc} =~ s/^://; } # detect missing data if((($dat[$lw] =~ /warning/i) || ($dat[$pks[0]] eq "M")) && (!($geno{$samp.":".$loc}[0])) || ($exloc{$loc}) || ($exind{$samp}) || ($exclude{$samp.":".$loc})) { $read{$samp.":".$loc} = 0; } elsif ($samp ne $blnk) { $read{$samp.":".$loc} = 1; } } } if(!($titles)) { die "\n\t\t*** ERROR ***\n Problem with format of file $files[-1]. Make sure 'Low Signal' field is present in title row\n"; } $titles=0 } } # single digit identifiers for alleles @lets = (A,B,C,D,E,F,G,H,I,J,K,L,a,b,c,d,e,f,g,h,i,j,k,l); # Assign one-letter identifier for alleles foreach $locall (sort (keys (%alleles))) { ($locus,$allele) = split(/:/,$locall); $id{$locus.":".$allele} = $lets[$allcnt{$locus}]; $all{$locus.":".$lets[$allcnt{$locus}]} = $allele; $allcnt{$locus}++; } # set genotypes foreach $locus (sort (keys (%allcnt))) { if(!($read{"Mother:".$locus})) { die "\n\t\t*** ERROR ***\n Data missing for mother for locus $locus Mother name must be the same for all loci If mother is not named $mthr, alter \"mother $mthr\" in genomap.cfg\n"; } if(!($read{"Father:".$locus})) { die "\n\t\t*** ERROR ***\n Data missing for father for locus $locus Father name must be the same for all loci If father is not named $fthr, alter \"father $fthr\" in genomap.cfg\n"; } foreach $sample (sort by_num (keys(%samples))) { # count of individuals with data if(($read{$sample.":".$locus}) && !($noseg{$sample})) { $amp{$locus}++; } # individuals with observed alleles if($amp{$sample.":".$locus}) { for($i=0;$i<=$#{$geno{$sample.":".$locus}};$i++) { if(($id{$locus.":".$geno{$sample.":".$locus}[$i]}) && ($idgeno{$sample.":".$locus} !~ /$id{$locus.":".$geno{$sample.":".$locus}[$i]}/)) { $idgeno{$sample.":".$locus} .= $id{$locus.":".$geno{$sample.":".$locus}[$i]}; } } } # data present but no observed alleles, so designated null elsif ($read{$sample.":".$locus}) { $idgeno{$sample.":".$locus} = "N"; } # no extracted data else { $idgeno{$sample.":".$locus} = "M"; } } } # sort through loci and individuals and count alleles, then identify polysomy and potential nulls foreach $locus (sort (keys (%allcnt))) { foreach $sample (sort by_num (keys(%samples))) { if($idgeno{$sample.":".$locus} ne "M") { if ($sample && (!($noseg{$sample}))) { for($i=0;$i<=$#{$geno{$sample.":".$locus}};$i++) { # count observations for each allele if($geno{$sample.":".$locus}[$i]) { $obs{$locus.":".$id{$locus.":".$geno{$sample.":".$locus}[$i]}}++; } # identify nonparental alleles if(($idgeno{"Mother:".$locus} !~ $id{$locus.":".$geno{$sample.":".$locus}[$i]}) && ($idgeno{"Father:".$locus} !~ $id{$locus.":".$geno{$sample.":".$locus}[$i]})) { $npar{$sample.":".$locus} .= $geno{$sample.":".$locus}[$i].":"; $npar{$locus} = 1; $npar{$sample} .= $locus; } # identify progeny of heterozygotes that lack a parental allele if($idgeno{"Mother:".$locus} =~ /$id{$locus.":".$geno{$sample.":".$locus}[$i]}/) { $mat{$sample.":".$locus} = 1; } if($idgeno{"Father:".$locus} =~ /$id{$locus.":".$geno{$sample.":".$locus}[$i]}/) { $pat{$sample.":".$locus} = 1; } } #identify maternal mismatches if(!($mat{$sample.":".$locus})) { $nomat{$locus}++; } #identify paternal mismatches if(!($pat{$sample.":".$locus})) { $nopat{$locus}++; } } } } # if mismatches in more than specified number of progeny (minnull, at top), null assumed to be segregating if($nomat{$locus} >= $minnull) { $m_null{$locus} = 1; # paternal alleles can be present with null, # so set null geno equal to paternal alleles # for later comparison $null{$locus} .= $idgeno{"Father:".$locus}; # if null is segregating and mother and father share an allele, # progeny with only that allele have ambiguous genotypes foreach $i (0 .. $#{$geno{"Mother:".$locus}}) { if($idgeno{"Father:".$locus} =~ /$id{$locus.":".$geno{"Mother:".$locus}[$i]}/) { $ambig{$locus.":".$id{$locus.":".$geno{"Mother:".$locus}[$i]}} = 1; $expect{$locus.":Mother"} = 0.25*($amp{$locus}); } } # expected number of maternal mismatches if(!($expect{$locus.":Mother"})) { $expect{$locus.":Mother"} = 0.5*($amp{$locus}); } # correct for double-null mother if($idgeno{"Mother:".$locus} eq "N") { $expect{$locus.":Mother"} = $amp{$locus}; } } if($nopat{$locus} >= $minnull) { $p_null{$locus} = 1; # maternal alleles can be present with null, # so set null geno equal to maternal alleles # for later comparison $null{$locus} .= $idgeno{"Mother:".$locus}; # if null is segregating and mother and father share an allele, # progeny with only that allele have ambiguous genotypes foreach $i (0 .. $#{$geno{"Father:".$locus}}) { if($idgeno{"Mother:".$locus} =~ /$id{$locus.":".$geno{"Father:".$locus}[$i]}/) { $ambig{$locus.":".$id{$locus.":".$geno{"Father:".$locus}[$i]}} = 1; $expect{$locus.":Father"} = 0.25*($amp{$locus}); } } if(!( $expect{$locus.":Father"})) { $expect{$locus.":Father"} = 0.5*($amp{$locus}); } # correct for double-null father if($idgeno{"Father:".$locus} eq "N") { $expect{$locus.":Father"} = $amp{$locus}; } } } foreach $locus (sort (keys (%allcnt))) { foreach $sample (sort by_num (keys(%samples))) { # Fix genotypes to account for homozygotes, nulls, and ambiguity if($sample && ($idgeno{$sample.":".$locus} ne "M") && !($noseg{$sample})) { if($idgeno{$sample.":".$locus} =~ tr/[A-LNa-l]// == 1) { #determine if putative homozygote could be null heterozygote if(($null{$locus}) && (($null{$locus} =~ $idgeno{$sample.":".$locus}) || ($idgeno{$sample.":".$locus} eq "N"))) { if($ambig{$locus.":".$idgeno{$sample.":".$locus}}) { #genotype is ambiguous because mother and father share visible allele $idgeno{$sample.":".$locus} .= "_"; } else { #unambiguous null heterozygote $idgeno{$sample.":".$locus} .= "N"; } } # identify unexplained lack of parental alleles and flag as possible aneuploids elsif((!($mat{$sample.":".$locus}) && (($nomat{$locus} < $minnull) || (($null{$locus} !~ $idgeno{$sample.":".$locus}) && ($idgeno{$sample.":".$locus} ne "N")))) || (!($pat{$sample.":".$locus}) && (($nopat{$locus} < $minnull) || (($null{$locus} !~ $idgeno{$sample.":".$locus}) && ($idgeno{$sample.":".$locus} ne "N"))))) { $idgeno{$sample.":".$locus} .= "X"; $aneu{$sample} .= ":".$locus; $aneu{$sample.":".$locus} = 1; } # homozygote, apparently else { $idgeno{$sample.":".$locus} .= $idgeno{$sample.":".$locus}; } } # identify unexplained parental mismatches and nulls (will subsequently be tagged as polysomic) elsif(($idgeno{$sample.":".$locus} =~ tr/[A-La-l]// == 2) && ($sample ne "Father") && ($sample ne "Mother") && ($sample ne $blnk) && !($npar{$sample.":".$locus})) { if(!($mat{$sample.":".$locus})){ if($nomat{$locus} < $minnull) { $idgeno{$sample.":".$locus} .= "X"; $ambmat{$locus}++; } # possibly a polysomic segregating null elsif($ambig{$locus.":".$idgeno{$sample.":".$locus}}) { #genotype is ambiguous because mother and father share visible allele $idgeno{$sample.":".$locus} .= "_"; } else { $idgeno{$sample.":".$locus} .= "N"; } } if(!($pat{$sample.":".$locus})){ if($nopat{$locus} < $minnull) { $idgeno{$sample.":".$locus} .= "X"; $ambpat{$locus}++; } # possibly a polysomic segregating null elsif($ambig{$locus.":".$idgeno{$sample.":".$locus}}) { #genotype is ambiguous because mother and father share visible allele $idgeno{$sample.":".$locus} .= "_"; } else { $idgeno{$sample.":".$locus} .= "N"; } } } # identify possible polysomy if($idgeno{$sample.":".$locus} =~ tr/[A-Na-lX]// == 3) { $polysom{$sample.":".$locus} = 1; $polysom{$sample} .= ":".$locus; # determine which parent contributed two alleles (if any) (@geno) = split(//,$idgeno{$sample.":".$locus}); $tpat=$tmat=0; for $i (0 .. $#geno) { if($idgeno{"Father:".$locus} =~ /$geno[$i]/) { $tpat++; } if($idgeno{"Mother:".$locus} =~ /$geno[$i]/) { $tmat++; } } if($tpat >= 2) { $mpoly++; } if($tmat >= 2) { $fpoly++; } } } } # fix parental genotypes if(!($npar{$locus})) { if ($p_null{$locus}) { $idgeno{"Father:".$locus} .= "N"; } if ($m_null{$locus}) { $idgeno{"Mother:".$locus} .= "N"; } } else { if ($p_null{$locus}) { $idgeno{"Father:".$locus} .= "X"; } if ($m_null{$locus}) { $idgeno{"Mother:".$locus} .= "X"; } } if ($ambpat{$locus} >= $minnull) { $idgeno{"Father:".$locus} .= "X"; } if ($ambmat{$locus} >= $minnull) { $idgeno{"Mother:".$locus} .= "X"; } if($idgeno{"Father:".$locus} =~ tr/[A-LNa-lX]// == 1) { $idgeno{"Father:".$locus} .= $idgeno{"Father:".$locus}; } if($idgeno{"Mother:".$locus} =~ tr/[A-LNa-lX]// == 1) { $idgeno{"Mother:".$locus} .= $idgeno{"Mother:".$locus}; } if($idgeno{"Father:".$locus} =~ tr/[A-LNa-lX]// == 3) { $polysom{"Father:".$locus} = 1; $polysom{"Father"} .= ":".$locus; } if($idgeno{"Mother:".$locus} =~ tr/[A-LNa-lX]// == 3) { $polysom{"Mother:".$locus} = 1; $polysom{"Mother"} .= ":".$locus; } } if($haplo) { # && ($files[-1])) { warn "\nPrinting data in haplotype format\n"; print "Alleles\tSize\t"; foreach $sample (sort by_num (keys(%samples))) { if($sample && ($sample ne $blnk)) { print $sample,"\t"; } } print "\n"; foreach $locall (sort (keys (%alleles))) { if ($locall !~ /null/) { ($locus,$allele) = split(/:/,$locall); print $locus,"-",$id{$locus.":".$allele},"\t",$allele,"\t"; foreach $sample (sort by_num (keys(%samples))) { if($sample && ($sample ne $blnk)) { $printed = 0; if(!($amp{$sample.":".$locus})) { print $idgeno{$sample.":".$locus} eq "M" ? "M\t" : "0\t"; } else { for($i=0;$i<=$#{$geno{$sample.":".$locus}};$i++) { if(($allele eq $geno{$sample.":".$locus}[$i]) && !($printed)) { print "1\t"; $printed = 1; } } if (!($printed)) { print "0\t"; } } } } print "\n"; # print repulsion phase for each allele print $locus,"-",$all,"r\t",$allele,"r\t"; foreach $sample (sort by_num (keys(%samples))) { if($sample && ($sample ne $blnk)) { $printed = 0; if(!($amp{$sample.":".$locus})) { print $idgeno{$sample.":".$locus} eq "M" ? "M\t" : "1\t"; } else { for($i=0;$i<=$#{$geno{$sample.":".$locus}};$i++) { if(($allele eq $geno{$sample.":".$locus}[$i]) && !($printed)) { print "0\t"; $printed = 1; } } if (!($printed)) { print "1\t"; } } } } print "\n"; } } print "Aneuploid\tNA\t"; foreach $sample (sort by_num (keys(%samples))) { if($sample && ($sample ne $blnk)) { if(($polysom{$sample}) || ($aneu{$sample})) { print $polysom{$sample},$aneu{$sample},"\t"; } else { print "0\t"; } } } print "\n"; print "Nonparental\tNA\t"; foreach $sample (sort by_num (keys(%samples))) { if($sample && ($sample ne $blnk)) { if($npar{$sample}) { print $npar{$sample},"\t"; } else { print "0\t"; } } } print "\n\n"; } # determine origins of alleles and expected segregation if($map || $summ) { # && ($files[-1])) { foreach $locall (sort(keys(%obs))) { ($locus,$all) = split(/:/,$locall); # determine if allele came from mother, father, or both if($idgeno{"Mother:".$locus} =~ /$all/) { $ori{$locus.":".$all} = "Mthr"; if($idgeno{"Father:".$locus} =~ /$all/) { $ori{$locus.":".$all} = "Both"; } # homozygote mother if($idgeno{"Mother:".$locus} =~ /$all$all/) { $expect{$locus.":".$all} = $amp{$locus}; } # heterozygote mother elsif($idgeno{"Father:".$locus} =~ /$all/) { # homozygote father if($idgeno{"Father:".$locus} =~ /$all$all/) { $expect{$locus.":".$all} = $amp{$locus}; } # intercross else { $expect{$locus.":".$all} = 0.75*($amp{$locus}); if(!($exallele{$locus.":".$all{$locus.":".$all}})) { $ninter++; } } } # testcross else { $expect{$locus.":".$all} = 0.5*($amp{$locus}); if(!($nmat{$locus} && $dist) && (!($exallele{$locus.":".$all{$locus.":".$all}}))) { $nmat++; $nmat{$locus} = 1; } } } elsif($idgeno{"Father:".$locus} =~ /$all/) { $ori{$locus.":".$all} = "Fthr"; # homozygote father if($idgeno{"Father:".$locus} =~ /$all$all/) { $expect{$locus.":".$all} = $amp{$locus}; } # testcross else { $expect{$locus.":".$all} = 0.5*($amp{$locus}); if(!($npat{$locus} && $dist) && (!($exallele{$locus.":".$all{$locus.":".$all}}))) { $npat++; $npat{$locus} = 1; } } } # nonparental allele else { $ori{$locus.":".$all} = "NP"; $expect{$locus.":".$all} = 0; } } } if($map) { # && ($files[-1])) { warn "\nPrinting data in Mapmaker format\n"; if($dist) { warn "\tfor calculating map distance\n"; } print "data type f2 backcross\n"; print "$nsamps ",$nmat*2," 0\n"; print "# Maternal Backcross Alleles\n"; print "#\t"; foreach $sample (sort by_num (keys(%samps))) { if($sample && ($sample ne $blnk)) { print $sample,"\t"; } } print "\n"; foreach $locallele (sort (keys (%alleles))) { if ($locallele !~ /null/) { # print segregating maternal alleles first ($locus,$allele) = split(/:/,$locallele); $all = $id{$locallele}; if(($expect{$locus.":".$all} < $amp{$locus}) && ($ori{$locus.":".$all} eq "Mthr") && (!($exallele{$locus.":".$allele})) && (!($dist) || !($mprinted{$locus}))) { # the dist option prints only one maternal or paternal allele for each locus print "*"; if($locus =~ /^[0-9]/) { print "s"; } print $locus,"-",$all,"\t"; foreach $sample (sort by_num (keys(%samps))) { if($sample && ($sample ne $blnk)) { $printed = 0; if(!($amp{$sample.":".$locus})) { print $idgeno{$sample.":".$locus} eq "M" ? "-\t" : "A\t"; } else { for($i=0;$i<=$#{$geno{$sample.":".$locus}};$i++) { if(($allele eq $geno{$sample.":".$locus}[$i]) && !($printed)) { print "H\t"; $printed = 1; } } if (!($printed)) { print "A\t"; } } } } print "\n"; # print repulsion phase for each allele print "*"; if($locus =~ /^[0-9]/) { print "s"; } print $locus,"-",$all,"r\t"; foreach $sample (sort by_num (keys(%samps))) { if($sample && ($sample ne $blnk)) { $printed = 0; if(!($amp{$sample.":".$locus})) { print $idgeno{$sample.":".$locus} eq "M" ? "-\t" : "H\t"; } else { for($i=0;$i<=$#{$geno{$sample.":".$locus}};$i++) { if(($allele eq $geno{$sample.":".$locus}[$i]) && !($printed)) { print "A\t"; $printed = 1; } } if (!($printed)) { print "H\t"; } } } } print "\n"; $mprinted{$locus} = 1; } } } print "\n"; print "# Paternal Backcross Alleles\n"; print "data type f2 backcross\n"; print "$nsamps ",$npat*2," 0\n"; print "#\t"; foreach $sample (sort by_num (keys(%samps))) { if($sample && ($sample ne $blnk)) { print $sample,"\t"; } } print "\n"; foreach $locallele (sort (keys (%alleles))) { if ($locallele !~ /null/) { # print segregating paternal alleles ($locus,$allele) = split(/:/,$locallele); $all = $id{$locallele}; if(($expect{$locus.":".$all} < $amp{$locus}) && ($ori{$locus.":".$all} eq "Fthr") && (!($exallele{$locus.":".$allele})) && (!($dist) || !($pprinted{$locus}))) { print "*"; if($locus =~ /^[0-9]/) { print "s"; } print $locus,"-",$all,"\t"; foreach $sample (sort by_num (keys(%samps))) { if($sample && ($sample ne $blnk)) { $printed = 0; if(!($amp{$sample.":".$locus})) { print $idgeno{$sample.":".$locus} eq "M" ? "-\t" : "A\t"; } else { for($i=0;$i<=$#{$geno{$sample.":".$locus}};$i++) { if(($allele eq $geno{$sample.":".$locus}[$i]) && !($printed)) { print "H\t"; $printed = 1; } } if (!($printed)) { print "A\t"; } } } } print "\n"; # print repulsion phase for each allele print "*"; if($locus =~ /^[0-9]/) { print "s"; } print $locus,"-",$all,"r\t"; foreach $sample (sort by_num (keys(%samps))) { if($sample && ($sample ne $blnk)) { $printed = 0; if(!($amp{$sample.":".$locus})) { print $idgeno{$sample.":".$locus} eq "M" ? "-\t" : "H\t"; } else { for($i=0;$i<=$#{$geno{$sample.":".$locus}};$i++) { if(($allele eq $geno{$sample.":".$locus}[$i]) && !($printed)) { print "A\t"; $printed = 1; } } if (!($printed)) { print "H\t"; } } } } print "\n"; $pprinted{$locus} = 1; } } } print "\n"; # don't use intercross alleles for calculating map distance if(!($dist)) { print "# Intercross Alleles\n"; print "data type f2 Intercross\n"; print "$nsamps ",$ninter," 0\n"; print "#\t"; foreach $sample (sort by_num (keys(%samps))) { if($sample && ($sample ne $blnk)) { print $sample,"\t"; } } print "\n"; foreach $locallele (sort (keys (%alleles))) { if ($locallele !~ /null/) { # print intercross alleles ($locus,$allele) = split(/:/,$locallele); $all = $id{$locallele}; if(($expect{$locus.":".$all} < $amp{$locus}) && (!($exallele{$locus.":".$allele})) && ($ori{$locus.":".$all} eq "Both")) { print "*"; if($locus =~ /^[0-9]/) { print "s"; } print $locus,"-",$all,"\t"; foreach $sample (sort by_num (keys(%samps))) { if($sample && ($sample ne $blnk)) { $printed = 0; if(!($amp{$sample.":".$locus})) { print $idgeno{$sample.":".$locus} eq "M" ? "-\t" : "A\t"; } else { for($i=0;$i<=$#{$geno{$sample.":".$locus}};$i++) { if(($allele eq $geno{$sample.":".$locus}[$i]) && !($printed)) { print "H\t"; $printed = 1; } } if (!($printed)) { print "A\t"; } } } } print "\n"; # # don't print repulsion phase for each allele # print "*"; # if($locus =~ /^[0-9]/) { # print "s"; # } # print $locus,"-",$all,"r\t"; # foreach $sample (sort by_num (keys(%samps))) { # if($sample && ($sample ne $blnk)) { # $printed = 0; # if(!($amp{$sample.":".$locus})) { # print $idgeno{$sample.":".$locus} eq "M" ? "-\t" : "H\t"; # } # else { # for($i=0;$i<=$#{$geno{$sample.":".$locus}};$i++) { # if(($allele eq $geno{$sample.":".$locus}[$i]) && !($printed)) { # print "A\t"; # $printed = 1; # } # } # if (!($printed)) { print "H\t"; } # } # } # } # print "\n"; } } } print "\n\n"; } } if ($geno) { # && ($files[-1])) { warn "\nPrinting data in genotype format\n"; print "Locus\t"; foreach $sample (sort by_num (keys(%samples))) { if($sample && ($sample ne $blnk)) { print $sample,"\t"; } } print "\n"; foreach $locus (sort (keys (%allcnt))) { print $locus,"\t"; foreach $sample (sort by_num (keys(%samples))) { if($sample && ($sample ne $blnk)) { print $idgeno{$sample.":".$locus},"\t"; } } print "\n"; } print "Aneuploid\t"; foreach $sample (sort by_num (keys(%samples))) { if($sample && ($sample ne $blnk)) { if($polysom{$sample} || $aneu{$sample}) { print $polysom{$sample},$aneu{$sample},"\t"; } else { print "0\t"; } } } print "\n"; print "Nonparental\t"; foreach $sample (sort by_num (keys(%samples))) { if($sample && ($sample ne $blnk)) { if($npar{$sample}) { print $npar{$sample},"\t"; } else { print "0\t"; } } } print "\n\n"; } if (($summ)) { # && ($files[-1])) { warn "\nPrinting summary of: \tsegregation \tmismatches \tnonparental alleles \tpossible aneuploidy \tmissing data \tand conficting repeated samples\n"; # first determine matches for parents and grandparents for each observed allele foreach $locus (sort(keys(%allcnt))) { @all = split(//,$idgeno{"Father:".$locus}); for $i (0 .. $#all) { if($all[$i] !~ /N|M/) { if($idgeno{$p_gfthr.":".$locus} =~ /$all[$i]/) { $fmatch{$p_gfthr.":".$locus.":".$all[$i]}++; $fmatch{$p_gfthr.":".$locus}++; } if($idgeno{$p_gmthr.":".$locus} =~ /$all[$i]/) { $fmatch{$p_gmthr.":".$locus.":".$all[$i]}++; $fmatch{$p_gmthr.":".$locus}++; } } } @all = split(//,$idgeno{"Mother:".$locus}); for $i (0 .. $#all) { if($all[$i] !~ /N|M/) { if($idgeno{$m_gfthr.":".$locus} =~ /$all[$i]/) { $mmatch{$m_gfthr.":".$locus.":".$all[$i]}++; $mmatch{$m_gfthr.":".$locus}++; } if($idgeno{$m_gmthr.":".$locus} =~ /$all[$i]/) { $mmatch{$m_gmthr.":".$locus.":".$all[$i]}++; $mmatch{$m_gmthr.":".$locus}++; } } } } print "Segregation_Analysis\n"; print "Locus\tAllele\tAlleleID\tOrigin\tSpecies\tMother\tFather\tN\tExpected\tObserved\tChi-Sq\tMu\n"; foreach $locall (sort(keys(%obs))) { ($locus,$all) = split(/:/,$locall); print "$locus\t",$all{$locus.":".$all},"\t$all\t"; # determine if allele came from mother, father, or both if($idgeno{"Mother:".$locus} =~ /$all/) { $ori{$locus.":".$all} = "Mthr"; if($idgeno{"Father:".$locus} =~ /$all/) { $ori{$locus.":".$all} = "Both"; } # homozygote mother if($idgeno{"Mother:".$locus} =~ /$all$all/) { $expect{$locus.":".$all} = $amp{$locus}; } # heterozygote mother elsif($idgeno{"Father:".$locus} =~ /$all/) { # homozygote father if($idgeno{"Father:".$locus} =~ /$all$all/) { $expect{$locus.":".$all} = $amp{$locus}; } # intercross else { $expect{$locus.":".$all} = 0.75*($amp{$locus}); } } # testcross else { $expect{$locus.":".$all} = 0.5*($amp{$locus}); } } elsif($idgeno{"Father:".$locus} =~ /$all/) { $ori{$locus.":".$all} = "Fthr"; # homozygote father if($idgeno{"Father:".$locus} =~ /$all$all/) { $expect{$locus.":".$all} = $amp{$locus}; } # testcross else { $expect{$locus.":".$all} = 0.5*($amp{$locus}); } } # nonparental allele else { $ori{$locus.":".$all} = "NP"; $expect{$locus.":".$all} = 0; } # determine species origin of alleles if(($ori{$locus.":".$all} eq "NP") || (($ori{$locus.":".$all} eq "Fthr") && ($unk{$fthr})) || (($ori{$locus.":".$all} eq "Mthr") && ($unk{$mthr}))) { $sp{$all.":".$locus} = "Unk"; } else { if (($ori{$locus.":".$all} eq "Fthr") || ($ori{$locus.":".$all} eq "Both")) { if($sp{$fthr} =~ tr/[a-zA-Z]// == 1) { $sp{$all.":".$locus} = $sp{$fthr}; } else { # grandfather data available if($read{$p_gfthr.":".$locus}) { if($fmatch{$p_gfthr.":".$locus.":".$all}) { # grandmother data available if($read{$p_gmthr.":".$locus}) { if($fmatch{$p_gmthr.":".$locus.":".$all}) { if($fmatch{$p_gfthr.":".$locus.":".$all} > 1) { # parent is homozygote that matches both grandparents if($fmatch{$p_gmthr.":".$locus.":".$all} > 1) { $sp{$all.":".$locus} = "Both"; } # this shouldn't happen, but just in case else { $sp{$all.":".$locus} = $sp{$p_gfthr}; } } # this shouldn't happen either elsif($fmatch{$p_gmthr.":".$locus.":".$all} > 1) { $sp{$all.":".$locus} = $sp{$p_gmthr}; } # if both match, and one matches two alleles and # the other only matches one, assign to the one # that matches one allele elsif(($fmatch{$p_gmthr.":".$locus} > 1) && ($fmatch{$p_gfthr.":".$locus} == 1)) { $sp{$all.":".$locus} = $sp{$p_gfthr}; } elsif(($fmatch{$p_gfthr.":".$locus} > 1) && ($fmatch{$p_gmthr.":".$locus} == 1)) { $sp{$all.":".$locus} = $sp{$p_gmthr}; } # both match same (single) allele in heterozygous progeny else { if($p_null{$locus}) { # give visible allele to grandparent with two visible alleles if(($idgeno{$p_gfthr.":".$locus} eq $all) && ($idgeno{$p_gmthr.":".$locus} ne $all)) { $sp{$all.":".$locus} = $sp{$p_gmthr}; } elsif(($idgeno{$p_gmthr.":".$locus} eq $all) && ($idgeno{$p_gfthr.":".$locus} ne $all)) { $sp{$all.":".$locus} = $sp{$p_gfthr}; } else { $sp{$all.":".$locus} = "Shared"; } } # no segregating nulls, yet grandparents only match one allele. # Hopefully unlikely else { $sp{$all.":".$locus} = "Shared"; } } } # grandfather matches, grandmother doesn't match else{ # if grandma matches another allele, or a null is segregating if(($fmatch{$p_gmthr.":".$locus}) || ($p_null{$locus})) { $sp{$all.":".$locus} = $sp{$p_gfthr}; } # something is strange else { $sp{$all.":".$locus} = $sp{$p_gfthr}."?"; } } } # no grandmother data else { # matches for one allele if($fmatch{$p_gfthr.":".$locus} == 1) { # if more than one visible allele in progeny, # matched allele is from grandfather if($idgeno{"Father:".$locus} =~ tr/[A-La-l]// > 1) { $sp{$all.":".$locus} = $sp{$p_gfthr}; } # assuming diploidy, heterozygous parent that matches for only one allele elsif($idgeno{$p_gfthr.":".$locus} =~ tr/[A-La-l]// > 1) { $sp{$all.":".$locus} = $sp{$p_gfthr}; } else { $sp{$all.":".$locus} = $sp{$p_gfthr}."?"; } } # progeny is homozygous for matching allele elsif($fmatch{$p_gfthr.":".$locus.":".$all} > 1) { $sp{$all.":".$locus} = "Both"; } # matches for multiple alleles, so insufficient data else { $sp{$all.":".$locus} = $sp{$p_gfthr}."?"; } } } # grandfather doesn't match else { # grandmother data available if($read{$p_gmthr.":".$locus}) { # grandfather doesn't match, grandma does if($fmatch{$p_gmthr.":".$locus.":".$all}) { # segregating null or grandpa matches at another locus if(($fmatch{$p_gfthr.":".$locus}) || ($p_null{$locus})) { $sp{$all.":".$locus} = $sp{$p_gmthr}; } # something is strange else { $sp{$all.":".$locus} = $sp{$p_gmthr}."?"; } } # hopefully this will be rare: neither grandparent matches else { $sp{$all.":".$locus} = "Unk"; } } # grandfather doesn't match, give allele to unamplified grandma # if grandpa matches another allele or grandpa has only one allele # and maternal null is segregating elsif(($fmatch{$p_gfthr.":".$locus}) || (($idgeno{$p_gfthr.":".$locus} =~ tr/[A-LNa-l]// == 1) && $p_null{$locus})) { $sp{$all.":".$locus} = $sp{$p_gmthr}; } # if observed grandparent doesn't match anything, and there are no nulls else { $sp{$all.":".$locus} = "Unk"; } } } # no grandfather data else { if($read{$p_gmthr.":".$locus}) { if($fmatch{$p_gmthr.":".$locus.":".$all}) { # matches for one allele if($fmatch{$p_gmthr.":".$locus} == 1) { # if more than one visible allele in progeny, # matched allele is from grandfather if($idgeno{"Father:".$locus} =~ tr/[A-La-l]// > 1) { $sp{$all.":".$locus} = $sp{$p_gmthr}; } # assuming diploidy, heterozygous parent that matches for only one allele elsif($idgeno{$p_gmthr.":".$locus} =~ tr/[A-La-l]// > 1) { $sp{$all.":".$locus} = $sp{$p_gmthr}; } else { $sp{$all.":".$locus} = $sp{$p_gmthr}."?"; } } # progeny is homozygous for matching allele elsif($fmatch{$p_gmthr.":".$locus.":".$all} > 1) { $sp{$all.":".$locus} = "Both"; } # multiple matching alleles, so indistinguishable else { $sp{$all.":".$locus} = $sp{$p_gmthr}."?"; } } # grandmother doesn't match, give allele to unamplified grandpa # if grandma matches another allele or grandma has only one allele # and paternal null is segregating elsif(($fmatch{$p_gmthr.":".$locus}) || (($idgeno{$p_gmthr.":".$locus} =~ tr/[A-LNa-l]// == 1) && $p_null{$locus})) { $sp{$all.":".$locus} = $sp{$p_gfthr}; } # if observed grandparent doesn't match anything, and there are no nulls else { $sp{$all.":".$locus} = "Unk"; } } # no grandparent data. Should never get here, but just in case. else { $sp{$all.":".$locus} = "Unk"; } } } } $sp1{$all.":".$locus} = $sp{$all.":".$locus}; # do it again for maternal alleles if (($ori{$locus.":".$all} eq "Mthr") || ($ori{$locus.":".$all} eq "Both")) { if($sp{$mthr} =~ tr/[a-zA-Z]// == 1) { $sp{$all.":".$locus} = $sp{$mthr}; } else { # grandfather data available if($read{$m_gfthr.":".$locus}) { if($mmatch{$m_gfthr.":".$locus.":".$all}) { # grandmother data available if($read{$m_gmthr.":".$locus}) { if($mmatch{$m_gmthr.":".$locus.":".$all}) { if($mmatch{$m_gfthr.":".$locus.":".$all} > 1) { # parent is homozygote that matches both grandparents if($mmatch{$m_gmthr.":".$locus.":".$all} > 1) { $sp{$all.":".$locus} = "Both"; } # this shouldn't happen, but just in case else { $sp{$all.":".$locus} = $sp{$m_gfthr}; } } # this shouldn't happen either elsif($mmatch{$m_gmthr.":".$locus.":".$all} > 1) { $sp{$all.":".$locus} = $sp{$m_gmthr}; } # if both match, and one matches two alleles and # the other only matches one, assign to the one # that matches one allele elsif(($mmatch{$m_gmthr.":".$locus} > 1) && ($mmatch{$m_gfthr.":".$locus} == 1)) { $sp{$all.":".$locus} = $sp{$m_gfthr}; } elsif(($mmatch{$m_gfthr.":".$locus} > 1) && ($mmatch{$m_gmthr.":".$locus} == 1)) { $sp{$all.":".$locus} = $sp{$m_gmthr}; } # both match same (single) allele in heterozygous progeny else { if($m_null{$locus}) { if(($idgeno{$m_gfthr.":".$locus} eq $all) && ($idgeno{$m_gmthr.":".$locus} ne $all)) { $sp{$all.":".$locus} = $sp{$m_gmthr}; } elsif(($idgeno{$m_gmthr.":".$locus} eq $all) && ($idgeno{$m_gfthr.":".$locus} ne $all)) { $sp{$all.":".$locus} = $sp{$m_gfthr}; } else { $sp{$all.":".$locus} = "Shared"; } } # no segregating nulls, yet grandparents only match one allele. # Hopefully unlikely else { $sp{$all.":".$locus} = "Shared"; } } } # grandfather matches, grandmother doesn't match else{ # segregating null or grandpa matches at another locus if(($mmatch{$m_gmthr.":".$locus}) || ($m_null{$locus})) { $sp{$all.":".$locus} = $sp{$m_gfthr}; } # something is strange else { $sp{$all.":".$locus} = $sp{$m_gfthr}."?"; } } } # no grandmother data else{ # matches for one allele if($mmatch{$m_gfthr.":".$locus} == 1) { # if more than one visible allele in progeny, # matched allele is from grandfather if($idgeno{"Mother:".$locus} =~ tr/[A-La-l]// > 1) { $sp{$all.":".$locus} = $sp{$m_gfthr}; } # assuming diploidy, heterozygous parent that matches for only one allele elsif($idgeno{$m_gfthr.":".$locus} =~ tr/[A-La-l]// > 1) { $sp{$all.":".$locus} = $sp{$m_gfthr}; } else { $sp{$all.":".$locus} = $sp{$m_gfthr}."?"; } } # progeny is homozygous for matching allele elsif($mmatch{$m_gfthr.":".$locus.":".$all} > 1) { $sp{$all.":".$locus} = "Both"; } else { $sp{$all.":".$locus} = $sp{$m_gfthr}."?"; } } } # grandfather doesn't match else{ # grandmother data available if($read{$m_gmthr.":".$locus}) { # grandfather doesn't match, grandma does if($mmatch{$m_gmthr.":".$locus.":".$all}) { # segregating null or grandpa matches at another locus if(($mmatch{$m_gfthr.":".$locus}) || ($m_null{$locus})) { $sp{$all.":".$locus} = $sp{$m_gmthr}; } # something is strange else { $sp{$all.":".$locus} = $sp{$m_gmthr}."?"; } } # hopefully this will be rare: neither grandparent matches else { $sp{$all.":".$locus} = "Unk"; } } # grandfather doesn't match, give allele to unamplified grandma # if grandpa matches another allele or grandpa has only one allele # and maternal null is segregating elsif(($mmatch{$m_gfthr.":".$locus}) || (($idgeno{$m_gfthr.":".$locus} =~ tr/[A-LNa-l]// == 1) && $m_null{$locus})) { $sp{$all.":".$locus} = $sp{$m_gmthr}; } # if observed grandparent doesn't match anything, and there are no nulls else { $sp{$all.":".$locus} = "Unk"; } } } # no grandfather data else { # grandmother data, no grandfather data if($read{$m_gmthr.":".$locus}) { if($mmatch{$m_gmthr.":".$locus.":".$all}) { # matches for one allele if($mmatch{$m_gmthr.":".$locus} == 1) { # if more than one visible allele in progeny, # matched allele is from grandmother if($idgeno{"Mother:".$locus} =~ tr/[A-La-l]// > 1) { $sp{$all.":".$locus} = $sp{$m_gmthr}; } # assuming diploidy, heterozygous parent that matches for only one allele elsif($idgeno{$m_gmthr.":".$locus} =~ tr/[A-La-l]// > 1) { $sp{$all.":".$locus} = $sp{$m_gmthr}; } else { $sp{$all.":".$locus} = $sp{$m_gmthr}."?"; } } # progeny is homozygous for matching allele elsif($mmatch{$m_gmthr.":".$locus.":".$all} > 1) { $sp{$all.":".$locus} = "Both"; } # multiple matching alleles, so indistinguishable else { $sp{$all.":".$locus} = $sp{$m_gmthr}."?"; } } # grandmother doesn't match, give allele to unamplified grandpa # if grandma matches another allele or grandma has only one allele # and maternal null is segregating elsif(($mmatch{$m_gmthr.":".$locus}) || (($idgeno{$m_gmthr.":".$locus} =~ tr/[A-LNa-l]// == 1) && $m_null{$locus})) { $sp{$all.":".$locus} = $sp{$m_gfthr}; } # if observed grandparent doesn't match anything, and there are no nulls else { $sp{$all.":".$locus} = "Unk"; } } # no grandparent data. Should never get here, but just in case else { $sp{$all.":".$locus} = "Unk"; } } } } } # for alleles present in both parents if(($sp1{$all.":".$locus}) && ($sp{$all.":".$locus}) && ($sp1{$all.":".$locus} ne $sp{$all.":".$locus})) { # unknown trumps all if(($sp1{$all.":".$locus} eq "Unk") || ($sp{$all.":".$locus} eq "Unk")) { $sp{$all.":".$locus} = "Unk"; } # if shared in one parent, shared in both elsif(($sp1{$all.":".$locus} eq "Shared") || ($sp{$all.":".$locus} eq "Shared")) { $sp{$all.":".$locus} = "Shared"; } # if both in one parent, both in both elsif(($sp1{$all.":".$locus} eq "Both") || ($sp{$all.":".$locus} eq "Both")) { $sp{$all.":".$locus} = "Both"; } # if species identity is the same, but questionable for one elsif((($sp1{$all.":".$locus} =~ /\?/) || ($sp{$all.":".$locus} =~ /\?/)) && (substr($sp{$all.":".$locus},0,1) eq substr($sp1{$all.":".$locus},0,1))) { $sp{$all.":".$locus} = (substr($sp{$all.":".$locus},0,1))."?"; } # each has a different species designation elsif(($sp{$all.":".$locus} =~ tr/[A-Za-z]// == 1) && ($sp{$all.":".$locus} =~ tr/[A-Za-z]// == 1)) { $sp{$all.":".$locus} = "Shared"; } # safety valve for unanticipated cases else { $sp{$all.":".$locus} = "Unk"; } } print $ori{$locus.":".$all},"\t",$sp{$all.":".$locus},"\t"; print $idgeno{"Mother:".$locus},"\t",$idgeno{"Father:".$locus},"\t"; print $amp{$locus},"\t",$expect{$locus.":".$all},"\t",$obs{$locus.":".$all},"\t"; # calculate chi-square for segregation distortion, assuming diploidy $o1 = $obs{$locus.":".$all}; $e1 = $expect{$locus.":".$all}; $o2 = $amp{$locus}-$o1; $e2 = $amp{$locus}-$e1; if ($e1 && $e2) { $chi{$locus.":".$all} = (((($o1-$e1)-0.5)**2)/$e1) + (((($o2-$e2)-0.5)**2)/$e2); } else { $chi{$locus.":".$all} = 0; } if($chi{$locus.":".$all} > 3.84) { $distort{$sp{$all.":".$locus}}++; if($ori{$locus.":".$all} eq "Fthr") { $mdistort++; } elsif($ori{$locus.":".$all} eq "Mthr") { $fdistort++; } } # calculate mu for segregation distortion $po = $o1/$amp{$locus}; $pe = $e1/$amp{$locus}; $qe= 1-$pe; $muex = (($pe*$qe)/$amp{$locus})**0.5; if($muex) { $mu{$locus.":".$all} = ($po-$pe)/$muex; } else { $mu{$locus.":".$all} = 0; } print $chi{$locus.":".$all},"\t",$mu{$locus.":".$all},"\n"; } # null segregation assessed based on mismatches with each parent foreach $locus (sort (keys (%allcnt))) { # if($nomat{$locus} > $minnull) { # $ori{"null:".$locus} = "Mthr"; # if($nopat{$locus} > $minnull) { # $ori{"null:".$locus} = "Both"; # } # } # elsif($nopat{$locus} > $minnull) { # $ori{"null:".$locus} = "Fthr"; # } if($m_null{$locus}) { # determine species origin of nulls if($unk{$mthr}) { $sp{"MNull:".$locus} = "Unk"; } elsif($sp{$mthr} =~ tr/[a-zA-Z]// == 1) { $sp{"MNull:".$locus} = $sp{$mthr}; } # double nulls got one from each grandparent elsif($idgeno{"Mother:".$locus} =~ /^N/) { $sp{"MNull:".$locus} = "Both"; } else { if($read{$m_gmthr.":".$locus}) { if($mmatch{$m_gmthr.":".$locus}) { if($read{$m_gfthr.":".$locus}) { # both parent genotypes match, so ambiguous if($mmatch{$m_gfthr.":".$locus}) { # grandmother is hetero, probably not source of null if($idgeno{$m_gmthr.":".$locus} =~ tr/A-La-l// > 1) { # grandpa is homo if($idgeno{$m_gfthr.":".$locus} =~ tr/A-La-l// == 1) { $sp{"MNull:".$locus} = $sp{$m_gfthr}; } # grandpa and grandma hetero (unlikely and unexplainable in diploid world) else { $sp{"MNull:".$locus} = "Unk"; } } # grandma is homo else { # grandpa is hetero if($idgeno{$m_gfthr.":".$locus} =~ tr/A-La-l// > 1) { $sp{"MNull:".$locus} = $sp{$m_gmthr}; } # both homozygotes else { $sp{"MNull:".$locus} = "Unk"; } } } # grandma matches, grandpa doesn't else { if($idgeno{$m_gfthr.":".$locus} =~ tr/A-LNa-l// == 1) { $sp{"MNull:".$locus} = $sp{$m_gfthr}; } else { $sp{"MNull:".$locus} = "Unk"; } } } # data only available for grandmother else { # can't tell if only one visible allele that matches if($idgeno{$m_gmthr.":".$locus} =~ tr/A-La-l// == 1) { $sp{"MNull:".$locus} = "Unk"; } # assuming diploidy, heterozygous matching # grandmother must mean grandfather contributed # null else { $sp{"MNull:".$locus} = $sp{$m_gfthr}; } } } # grandmother genotype doesn't match else { if($read{$m_gfthr.":".$locus}) { if($mmatch{$m_gfthr.":".$locus}) { if($idgeno{$m_gmthr.":".$locus} =~ tr/A-LNa-l// == 1) { $sp{"MNull:".$locus} = $sp{$m_gmthr}; } else { $sp{"MNull:".$locus} = "Unk"; } } # if they're both read and neither match, # and it's not null homozygote, then # something's wrong else { $sp{"MNull:".$locus} = "Unk"; } } else { # if grandma is homozygote and doesn't match, assume grandma gave null if($idgeno{$m_gmthr.":".$locus} =~ tr/A-La-lN// == 1) { $sp{"MNull:".$locus} = $sp{$m_gmthr}; } # if grandma is a mismatching heterozygote, something's wrong else { $sp{"MNull:".$locus} = "Unk"; } } } } # cases for unamplified grandma else { if($read{$m_gfthr.":".$locus}) { if($mmatch{$m_gfthr.":".$locus}) { # if only visible allele matches, can't tell if($idgeno{$m_gfthr.":".$locus} =~ tr/A-La-l// == 1) { $sp{"MNull:".$locus} = "Unk"; } # assuming diploidy, heterozygous matching # grandfather must mean grandmother contributed # null else { $sp{"MNull:".$locus} = $sp{$m_gmthr}; } } else { # if grandpa is homozygote and doesn't match, assume grandpa gave null if($idgeno{$m_gfthr.":".$locus} =~ tr/A-La-lN// == 1) { $sp{"MNull:".$locus} = $sp{$m_gfthr}; } # mismatching heterozygote makes no sense in diploid world else { $sp{"MNull:".$locus} = "Unk"; } } } # no data for grandparents. This is covered by first if, but doesn't hurt. else { $sp{"MNull:".$locus} = "Unk"; } } } print $locus,"\tNull\tN\tMthr\t",$sp{"MNull:".$locus},"\t",$idgeno{"Mother:".$locus},"\t", $idgeno{"Father:".$locus},"\t",$amp{$locus},"\t"; $o1 = $nomat{$locus}; $e1 = $expect{$locus.":Mother"}; $o2 = $amp{$locus}-$o1; $e2 = $amp{$locus}-$e1; if ($e1 && $e2) { $chi{$locus.":Mthrnull"} = (((($o1-$e1)-0.5)**2)/$e1) + (((($o2-$e2)-0.5)**2)/$e2); } else { $chi{$locus.":Mthrnull"} = 0; } # probably doesn't make sense to count null distortion # if($chi{$locus.":Mthrnull"} > 3.84) { # $fdistort++; # } # calculate mu for segregation distortion $po = $o1/$amp{$locus}; $pe = $e1/$amp{$locus}; $qe= 1-$pe; $muex = (($pe*$qe)/$amp{$locus})**0.5; if($muex) { $mu{$locus.":".$all} = ($po-$pe)/$muex; } else { $mu{$locus.":".$all} = 0; } print $expect{$locus.":Mother"} ? $expect{$locus.":Mother"} : "0", "\t",$nomat{$locus},"\t",$chi{$locus.":Mthrnull"},"\t",$mu{$locus.":".$all},"\n"; } # do it over again for paternal nulls if($p_null{$locus}) { if($unk{$fthr}) { $sp{"FNull:".$locus} = "Unk"; } elsif($sp{$fthr} =~ tr/[a-zA-Z]// == 1) { $sp{"FNull:".$locus} = $sp{$fthr}; } # double nulls got one from each grandparent elsif($idgeno{"Father:".$locus} =~ /^N/) { $sp{"FNull:".$locus} = "Both"; } else { if($read{$p_gmthr.":".$locus}) { if($fmatch{$p_gmthr.":".$locus}) { if($read{$p_gfthr.":".$locus}) { # both parent genotypes match, so ambiguous if($fmatch{$p_gfthr.":".$locus}) { # grandmother is hetero, probably not source of null if($idgeno{$p_gmthr.":".$locus} =~ tr/A-La-l// > 1) { # grandpa is homo if($idgeno{$p_gfthr.":".$locus} =~ tr/A-La-l// == 1) { $sp{"FNull:".$locus} = $sp{$p_gfthr}; } # grandpa and grandma hetero (unlikely and unexplainable in diploid world) else { $sp{"FNull:".$locus} = "Unk"; } } # grandma is homo else { # grandpa is hetero if($idgeno{$p_gfthr.":".$locus} =~ tr/A-La-l// > 1) { $sp{"FNull:".$locus} = $sp{$p_gmthr}; } # both homozygotes else { $sp{"FNull:".$locus} = "Unk"; } } } # grandma matches, grandpa doesn't else { if($idgeno{$p_gfthr.":".$locus} =~ tr/A-LNa-l// == 1) { $sp{"FNull:".$locus} = $sp{$p_gfthr}; } else { $sp{"FNull:".$locus} = "Unk"; } } } # data only available for grandmother else { # can't tell if only one visible allele that matches if($idgeno{$p_gmthr.":".$locus} =~ tr/A-La-l// == 1) { $sp{"FNull:".$locus} = "Unk"; } # assuming diploidy, heterozygous matching # grandmother must mean grandfather contributed # null else { $sp{"FNull:".$locus} = $sp{$p_gfthr}; } } } # grandmother genotype doesn't match else { if($read{$p_gfthr.":".$locus}) { if($fmatch{$p_gfthr.":".$locus}) { if($idgeno{$p_gmthr.":".$locus} =~ tr/A-LNa-l// == 1) { $sp{"FNull:".$locus} = $sp{$p_gmthr}; } else { $sp{"FNull:".$locus} = "Unk"; } } # if they're both read and neither match, # and it's not null homozygote, then # something's wrong else { $sp{"FNull:".$locus} = "Unk"; } } else { # if grandma is homozygote and doesn't match, assume grandma gave null if($idgeno{$p_gmthr.":".$locus} =~ tr/A-La-lN// == 1) { $sp{"FNull:".$locus} = $sp{$p_gmthr}; } # if grandma is a mismatching heterozygote, something's wrong else { $sp{"FNull:".$locus} = "Unk"; } } } } # cases for unamplified grandma else { if($read{$p_gfthr.":".$locus}) { if($fmatch{$p_gfthr.":".$locus}) { # can't tell if only one visible allele that matches if($idgeno{$p_gfthr.":".$locus} =~ tr/A-La-l// == 1) { $sp{"FNull:".$locus} = "Unk"; } # assuming diploidy, heterozygous matching # grandfather must mean grandmother contributed # null else { $sp{"FNull:".$locus} = $sp{$p_gmthr}; } } else { # if grandpa is homozygote and doesn't match, assume grandpa gave null if($idgeno{$p_gfthr.":".$locus} =~ tr/A-La-lN// == 1) { $sp{"FNull:".$locus} = $sp{$p_gfthr}; } # if grandpa is a mismatching heterozygote, something's wrong else { $sp{"FNull:".$locus} = "Unk"; } } } # no data for grandparents. This is covered by first if, but doesn't hurt. else { $sp{"FNull:".$locus} = "Unk"; } } } print $locus,"\tNull\tN\tFthr\t",$sp{"FNull:".$locus},"\t",$idgeno{"Mother:".$locus},"\t", $idgeno{"Father:".$locus},"\t",$amp{$locus},"\t"; $o1 = $nopat{$locus}; $e1 = $expect{$locus.":Father"}; $o2 = $amp{$locus}-$o1; $e2 = $amp{$locus}-$e1; if ($e1 && $e2) { $chi{$locus.":Fthrnull"} = (((($o1-$e1)-0.5)**2)/$e1) + (((($o2-$e2)-0.5)**2)/$e2); } else { $chi{$locus.":Fthrnull"} = 0; } # Probably shouldn't count null distortion # if($chi{$locus.":Fthrnull"} > 3.84) { # $mdistort++; # } # calculate mu for segregation distortion $po = $o1/$amp{$locus}; $pe = $e1/$amp{$locus}; $qe= 1-$pe; $muex = (($pe*$qe)/$amp{$locus})**0.5; if($muex) { $mu{$locus.":".$all} = ($po-$pe)/$muex; } else { $mu{$locus.":".$all} = 0; } print $expect{$locus.":Father"} ? $expect{$locus.":Father"} : "0", "\t",$nopat{$locus},"\t",$chi{$locus.":Fthrnull"},"\t",$mu{$locus.":".$all},"\n"; } } print "**********************************************************************\n"; print "\nParental Summary\n"; print "\n\tMother\tFather\n"; foreach $locus (sort(keys(%allcnt))) { (@fgeno) = split(//,$idgeno{"Mother:".$locus}); (@mgeno) = split(//,$idgeno{"Father:".$locus}); if($fgeno[0]) { $floci++; if($fgeno[0] ne $fgeno[1]) { $fhetero++; } for $i (0 .. $#fgeno) { if(!($fcounted{$locus.":".$fgeno[$i]}) && ($fgeno[$i] !~ /X|M|N/)) { $falleles++; $fcounted{$locus.":".$fgeno[$i]}=1; } } # identify polysomic mother geno if(($idgeno{"Mother:".$locus} =~ tr/A-La-lN//) >= 3) { for $i (0 .. $#mgeno) { $poly{$sp{$mgeno[$i].":".$locus}.":".$locus}++; } } if($idgeno{"Mother:".$locus} =~ /X/) { $fnonpar++; } if($idgeno{"Mother:".$locus} =~ /N/) { $fnull++; } } if($mgeno[0]) { $mloci++; if($mgeno[0] ne $mgeno[1]) { $mhetero++; } for $i (0 .. $#mgeno) { if(!($mcounted{$locus.":".$mgeno[$i]}) && ($mgeno[$i] !~ /X|M|N/)) { $malleles++; $mcounted{$locus.":".$mgeno[$i]}=1; } } # identify polysomic father geno if(($idgeno{"Father:".$locus} =~ tr/A-La-lN//) >= 3) { for $i (0 .. $#fgeno) { $poly{$sp{$fgeno[$i].":".$locus}.":".$locus}++; } } if($idgeno{"Father:".$locus} =~ /X/) { $mnonpar++; } if($idgeno{"Father:".$locus} =~ /N/) { $mnull++; } } } print "Loci\t",$floci ? $floci : 0,"\t",$mloci ? $mloci : 0,"\n"; print "Alleles\t",$falleles ? $falleles : 0,"\t",$malleles ? $malleles : 0,"\n"; print "Heterozygosity\t",$floci ? $fhetero/$floci : 0,"\t",$mloci ? $mhetero/$mloci : 0,"\n"; print "Nulls\t",$fnull ? $fnull : 0,"\t",$mnull ? $mnull : 0,"\n"; print "Nonparental Alleles\t",$fnonpar ? $fnonpar : 0,"\t",$mnonpar ? $mnonpar : 0,"\n"; print "Unreduced Gametes\t",$fpoly ? $fpoly : 0,"\t",$mpoly ? $mpoly : 0,"\n"; print "Segregation Distortion\t",$fdistort ? $fdistort : 0,"\t",$mdistort ? $mdistort : 0,"\n"; print "**********************************************************************\n"; print "\nSpecies Summary\n\n"; foreach $all_loc (sort(keys(%sp))) { ($all,$loc) = split(/:/,$all_loc); if($loc) { if(!($cntd{$all_loc})) { $loc{$sp{$all_loc}}++; $cntd{$all_loc}=1; } if($all =~ /[F|M]Null/) { $null{$sp{$all_loc}}++; } elsif(($all) && !($acntd{$all_loc})) { $acntd{$all_loc}=1; $alls{$sp{$all_loc}}++; } } } foreach $loc (sort(keys(%allcnt))) { foreach $sp (sort(keys(%loc))) { if($poly{$sp.":".$loc} > 1) { $unredcd{$sp}++; } } } print "\t"; foreach $sp (sort(keys(%loc))) { if($sp) { print "$sp\t"; } } print "\n"; print "Alleles\t"; foreach $sp (sort(keys(%loc))) { if($sp) { print $loc{$sp} ? $loc{$sp} : 0,"\t"; } } print "\n"; print "Visible Alleles\t"; foreach $sp (sort(keys(%loc))) { if($sp) { print $alls{$sp} ? $alls{$sp} : 0,"\t"; } } print "\n"; print "Nulls\t"; foreach $sp (sort(keys(%loc))) { if($sp) { print $null{$sp} ? $null{$sp} : 0,"\t"; } } print "\n"; print "Unreduced Gametes\t"; foreach $sp (sort(keys(%loc))) { if($sp) { print $unredcd{$sp} ? $unredcd{$sp} : 0,"\t"; } } print "\n"; print "Segregation Distortion\t"; foreach $sp (sort(keys(%loc))) { if($sp) { print $distort{$sp} ? $distort{$sp} : 0,"\t"; } } print "\n"; print "**********************************************************************\n"; print "\nPossible Aneuploidy\n"; print "\nSample\tLocus\tFiles\n"; foreach $locus (sort (keys (%allcnt))) { foreach $sample (sort by_num (keys(%samples))) { if(($polysom{$sample.":".$locus} || $aneu{$sample.":".$locus})) { print $sample,"\t",$locus,"\t"; for $i (0 .. $#{$fname{$sample.":".$locus}}) { print $fname{$sample.":".$locus}[$i],"; "; } print "\n"; } } } print "**********************************************************************\n"; print "\nNonparental_Alleles\n"; print "\nLocus\tSample\tAllele\tFiles\n"; foreach $locus (sort (keys (%allcnt))) { foreach $sample (sort by_num (keys(%samples))) { if($npar{$sample.":".$locus}) { print $locus,"\t",$sample,"\t",$npar{$sample.":".$locus},"\t"; for $i (0 .. $#{$fname{$sample.":".$locus}}) { print $fname{$sample.":".$locus}[$i],"; "; } print "\n"; } } } print "**********************************************************************\n"; print "\nRepeated Samples with Different Genotypes\n"; print "\nSample\tLocus\tNo. Observed\tGenotype\tFiles\n"; foreach $samp_loc (sort (keys (%geno))) { if($samp_loc =~ /_rep/) { ($samp,$loc) = split(/:/,$samp_loc); $bsamp = $samp; $bsamp =~ s/_rep//; print "$bsamp\t$loc\t"; print $rep{$bsamp.":".$loc} ? $rep{$bsamp.":".$loc}+1 : "1","\t"; if(!($geno{$bsamp.":".$loc}[0])) { print "\t";} else { for $i (0 .. $#{$geno{$bsamp.":".$loc}}) { print $geno{$bsamp.":".$loc}[$i],$i < $#{$geno{$bsamp.":".$loc}} ? "/" : "\t"; } } for $i (0 .. $#{$fname{$bsamp.":".$loc}}) { print $fname{$bsamp.":".$loc}[$i],"; "; } print "\n"; print "$samp\t$loc\t"; print $rep{$samp.":".$loc} ? $rep{$samp.":".$loc}+1 : "1","\t"; if(!($geno{$samp_loc}[0])) { print "\t"; } else { for $i (0 .. $#{$geno{$samp_loc}}) { print $geno{$samp_loc}[$i],$i < $#{$geno{$samp_loc}} ? "/" : "\t"; } } for $i (0 .. $#{$fname{$samp.":".$loc}}) { print $fname{$samp.":".$loc}[$i],"; "; } print "\n"; } } print "**********************************************************************\n"; print "\n$blnk with Observed Alleles\n"; print "\nLocus\tGenotype\tFiles\n"; foreach $loc (sort (keys (%allcnt))) { if($geno{$blnk.":".$loc}[0]) { print "$loc\t"; for $i (0 .. $#{$geno{$blnk.":".$loc}}) { print $geno{$blnk.":".$loc}[$i],$i < $#{$geno{$blnk.":".$loc}} ? "/" : "\t"; } for $i (0 .. $#{$fname{$blnk.":".$loc}}) { print $fname{$blnk.":".$loc}[$i],"; "; } print "\n"; } } print "**********************************************************************\n"; print "\nProgeny_with_unexpected_lack_of_parental_alleles\n"; print "\nLocus\tSample\tMaternal Mismatch\tPaternal Mismatch\tFiles\n"; foreach $locus (sort (keys (%allcnt))) { foreach $sample (sort by_num (keys(%samples))) { if($sample && (!($noseg{$sample})) && ($idgeno{$sample.":".$locus} ne "M" )) { if(((!($mat{$sample.":".$locus}) && (!($expect{$locus.":Mother"}))) || (!($pat{$sample.":".$locus}) && (!($expect{$locus.":Father"})))) || ($idgeno{$sample.":".$locus} =~ /X/)) { print $locus,"\t",$sample,"\t",!($mat{$sample.":".$locus}) ? "1" : "0","\t", !($pat{$sample.":".$locus}) ? "1" : "0","\t"; for $i (0 .. $#{$fname{$sample.":".$locus}}) { print $fname{$sample.":".$locus}[$i],"; "; } print "\n"; } } } } print "**********************************************************************\n"; print "\nMismatching_progeny_for_loci_with_segregation_distortion\n"; print "\nLocus\tSample\tMaternal Mismatch\tPaternal Mismatch\tFiles\n"; foreach $locus (sort (keys (%allcnt))) { foreach $sample (sort by_num (keys(%samples))) { if($sample && (!($noseg{$sample})) && ($idgeno{$sample.":".$locus} ne "M" )) { if(((!($mat{$sample.":".$locus}) && ($chi{$locus.":Mthrnull"} > 3.84)) || (!($pat{$sample.":".$locus}) && ($chi{$locus.":Fthrnull"} > 3.84)))) { print $locus,"\t",$sample,"\t",!($mat{$sample.":".$locus}) ? "1" : "0","\t", !($pat{$sample.":".$locus}) ? "1" : "0","\t"; for $i (0 .. $#{$fname{$sample.":".$locus}}) { print $fname{$sample.":".$locus}[$i],"; "; } print "\n"; } } } } print "**********************************************************************\n"; print "\nMissing_Data\n"; print "\nLocus\tSample\n"; foreach $locus (sort (keys (%allcnt))) { foreach $sample (sort by_num (keys(%samples))) { if($sample && ($idgeno{$sample.":".$locus} eq "M") && ($sample ne $blnk)) { print $locus,"\t",$sample,"\n"; } } } } sub rep_check { my($samp,$loc) = @_; if(!($read{$samp.":".$loc})) { # if both samples are missing, skip to next line if(($dat[$lw] =~ /warning/i) || ($dat[$pks[0]] eq "M")) { next LINE; } # if only old one is missing, process new one with orig. name return $samp; $cont = 1; } else { # old sample is amplified, new one isn't if(($dat[$lw] =~ /warning/i) || ($dat[$pks[0]] eq "M")) { next LINE; } # both samples have data else { if($geno{$samp.":".$loc}[0]) { for $i (0 .. $#{$geno{$samp.":".$loc}}) { $match = 0; for $j (0 .. $#pks) { if($geno{$samp.":".$loc}[$i] eq $dat[$pks[$j]]) { $match = 1; } } if(!($match)) { $samp .= "_rep"; return $samp; } } } # old sample is double null, see if new one matches else { for $i (0 .. $#pks) { # if any data, new one doesn't match if($dat[$pks[$i]]) { $samp .= "_rep"; return $samp; } } # if we get to this point, both are double-nulls $rep{$samp.":".$loc}++; # save unique filenames for repeats if($fname{$samp.":".$loc} !~ $dat[fl]) { push(@{$fname{$samp.":".$loc}},$files[-1].":".$dat[$fl]); } next LINE; } # if we get to this point, all alleles in old sample match # those of new sample. Now we need to know if all new # alleles match old ones for $i (0 .. $#pks) { if($dat[$pks[$i]]) { $match = 0; for $j (0 .. $#{$geno{$samp.":".$loc}}) { if($geno{$samp.":".$loc}[$j] eq $dat[$pks[$i]]) { $match = 1; } } if(!($match)) { $samp .= "_rep"; return $samp; } } } # if we get to this point, samples match exactly, so count # and skip to next line $rep{$samp.":".$loc}++; # save unique filenames for repeats if($fname{$samp.":".$loc} !~ $dat[fl]) { push(@{$fname{$samp.":".$loc}},$files[-1].":".$dat[$fl]); } next LINE; } } } sub upcase { my $word = @_; $word =~ tr/[a-z]/A-Z/; return $word; }