RADmarkers

#!/usr/bin/env perl

# RADmarkers
# Takes all the files in a folder with a .tags suffix and sorts the
# sequences into clusters, representing all alleles of tags
#
# History:
# 16/05/10 First public version
# 04/07/10 Bug fixes
# 05/07/10 Added fragments option - defaults to using read counts but with
#          -f will use fragment counts
# 03/08/10 Incorporated into RADtools
# 18/08/10 Run through perlcritic, tidied up options, added POD,
#          added main() for testing and refactored main into subroutines
# 20/08/10 Added -species option to specify tags directory
# 25/08/10 Version 1.0
# 06/09/10 Fixed names of individuals (was leaving file extension on)
# 29/10/10 1.0.2 Don't look for species name in input files
# 30/10/10 1.1   Load Sanger qualities, not Illumina
#                Load individuals from pools file and keep in this order
# 16/02/11 1.2   Produces R-compatible TSV output format; add -o option
#                to produce old output
# 16/05/11 1.2.1 Replaced Allele with Tag in output, specify pools file
#                should be in current directory

#############################################################################
###
### PREAMBLE
###
#############################################################################

use strict;
use warnings;
use English qw(-no_match_vars );
use Getopt::Long qw(:config bundling no_auto_abbrev auto_version);
use Pod::Usage;
use Cwd 'abs_path';

use RADtools qw(get_pools_filename QUAL_OFFSET);

# Would like to use Carp, but an outstanding bug causes cryptic errors
# when using caller(), so using die until this is fixed
# http://www.nntp.perl.org/group/perl.perl5.porters/2010/03/msg157461.html

local $main::VERSION    = 1.2.1;    # Used by Getopt::Long to provide --version
local $OUTPUT_AUTOFLUSH = 1;      # So reporting on progress works

main(@ARGV) unless caller;        # So test suite can call script

sub main {

    # Set up default options

    my $help  = 0;
    my $usage = 0;
    my $man   = 0;

    my $end_tagfiles = '.tags';
    my $directory    = abs_path;

    my $verbose    = 0;
    my $snpsout    = 0;
    my $qualsout   = 0;
    my $fragments  = 0;
    my $old_output = 0;

    my $tag_count_threshold = 0;
    my $hamming_threshold   = 0;

    my $include_singletons = 0;    # Tags only in one individual

    my $options_okay = GetOptions(

        'help|h'  => \$help,
        'usage|u' => \$usage,
        'man'     => \$man,

        'end_tagfiles|e=s' => \$end_tagfiles,
        'directory|d=s'    => \$directory,

        'verbose|v'    => \$verbose,
        'snpsout|s'    => \$snpsout,
        'qualsout|q'   => \$qualsout,
        'fragments|f'  => \$fragments,
        'old_output|o' => \$old_output,

        'tag_count_threshold|t=i' => \$tag_count_threshold,
        'mismatches|m=i'          => \$hamming_threshold,

        'include_singletons|i' => \$include_singletons,

    ) or pod2usage( -verbose => 0 );

    pod2usage( -verbose => 0 ) if $usage;
    pod2usage( -verbose => 1 ) if $help;
    pod2usage( -verbose => 2 ) if $man;

    my %tag_clusters;
    my %tags;

#############################################################################
###
### MAIN LOOP
###
#############################################################################

    # Find pools file for list of individuals
    my $pools_file = get_pools_filename($directory);

    my @individuals;
    while ( my $pools_line = <$pools_file> ) {
        chomp $pools_line;
        my ( $name, $mid ) = split /\s+/, $pools_line;
        push @individuals, $name;
    }

    close $pools_file;

    generate_tag_clusters(
        {
            end_tagfiles        => $end_tagfiles,
            directory           => $directory,
            tag_clusters        => \%tag_clusters,
            tags                => \%tags,
            individuals         => \@individuals,
            hamming_threshold   => $hamming_threshold,
            tag_count_threshold => $tag_count_threshold,
            fragments           => $fragments,
            verbose             => $verbose,
        }
    );

    if ($verbose) {
        print STDERR scalar( keys %tag_clusters )
          . ' clusters found from '
          . scalar( keys %tags )
          . " tags\n"
          or die "Can't print\n";
    }

    my %cluster_summary;
    summarise_tag_clusters(
        {
            tag_clusters       => \%tag_clusters,
            individuals        => \@individuals,
            cluster_summary    => \%cluster_summary,
            tags               => \%tags,
            include_singletons => $include_singletons,
        }
    );

    if ( $verbose && !$include_singletons ) {
        print STDERR scalar( keys %tag_clusters )
          . ' clusters found from '
          . scalar( keys %tags )
          . " tags after tags found in only one individual removed\n";
    }

    if ( !$old_output ) {
        output_markers(
            {
                cluster_summary => \%cluster_summary,
                individuals     => \@individuals,
                qualsout        => $qualsout,
                snpsout         => $snpsout,
                fragments       => $fragments
            }
        );
    }
    else {
        foreach my $tag_count ( sort { $a <=> $b } keys %cluster_summary ) {
            print "$tag_count $cluster_summary{$tag_count}{count}\n";

            foreach my $seg_pattern (
                sort
                keys %{ $cluster_summary{$tag_count}{segs} }
              )
            {
                print_seg_tags(
                    {
                        seg_pattern => $seg_pattern,
                        seg_tags =>
                          $cluster_summary{$tag_count}{segs}{$seg_pattern},
                        ind       => \@individuals,
                        qualsout  => $qualsout,
                        snpsout   => $snpsout,
                        fragments => $fragments,
                    }
                );
                print "\n";
            }
        }
    }

    return;
}

#############################################################################
###
### SUBROUTINES
###
#############################################################################

#############################################################################
### Name:       GENERATE TAG CLUSTERS
### Function:   Cluster tags across all individuals
### Parameters: end_tagfiles        - for removing filename extensions
###             directory             - pools folder to load tags from
###             tag_clusters        - hash of clusters containing tags
###             tags                - hash of tags alone for reference
###             individuals         - list of sample names
###             hamming_threshold   - cutoff for Hamming distance
###             tag_count_threshold - throw away tags with counts below this
###             fragments           - if set, output frag counts, not reads
###             verbose             - output progress messages
### Returns:    Nothing (fills tag_clusters and tags)
#############################################################################

sub generate_tag_clusters {

    my ($arg_ref) = @_;

    my $end_tagfiles        = $arg_ref->{end_tagfiles};
    my $directory           = $arg_ref->{directory};
    my $tag_clus_ref        = $arg_ref->{tag_clusters};
    my $tags_ref            = $arg_ref->{tags};
    my $ind_ref             = $arg_ref->{individuals};
    my $hamming_threshold   = $arg_ref->{hamming_threshold};
    my $tag_count_threshold = $arg_ref->{tag_count_threshold};
    my $fragments           = $arg_ref->{fragments};
    my $verbose             = $arg_ref->{verbose};

    my $new_cluster_id = 0;

    foreach my $individual ( @{$ind_ref} ) {

        open my $tag_file, '<', "$directory/$individual$end_tagfiles"
          or die "Can't open $directory/$individual$end_tagfiles: $OS_ERROR!\n";

        #    foreach my $tag_filename ( sort @{$file_ref} ) {
        #        open my $tag_file, '<', "$directory/$tag_filename"
        #          or die "Can't open $directory/$tag_filename: $OS_ERROR!\n";
        #        $tag_filename =~ s/$end_tagfiles//;
        #        my $individual = $tag_filename;

        #        push @{$ind_ref}, $individual;

        my %cluster;

        if ($verbose) { print STDERR "Loading tags from $individual...\n"; }
        while ( my $tag_line = <$tag_file> ) {
            chomp $tag_line;

            # If cluster is fully loaded, process cluster
            if ( $tag_line eq q{} ) {
                my %cluster_ids;
                get_cluster_ids(
                    {
                        cluster           => \%cluster,
                        cluster_ids       => \%cluster_ids,
                        tags              => $tags_ref,
                        hamming_threshold => $hamming_threshold,
                    }
                );

                my $this_cluster_id = -1;

                # If matching previous cluster IDs, collapse all these
                # clusters into one cluster
                if ( keys %cluster_ids > 0 ) {

                    $this_cluster_id = collapse_existing_clusters(
                        {
                            tag_clusters => $tag_clus_ref,
                            cluster_ids  => \%cluster_ids,
                            tags         => $tags_ref,
                        }
                    );
                }

                # If not matching any IDs, make a new cluster
                else {
                    $new_cluster_id++;
                    $this_cluster_id = $new_cluster_id;
                }

                store_current_cluster(
                    {
                        cluster         => \%cluster,
                        tag_cluster     => $tag_clus_ref,
                        this_cluster_id => $this_cluster_id,
                        tags            => $tags_ref,
                    }
                );

                # Empty cluster and get more input
                %cluster = ();

                next;
            }

            next if ( $tag_line =~ /^\s/ );

            load_tag_into_cluster(
                {
                    tag_line            => $tag_line,
                    tags                => $tags_ref,
                    cluster             => \%cluster,
                    individual          => $individual,
                    tag_count_threshold => $tag_count_threshold,
                    fragments           => $fragments,
                }
            );
        }

        close $tag_file or die "Can't close tag file\n";
    }
    return;
}

#############################################################################
### Name:       GET CLUSTER IDS
### Function:   Finds matching clusters for current set of sequences
### Parameters: cluster           - the set of tags to check
###             cluster_ids       - the list of IDs to fill
###             tags              - already clustered tags to check against
###             hamming threshold - cutoff Hamming distance
### Returns:   nothing (fills cluster_ids)
#############################################################################

sub get_cluster_ids {
    my ($arg_ref) = @_;

    my $clus_ref          = $arg_ref->{cluster};
    my $clus_ids_ref      = $arg_ref->{cluster_ids};
    my $tags_ref          = $arg_ref->{tags};
    my $hamming_threshold = $arg_ref->{hamming_threshold};

    # Get existing cluster IDs for these sequences
  SEQUENCE:
    foreach my $sequence ( keys %{$clus_ref} ) {

        # Check for exact match
        if ( defined( $tags_ref->{$sequence}{cluster} ) ) {
            $clus_ids_ref->{ $tags_ref->{$sequence}{cluster} }++;
            next SEQUENCE;
        }

        next SEQUENCE if ( $hamming_threshold == 0 );

        # Check for Hamming distance matches
        foreach my $tag_seq ( keys %{$tags_ref} ) {
            next if ( $tag_seq eq $sequence );
            next
              if (
                ( defined( $tags_ref->{$tag_seq}{cluster} ) )
                && (
                    defined(
                        $clus_ids_ref->{ $tags_ref->{$tag_seq}{cluster} }
                    )
                )
              );
            my $hamming = ( $tag_seq ^ $sequence ) =~ tr/\001-\255//;
            if ( $hamming <= $hamming_threshold ) {
                if ( defined( $tags_ref->{$tag_seq}{cluster} ) ) {
                    $clus_ids_ref->{ $tags_ref->{$tag_seq}{cluster} }++;
                }
            }
        }
    }
    return;
}

#############################################################################
### Name:       COLLAPSE EXISTING CLUSTERS
### Function:   Joins clusters sharing a tag sequence into one cluster
### Parameters: tag_clusters - hash of clustered tags to revise
###             cluster_ids  - hash of clusters to combine
###             tags         - hash of tags for reference
### Returns:    ID of final cluster
#############################################################################

sub collapse_existing_clusters {
    my ($arg_ref) = @_;

    my $tag_clus_ref    = $arg_ref->{tag_clusters};
    my $cluster_ids_ref = $arg_ref->{cluster_ids};
    my $tags_ref        = $arg_ref->{tags};

    my @cluster_list =
      sort { $a <=> $b } keys %{$cluster_ids_ref};
    my $smallest_id = $cluster_list[0];
    foreach my $cluster_id (@cluster_list) {
      CLUSTERSEQ:
        foreach my $sequence ( keys %{ $tag_clus_ref->{$cluster_id} } ) {
            $tags_ref->{$sequence}{cluster} = $smallest_id;

            next CLUSTERSEQ if ( $smallest_id eq $cluster_id );
            foreach
              my $ind ( keys %{ $tag_clus_ref->{$cluster_id}{$sequence} } )
            {
                $tag_clus_ref->{$smallest_id}{$sequence}{$ind}{count} =
                  $tag_clus_ref->{$cluster_id}{$sequence}{$ind}{count};
                $tag_clus_ref->{$smallest_id}{$sequence}{$ind}{unique} =
                  $tag_clus_ref->{$cluster_id}{$sequence}{$ind}{unique};
                $tag_clus_ref->{$smallest_id}{$sequence}{$ind}{qual} =
                  $tag_clus_ref->{$cluster_id}{$sequence}{$ind}{qual};

            }
        }
        if ( $cluster_id ne $smallest_id ) {
            delete $tag_clus_ref->{$cluster_id};
        }
    }
    return $smallest_id;
}

#############################################################################
### Name:       STORE CURRENT CLUSTER
### Function:   Add new cluster sequences to the collapsed cluster
### Parameters: cluster         - cluster to add
###             tag_clusters    - hash of clustered tags to add to
###             this_cluster_id - ID of the cluster to add to
###             tags            - hash of tags for reference
### Returns:    nothing (fills tag_clusters)
#############################################################################

sub store_current_cluster {
    my ($arg_ref)       = @_;
    my $clus_ref        = $arg_ref->{cluster};
    my $tag_clus_ref    = $arg_ref->{tag_cluster};
    my $this_cluster_id = $arg_ref->{this_cluster_id};
    my $tags_ref        = $arg_ref->{tags};

    foreach my $sequence ( keys %{$clus_ref} ) {
        foreach my $individual ( keys %{ $clus_ref->{$sequence} } ) {

            $tag_clus_ref->{$this_cluster_id}{$sequence}{$individual}{count} =
              $clus_ref->{$sequence}{$individual}{count};
            $tag_clus_ref->{$this_cluster_id}{$sequence}{$individual}{unique} =
              $clus_ref->{$sequence}{$individual}{unique};
            $tag_clus_ref->{$this_cluster_id}{$sequence}{$individual}{qual} =
              $clus_ref->{$sequence}{$individual}{qual};

        }
        $tags_ref->{$sequence}{cluster} = $this_cluster_id;
    }
    return;
}

#############################################################################
### Name:       LOAD TAG INTO CLUSTER
### Function:   Process tag line from file into current fresh cluster
### Parameters: tag_line            - unprocessed tag line from file
###             tags                - tags hash for reference
###             cluster             - current cluster to add tag to
###             individual          - name of current sample
###             tag_count_threshold - throw away tags with counts below this
###             fragments           - if set, use fragment count, not reads
### Returns:    nothing (fills cluster, tags)
#############################################################################

sub load_tag_into_cluster {
    my ($arg_ref) = @_;

    my $tag_line            = $arg_ref->{tag_line};
    my $tags_ref            = $arg_ref->{tags};
    my $clus_ref            = $arg_ref->{cluster};
    my $individual          = $arg_ref->{individual};
    my $tag_count_threshold = $arg_ref->{tag_count_threshold};
    my $fragments           = $arg_ref->{fragments};

    my ( $tag_seq, $tag_qual, $tag_read_count, $tag_unique_count ) = split / /,
      $tag_line;

    # Reject tags containing P2 adapter site
    return if ( $tag_seq =~ m/GATCGGA/ );

    my $count = $fragments ? $tag_unique_count : $tag_read_count;
    return if ( $count < $tag_count_threshold );

    $tags_ref->{$tag_seq}{ind}{$individual}{count}  = $tag_read_count;
    $tags_ref->{$tag_seq}{ind}{$individual}{unique} = $tag_unique_count;
    $tags_ref->{$tag_seq}{ind}{$individual}{qual}   = $tag_qual;

    $clus_ref->{$tag_seq}{$individual}{count}  = $tag_read_count;
    $clus_ref->{$tag_seq}{$individual}{unique} = $tag_unique_count;
    $clus_ref->{$tag_seq}{$individual}{qual}   = $tag_qual;

    return;
}

#############################################################################
### Name:       SUMMARISE TAG CLUSTERS
### Function:   Get segregation patterns and sort alleles for each cluster
### Parameters: tag_clusters       - hash of clustered tags
###             individuals        - names of samples
###             cluster_summary    - hash of summary information to fill
###             tags               - hash of tags for reference
###             include_singletons - if set, report on tags present in
###                                  only one individual
### Returns:    nothing (fills cluster_summary)
#############################################################################

sub summarise_tag_clusters {
    my ($arg_ref) = @_;

    my $clus_ref    = $arg_ref->{tag_clusters};
    my $ind_ref     = $arg_ref->{individuals};
    my $summary_ref = $arg_ref->{cluster_summary};
    my $tags_ref    = $arg_ref->{tags};
    my $singletons  = $arg_ref->{include_singletons};

    foreach my $tag_cluster ( sort { $a <=> $b } keys %{$clus_ref} ) {
        my @seg_patterns;
        my $seg_pattern = q{};
        foreach my $tag_sequence ( sort keys %{ $clus_ref->{$tag_cluster} } ) {
            foreach my $individual ( @{$ind_ref} ) {
                my $individual_present =
                  defined(
                    $clus_ref->{$tag_cluster}{$tag_sequence}{$individual} )
                  ? "1"
                  : "-";
                $seg_pattern .= $individual_present;
            }

            if (   ( !$singletons )
                && ( ( $seg_pattern =~ tr/1// ) == 1 ) )
            {
                delete $tags_ref->{$tag_sequence};
                delete $clus_ref->{$tag_cluster}{$tag_sequence};
                $seg_pattern = q{};
                next;
            }

            push @seg_patterns,
              {
                seg_pattern => $seg_pattern,
                sequence    => $tag_sequence,
                individuals => $clus_ref->{$tag_cluster}{$tag_sequence},
              };

            $seg_pattern = q{};

        }
        if ( @seg_patterns == 0 ) {
            delete $clus_ref->{$tag_cluster};
            next;
        }

        my $tag_count = keys %{ $clus_ref->{$tag_cluster} };
        $summary_ref->{$tag_count}{count}++;

        my @sorted_alleles;
        $seg_pattern = q{};
        foreach my $seg_pattern_ref ( sort {$a->{seg_pattern} cmp $b->{seg_pattern}} @seg_patterns ) {
            push @sorted_alleles, $seg_pattern_ref;
            $seg_pattern .= $seg_pattern_ref->{seg_pattern} . q{ };
        }
        $summary_ref->{$tag_count}{segs}{$seg_pattern}{count}++;

        push @{ $summary_ref->{$tag_count}{segs}{$seg_pattern}{tags} },
          \@sorted_alleles;
    }
    return;
}

#############################################################################
### Name:       OUTPUT MARKERS
### Function:   Output all markers in TSV format
### Parameters: cluster_summary - all marker data
###             individuals     - names of samples
###             qualsout        - if set, output qualities
###             snpsout         - if set, output SNPs
###             fragments       - if set, output fragment counts, not reads
### Returns:    nothing (writes to STDOUT)
#############################################################################

sub output_markers {
    my ($arg_ref) = @_;

    my $clusters_ref    = $arg_ref->{cluster_summary};
    my $individuals_ref = $arg_ref->{individuals};
    my $qualsout        = $arg_ref->{qualsout};
    my $snpsout         = $arg_ref->{snpsout};
    my $fragments       = $arg_ref->{fragments};

    my $cluster_id = 0;
    print "ClusterID\tClusterTags\tSegPattern\tTag";
    map { print "\t$_"; } @{$individuals_ref};
    print "\n";
    foreach my $alleles_at_locus ( sort { $a <=> $b } keys %{$clusters_ref} ) {
        foreach my $seg_pattern (
            sort keys %{ $clusters_ref->{$alleles_at_locus}{segs} } )
        {
            foreach my $locus_ref (
                @{
                    $clusters_ref->{$alleles_at_locus}{segs}{$seg_pattern}{tags}
                }
              )
            {
                $cluster_id++;
                foreach my $allele_ref ( sort {$a->{seg_pattern} cmp $b->{seg_pattern}} @{$locus_ref} ) {
                    print
"$cluster_id\t$alleles_at_locus\t$allele_ref->{seg_pattern}\t$allele_ref->{sequence}";
                    foreach my $individual ( @{$individuals_ref} ) {

                        my $count =
                            $fragments
                          ? $allele_ref->{individuals}{$individual}{unique}
                          : $allele_ref->{individuals}{$individual}{count};
                        print $count ? "\t$count" : "\tNA";
                    }
                    print "\n";
                }
            }
        }
    }
}

#############################################################################
### Name:       PRINT SEG TAGS
### Function:   Output tags for one segregation pattern
### Parameters: seg_pattern - segregation pattern to output
###             seg_tags    - tags for this segregation pattern
###             ind         - names of samples
###             qualsout    - if set, output qualities
###             snpsout     - if set, output SNPs
###             fragments   - if set, output fragment counts, not reads
### Returns:    nothing (writes to STDOUT)
#############################################################################

sub print_seg_tags {
    my ($arg_ref) = @_;

    my $seg_pattern  = $arg_ref->{seg_pattern};
    my $seg_tags_ref = $arg_ref->{seg_tags};
    my $ind_ref      = $arg_ref->{ind};
    my $qualsout     = $arg_ref->{qualsout};
    my $snpsout      = $arg_ref->{snpsout};
    my $fragments    = $arg_ref->{fragments};

    print "$seg_pattern ";
    printf '%3s ', $seg_tags_ref->{count};
    print is_mirror($seg_pattern);
    print "\n";
    print "\t";
    print q{ } x length @{ $seg_tags_ref->{tags} }[0]->[0]->{sequence};
    print q{    };
    foreach my $ind ( @{$ind_ref} ) {
        if ( length($ind) < 3 ) {
            print q{ } x ( 3 - length $ind );
            print "$ind    ";
        }
        else {
            print $ind;
            if ( ( length $ind ) < 7 ) {
                print q{ } x ( 7 - ( length $ind ) );
            }
            else { print q{ }; }
        }
    }
    print "\n";

    foreach my $tag_ref ( @{ $seg_tags_ref->{tags} } ) {

        foreach my $allele_ref ( @{$tag_ref} ) {

            print "\t$allele_ref->{sequence}    ";
            print_tag_counts( $allele_ref->{individuals},
                $ind_ref, $qualsout, $fragments );
        }

        if ($snpsout) {
            print "\n";
            print_seg_snps( $tag_ref, $ind_ref, $fragments );
        }

        print "\n";
    }
    return;
}

#############################################################################
### Name:       PRINT TAG COUNTS
### Function:   Outputs read/fragment counts and qualities for one tag
### Parameters: seq_ref   - tag information
###             ind_ref   - names of samples
###             quals     - if set, output qualities
###             fragments - if set, output fragment counts, not reads
### Returns:    nothing (writes to STDOUT)
#############################################################################

sub print_tag_counts {
    my ( $seq_ref, $ind_ref, $quals, $fragments ) = @_;
    foreach my $individual ( @{$ind_ref} ) {

        my $count =
            $fragments
          ? $seq_ref->{$individual}{unique}
          : $seq_ref->{$individual}{count};
        if ( defined $count ) {
            printf '%6.2f ', $count;
        }
        else {
            print '  -    ';
        }
    }
    print "\n";
    if ($quals) {
        foreach my $individual ( @{$ind_ref} ) {
            if ( defined $seq_ref->{$individual}{qual} ) {
                print "\t";
                my @quals = split //, $seq_ref->{$individual}{qual};
                foreach my $qual (@quals) {
                    my $qual_score = ord($qual) - QUAL_OFFSET;

                    if    ( $qual_score < 20 ) { print q{!}; }
                    elsif ( $qual_score < 30 ) { print q{?}; }
                    else                       { print q{ }; }
                }
                print " $individual\n";
            }
        }
    }
    return;
}

#############################################################################
### Name:       PRINT SEG SNPS
### Function:   Print SNPs for a particular tag
### Parameters: tag_ref   - tag information to output
###             ind_ref   - names of samples
###             fragments - if set, output fragment counts, not reads
### Returns:    nothing (outputs to STDOUT)
#############################################################################

sub print_seg_snps {
    my ( $tag_ref, $ind_ref, $fragments ) = @_;

    my $tag_length = length( $tag_ref->[0]->{sequence} );

    foreach my $pos ( 0 .. $tag_length ) {
        my %pos_bases;
        foreach my $allele_ref ( @{$tag_ref} ) {
            my $allele_base = substr $allele_ref->{sequence}, $pos, 1;
            foreach my $individual ( keys %{ $allele_ref->{individuals} } ) {
                my $count =
                    $fragments
                  ? $allele_ref->{individuals}{$individual}{unique}
                  : $allele_ref->{individuals}{$individual}{count};
                if ( defined $count ) {
                    $pos_bases{$allele_base}{$individual} += $count;
                }
            }
        }

        if ( keys %pos_bases > 1 ) {
            foreach my $base ( sort keys %pos_bases ) {
                print "\t";
                print q{ } x $pos;
                print $base;
                print q{ } x ( $tag_length - $pos );
                print q{   };
                foreach my $individual ( @{$ind_ref} ) {
                    if ( defined $pos_bases{$base}{$individual} ) {
                        printf '%6.2f ', $pos_bases{$base}{$individual};
                    }
                    else {
                        print '  -    ';
                    }
                }
                print "\n";
            }
            print "\n";
        }
    }
    return;
}

#############################################################################
### Name:       IS MIRROR
### Function:   Checks if any two segregation patterns in one cluster
###             are mirrors of each other
### Parameters: seg_pattern - patterns to check
### Returns:    nothing (outputs to STDOUT)
#############################################################################

sub is_mirror {
    my ($seg_pattern) = @_;

    my @genotypes = split q{ }, $seg_pattern;
    foreach my $genotype_a (@genotypes) {
        foreach my $genotype_b (@genotypes) {
            next if ( $genotype_a eq $genotype_b );

            my @genotype_b_all = split //, $genotype_b;

            my @mirror_b_all;
            foreach my $genotype_b_present (@genotype_b_all) {
                if ($genotype_b_present) {
                    push @mirror_b_all, 0;
                }
                else {
                    push @mirror_b_all, 1;
                }
            }
            my $mirror_b = join q{}, @mirror_b_all;

            if ( $genotype_a eq $mirror_b ) {
                print 'M';
            }
        }
    }
    return;
}

__END__

#############################################################################
###
### DOCUMENTATION
###
#############################################################################

=head1 NAME

RADmarkers - generates candidate markers from RAD tags across many individuals

=head1 VERSION

This documentation refers to RADtools version $main::VERSION.

=head1 SYNOPSIS

=over 8

=item RADmarkers [options]

=item RADmarkers --help

=back

=head1 OPTIONS

=over 8

=item B<-h, --help>

Print a brief help message and exit

=item B<-u, --usage>

Print concise usage and exit

=item B<--man>

Print the manual page and exit

=item B<--version>

Print version number and exit

=item B<-e, --end_tagfiles>

Suffix of filenames containing RAD tags (default '.tags')

=item B<-d, --directory>

Name of directory containing tags files (ie prefix of pools filename if using RADtools conventions) (default .)

=item B<-v, --verbose>

Output progress messages and summary statistics (default off)

=item B<-s, --snpsout>

Output SNPs for clusters with more than one tag (default off)

=item B<-q, --qualsout>

Output quality summaries for tags (default off; ? = Q below 30, ! = Q below 20)

=item B<-f, --fragments>

Output fragment counts instead of read counts (default off). A fragment is a unique single/paired end combination. Multiple copies of a fragment are likely to be PCR duplicates, and so the use of this option is recommended.

=item B<-t, --tag_count_threshold>

Tags with counts below THRES are rejected (default 0; uses read counts by default, fragment counts if -f is set)

=item B<-m, --mismatches>

Two tags with this number of mismatches or less will be clustered together  (default 0, ie only associate clusters from different pools where they contain at least one tag each with exactly the same sequence)

=item B<-i, --include_singletons>

Include tags present in only one individual (default off)

=item B<-o, --old_output>

Produces old output format, now deprecated (default off)

=back

=head1 DESCRIPTION

B<RADmarkers> loads all tag files output by RADtags for all individuals and clusters together tags with similar sequences into candidate loci. The loci are then sorted by segregation pattern and output with counts against each individual. SNPs and qualities can also be output. 

=head1 AUTHOR

John Davey <john.davey@ed.ac.uk>

=head1 LICENCE AND COPYRIGHT

Copyright 2010 John Davey, University of Edinburgh john.davey@ed.ac.uk

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

=cut