Source code for cluster.Cluster_Command

from classes.ChewbaccaCommand import ChewbaccaCommand
from Cluster_Program_Crop import Cluster_Program_Crop
from Cluster_Program_Swarm import Cluster_Program_Swarm
from Cluster_Program_Vsearch import Cluster_Program_Vsearch


[docs]class Cluster_Command(ChewbaccaCommand):
    """Clusters a set of fasta files.  This command generates a fasta file of unique sequences
    (each representing a cluster) and a .groups file.  This command also takes an optional .groups file containing
    replication data from previous commands.  If a .groups file is supplied, only one output .groups file is generated
    (regardless of the number of inputs).

    **Inputs**:
        * One or more fasta files to cluster.
        * Optional: :ref:`.groups` - A list of representative names and the names of their seed \
                                            sequences.  You likely have one of these files if you've previously run a \
                                            clustering or dereplication command.

    **Outputs**:
        * \*.fasta file - A fasta file with unique sequences and their replication counts.
        * \*.groups  - A :ref:`.groups`

    **Notes**:
        * The input fasta file(s) should have been dereplicated before clustering. \
        * For a single experiment with multiple fasta files, it is best to merge all input fasta files, dereplicate \
            them, then cluster the single merged and dereplicated fasta file.  This provides the best OTU groupings. 

    **Example**:

    ::

        ./
            Data.fasta:
                >seq1_3
                AAAAAAAAAA
                >seq2_1
                ATAAAAAAAA
                >seq3_1
                TTTTTTTTTT
                >seq4_1
                TTTTTTATTT
                >seq5_1
                TTTTTTATCT


            Data.groups:
                seq1	seq6 seq1 seq7

    ``$ python chewbacca.py cluster_seqs -i Data.fasta -o rslt -g Data.groups``

    ::

        rslt/
            Data_clustered_seeds.fasta:
                >seq1_4
                AAAAAAAAAA
                >seq3_3
                TTTTTTTTTT

        rslt_groups_files/
            postcluster_updated.groups:
                seq3	seq3 seq5 seq4
                seq1	seq2 seq1 seq7 seq6
    """
    supported_programs = [Cluster_Program_Crop,
                          Cluster_Program_Swarm,
                          Cluster_Program_Vsearch
                          ]
    default_program = Cluster_Program_Swarm
    command_name = "Cluster"