from classes.ChewbaccaCommand import ChewbaccaCommand
from Cluster_Program_Crop import Cluster_Program_Crop
from Cluster_Program_Swarm import Cluster_Program_Swarm
from Cluster_Program_Vsearch import Cluster_Program_Vsearch
[docs]class Cluster_Command(ChewbaccaCommand):
"""Clusters a set of fasta files. This command generates a fasta file of unique sequences
(each representing a cluster) and a .groups file. This command also takes an optional .groups file containing
replication data from previous commands. If a .groups file is supplied, only one output .groups file is generated
(regardless of the number of inputs).
**Inputs**:
* One or more fasta files to cluster.
* Optional: :ref:`.groups` - A list of representative names and the names of their seed \
sequences. You likely have one of these files if you've previously run a \
clustering or dereplication command.
**Outputs**:
* \*.fasta file - A fasta file with unique sequences and their replication counts.
* \*.groups - A :ref:`.groups`
**Notes**:
* The input fasta file(s) should have been dereplicated before clustering. \
* For a single experiment with multiple fasta files, it is best to merge all input fasta files, dereplicate \
them, then cluster the single merged and dereplicated fasta file. This provides the best OTU groupings.
**Example**:
::
./
Data.fasta:
>seq1_3
AAAAAAAAAA
>seq2_1
ATAAAAAAAA
>seq3_1
TTTTTTTTTT
>seq4_1
TTTTTTATTT
>seq5_1
TTTTTTATCT
Data.groups:
seq1 seq6 seq1 seq7
``$ python chewbacca.py cluster_seqs -i Data.fasta -o rslt -g Data.groups``
::
rslt/
Data_clustered_seeds.fasta:
>seq1_4
AAAAAAAAAA
>seq3_3
TTTTTTTTTT
rslt_groups_files/
postcluster_updated.groups:
seq3 seq3 seq5 seq4
seq1 seq2 seq1 seq7 seq6
"""
supported_programs = [Cluster_Program_Crop,
Cluster_Program_Swarm,
Cluster_Program_Vsearch
]
default_program = Cluster_Program_Swarm
command_name = "Cluster"