Source code for demux.Demux_Barcode_Program_Fastx

from classes.ChewbaccaProgram import ChewbaccaProgram
from classes.Helpers import getInputFiles, debugPrintInputInfo, init_pool, run_parallel, printVerbose, \
    makeAuxDir, bulk_move_to_dir, cleanup_pool
from classes.ProgramRunner import ProgramRunner, ProgramRunnerCommands


class Demux_Program_Fastx(ChewbaccaProgram):
    """Splits a fasta/fastq file on a set of barcodes.  For a set of k input files, each input file is assigned a
    file_id. Then, each file is split into a set of subfiles, where each subfile named <sample>_splitOut_<k> contains
    the sequences belonging to <sample> from file <k>.
    Note, the assignment of a file to its file_id is arbitrary and should not be used to identify file groupings.
    E.x:
    File Alpha.fastq contains sequences belonging to samples A,B, and C,
    File Beta.fastq contains sequences belonging to samples A and C,
    we expect to see the output files:
    A_splitOut_0.fastq #sequences from sample A that were in Alpha
    A_splitOut_1.fastq #sequences from sample A that were in Beta
    B_splitOut_0.fastq #sequences from sample B that were in Alpha
    C_splitOut_0.fastq #sequences from sample C that were in Alpha
    C_splitOut_1.fastq #sequences from sample C that were in Beta
    """
    name = "fastx"

    def execute_program(self):
        args = self.args
        self.demux_fastx(args.input_f, args.barcodes, args.outdir, args.processes, args.extraargstring)

    def demux_fastx(self, input_f, barcodes, outdir, processes, extraargstring):
        """Demuxes using FAST X BARCODE SPLITTER.

        :param input_f: File path to input file or folder of input files.
        :param barcodes: File path to input barcodes file.
        :param outdir: Filepath to output directory.
        :param processes: Number of processes to use to demux input fileset.
        :param extraargstring: Advanced program parameter string.
        """
        # Get input files
        files_to_split = getInputFiles(input_f)
        # Assign the files shard numbers
        file_id = range(len(files_to_split))
        file_id_pairs = zip(files_to_split, file_id)
        debugPrintInputInfo(files_to_split, "demux")
        pool = init_pool(min(len(file_id_pairs), processes))

        printVerbose("Demuxing sequences...")
        run_parallel([ProgramRunner(ProgramRunnerCommands.DEMUX_FASTX,
                                    [input_, barcodes, "%s/" % outdir, "_%d_demux.fastq" % id_],
                                    {"exists": [input_, barcodes]}, extraargstring)
                      for input_, id_ in file_id_pairs], pool)
        printVerbose("Demuxed sequences.")

        # Grab all the auxillary files
        aux_files = getInputFiles(outdir, "unmatched_*", ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)