Source code for assemble.Assemble_Program_Pear

from classes.ChewbaccaProgram import ChewbaccaProgram
from classes.Helpers import validate_paired_fastq_reads, init_pool, printVerbose, debugPrintInputInfo, run_parallel, \
                            getInputFiles, bulk_move_to_dir, makeAuxDir, cleanup_pool, getFileName
from classes.ProgramRunner import ProgramRunner, ProgramRunnerCommands


class Assemble_Program_Pear(ChewbaccaProgram):
    """Uses Pear to assemble reads from two (left and right) fastq files/directories.  For a set of k forward read files, and k
        reverse read files, return k assembled files.  Matching forward and reverse files should be identically named,
        except for a <forward>/<reverse> suffix that indicates the read orientation.  Two suffix pairs are supported:
        '_forwards' and '_reverse',
        and
        '_R1' and 'R2'
        Choose ONE suffix style and stick to it.

        e.g. Sample_100_forwards.fq and Sample_100_reverse.fq will be assembled into Sample_100_assembled.fq.
          Alternatively, Sample_100_R1.fq and Sample_100_R2.fq will be assembled into Sample_100_assembled.fq.
          You can provide as many pairs of files as you wish as long as they follow exactly on of the above naming
          conventions.  If a 'name' parameter is provided, it will be used as a suffix for all assembled sequence files.
    """
    name = "pear"


    def execute_program(self):
        args = self.args
        self.assemble_pear(args.input_f, args.input_r, args.outdir, args.name,  args.processes, args.pearthreads,
                           args.extraargstring)


    def assemble_pear(self, input_f, input_r, outdir, name, processes, pearthreads, extraargstring):
        """Uses PEAR to assemble paired F/R read files in run_parallel.

        :param input_f: File path to forward Fastq Reads file or folder.
        :param input_r: File path to reverse Fastq Reads file or folder.
        :param outdir: File path to the output directory.
        :param name: File prefix for the assembled reads.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        :param pearthreads: The number of threads per process to use.
        """
        # "~/programs/pear-0.9.4-bin-64/pear-0.9.4-64 -f %s -r %s -o %s -j %s -m %d"
        inputs = validate_paired_fastq_reads(input_f, input_r)
        pool = init_pool(min(len(inputs), processes))
        printVerbose("\tAssembling reads with pear")
        debugPrintInputInfo(inputs, "assemble")
        run_parallel([ProgramRunner(ProgramRunnerCommands.ASSEMBLE_PEAR,
                                      [forwards, reverse, "%s/%s_%s" % ( outdir, name, getFileName(forwards)),
                                        pearthreads],
                                      {"exists": [forwards, reverse], "positive": [pearthreads]},
                                      extraargstring)
                        for forwards, reverse in inputs], pool)

        printVerbose("Done assembling sequences...")
        # Grab all the auxillary files (everything not containing ".assembled."
        aux_files = getInputFiles(outdir, "*", "*.assembled.*", ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)