Source code for preclean.Preclean_Program_Bayeshammer

from classes.ChewbaccaProgram import ChewbaccaProgram
from classes.Helpers import getInputFiles, debugPrintInputInfo, init_pool, run_parallel, printVerbose, strip_ixes, \
    cleanup_pool, bulk_move_to_dir, makeAuxDir, validate_paired_fastq_reads, move
from classes.ProgramRunner import ProgramRunner, ProgramRunnerCommands


class Preclean_Program_Bayeshammer(ChewbaccaProgram):
    """Uses bayeshammer (Spades) to fix sequencing errors via kmer clustering and probabilistic substitution.
    """
    name = "bayeshammer"

    def execute_program(self):
        args = self.args
        self.preclean_bayeshammer(args.input_f, args.input_r, args.outdir, args.processes, args.bayesthreads,
                                  args.extraargstring)

    def preclean_bayeshammer(self, input_f, input_r, outdir, processes, bayesthreads, extraargstring):
        """Assembles reads from two (left and right) fastq files/directories.

        :param input_f: File path to file or folder of left reads to clean.
        :param input_r: File path to file or folder of right reads to clean.
        :param outdir: Filepath to output directory.
        :param bayesthreads: The number of threads per process to use.
        :param processes: The maximum number of processes to use.
        :param kmerlen: The kmer length to use.  Default: 16.
        :param extraargstring: Advanced program parameter string.
        """
        # Collect input files, and validate that they match
        inputs = validate_paired_fastq_reads(input_f, input_r)
        pool = init_pool(min(len(inputs), processes))
        printVerbose("\tPrecleaning %s reads with Spades-Baye's Hammer..." % len(inputs))
        debugPrintInputInfo(inputs, "preclean/fix.")

        run_parallel([ProgramRunner(ProgramRunnerCommands.PRECLEAN_SPADES,
                                    [forwards, reverse, outdir, bayesthreads],
                                    {"exists": [forwards, reverse], "positive": [bayesthreads]},
                                    extraargstring)
                      for forwards, reverse in inputs], pool)
        printVerbose("Done cleaning reads.")

        # Grab all the auxillary files (everything not containing ".assembled."
        # aux_files = getInputFiles(outdir, "*", "*.assembled.*", ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        # bulk_move_to_dir(aux_files, makeAuxDir(outdir))

        # Select output files
        aux_files = getInputFiles(outdir, "*", ignore_empty_files=False)
        corrected_dir = "%s/corrected" % outdir
        bulk_move_to_dir(getInputFiles(corrected_dir, "*"), outdir)
        aux_files += getInputFiles(outdir, "*unpaired*", ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "configs", ignore_empty_files=False)

        # Gather aux files
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Rename output files
        output_files = getInputFiles(outdir, "*", "corrected_*")
        for out_file in output_files:
            move(out_file, "%s/%s_corrected.fastq" % (outdir, strip_ixes(out_file)))

        # move the last minute log file
        try:
            move("%s/corrected_corrected.fastq" % outdir, "%s/corrected_corrected.fastq" % aux_dir)
        except:
            pass
        cleanup_pool(pool)