Analysis pipeline for PEANUT performance and sensitivity¶

The following Snakemake pipeline conducts the complete performance and sensitivity analysis of PEANUT and its competitors as presented in our publication. The corresponding Snakefile can be downloaded here.
# vim: set syntax=python expandtab!:
import csv
from functools import partial
from collections import defaultdict, Counter
import matplotlib
matplotlib.use("agg")
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid.axislines import Subplot
import pysam


__author__ = "Johannes Köster"
__license__ = "MIT"


"""
PEANUT analysis workflow.
The workflow expects the homo sapiens reference as "ref/genome.fa" together with
BWA and Bowtie 2, cushaw2 (with prefix genome.cushaw2), cushaw3 (with prefix genome.cushaw3) and nvbwt (with prefix genome.nvbwt) indexes in the same directory.
This can be achieved via symlinks.
For unbiased performance estimates, it should be ensured that all files are accessible
from the same hard disk and do not reside on a network share.
Further, the workflow should not be executed multithreaded.
"""


############## definitions ################

# occupancy
BLOCKSIZES = list(range(32, 577, 32))

# sensitivity
RABEMA_SCORES = list(range(60, 101, 5))
RABEMA_ERRORS = list(range(6)) + list(range(10, 21, 5)) #+ list(range(30, 51, 10))

SENSITIVITY_TESTS = expand("rabema/evaluation/sim.{readlength}.{score}.{error}.txt", readlength=[100], score=RABEMA_SCORES, error=RABEMA_ERRORS)

# performance
PEANUT_TESTS = expand("peanut.align.{type}", type="strata all".split())
PEANUT_TESTS_RECALL = expand("peanut.align.{type}", type="strata_noxa all_noxa".split())
PERFORMANCE_MAPPERS = "bowtie2.best cushaw2-gpu bowtie2.all razers3 ngm cushaw3 bwamem nvbowtie mrfast".split()
# bowtie2.all does not terminate due to swapping
PERFORMANCE_MAPPERS.remove("bowtie2.all")
# nvbowtie is optimized for Tesla GPUs with larger memory than our test system and fails with bad_alloc
PERFORMANCE_MAPPERS.remove("nvbowtie")
# ngm exits with an error for the large dataset, razers3 badalloc, mrfast crashes
PERFORMANCE_MAPPERS_LARGE_BLACKLIST = "ngm".split()
# peanut all not needed, razers3 throwing error (TODO find better parameters for razers3)
PERFORMANCE_VS_RECALL_BLACKLIST = "razers3 peanut.align.all_noxa".split()


PERFORMANCE_FASTQ = {
    "ERR281333.5000000": expand("sampled/ERR281333_{read}.5000000.fastq.gz", read="1 2".split()),
    "ERR281333_1.5000000": ["sampled/ERR281333_1.5000000.fastq.gz"],
    "simulated.5000000": ["sampled/simulated.5000000.fastq.gz"],
    "simulated.1000": ["sampled/simulated.1000.fastq.gz"],
    "ERR091787.25000000": expand("sampled/ERR091787_{read}.25000000.fastq.gz", read="1 2".split())
}

def get_performance_fastq(wildcards, gzip=True):
    fastqs = PERFORMANCE_FASTQ[wildcards.dataset]
    if not gzip:
        fastqs = ["".join(os.path.splitext(f)[:-1]) for f in fastqs]
    return fastqs

PERFORMANCE_RECALL = expand(
    "performance/evaluation/simulated.5000000.{mapper}.{run}.{benchmark_type}.rabema_report_tsv".split(),
    mapper=PERFORMANCE_MAPPERS + PEANUT_TESTS_RECALL,
    benchmark_type="recall precision".split(),
    run=0)
PERFORMANCE_RUNTIME = expand("performance/{dataset}.{mapper}.{run}.time".split(),
    dataset="ERR281333_1.5000000 simulated.5000000 ERR281333.5000000".split(),
    mapper=PERFORMANCE_MAPPERS + PEANUT_TESTS,
    run=0)
PERFORMANCE_RUNTIME_LARGE = expand("performance/{dataset}.{mapper}.{run}.time".split(),
    dataset="ERR091787.25000000".split(),
    mapper=[mapper for mapper in PERFORMANCE_MAPPERS if mapper not in PERFORMANCE_MAPPERS_LARGE_BLACKLIST] + PEANUT_TESTS,
    run=0)
PERFORMANCE_PEANUT = expand("performance/{dataset}.{mapper}.{run}.time".split(),
    dataset="ERR281333_1.5000000 simulated.5000000 ERR281333.5000000".split(),
    mapper=PEANUT_TESTS,
    run=0)
RUN_TIME_DIST = expand(
    "plots/{dataset}.run_time_dist.pdf",
    dataset="ERR281333_1.5000000 simulated.5000000 ERR281333.5000000 ERR091787.25000000".split()
)


shell.prefix("set -o pipefail; ")

##################### targets #####################


rule all:
    input:
        "plots/occupancy.pdf",
        "plots/benchmark_best-mappers_recall.pdf",
        "plots/benchmark_best-mappers_precision.pdf",
        "plots/benchmark_all-mappers_all.pdf",
        "plots/index_size.pdf",
        "plots/mapq_fpr.pdf",
        RUN_TIME_DIST,
        PERFORMANCE_RECALL,
        PERFORMANCE_RUNTIME,
        PERFORMANCE_RUNTIME_LARGE,
        SENSITIVITY_TESTS


rule all_performance_peanut:
    input:
        PERFORMANCE_PEANUT


rule all_performance:
    input:
        PERFORMANCE_RUNTIME


rule all_run_time_dist:
    input:
        RUN_TIME_DIST


##################### general #####################


rule index_repeatcount:
    input:
        "ref/genome.fa"
    output:
        "stats/index/genome.{minrepeat}.csv", "index/genome.{minrepeat}.hdf5"
    resources: gpu=1
    shell:
        "peanut index {input} --min-repeat-count {wildcards.minrepeat} --stats {output}"


rule index_default:
    input:
        "{prefix}.fasta"
    output:
        "{prefix}.hdf5"
    shell:
        "peanut index {input} {output}"


rule sample_fastq:
    input: "reads/{dataset}.fastq.gz"
    output: "sampled/{dataset}.{reads}.fastq.gz"
    params:
        lines=lambda wildcards: str(4 * int(wildcards.reads))
    shell:
        "set +o pipefail; zcat {input} | head -n {params.lines} | gzip > {output}"


rule sam_to_bam:
    input:  "{prefix}.sam"
    output: "{prefix}.bam"
    shell:  "samtools view -Sb {input} > {output}"


rule sort_bam:
    input:  "{prefix}.bam"
    output: "{prefix}.{sorttype,(sorted|namesorted)}.bam"
    params: flags=lambda wildcards: "-n" if wildcards.sorttype == "namesorted" else ""
    shell:  "samtools sort {params.flags} {input} {wildcards.prefix}.{wildcards.sorttype}"


rule index_bam:
    input:
        "{prefix}.sorted.bam"
    output:
        "{prefix}.sorted.bam.bai"
    shell:
        "samtools index {input}"


rule unzip_fastq:
    input:  "{prefix}.fastq.gz"
    output: "{prefix}.fastq"
    shell:
        "gzip -d -c {input} > {output}"


##################### occupancy ##################

ALL_KERNELS = set("create_queries_index create_queries_occ_count create_queries_occ popcount_index filter_reference create_candidates validate_hits".split())
REPRESENTATIVE_KERNELS = {"create_queries_index": "index construction", "filter_reference": "filtration", "validate_hits": "validation"}
PROFILE = "COMPUTE_PROFILE=1 COMPUTE_PROFILE_CSV=1 COMPUTE_PROFILE_CONFIG=profile.config"


rule profile_peanut:
    input:
        "sampled/{reads}.fastq.gz", "index/genome.2500.hdf5"
    output:
        profile="profile/{reads}.{blocksize}.profile.txt"
    resources: gpu=1
    threads: 8
    shell:
        "{PROFILE} CL_LOG_ERRORS=stdout COMPUTE_PROFILE_LOG={output.profile} "
        "peanut map --threads {threads} "
        "--blocksize-filtration {wildcards.blocksize} "
        "--blocksize-validation {wildcards.blocksize} "
        "{input} > /dev/null"


rule extract_occupancy:
    input:
        expand("profile/1000000.{blocksize}.profile.txt", blocksize=range(32, 513, 32))
    output:
        csv="profile/occupancy.csv"
    run:
        kernels = ALL_KERNELS

        occupancy = defaultdict(dict)
        for f in input:
            with open(f) as f:
                reader = csv.reader(f, delimiter=",")
                for l in reader:
                    if not len(l) > 10:
                        continue
                    try:
                        method = l[0]
                        blocksize = int(l[5]) * int(l[6]) * int(l[7])
                        occ = float(l[10])
                        if method in kernels:
                            occupancy[method][blocksize] = occ
                    except ValueError:
                        continue

        with open(output.csv, "w") as out:
            blocksizes = sorted(set(b for occ in occupancy.values() for b in occ))
            print("method", *blocksizes, sep="\t", file=out)
            for method, occ in occupancy.items():
                print(method, *[occ[b] if b in occ else "-" for b in blocksizes], sep="\t", file=out)


rule plot_occupancy:
    input:
        csv="profile/occupancy.csv"
    output:
        pdf="plots/occupancy.pdf"
    run:
        figure()
        styles = "- -- :".split()
        with open(input.csv) as f:
            reader = csv.reader(f, delimiter="\t")
            header = next(reader)
            x = [int(b) for b in header[1:]]
            i = 0
            for l in reader:
                kernel = l[0]
                occupancy = l[1:]
                if kernel not in REPRESENTATIVE_KERNELS:
                    continue
                x_ = [b for b, occ in zip(x, occupancy) if occ != "-"]
                y = [float(occ) for occ in occupancy if occ != "-"]
                plt.plot(x_, y, styles[i], label=REPRESENTATIVE_KERNELS[kernel])
                i += 1
        plt.xlabel("block size")
        plt.ylabel("occupancy")
        plt.legend(loc="lower right", handlelength=2.5)
        plt.ylim([0, 1])
        plt.savefig(output.pdf)


################## sensitivity ##################


rule rabema_download_data:
    output: 
        "rabema/data/saccharomyces/genome.fasta",
        "rabema/data/saccharomyces/reads_454/SRR000853.10k.fastq"
    params:
        archive="rabema-data.tar.bz2"
    shell:  
        """
        wget -O {params.archive} http://www.seqan.de/wp-content/plugins/download-monitor/download.php?id=27
        tar -xf {params.archive}
        mv rabema-data rabema
        rm {params.archive}
        """


rule rabema_create_fastq_simulated:
    input:
        "rabema/data/saccharomyces/genome.fasta"
    output:
        "rabema/sim.{readlength,\d+}.fastq"
    shell:
        """
        mason illumina -N 10000 --no-N -n {wildcards.readlength} -sq -o {output} {input}
        rm {output}.sam
        """


rule razers3_create_sam:
    input:
        "rabema/data/saccharomyces/genome.fasta",
        "rabema/sim.{readlength}.fastq"
    output:
        "rabema/gold.{readlength}.{error,\d+}.pre.sam"
    params:
        identity=lambda wildcards: str(100 - int(wildcards.error))
    threads: 6
    shell:
        "razers3 --dont-shrink-alignments --verbose "
        "--thread-count {threads} "
        "--recognition-rate 100 "
        "--percent-identity {params.identity} "
        "--max-hits 10000000 "
        "--output {output} "
        "{input}"


rule rabema_prepare_sam:
    input:
        "rabema/gold.{readlength}.{error}.pre.sam"
    output:
        "rabema/gold.{readlength,\d+}.{error,\d+}.sam"
    shell:
        "rabema_prepare_sam -i {input} -o {output}"


rule rabema_build_gold_standard:
    input:
        ref="rabema/data/saccharomyces/genome.fasta",
        bam="rabema/gold.{readlength}.{error}.sorted.bam"
    output: "rabema/gold.{readlength}.{error}.gsi"
    shell:
        "rabema_build_gold_standard "
        "--max-error {wildcards.error} "
        "--distance-metric edit "
        "--out-gsi {output} "
        "--reference {input.ref} "
        "--in-bam {input.bam}"


rule rabema_peanut:
    input:
        ref="rabema/data/saccharomyces/genome.hdf5",
        fastq="rabema/sim.{readlength,\d+}.fastq"
    output:
        "rabema/peanut.{readlength,\d+}.{score,\d+}.bam"
    resources: gpu=1
    shell:
        "peanut map --no-xa --max-hits 1000 --semiglobal --strata all --gap-open-penalty 2 --gap-extend-penalty 1 --percent-identity {wildcards.score} {input.ref} {input.fastq} | samtools view -Sbh - > {output}"



rule debug_peanut:
    input:
        ref="rabema/data/saccharomyces/genome.hdf5",
        fastq="rabema/debug.fastq"
    output:
        aligned="rabema/debug.bam"
    resources: gpu=1
    shell:
        """
        peanut map --max-hits 1000 --semiglobal --gap-open-penalty 2 --gap-extend-penalty 1 --percent-identity 60 {input.fastq} {input.ref} | samtools view -Shb - {output}
        """


rule rabema_evaluate_simulated:
    input:
        ref="rabema/data/saccharomyces/genome.fasta",
        peanut="rabema/peanut.{readlength}.{score}.namesorted.bam",
        gold="rabema/gold.{readlength}.{error}.gsi"
    output:
        "rabema/evaluation/sim.{readlength}.{score}.{error}.txt"
    log:
        "logs/rabema_evaluate.{readlength}.{score}.{error}.log"
    shell:
        "rabema_evaluate --reference {input.ref} "
        "--show-missed-intervals "
        "--max-error {wildcards.error} "
        "--distance-metric edit "
        "--benchmark-category all "
        "--in-gsi {input.gold} "
        "--in-bam {input.peanut} > {output} 2> {log}"


######################### performance ######################


rule performance_dataset_real:
    output:
        "reads/ERR281333_{read}.fastq.gz"
    shell:
        "wget -O {output} ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR281/ERR281333/ERR281333_{wildcards.read}.fastq.gz"


rule performance_dataset_real2:
    # download an illumina platinum genome from an african male
    output:
        "reads/ERR091787_{read}.fastq.gz"
    shell:
        "wget -O {output} ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR091/ERR091787/ERR091787_{wildcards.read}.fastq.gz"


rule performance_dataset_simulated:
    input:
        "ref/genome.fa"
    output:
        "sampled/simulated.{count}.fastq.gz", "sampled/simulated.{count}.fastq.sam"
    params:
        fastq="sampled/simulated.{count}.fastq"
    shell:
        """
        mason illumina -N {wildcards.count} -n 100 -sq -o {params.fastq} {input}
        gzip {params.fastq}
        """


rule performance_gold_standard_oracle:
    input:
        bam="sampled/simulated.{count}.fastq.sorted.bam",
        ref="ref/genome.fa"
    output:
        "performance/simulated.{count,\d+}.gsi"
    shell:
        "rabema_build_gold_standard --oracle-mode -o {output} -r {input.ref} -b {input.bam}"


rule performance_gold_standard_map_all:
    input:
        "ref/genome.fa",
        "sampled/simulated.{count}.fastq"
    output:
        "sampled/simulated.{count}.gold_all.pre.sam"
    threads: 8
    shell:
        "razers3 --dont-shrink-alignments "
        "--parallel-window-size 20000 "
        "--thread-count {threads} "
        "--recognition-rate 100 "
        "--percent-identity 85 "
        "--max-hits 1000000 "
        "--output {output} "
        "{input}"


rule performance_rabema_prepare_bam:
    input:
        "sampled/simulated.{count}.gold_all.pre.sam"
    output:
        "sampled/simulated.{count}.gold_all.sam"
    shell:
        "rabema_prepare_sam -i {input} -o {output}"


rule performance_gold_standard_all:
    input:
        bam="sampled/simulated.{count}.gold_all.sorted.bam",
        ref="ref/genome.fa"
    output:
        "performance/simulated.{count}.all.gsi"
    shell:
        "rabema_build_gold_standard "
        "--max-error 15 "
        "--distance-metric edit "
        "--out-gsi {output} "
        "--reference {input.ref} "
        "--in-bam {input.bam}"


rule performance_rabema:
    input:
        ref="ref/genome.fa",
        gsi="performance/simulated.{count}.gsi",
        bam="performance/simulated.{count}.{mapper}.{run}.namesorted.bam"
    output:
        "performance/evaluation/simulated.{count,[0-9]+}.{mapper}.{run,\d+}.{type,(precision|recall)}.rabema_report_tsv"
    params:
        lambda wildcards: "--only-unique-reads" if wildcards.type == "precision" else ""
    log:
        "logs/performance/rabema_evaluate.simulated.{count}.{mapper}.{run}.{type}.log"
    shell:
        "rabema_evaluate --show-missed-intervals --show-additional-hits "
        "--show-invalid-hits --oracle-mode "
        "{params} "
        "-r {input.ref} -g {input.gsi} -b {input.bam} "
        "--out-tsv {output} 2> {log}"


rule performance_rabema_all:
    input:
        ref="ref/genome.fa",
        gsi="performance/simulated.{count}.all.gsi",
        bam="performance/simulated.{count}.{mapper}.{run}.namesorted.bam"
    output:
        "performance/evaluation/simulated.{count,[0-9]+}.{mapper}.{run,\d+}.{type,(all|any-best|all-best)}.rabema_report_tsv"
    log:
        "logs/performance/rabema_evaluate.simulated.{count}.{mapper}.{run}.{type}.log"
    shell:
        "rabema_evaluate --show-missed-intervals --show-additional-hits "
        "--show-invalid-hits "
        "--max-error 15 "
        "--distance-metric edit "
        "--benchmark-category {wildcards.type} "
        "-r {input.ref} -g {input.gsi} -b {input.bam} "
        "--out-tsv {output} 2> {log}"


RUN_PARAMS = {
    "peanut": ["", "--percent-identity 90", "--percent-identity 95"],
    "bwa_mem": ["", "-r 2.5", "-r 3.5"],
    "bowtie2": ["", "--very-sensitive", "--very-fast"],
    "razers3": ["", "--percent-identity 80", "--percent-identity 90"],
    "ngm": ["", "--min-identity 0.8", "--min-identity 0.95"],
    "cushaw2_gpu": ["", "-min_id 0.8", "-min_id 0.95"],
    "cushaw3": ["", "-min_id 0.8", "-min_id 0.95"]
}


def get_run_params(mapper):
    def _get(wildcards):
        return RUN_PARAMS[mapper][int(wildcards.run)]
    return _get


PEANUT_TYPE2PARAMS = dict(
    all="--strata all",
    strata="--strata 1",
    strata_noxa="--strata 1 --no-xa",
    all_noxa="--strata all --no-xa",
    noalign="--strata 1 --no-alignments")


rule performance_peanut:
    resources: benchmark=1, gpu=1
    input:
        fastq=get_performance_fastq, index="index/genome.2500.hdf5"
    output:
        sam="performance/{dataset}.peanut.align.{type,(all|strata|strata_noxa|all_noxa|noalign)}.{run,\d+}.sam",
        stats="performance/{dataset}.peanut.align.{type}.{run,\d+}.stats",
        time="performance/{dataset}.peanut.align.{type}.{run,\d+}.time"
    params:
        type=lambda wildcards: PEANUT_TYPE2PARAMS[wildcards.type],
        run=get_run_params("peanut")
    log:
        "logs/{dataset}.peanut.align.{type}.{run}.log"
    shell:
        "rm -f {output.time}; "
        "for i in 1 2 3; do "
        "(time peanut --stats {output.stats} map --threads 8 --query-buffer 1000000 {params.type} "
        " {input.index} {input.fastq} --semiglobal {params.run} "
        " > {output.sam} 2> {log}) 2>> {output.time}; "
        "done"


rule performance_bwa_mem:
    resources: benchmark=1
    input:
        "ref/genome.fa",
        get_performance_fastq
    output:
        sam="performance/{dataset}.bwamem.{run,\d+}.sam",
        time="performance/{dataset}.bwamem.{run,\d+}.time"
    params:
        run=get_run_params("bwa_mem")
    log:
        "logs/{dataset}.bwamem.{run}.log"
    shell:
        "rm -f {output.time}; "
        "for i in 1 2 3; "
        "do (time bwa mem {params.run} -t 8 {input} > {output.sam} 2> {log}) 2>> {output.time}; "
        "done"


rule performance_razers3:
    resources: benchmark=1
    input:
        "ref/genome.fa",
        partial(get_performance_fastq, gzip=False)
    output:
        sam="performance/{dataset}.razers3.{run,\d+}.sam",
        time="performance/{dataset}.razers3.{run,\d+}.time"
    params:
        run=get_run_params("razers3")
    log:
        "logs/{dataset}.razers3.{run}.log"
    shell:
        "rm -f {output.time}; "
        "for i in 1 2 3; "
        "do (time razers3 {params.run} -tc 8 {input} --output {output.sam} 2> {log}) 2>> {output.time}; "
        "done"


rule performance_razers3_partitioned:
    resources: benchmark=1
    input:
        ref="ref/genome.fa",
        fastq=expand("sampled/ERR091787_{read}.25000000.fastq", read="1 2".split())
    output:
        sam="performance/ERR091787.25000000.razers3.{run,\d+}.sam",
        time="performance/ERR091787.25000000.razers3.{run,\d+}.time"
    params:
        run=get_run_params("razers3"),
        lines=str(int(25000000 * 4 / 2)),
        args = " ".join(map("{:02d}".format, range(2)))
    log:
        "logs/ERR091787.25000000.razers3.{run}.log"
    run:
        seq_args = " ".join("{}.{{}}.fastq".format(f) for f in input.fastq)
        shell(
        """
        for f in {input.fastq}
        do
            rm -f $f.*
            split -a 2 -d -l {params.lines} $f $f.
            rename 's/$/.fastq/' $f.*
        done
        rm -f {output.time}
        rm -f {log}
        for i in 1 2 3
        do
            (time (
                for chunk in {params.args}
                do
                    echo "chunk $chunk"
                    razers3 {params.run} -tc 8 {input.ref} {input.fastq[0]}.$chunk.fastq {input.fastq[1]}.$chunk.fastq --output {output.sam} &>> {log}
                done
            )) 2>> {output.time}
        done
        """
        )

ruleorder: performance_razers3_partitioned > performance_razers3


rule performance_ngm:
    resources: benchmark=1
    input:
        ref="ref/genome.fa",
        fastq=get_performance_fastq
    output:
        sam="performance/{dataset}.ngm.{run,\d+}.sam",
        time="performance/{dataset}.ngm.{run,\d+}.time"
    params:
        run=get_run_params("ngm")
    log:
        "logs/{dataset}.ngm.{run}.log"
    run:
        if len(input.fastq) > 1:
            fastqs = "-1 {} -2 {}".format(*input.fastq)
        else:
            fastqs = "-q {}".format(input.fastq)
        shell("rm -f {output.time}; "
        "for i in 1 2 3; "
        "do (time ngm {params.run} --gpu -t 8 {fastqs} -r {input.ref} -o {output.sam} &> {log}) 2>> {output.time}; "
        "done")


rule performance_cushaw3:
    resources: benchmark=1
    input:
        ref="ref/genome.fa",
        fastq=get_performance_fastq
    output:
        sam="performance/{dataset}.cushaw3.{run,\d+}.sam",
        time="performance/{dataset}.cushaw3.{run,\d+}.time"
    params:
        run=get_run_params("cushaw3")
    log:
        "logs/{dataset}.cushaw3.{run}.log"
    run:
        inputparam = "-q" if len(input.fastq) > 1 else "-f"
        shell(
            "rm -f {output.time}; "
            "for i in 1 2 3; "
            "do (time cushaw3 align {params.run} -t 8 -r ref/genome.cushaw3 {inputparam} {input.fastq} -o {output.sam} &> {log}) 2>> {output.time}; "
            "done")


rule performance_cushaw2_gpu:
    resources: benchmark=1
    input:
        ref="ref/genome.fa",
        fastq=get_performance_fastq
    output:
        sam="performance/{dataset}.cushaw2-gpu.{run,\d+}.sam",
        time="performance/{dataset}.cushaw2-gpu.{run,\d+}.time"
    params:
        run=get_run_params("cushaw2_gpu")
    log:
        "logs/{dataset}.cushaw2-gpu.{run}.log"
    run:
        inputparam = "-q" if len(input.fastq) > 1 else "-f"
        shell(
            "rm -f {output.time}; "
            "for i in 1 2 3; "
            "do (time cushaw2-gpu {params.run} -t 8 {inputparam} {input.fastq} -r ref/genome.cushaw2 -o {output.sam} &> {log}) 2>> {output.time}; "
            "done")


rule performance_bowtie2:
    resources: benchmark=1
    input:
        "ref/genome.1.bt2",
        fastq=get_performance_fastq
    output:
        sam="performance/{dataset}.bowtie2.{type,(best|all)}.{run,\d+}.sam",
        time="performance/{dataset}.bowtie2.{type,(best|all)}.{run,\d+}.time"
    params:
        prefix="ref/genome",
        all=lambda wildcards: "--all" if wildcards.type == "all" else "",
        run=get_run_params("bowtie2")
    log:
        "logs/{dataset}.bowtie2.{run}.log"
    run:
        fastq = "-U {}".format(input.fastq) if len(input.fastq) == 1 else "-1 {} -2 {}".format(*input.fastq)
        shell("rm -f {output.time}; for i in 1 2 3; do (time bowtie2 {params.run} --threads 8 {params.all} {params.prefix} {fastq} > {output.sam} 2> {log}) 2>> {output.time}; done")


rule performance_nvbowtie:
    resources: benchmark=1
    input:
        "ref/genome.fa",
        fastq=get_performance_fastq
    output:
        sam="performance/{dataset}.nvbowtie.{run,\d+}.sam",
        time="performance/{dataset}.nvbowtie.{run,\d+}.time"
    params:
        prefix="ref/genome.nvbwt"
    log:
        "logs/{dataset}.nvbowtie.{run}.log"
    shell:
        "rm -f {output.time}; "
        "for i in 1 2 3; "
        "do (time nvBowtie --file-ref {params.prefix} {input.fastq} {output.sam} &> {log}) 2>> {output.time}; "
        "done"


rule performance_mrfast:
    resources: benchmark=1
    input:
        ref="ref/genome.fa",
        fastq=partial(get_performance_fastq, gzip=False)
    output:
        sam="performance/{dataset}.mrfast.{run,\d+}.sam",
        time="performance/{dataset}.mrfast.{run,\d+}.time"
    log:
        "logs/{dataset}.mrfast.{run}.log"
    run:
        _, reads = wildcards.dataset.split(".")
        chunks = 1000 if int(reads) > 5000000 else 100
        args = " ".join(map("{:04d}".format, range(chunks)))
        seq_args = "--seq {}.{{}}".format(input.fastq) if len(input.fastq) == 1 else "--pe --min 100 --max 500 --seq1 {}.{{}} --seq2 {}.{{}}".format(*input.fastq)
        shell("""
        rm -f {output.sam}.*
        for f in {input.fastq}
        do
            flines=`wc -l $f | cut -f1 -d' '`
            lines=$(($flines / {chunks}))
            rm -f $f.*
            split -a 4 -d -l $lines $f $f.
        done

        rm -f {output.time}
        for i in 1 2 3
        do
            (time parallel -j 8 -i sh -c 'mrfast --search {input.ref} {seq_args} -o {output.sam}.{{}}' -- {args} ) 2>> {output.time}
        done
        samtools view -SH {output.sam}.0000 > {output.sam}
        for f in {output.sam}.*
        do
            # ignore errors, SAM will be incomplete but we are only interested in run time
            samtools view -S $f >> {output.sam} || true
        done
        """)


####################### PLOT_BENCHMARK_RESULTS #################################

BENCHMARK_BEST_MAPPERS = [
    mapper
    for mapper in ["peanut.align.strata_noxa"] + PERFORMANCE_MAPPERS
    if mapper != "razers3" and mapper != "mrfast"  # only compare best mappers here
]
BENCHMARK_ALL_MAPPERS = "peanut.align.all_noxa razers3 mrfast".split()

MAPPER_LABELS = {
    "peanut": "PEANUT",
    "bwamem": "BWA-MEM",
    "ngm": "NextGenMap",
    "bowtie2": "Bowtie 2",
    "cushaw3": "CUSHAW3",
    "cushaw2-gpu": "CUSHAW2-GPU",
    "razers3": "RazerS 3",
    "mrfast": "MrFast"
}


def plot_benchmarks_input(wildcards):
    pattern = "performance/evaluation/simulated.{reads}.{mapper}.0.{type}.rabema_report_tsv"
    if wildcards.mapper_type == "best-mappers":
        return expand(pattern, mapper=BENCHMARK_BEST_MAPPERS, type=wildcards.type, reads=5000000)
    return expand(pattern, mapper=BENCHMARK_ALL_MAPPERS, type=wildcards.type, reads=1000)


rule plot_benchmarks:
    input:
        benchmarks=plot_benchmarks_input
    output:
        "plots/benchmark_{mapper_type}_{type}.pdf"
    run:
        mappers = BENCHMARK_BEST_MAPPERS if wildcards.mapper_type == "best-mappers" else BENCHMARK_ALL_MAPPERS

        def parse_rabema(f):
            d = np.genfromtxt(f, dtype=None, names="error_rate num_max num_found percent_found norm_max norm_found percent_norm_found".split())
            p = np.cumsum(d["norm_max"])
            tp = np.cumsum(d["norm_found"])
            rate = tp / p * 100
            return rate, d["percent_norm_found"]

        error_rate = np.arange(16)
        figure()
        styles = "- -- : k- k-- k:".split()
        for i, (mapper, benchmark_file) in enumerate(zip(mappers, input.benchmarks)):
            cumulative_results, results = parse_rabema(benchmark_file)
            results = cumulative_results
            l = results.size
            results = np.resize(results, error_rate.size)
            results[l:] = results[l-1]

            plt.plot(error_rate, results, styles[i], label=MAPPER_LABELS[mapper.split(".")[0]])
        plt.xlabel("maximum edit distance")
        plt.ylabel((wildcards.type if wildcards.type != "all" else "sensitivity") + " [%]")
        plt.legend(loc="lower left", ncol=2 if wildcards.type != "all" else 1, handlelength=2.5)
        ymin = 80 if wildcards.mapper_type == "best-mappers" else 60
        plt.ylim([ymin, 100])
        plt.xlim([0,15])

        plt.savefig(output[0])



##################### MAPQ ###############################

rule extract_mapq_fpr:
    input:
        peanut="performance/simulated.5000000.peanut.align.all_noxa.0.namesorted.bam",
        gold="sampled/simulated.5000000.fastq.sorted.bam",
        idx="sampled/simulated.5000000.fastq.sorted.bam.bai"
    output:
        pdf="performance/evaluation/mapq_fpr.txt"
    run:
        import pysam
        total_reads = 5000000
        with pysam.Samfile(input.peanut, "rb") as peanut, pysam.Samfile(input.gold, "rb") as gold:
            FP = Counter()
            P = Counter()
            n = 0
            last = None
            for i, hit in enumerate(peanut):
                if not i % 1000:
                    print("processing hit", i)
                if not hit.is_unmapped:
                    ref = peanut.getrname(hit.tid)
                    is_fp = not any(
                        a.qname == hit.qname
                        for a in gold.fetch(
                            reference=ref, start=hit.pos, end=hit.pos + 1
                        )
                    )
                    if is_fp:
                        FP[hit.mapq] += 1
                    P[hit.mapq] += 1
                if i > total_reads:
                    break
        mapqs = sorted(P)
        p = [P[m] for m in mapqs]
        fp = [FP[m] for m in mapqs]
        np.savetxt(output[0], [mapqs, p, fp])

rule plot_mapq_fpr:
    input:
        "performance/evaluation/mapq_fpr.txt"
    output:
        pdf="plots/mapq_fpr.pdf"
    run:
        d = np.loadtxt(input[0])
        mapqs = d[0]
        p = d[1]
        fp = d[2]

        figure()
        plt.plot(mapqs, fp / p, "-", label="measured FPR")
        plt.plot(mapqs, [10 ** (-q / 10) for q in mapqs], "--", label="expected FPR")
        plt.legend(loc="upper right", handlelength=2.5)
        plt.xlabel("mapping quality")
        plt.ylim([-0.01, 1])
        plt.savefig(output.pdf)


####################### index size ##########################

rule plot_index_size:
    output:
        "plots/index_size.pdf"
    run:
        figure()
        epsilon = np.linspace(0, 1)
        ratio = 1 + epsilon / (16 * (1 + epsilon))
        line, = plt.semilogx(epsilon, ratio, "-")

        calc_ratio = lambda k: (k / 16 + 2) / (k + 1)
        K = np.linspace(1, 100)
        ratio = calc_ratio(K)
        plt.semilogx(K, ratio, "-")
        plt.xlabel("K")
        plt.ylabel("index size ratio")
        #plt.xlim([1, 100])

        break_x = 16 / 15
        break_y = calc_ratio(break_x)
        
        plt.plot([break_x] * 2, [0, break_y], "k--")

        plt.savefig(output[0])


###################### run time distribution ##################

rule plot_run_time_dist:
    input:
        "performance/{dataset}.peanut.align.strata.0.stats"
    output:
        "plots/{dataset}.run_time_dist.pdf"
    run:
        durations = dict()
        with open(input[0]) as f:
            reader = csv.reader(f, delimiter="\t")
            for l in reader:
                if l[0].endswith("duration"):
                    durations[l[0].split("_")[0]] = sum(map(float, l[1:]))
        items = "index filtration validation postprocessing writing".split()
        durations = [durations[i] for i in items]
        plt.figure(figsize=(3,2))
        cmap = plt.cm.Blues_r
        colors = cmap(np.linspace(0., 1., len(items) * 2))
        # fix index label
        items[0] = "indexing"
        patches, texts, autotexts = plt.pie(durations, labels=items, autopct="%1.1f%%", colors=colors)
        for p in patches:
            p.set_edgecolor("w")
        for t in autotexts:
            t.set_color("w")
        plt.gca().set_aspect("equal")
        plt.savefig(output[0], bbox_inches="tight")


def figure(figsize=None, right=False, top=False):
    fig = plt.figure(figsize=figsize)
    ax = Subplot(fig, 111)
    ax.axis["right"].set_visible(False)
    ax.axis["top"].set_visible(False)
    fig.add_subplot(ax)