diff --git a/prepare_data/alignment_pipelines/calc_ideal_methylation.nf b/prepare_data/alignment_pipelines/calc_ideal_methylation.nf
new file mode 100644
index 0000000..a597878
--- /dev/null
+++ b/prepare_data/alignment_pipelines/calc_ideal_methylation.nf
@@ -0,0 +1,464 @@
+// this should construct an idealize but realistic methylation dataset 
+// from a standard DNA library and a set of methylation data (bedgraphs)
+// steps:
+// - proces unconverted fastq files (*.[12].fastq.gz), adapter trim, then align reads to 
+//   the grch38 reference (downloaded by the tool) with bwa-mem
+// - process input bed graphs to calculate an aggregate methylation frequency for each C
+// - in-silico convert bwa-mem aligned reads 
+// - re-map the converted reads to a converted reference using bwa-meth
+
+
+params.outdir = 'output'
+params.bwameth_genome = file('genome/grch38_core+bs_controls.fa').toAbsolutePath() //todo - move to zenodo?
+params.bwamem_genome = file('/mnt/galaxy/data/genome/grch38_core+bs_controls/bwa_index/grch38_core+bs_controls/grch38_core+bs_controls.fa').toAbsolutePath() //todo - move to zenodo?
+params.unconverted_fastqs_path = './*.{1,2}.fastq.gz'
+params.bedgraphs_path = './*.bedGraph.gz'
+params.aligner_cpus = 5
+path_to_convert_script = params.path_to_convert_script
+
+bedgraph_files = Channel.fromPath(params.bedgraphs_path)
+unconverted_fastq_files = Channel.fromFilePairs(params.unconverted_fastqs_path, flat: true)
+
+process cache_genome {
+    publishDir file(params.bwameth_genome).parent
+
+    output: 
+        val(file("*.fa")) into grch38_ref
+
+    when:
+        ! params.bwameth_genome.exists()
+
+    shell:
+    '''
+        if [ -f !{params.bwameth_genome} ]; then
+            echo "already present"
+        else
+            wget https://neb-em-seq-sra.s3.amazonaws.com/grch38_core%2Bbs_controls.fa 
+            mkdir -p $(dirname !{params.bwameth_genome})
+            cp grch38_core+bs_controls.fa !{params.bwameth_genome}
+        fi
+    '''
+}
+
+process bwameth_index {
+
+    input: file(grch38_ref) //this is only used to cause this step to wait for the genome download to complete
+
+    genome_dir = file(params.bwameth_genome).parent
+
+    when:
+        ! (params.bwameth_genome + "/bwameth").exists()
+
+    conda 'bwameth=0.2.2'
+
+    shell:
+    '''
+        mkdir -p !{genome_dir}/bwameth \
+          && cd !{genome_dir}/bwameth \
+          && ln -s !{params.bwameth_genome} genome.fa \
+          && bwameth.py --index genome.fa
+    '''
+}
+
+process bwa_index {
+
+    input: file(grch38_ref) //this is only used to cause this step to wait for download to complete
+
+    genome_dir = file(params.bwameth_genome).parent
+
+    conda 'bwa=0.7.17'
+
+    when:
+        ! file(params.bwameth_genome + "/bwa-mem").exists
+
+    shell:
+    '''
+        mkdir -p !{genome_dir}/bwa-mem \
+          && cd !{genome_dir}/bwa-mem \
+          && ln -s !{params.bwameth_genome} genome.fa \
+          && bwa index genome.fa
+    '''
+}
+
+process prepare_metadata_for_bedGraphs {
+    input: file(bedgraph) from bedgraph_files.toList()
+    output: file('bedgraph_lib_info.csv') into bedgraph_info
+
+    shell:
+    '''
+    for bg in `find "$(pwd)" -maxdepth 1 -name '*.bedGraph.gz'`; do
+        dirname=`dirname $bg`
+        cell_line=`echo $(basename $bg) | cut -f 2 -d _`
+        context=`echo $(basename $bg) | cut -f 3 -d .`
+        printf "${cell_line},${context},${bg}\n" >> 'bedgraph_lib_info.csv'
+    done
+    '''
+}
+
+bedgraph_info.splitCsv()
+             .set{library_bedgraphs}
+
+
+// merge bedgraphs by cell line and recalculate % meth for all libs
+process merge_bedgraphs_context {
+    cpus 1
+    tag {cell_line; context}
+
+    input: 
+        tuple cell_line, context, file(bed) from library_bedgraphs.groupTuple(by: [0,1])
+    output: 
+        tuple cell_line, file("combined_bedgraph_*") into combined_bedgraphs_context
+
+    conda 'bedtools=2.29.2 pigz=2.3.4 parallel=20200722 sed=4.8'
+
+    shell:
+    '''
+        for file in input.*; do ln -s $(cat $file) ./; done
+        ungz_bedgs=$(
+            for f in *.bedGraph.gz; do 
+                echo -n "<(pigz -d < $f | sed 's/\\t/:/g4') "
+            done
+        ) && echo " bedtools unionbedg -filler '0:0:0' -i ${ungz_bedgs}" | bash \
+        | tr ':' "\\t" | \
+        awk -v OFS='\\t' '{  
+                offset=3; meth=0; unmeth=0
+                for(i=0; i<=NF-offset+1; i++) {
+                    if( i%3 == 1) { meth += $(i+offset+1) }
+                    else if( i%3 == 2) { unmeth += $(i+offset+1) }
+                }; 
+                avg_meth=meth/(meth+unmeth);
+                print $1,$2,$3,avg_meth,meth,unmeth
+            }' > combined_bedgraph_!{context}
+
+    '''
+}
+
+process merge_bedgraphs {
+    cpus 1
+    tag {cell_line}
+
+    input:
+        tuple cell_line, file(bed) from combined_bedgraphs_context.groupTuple()
+    output:
+        tuple cell_line, file('combined_bedgraph') into combined_bedgraphs
+
+    shell:
+    '''
+        cat combined_bedgraph_* > combined_bedgraph
+    '''
+
+}
+
+// process prepare_metadata_for_fastqs {
+//     input: 
+//         tuple base, file(r1), file(r2) from unconverted_fastq_files.flatten().toList()
+//     output: 
+//         file('fastq_lib_info.csv') into fastq_files
+
+//     shell:
+//     '''
+//     for read1_fq in `find . -maxdepth 1 -name '*1.fastq*'`; do
+
+//         dirname=`dirname $read1_fq`
+//         library=!{base} #`basename $(basename $read1_fq .1.fastq) .1.fastq.gz`
+//         method=`echo $library | cut -f 1 -d _`
+//         cell_line=`echo $library |cut -f 2 -d _`
+//         lab=`echo $library |cut -f 3 -d _`
+//         tile=all
+//         header=`zcat -f $read1_fq | head -n 1`
+//         flowcell=`echo $header | cut -f 3 -d ':'`
+//         lane=`echo $header | head -n 1 | cut -f 4 -d ':'`
+//         read2_fq=$(ls ${dirname}/*${library}*2.*);
+
+//         if [ -f $read1_fq ] && [ -f $read2_fq ]; then
+//             printf "${library},${method},${lab},${cell_line},${read1_fq},${read2_fq}\n" >> 'fastq_lib_info.csv'
+//         else
+//             echo "Unable to find $read1_fq or $read2_fq"
+//             exit 255
+//         fi
+//     done
+//     '''
+
+// }
+
+// fastq_files
+//     .splitCsv(header: ['library', 'method', 'lab','cell_line', 'flowcell', 'read1', 'read2'])
+//     .set {fastq_for_bwamem}
+
+process split_fastqs {
+    cpus 2
+
+    input:
+        tuple library, file(read1), file(read2) from unconverted_fastq_files
+
+    output:
+        tuple file('*1.fastq.gz_*'), file('*2.fastq.gz_*') into split_fastqs
+    
+    shell:
+    '''
+    split <(zcat !{read1}) !{read1}_ -l 4000000 &
+    split <(zcat !{read2}) !{read2}_ -l 4000000
+    '''
+  
+}
+
+process bwamem_md {
+    cpus params.aligner_cpus
+    errorStrategy 'retry'
+    tag {library}
+
+    conda 'bwa=0.7.17 fastp=0.20.1 seqtk=1.3 samblaster=0.1.24 sambamba'
+
+    input:
+        tuple file(read1), file(read2) from split_fastqs.transpose()
+
+    output:
+        tuple file('*.sorted.bam'), file('*.sorted.bam.bai') into split_bams
+
+    shell:
+    '''
+    library=$(basename $PWD)
+    inst_name=$(head -n 1 < <(cat *1.fastq.gz*) | cut -f 1 -d ':' | sed 's/^@//')
+    fastq_barcode=$(head -n 1 < <(cat *1.fastq.gz*) | sed -r 's/.*://')
+    flowcell=$(head -n 1 < <(cat *1.fastq.gz*) | cut -f 3 -d ':')
+    if [[ "${inst_name:0:2}" == 'A0' ]] || [[ "${inst_name:0:2}" == 'NS' ]] || [[ "${inst_name:0:2}" == 'NB' ]]; then
+        #these 2-color instruments should have poly-g stretches trimmed since they are likely indicative of inactive clusters
+        trim_polyg='--trim_poly_g'
+    else
+        trim_polyg=''
+    fi
+
+    seqtk mergepe <(cat *1.fastq.gz*) <(cat *2.fastq.gz*) \
+    | fastp --stdin --stdout -l 2 -Q ${trim_polyg} --interleaved_in --overrepresentation_analysis -j "${library}_combined_fastp.json" \
+    | bwa mem -p -t !{task.cpus} -R \"@RG\\\\tID:${fastq_barcode}\\\\tSM:${library}\\\\tBC:${fastq_barcode}\\\\tCN:Multiple\\\\tPL:ILLUMINA\\\\tPU:${flowcell}-${fastq_barcode}" !{params.bwamem_genome} /dev/stdin 2>  "${library}_${fastq_barcode}${flowcell}.log.bwamem" \
+    | sambamba view -t 2 -S -f bam -o "${library}_${fastq_barcode}.bwa.md.bam" /dev/stdin;
+
+    sambamba sort *.md.bam
+    sambamba index *.sorted.bam
+    '''
+
+}
+
+chrom_list = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM', 'chrEBV', 'phage_lambda', 'plasmid_puc19c', 'phage_T4', 'phage_Xp12']
+
+process split_bams {
+    cpus 2
+    conda 'sambamba'
+
+    input:
+        tuple file(bam), file(bai) from split_bams
+        each chrom from chrom_list
+
+    output:
+        tuple chrom, file('*.split.bam'), file('*.split.bam.bai') into chrom_split_bams
+
+    shell:
+    '''
+    identifier=$(basename $PWD)
+    my_chrom=$(echo "!{chrom}" | tr -d \\n)
+    sambamba view -h -f bam -t 2 -o "${my_chrom}.${identifier}.bwa.md.split.bam" *.sorted.bam ${my_chrom}
+    sambamba index "${my_chrom}.${identifier}.bwa.md.split.bam"
+    '''
+}
+
+process merge_bams {
+    cpus 1
+    publishDir "output", mode: 'copy', pattern: '*.{sorted.bam}*'
+    conda 'samtools sambamba'
+
+    input:
+        tuple chrom, file(bam) from chrom_split_bams.groupTuple()
+
+    output:
+        tuple chrom, file('*.unconverted.sorted.bam') into chrom_merged_bams
+
+    shell:
+    '''
+    my_chrom=$(echo "!{chrom}" | tr -d \\n)
+
+    samtools cat *.bam > ${my_chrom}.unconverted.cat.bam
+
+    sambamba sort -n ${my_chrom}.unconverted.sorted.bam
+
+    '''
+
+}
+
+// merged_bams.into{bwa_mem_contigs; bwa_mem_split}
+
+
+process insilico_convert {
+    conda 'pysam samtools'
+
+    input: 
+        tuple chrom, file(bam), cell_line, file(bedgraph) from chrom_merged_bams.combine(combined_bedgraphs)
+
+    output:
+        tuple cell_line, file('*.1.fastq.gz'), file('*.2.fastq.gz') into fastq_for_splitting
+
+    shell:
+    '''
+    identifier=$(basename $PWD)
+    !{path_to_convert_script}/convert_reads.py --bam !{bam} --bed !{bedgraph} --out /dev/stdout \
+    | samtools fastq -N -s singletons.fastq.gz -1 !{chrom}.!{cell_line}.1.fastq.gz -2 !{chrom}.!{cell_line}.2.fastq.gz /dev/stdin
+    '''
+}
+
+process split_converted_fastqs {
+    cpus 2
+
+    input:
+        tuple cell_line, file(read1), file(read2) from fastq_for_splitting.groupTuple()
+
+    output:
+        tuple cell_line, file('*.1.fastq.gz_*'), file('*.2.fastq.gz_*') into fastq_for_bwameth
+    
+    shell:
+    '''
+    split <(zcat *1.fastq.gz_*) !{cell_line}.1.fastq.gz_ -l 40000000 &
+    split <(zcat *2.fastq.gz_*) !{cell_line}.2.fastq.gz_ -l 40000000
+    '''
+  
+}
+
+// process bam_to_fastq {
+//     conda 'samtools'
+//     publishDir "output", mode: 'copy', pattern: '*.fastq.gz'
+
+//     input:
+//         tuple cell_line, file(bam) from converted_bams
+
+//     output:
+//         tuple cell_line, file('*_read1.fastq.gz'), file('*_read2.fastq.gz') into fastq_for_bwameth
+
+//     shell:
+//     '''
+//     samtools fastq -N -s singletons.fastq.gz -1 !{cell_line}_read1.fastq.gz -2 !{cell_line}_read2.fastq.gz !{bam}
+//     '''
+// }
+
+process bwameth_md {
+    cpus 16
+    // errorStrategy 'retry'
+    publishDir "output", mode: 'copy', pattern: '*.{md.bam}*'
+
+    conda 'bwameth=0.2.2 fastp=0.20.1 seqtk=1.3 samblaster=0.1.24 sambamba'
+
+    input:
+        tuple cell_line, file(read1), file(read2) from fastq_for_bwameth.transpose()
+
+    output:
+        tuple cell_line, file("*.bwameth.md.bam"), file("*.bwameth.md.bam.bai") into bwameth_aligned_files
+
+    shell:
+    '''
+    inst_name=$(head -n 1 < <(zcat -f !{read1}) | cut -f 1 -d ':' | sed 's/^@//')
+    fastq_barcode=$(head -n 1 < <(zcat -f !{read1}) | sed -r 's/.*://')
+    if [[ "${inst_name:0:2}" == 'A0' ]] || [[ "${inst_name:0:2}" == 'NS' ]] || [[ "${inst_name:0:2}" == 'NB' ]]; then
+        #these 2-color instruments should have poly-g stretches trimmed since they are likely indicative of inactive clusters
+        trim_polyg='--trim_poly_g'
+    else
+        trim_polyg=''
+    fi
+
+    seqtk mergepe <(zcat -f !{read1}) <(zcat -f !{read2}) \
+    | fastp --stdin --stdout -l 2 -Q ${trim_polyg} --interleaved_in --overrepresentation_analysis -j "combined_fastp.json" \
+    | sed 's|/1| 1:N:0:XXXXX|g' | sed 's|/2| 2:N:0:XXXXX|g'\
+    | bwameth.py -p -t !{task.cpus} \
+       --read-group  \"@RG\\\\tID:${fastq_barcode}\\\\tBC:${fastq_barcode}\\\\tCN:Multiple\\\\tPL:ILLUMINA\\\\tPU:${fastq_barcode}" \
+       --reference !{params.bwameth_genome} /dev/stdin 2>  "!{cell_line}.log.bwamem" \
+    | samblaster 2> log.samblaster \
+    | sambamba view -t 2 -l 0 -S -f bam /dev/stdin \
+    | sambamba sort --tmpdir=./ -t !{task.cpus} -m 20GB -o "PCRfree_!{cell_line}.bwameth.md.bam" /dev/stdin
+    '''
+}
+
+process combine_sort_bams {
+    conda 'samtools'
+
+    input:
+        tuple cell_line, file(bam), file(bai) from bwameth_aligned_files.groupTuple()
+
+    output:
+        tuple cell_line, file('merged_converted.sorted.bam') into combined_bams
+
+    shell: //do merge instead
+    '''
+    samtools merge merged_converted.sorted.bam *.bam
+    '''
+
+}
+
+combined_bams.into{aligned_for_mbias; aligned_for_extract}
+
+process methylDackel_mbias {
+    cpus 8
+    errorStrategy 'retry'
+    conda "methyldackel=0.4.0 samtools=1.9"
+    publishDir "output", mode: 'copy'
+
+    input:
+        tuple cell_line, file(md_file), file(md_bai) from aligned_for_mbias
+
+    output:
+        tuple cell_line, file('*.svg') into mbias_output_svg
+        tuple cell_line, file('*.tsv') into mbias_output_tsv
+
+    shell:
+    '''
+    echo -e "chr\tcontext\tstrand\tRead\tPosition\tnMethylated\tnUnmethylated\tnMethylated(+dups)\tnUnmethylated(+dups)" > UII_combined_mbias.tsv
+    chrs=(`samtools view -H !{md_file} | grep @SQ | cut -f 2 | sed 's/SN://'| grep -v _random | grep -v chrUn | sed 's/|/\\|/'`)
+
+    for chr in ${chrs[*]}; do
+        for context in CHH CHG CpG; do
+            arg=''
+            if [ $context = 'CHH' ]; then
+            arg='--CHH --noCpG'
+            elif [ $context = 'CHG' ]; then
+            arg='--CHG --noCpG'
+            fi
+            # need two calls to add columns containing the counts without filtering duplicate reads (for rrEM-seq where start/end is constrained)
+            # not sure why we need both --keepDupes and -F, probably a bug in mbias
+            join -t $'\t' -j1 -o 1.2,1.3,1.4,1.5,1.6,2.5,2.6 -a 1 -e 0 \
+            <( \
+                MethylDackel mbias --noSVG $arg -@ !{task.cpus} -r $chr !{params.bwameth_genome} !{md_file} | \
+                tail -n +2 | awk '{print $1"-"$2"-"$3"\t"$0}' | sort -k 1b,1
+            ) \
+            <( \
+                MethylDackel mbias --noSVG --keepDupes -F 2816 $arg -@ !{task.cpus} -r $chr !{params.bwameth_genome} !{md_file} | \
+                tail -n +2 | awk '{print $1"-"$2"-"$3"\t"$0}' | sort -k 1b,1
+            ) \
+            | sed "s/^/${chr}\t${context}\t/" \
+            >> PCRfree_!{cell_line}_combined_mbias.tsv
+        done
+    done
+    # makes the svg files for trimming checks
+    MethylDackel mbias -@ !{task.cpus} --noCpG --CHH --CHG -r ${chrs[0]} !{params.bwameth_genome} !{md_file} UII_chn
+    for f in *chn*.svg; do sed -i "s/Strand<\\/text>/Strand $f ${chrs[0]} CHN <\\/text>/" $f; done;
+
+    MethylDackel mbias -@ !{task.cpus} -r ${chrs[0]} !{params.bwameth_genome} !{md_file} UII_cpg
+    for f in *cpg*.svg; do sed -i "s/Strand<\\/text>/Strand $f ${chrs[0]} CpG<\\/text>/" $f; done;
+
+    '''
+
+}
+
+process methylDackel_extract {
+    cpus 8
+    // publishDir "${default_dest_path}/${email}/${flowcell}${dest_modifier}", mode: 'copy'
+    conda "methyldackel=0.4.0 pigz=2.4"
+    publishDir "output", mode: 'copy'
+
+    input:
+        tuple cell_line, file(md_file), file(md_bai) from aligned_for_extract
+
+    output:
+        tuple cell_line, file('*.methylKit.gz') into extract_output
+
+    shell:
+    '''
+    MethylDackel extract --methylKit --OT 0,0,0,95 --OB 0,0,5,0 -@ !{task.cpus} --CHH --CHG -o PCRfree_!{cell_line} !{params.bwameth_genome} !{md_file}
+    pigz -p !{task.cpus} *.methylKit
+    '''
+
+}
diff --git a/prepare_data/alignment_pipelines/convert_reads.py b/prepare_data/alignment_pipelines/convert_reads.py
new file mode 100755
index 0000000..973f942
--- /dev/null
+++ b/prepare_data/alignment_pipelines/convert_reads.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+
+import sys
+import argparse
+import pysam
+import random
+
+def argparser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--bed", required = True, help = "Bed file of methylation values")
+    parser.add_argument("--bam", required = True, help = "Input bam or sam file")
+    parser.add_argument("--out", required = True, help = "Name for output sam file")
+
+    args = parser.parse_args()
+    return args
+
+def read_bedgraph_file(bed_file):
+    my_bed_dict = {}
+
+    file = open(bed_file, "r")
+    next(file)
+    for line in file:
+        line = line.strip().split("\t")
+        chr = line[0]
+        pos1 = int(line[1])
+        pos2 = int(line[2])
+        meth_percent = float(line[3]) * 100
+
+        if chr not in my_bed_dict:
+            my_bed_dict[chr] = {}
+        for x in range(pos1, pos2):
+            my_bed_dict[chr][x] = meth_percent
+    
+    return my_bed_dict
+
+def decide_to_convert(meth_percent):
+    tmp = random.randrange(0,100)
+    if tmp > meth_percent:
+        return True
+    else:
+        return False
+
+def check_alignment_positions(read, original, converted, bed_dict):
+
+    my_seq = list(read.query_sequence)
+    quals = read.query_qualities
+    for x, alignment_pos in enumerate(read.get_reference_positions(full_length = True)):            
+        if alignment_pos in bed_dict[read.reference_name] and decide_to_convert(bed_dict[read.reference_name][alignment_pos]) is True:
+            if my_seq[x] == original:
+                my_seq[x] = converted
+    read.query_sequence = "".join(my_seq)
+    read.query_qualities = quals
+
+    return read
+
+
+def read_bam_file(bam_file, bed_dict, out_file):
+    my_bam = pysam.AlignmentFile(bam_file, "rb")
+    out_bam = pysam.AlignmentFile(out_file, "wb", template=my_bam)
+
+    for read in my_bam:
+        if (read.is_reverse and read.is_read2) or (read.mate_is_reverse and read.is_read1): # 'Top' strand
+            new_read = check_alignment_positions(read, "C", "T", bed_dict)
+        elif (read.is_reverse and read.is_read1) or (read.mate_is_reverse and read.is_read2): # 'Bottom' strand
+            new_read = check_alignment_positions(read, "G", "A", bed_dict)
+        else:
+            continue
+        out_bam.write(new_read)
+
+
+
+def run_script():
+    args = argparser()
+
+    bed_dict = read_bedgraph_file(args.bed)
+
+    read_bam_file(args.bam, bed_dict, args.out)
+
+
+if __name__ == "__main__":
+    run_script()
\ No newline at end of file