diff --git a/CHANGES.md b/CHANGES.md index 8948f7d..1be0b3c 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,9 @@ # CHANGES +## 1.13.16 + +* Correct occasional memory blowout caused by split step not excluding an ignore region. + ## 1.13.15 * Add checks to sam_iter_next to ensure result is checked for errors diff --git a/Makefile b/Makefile index 42279aa..6aedb75 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -CAVEMAN_VERSION=1.13.15 +CAVEMAN_VERSION=1.13.16 TEST_REF?="" #Compiler diff --git a/src/ignore_reg_access.c b/src/ignore_reg_access.c index 44fc0e9..ed36449 100644 --- a/src/ignore_reg_access.c +++ b/src/ignore_reg_access.c @@ -40,156 +40,166 @@ #include int ignore_reg_access_get_ign_reg_count_for_chr(char *ign_file, char *chr){ - assert(ign_file != NULL); - assert(chr != NULL); - FILE *file = fopen(ign_file,"r"); - check(file != NULL,"Couldn't open ignored region file: %s.",ign_file); - //Read all lines, only including the ones that are of the correct chromosome in the count - int entry_count = 0; - //read and count - char rd[200]; - while(fgets(rd, 200, file) != NULL){ - check(rd != NULL,"Invalid line read in ignored region file."); - char *chr_nom = malloc(sizeof(char)*50); - check_mem(chr_nom); - int chk = sscanf(rd,"%s",chr_nom); - check(chk == 1,"Incorrect line read.\n"); - if(strcmp(chr_nom,chr) == 0){ - entry_count++; - } - free(chr_nom); - } - check(fclose(file)==0,"Error closing ignored region file '%s'.",ign_file); - return entry_count; + assert(ign_file != NULL); + assert(chr != NULL); + FILE *file = fopen(ign_file,"r"); + check(file != NULL,"Couldn't open ignored region file: %s.",ign_file); + //Read all lines, only including the ones that are of the correct chromosome in the count + int entry_count = 0; + //read and count + char rd[200]; + while(fgets(rd, 200, file) != NULL){ + check(rd != NULL,"Invalid line read in ignored region file."); + char *chr_nom = malloc(sizeof(char)*50); + check_mem(chr_nom); + int chk = sscanf(rd,"%s",chr_nom); + check(chk == 1,"Incorrect line read.\n"); + if(strcmp(chr_nom,chr) == 0){ + entry_count++; + } + free(chr_nom); + } + check(fclose(file)==0,"Error closing ignored region file '%s'.",ign_file); + return entry_count; error: - if(file) fclose(file); - return -1; + if(file) fclose(file); + return -1; } seq_region_t *ignore_reg_access_get_ign_reg_overlap(int pos, struct seq_region_t **regions, int entry_count){ - int i=0; - for(i=0; ibeg <= pos && regions[i]->end >= pos){ - seq_region_t *reg_copy = malloc(sizeof(struct seq_region_t)); - check_mem(reg_copy); - reg_copy->beg = regions[i]->beg; - reg_copy->end = regions[i]->end; - return reg_copy; - } - } + int i=0; + for(i=0; ibeg <= pos && regions[i]->end >= pos){ + seq_region_t *reg_copy = malloc(sizeof(struct seq_region_t)); + check_mem(reg_copy); + reg_copy->beg = regions[i]->beg; + reg_copy->end = regions[i]->end; + return reg_copy; + } + } error: - return NULL; + return NULL; } int ignore_reg_access_get_ign_reg_for_chr(char *ign_file,char *chr, int entry_count, struct seq_region_t **regions){ - assert(ign_file != NULL); - assert(chr != NULL); - assert(entry_count >= 0); - assert(sizeof(regions)>0); - int is_bed = 0; - if(entry_count == 0){ - return 0; - } - //assign the right size to the array - //then reread so we can parse the actual lines. - //Check for bed extension - const char *ext = strrchr(ign_file, '.'); - if(ext && ext != ign_file && strcmp(ext+1,"bed")==0){ - is_bed = 1; - } + assert(ign_file != NULL); + assert(chr != NULL); + assert(entry_count >= 0); + assert(sizeof(regions)>0); + int is_bed = 0; + if(entry_count == 0){ + return 0; + } + //assign the right size to the array + //then reread so we can parse the actual lines. + //Check for bed extension + const char *ext = strrchr(ign_file, '.'); + if(ext && ext != ign_file && strcmp(ext+1,"bed")==0){ + is_bed = 1; + } - FILE *file = fopen(ign_file,"r"); - check(file != NULL,"Couldn't open ignored region file: %s.",ign_file); - int found_count = 0; + FILE *file = fopen(ign_file,"r"); + check(file != NULL,"Couldn't open ignored region file: %s.",ign_file); + int found_count = 0; - char rd[200]; - while(fgets(rd, 200, file) != NULL){ - check(rd != NULL,"Invalid line read in ignored region file."); - char *chr_nom = malloc(sizeof(char)*250); - check_mem(chr_nom); - int beg,end; - int chk = sscanf(rd,"%s\t%d\t%d",chr_nom,&beg,&end); - if(chk==3){ - if(strcmp(chr_nom,chr) == 0){ - regions[found_count] = malloc(sizeof(struct seq_region_t)); - check_mem(regions[found_count]); - regions[found_count]->beg = beg + is_bed; - regions[found_count]->end = end; - found_count++; - } - }else if(1==sscanf(rd,"%s",chr_nom)){//Check for just a chromosome. - if(strcmp(chr_nom,chr) == 0){ - regions[found_count] = malloc(sizeof(struct seq_region_t)); - check_mem(regions[found_count]); - regions[found_count]->beg = 1; - regions[found_count]->end = INT_MAX; - found_count++; - } - }else{ - free(chr_nom); - sentinel("Incorrect line read from ignore file %s.",rd); - } - free(chr_nom); - } - check(entry_count == found_count,"Wrong number of lines found %d for chr: %s. Expected %d.",found_count,chr,entry_count); - check(fclose(file)==0,"Error closing ignored region file '%s'.",ign_file); - return 0; + char rd[200]; + while(fgets(rd, 200, file) != NULL){ + check(rd != NULL,"Invalid line read in ignored region file."); + char *chr_nom = malloc(sizeof(char)*250); + check_mem(chr_nom); + int beg,end; + int chk = sscanf(rd,"%s\t%d\t%d",chr_nom,&beg,&end); + if(chk==3){ + if(strcmp(chr_nom,chr) == 0){ + regions[found_count] = malloc(sizeof(struct seq_region_t)); + check_mem(regions[found_count]); + regions[found_count]->beg = beg + is_bed; + regions[found_count]->end = end; + found_count++; + } + }else if(1==sscanf(rd,"%s",chr_nom)){//Check for just a chromosome. + if(strcmp(chr_nom,chr) == 0){ + regions[found_count] = malloc(sizeof(struct seq_region_t)); + check_mem(regions[found_count]); + regions[found_count]->beg = 1; + regions[found_count]->end = INT_MAX; + found_count++; + } + }else{ + free(chr_nom); + sentinel("Incorrect line read from ignore file %s.",rd); + } + free(chr_nom); + } + check(entry_count == found_count,"Wrong number of lines found %d for chr: %s. Expected %d.",found_count,chr,entry_count); + check(fclose(file)==0,"Error closing ignored region file '%s'.",ign_file); + return 0; error: - if(file) fclose(file); - if(regions) ignore_reg_access_destroy_seq_region_t_arr(entry_count, regions); - return -1; + if(file) fclose(file); + if(regions) ignore_reg_access_destroy_seq_region_t_arr(entry_count, regions); + return -1; } -List *ignore_reg_access_get_ign_reg_contained(int from, int to, struct seq_region_t **regions, int entry_count){ - List *li = List_create(); - int i=0; - for(i=0; ibeg >= from && regions[i]->end <= to){ - //Make a copy of this region and put in the list - seq_region_t *reg_copy = malloc(sizeof(struct seq_region_t)); - check_mem(reg_copy); - reg_copy->beg = regions[i]->beg; - reg_copy->end = regions[i]->end; - List_push(li,reg_copy); - } - } - return li; +List *ignore_reg_access_get_ign_regs_covered(int from, int to, struct seq_region_t **regions, int entry_count){ + List *li = List_create(); + int i=0; + for(i=0; i= regions[i]->beg && from <= regions[i]->end) || + (to >= regions[i]->beg && to <= regions[i]->end) || + from <= regions[i]->beg && to >= regions[i]->end){ + seq_region_t *reg_copy = malloc(sizeof(struct seq_region_t)); + check_mem(reg_copy); + reg_copy->beg = regions[i]->beg; + reg_copy->end = regions[i]->end; + List_push(li,reg_copy); + } //End of checking if this ignore region overlaps + } + return li; error: - return NULL; + return NULL; } List *ignore_reg_access_resolve_ignores_to_analysis_sections(int start, int end, struct seq_region_t **regions, int entry_count){ - List *li = ignore_reg_access_get_ign_reg_contained(start,end,regions,entry_count); - check(li != NULL,"Error fetching contained ignore regions."); + List *li = ignore_reg_access_get_ign_regs_covered(start,end,regions,entry_count); + check(li != NULL,"Error fetching contained ignore regions."); - List *reg_for_analysis = List_create(); - seq_region_t *range = malloc(sizeof(struct seq_region_t)); - range->beg = start; - LIST_FOREACH(li, first, next, cur){ - range->end = ((seq_region_t *) cur->value)->beg - 1; - List_push(reg_for_analysis,range); - range = malloc(sizeof(struct seq_region_t)); - range->beg = ((seq_region_t *) cur->value)->end + 1; - } - range->end = end; - List_push(reg_for_analysis,range); - List_clear_destroy(li); - return reg_for_analysis; + List *reg_for_analysis = List_create(); + seq_region_t *range = malloc(sizeof(struct seq_region_t)); + + range->beg = start; + if(List_count(li) > 0){ + if(start >= ((seq_region_t *) li->first->value)->beg){ + range->beg = ((seq_region_t *) li->first->value)->end+1; + List_unshift(li); + } + } + LIST_FOREACH(li, first, next, cur){ + range->end = ((seq_region_t *) cur->value)->beg - 1; + List_push(reg_for_analysis,range); + range = malloc(sizeof(struct seq_region_t)); + range->beg = ((seq_region_t *) cur->value)->end + 1; + } + + range->end = end; + List_push(reg_for_analysis,range); + List_clear_destroy(li); + return reg_for_analysis; error: - List_clear_destroy(li); - return NULL; + List_clear_destroy(li); + return NULL; } void ignore_reg_access_destroy_seq_region_t_arr(int entry_count, seq_region_t **regions){ - if(sizeof(regions) > 0){ - int i=0; - for(i=0;i 0){ + int i=0; + for(i=0;i 0",optarg); - } - break; + case 'i': + if(sscanf(optarg, "%i", &idx) != 1){ + sentinel("Error parsing -i argument '%s'. Should be an integer > 0",optarg); + } + break; - case 'e': - if(sscanf(optarg, "%i", &max_read_count) != 1){ - sentinel("Error parsing -e argument '%s'. Should be an integer > 0",optarg); - } - break; + case 'e': + if(sscanf(optarg, "%i", &max_read_count) != 1){ + sentinel("Error parsing -e argument '%s'. Should be an integer > 0",optarg); + } + break; - case '?': - split_print_usage (1); - break; + case '?': + split_print_usage (1); + break; - default: - split_print_usage (1); + default: + split_print_usage (1); - }; // End of args switch statement + }; // End of args switch statement - }//End of iteration through options + }//End of iteration through options - //Do some checking to ensure required arguments were passed - if(idx == 0){ - split_print_usage(1); - } + //Do some checking to ensure required arguments were passed + if(idx == 0){ + split_print_usage(1); + } - if(check_exist(config_file) != 1){ - printf("Config file %s does not appear to exist. Have you run caveman setup?\n",config_file); - split_print_usage(1); - } + if(check_exist(config_file) != 1){ + printf("Config file %s does not appear to exist. Have you run caveman setup?\n",config_file); + split_print_usage(1); + } - return 0; + return 0; error: - split_print_usage(1); - return -1; + split_print_usage(1); + return -1; } int round_divide_integer(int dividend, int divisor){ - if(dividend == 0 || divisor == 0){ - return 1; - } + if(dividend == 0 || divisor == 0){ + return 1; + } return (dividend + (divisor / 2)) / divisor; } uint32_t min(uint32_t one, uint32_t two){ - if(one<=two){ - return one; - }else if(two= 0,"Error trying to check the number of ignored regions for this chromosome."); - - printf("Found %d ignored regions for chromosome %s.\n",ignore_reg_count,chr_name); - - //Now create a store for said regions. - ignore_regs = malloc(sizeof(struct seq_region_t *) * ignore_reg_count); - check_mem(ignore_regs); - check(ignore_reg_access_get_ign_reg_for_chr(ignore_regions_file,chr_name,ignore_reg_count,ignore_regs)==0,"Error fetching ignored regions from file."); - - //Check there's not a whole chromosome block. - if(!(ignore_reg_count == 1 && ignore_regs[0]->beg == 1 && ignore_regs[0]->end >= chr_length)){ - //No chromosome block, so carry on. - uint32_t start = 1; - uint32_t stop = chr_length; - uint64_t rd_count = 0; - - sf_norm = bam_access_populate_file(norm_bam_file,ref_idx); - check(sf_norm!=NULL,"Error populating file norm seq file %s.",norm_bam_file); - idx_norm = bam_access_populate_file_index(sf_norm, norm_bam_file); - check(idx_norm!=NULL,"Error populating index for norm seq file %s.",norm_bam_file); - sf_tum = bam_access_populate_file(tum_bam_file,ref_idx); - check(sf_tum!=NULL,"Error populating file for tum seq file %s.",tum_bam_file); - idx_tum = bam_access_populate_file_index(sf_tum, tum_bam_file); - check(idx_tum!=NULL,"Error populating index for tum seq file %s.",tum_bam_file); - - //read the first 100 reads and get an idea of average read length. - int avg_read_len_norm = bam_access_get_avg_readlength_from_bam(sf_norm); - int avg_read_len_tum = bam_access_get_avg_readlength_from_bam(sf_tum); - //Use a comparison of average read length to read_length_base in order to calculate a useful split size. - float avg_read_len = ((float)avg_read_len_norm + (float)avg_read_len_tum) / (float)2; - //Adjust max read count according to difference between avg_read_len and read_length_base - float proportion_rd_length = (float)read_length_base / avg_read_len; - max_read_count = (int)((float)max_read_count * proportion_rd_length); - - hts_close(sf_norm); - hts_idx_destroy(idx_norm); - hts_close(sf_tum); - hts_idx_destroy(idx_tum); - - sf_norm = bam_access_populate_file(norm_bam_file,ref_idx); - check(sf_norm!=NULL,"Error populating file norm seq file %s.",norm_bam_file); - idx_norm = bam_access_populate_file_index(sf_norm, norm_bam_file); - check(idx_norm!=NULL,"Error populating index for norm seq file %s.",norm_bam_file); - sf_tum = bam_access_populate_file(tum_bam_file,ref_idx); - check(sf_tum!=NULL,"Error populating file for tum seq file %s.",tum_bam_file); - idx_tum = bam_access_populate_file_index(sf_tum, tum_bam_file); - check(idx_tum!=NULL,"Error populating index for tum seq file %s.",tum_bam_file); - - iter_norm = bam_access_get_hts_itr(sf_norm, idx_norm, chr_name, start, stop); - check(iter_norm!=NULL,"Error fetching normal iterator or section %s:%d-%d.",chr_name,start,stop); - iter_tum = bam_access_get_hts_itr(sf_tum, idx_tum, chr_name, start, stop); - check(iter_tum!=NULL,"Error fetching tumour iterator or section %s:%d-%d.",chr_name,start,stop); - - //Setup a read for iteration - norm_read = bam_init1(); - tum_read = bam_init1(); - int iter_n_status = 0; - int iter_t_status = 0; - uint32_t sect_start = 1; - uint32_t sect_stop = 0; - uint32_t curr_n_pos = 0; - uint32_t curr_t_pos = 0; - - - //Have both iterators, now need to iterate through each in sync so we don't get ahead of the stops. - while(iter_n_status>=0 || iter_t_status>=0){ //Keep iterating until both iterators are out of reads. - - while(curr_n_pos<=curr_t_pos && iter_n_status>=0 && iter_t_status>=0 && rd_count<=max_read_count){ //While the positions aren't equal and tumour has reads left. Normal jumps ahead - iter_n_status = sam_itr_next(sf_norm,iter_norm,norm_read); - check(iter_n_status>=-1,"Error detected (%d) when trying to iterate through region.",iter_n_status); - curr_n_pos = norm_read->core.pos; - if(iter_n_status>=0 && bam_access_check_bam_flags(norm_read) == 1 && ignore_reg_access_get_ign_reg_overlap(curr_n_pos,ignore_regs,ignore_reg_count) == NULL){ - rd_count++; - } - }//End of this iteration through normal reads - - while(curr_t_pos<=curr_n_pos && iter_t_status>=0 && iter_n_status>=0){ //While the positions aren't equal and normal has reads left - iter_t_status = sam_itr_next(sf_tum,iter_tum,tum_read); - check(iter_t_status>=-1,"Error detected (%d) when trying to iterate through region.",iter_t_status); - curr_t_pos = tum_read->core.pos; - if(iter_t_status>=0 && bam_access_check_bam_flags(tum_read) == 1 && ignore_reg_access_get_ign_reg_overlap(curr_t_pos,ignore_regs,ignore_reg_count) == NULL){ - rd_count++; - } - }//End of this iteration through tumour reads - - //An extra section for where one or the other iterator is out of reads (we still need to count for the mstep). - if(iter_n_status<0 && iter_t_status>=0){ //No more normal reads - while(iter_t_status>=0 && rd_count<=max_read_count){ - iter_t_status = sam_itr_next(sf_tum,iter_tum,tum_read); - check(iter_t_status>=-1,"Error detected (%d) when trying to iterate through region.",iter_t_status); - curr_t_pos = tum_read->core.pos; - if(iter_t_status>=0 && bam_access_check_bam_flags(tum_read) == 1 && ignore_reg_access_get_ign_reg_overlap(curr_t_pos,ignore_regs,ignore_reg_count) == NULL){ - rd_count++; - } - } - }//End of iteration through tumour reads where only tumour reads remain - - if(iter_t_status<0 && iter_n_status>=0){ //No more tumour reads - while(iter_n_status>=0 && rd_count<=max_read_count){ - iter_n_status = sam_itr_next(sf_norm,iter_norm,norm_read); - check(iter_n_status>=-1,"Error detected (%d) when trying to iterate through region.",iter_n_status); - curr_n_pos = norm_read->core.pos; - if(iter_n_status>=0 && bam_access_check_bam_flags(norm_read) == 1 && ignore_reg_access_get_ign_reg_overlap(curr_n_pos,ignore_regs,ignore_reg_count) == NULL){ - rd_count++; - } - } - }//End of iteration through normal reads where only normal reads remain - - //Reads have equal start positions, check the count. - if(rd_count>=max_read_count){ - //Set old stop position (Min of curr_t_pos & curr_n_pos) - sect_stop = min(curr_t_pos,curr_n_pos); - //This is the position on which to separate the split sections so print it. - if(sect_stop>0 && sect_stop >= sect_start) { - split_access_print_section(output,chr_name,sect_start,sect_stop); - } - //printf("Found %d reads for %s:%d-%d\n",rd_count,chr_name,sect_start,sect_stop); - //Reset read count - rd_count=1;//Set as 1 due to the way the loop records read counts . - //Set new start position - sect_start = sect_stop+1; - }//End of checking if we've hit our read cutoff. - - }//End of moving through both normal and tumour iterators - - //No more reads left so we must print the last section. - split_access_print_section(output,chr_name,sect_start,chr_length); - bam_destroy1(norm_read); - bam_destroy1(tum_read); - hts_close(sf_norm); - hts_idx_destroy(idx_norm); - hts_close(sf_tum); - hts_idx_destroy(idx_tum); - hts_itr_destroy(iter_norm); - hts_itr_destroy(iter_tum); - }//End of checking if this is a valid contig to split. - - - return 0; + strcpy(fname,list_loc); + strcat(fname,"."); + strcat(fname,chr_name); + FILE *output = fopen(fname,"w"); + free(fname); + check(output != NULL, "Error opening file %s for write.",fname); + + //Load in a set of ignore regions from tsv format, only require this chromosome. + ignore_reg_count = ignore_reg_access_get_ign_reg_count_for_chr(ignore_regions_file,chr_name); + check(ignore_reg_count >= 0,"Error trying to check the number of ignored regions for this chromosome."); + + printf("Found %d ignored regions for chromosome %s.\n",ignore_reg_count,chr_name); + + //Now create a store for said regions. + ignore_regs = malloc(sizeof(struct seq_region_t *) * ignore_reg_count); + check_mem(ignore_regs); + check(ignore_reg_access_get_ign_reg_for_chr(ignore_regions_file,chr_name,ignore_reg_count,ignore_regs)==0,"Error fetching ignored regions from file."); + + //Check there's not a whole chromosome block. + if(!(ignore_reg_count == 1 && ignore_regs[0]->beg == 1 && ignore_regs[0]->end >= chr_length)){ + //No chromosome block, so carry on. + uint32_t start = 1; + uint32_t stop = chr_length; + uint64_t rd_count = 0; + + sf_norm = bam_access_populate_file(norm_bam_file,ref_idx); + check(sf_norm!=NULL,"Error populating file norm seq file %s.",norm_bam_file); + idx_norm = bam_access_populate_file_index(sf_norm, norm_bam_file); + check(idx_norm!=NULL,"Error populating index for norm seq file %s.",norm_bam_file); + sf_tum = bam_access_populate_file(tum_bam_file,ref_idx); + check(sf_tum!=NULL,"Error populating file for tum seq file %s.",tum_bam_file); + idx_tum = bam_access_populate_file_index(sf_tum, tum_bam_file); + check(idx_tum!=NULL,"Error populating index for tum seq file %s.",tum_bam_file); + + //read the first 100 reads and get an idea of average read length. + int avg_read_len_norm = bam_access_get_avg_readlength_from_bam(sf_norm); + int avg_read_len_tum = bam_access_get_avg_readlength_from_bam(sf_tum); + //Use a comparison of average read length to read_length_base in order to calculate a useful split size. + float avg_read_len = ((float)avg_read_len_norm + (float)avg_read_len_tum) / (float)2; + //Adjust max read count according to difference between avg_read_len and read_length_base + float proportion_rd_length = (float)read_length_base / avg_read_len; + max_read_count = (int)((float)max_read_count * proportion_rd_length); + + hts_close(sf_norm); + hts_idx_destroy(idx_norm); + hts_close(sf_tum); + hts_idx_destroy(idx_tum); + + sf_norm = bam_access_populate_file(norm_bam_file,ref_idx); + check(sf_norm!=NULL,"Error populating file norm seq file %s.",norm_bam_file); + idx_norm = bam_access_populate_file_index(sf_norm, norm_bam_file); + check(idx_norm!=NULL,"Error populating index for norm seq file %s.",norm_bam_file); + sf_tum = bam_access_populate_file(tum_bam_file,ref_idx); + check(sf_tum!=NULL,"Error populating file for tum seq file %s.",tum_bam_file); + idx_tum = bam_access_populate_file_index(sf_tum, tum_bam_file); + check(idx_tum!=NULL,"Error populating index for tum seq file %s.",tum_bam_file); + + iter_norm = bam_access_get_hts_itr(sf_norm, idx_norm, chr_name, start, stop); + check(iter_norm!=NULL,"Error fetching normal iterator or section %s:%d-%d.",chr_name,start,stop); + iter_tum = bam_access_get_hts_itr(sf_tum, idx_tum, chr_name, start, stop); + check(iter_tum!=NULL,"Error fetching tumour iterator or section %s:%d-%d.",chr_name,start,stop); + + //Setup a read for iteration + norm_read = bam_init1(); + tum_read = bam_init1(); + int iter_n_status = 0; + int iter_t_status = 0; + uint32_t sect_start = 1; + uint32_t sect_stop = 0; + uint32_t curr_n_pos = 0; + uint32_t curr_t_pos = 0; + + + //Have both iterators, now need to iterate through each in sync so we don't get ahead of the stops. + while(iter_n_status>=0 || iter_t_status>=0){ //Keep iterating until both iterators are out of reads. + + while(curr_n_pos<=curr_t_pos && iter_n_status>=0 && iter_t_status>=0 && rd_count<=max_read_count){ //While the positions aren't equal and tumour has reads left. Normal jumps ahead + iter_n_status = sam_itr_next(sf_norm,iter_norm,norm_read); + check(iter_n_status>=-1,"Error detected (%d) when trying to iterate through region.",iter_n_status); + curr_n_pos = norm_read->core.pos; + if(iter_n_status>=0 && bam_access_check_bam_flags(norm_read) == 1 && ignore_reg_access_get_ign_reg_overlap(curr_n_pos,ignore_regs,ignore_reg_count) == NULL){ + rd_count++; + } + }//End of this iteration through normal reads + + while(curr_t_pos<=curr_n_pos && iter_t_status>=0 && iter_n_status>=0){ //While the positions aren't equal and normal has reads left + iter_t_status = sam_itr_next(sf_tum,iter_tum,tum_read); + check(iter_t_status>=-1,"Error detected (%d) when trying to iterate through region.",iter_t_status); + curr_t_pos = tum_read->core.pos; + if(iter_t_status>=0 && bam_access_check_bam_flags(tum_read) == 1 && ignore_reg_access_get_ign_reg_overlap(curr_t_pos,ignore_regs,ignore_reg_count) == NULL){ + rd_count++; + } + }//End of this iteration through tumour reads + + //An extra section for where one or the other iterator is out of reads (we still need to count for the mstep). + if(iter_n_status<0 && iter_t_status>=0){ //No more normal reads + while(iter_t_status>=0 && rd_count<=max_read_count){ + iter_t_status = sam_itr_next(sf_tum,iter_tum,tum_read); + check(iter_t_status>=-1,"Error detected (%d) when trying to iterate through region.",iter_t_status); + curr_t_pos = tum_read->core.pos; + if(iter_t_status>=0 && bam_access_check_bam_flags(tum_read) == 1 && ignore_reg_access_get_ign_reg_overlap(curr_t_pos,ignore_regs,ignore_reg_count) == NULL){ + rd_count++; + } + } + }//End of iteration through tumour reads where only tumour reads remain + + if(iter_t_status<0 && iter_n_status>=0){ //No more tumour reads + while(iter_n_status>=0 && rd_count<=max_read_count){ + iter_n_status = sam_itr_next(sf_norm,iter_norm,norm_read); + check(iter_n_status>=-1,"Error detected (%d) when trying to iterate through region.",iter_n_status); + curr_n_pos = norm_read->core.pos; + if(iter_n_status>=0 && bam_access_check_bam_flags(norm_read) == 1 && ignore_reg_access_get_ign_reg_overlap(curr_n_pos,ignore_regs,ignore_reg_count) == NULL){ + rd_count++; + } + } + }//End of iteration through normal reads where only normal reads remain + + //Reads have equal start positions, check the count. + if(rd_count>=max_read_count){ + //Set old stop position (Min of curr_t_pos & curr_n_pos) + sect_stop = min(curr_t_pos,curr_n_pos); + seq_region_t *reg = ignore_reg_access_get_ign_reg_overlap(sect_stop+1,ignore_regs,ignore_reg_count); + if(reg != NULL){ + sect_stop = reg->end+1; + } + //This is the position on which to separate the split sections so print it. + if(sect_stop>0 && sect_stop >= sect_start) { + split_access_print_section(output,chr_name,sect_start,sect_stop); + } + //printf("Found %d reads for %s:%d-%d\n",rd_count,chr_name,sect_start,sect_stop); + //Reset read count + rd_count=1;//Set as 1 due to the way the loop records read counts . + //Set new start position + sect_start = sect_stop+1; + }//End of checking if we've hit our read cutoff. + + }//End of moving through both normal and tumour iterators + + //No more reads left so we must print the last section. + split_access_print_section(output,chr_name,sect_start,chr_length); + bam_destroy1(norm_read); + bam_destroy1(tum_read); + hts_close(sf_norm); + hts_idx_destroy(idx_norm); + hts_close(sf_tum); + hts_idx_destroy(idx_tum); + hts_itr_destroy(iter_norm); + hts_itr_destroy(iter_tum); + }//End of checking if this is a valid contig to split. + + + return 0; error: - if(norm_read) bam_destroy1(norm_read); - if(tum_read) bam_destroy1(tum_read); - if(sf_norm) hts_close(sf_norm); - if(idx) hts_idx_destroy(idx_norm); - if(sf_tum) hts_close(sf_tum); - if(idx) hts_idx_destroy(idx_tum); - if(iter_norm) hts_itr_destroy(iter_norm); - if(iter_tum) hts_itr_destroy(iter_tum); - return -1; + if(norm_read) bam_destroy1(norm_read); + if(tum_read) bam_destroy1(tum_read); + if(sf_norm) hts_close(sf_norm); + if(idx) hts_idx_destroy(idx_norm); + if(sf_tum) hts_close(sf_tum); + if(idx) hts_idx_destroy(idx_tum); + if(iter_norm) hts_itr_destroy(iter_norm); + if(iter_tum) hts_itr_destroy(iter_tum); + return -1; } diff --git a/tests/ign_region_access_tests.c b/tests/ign_region_access_tests.c index 2310df8..e8f3703 100644 --- a/tests/ign_region_access_tests.c +++ b/tests/ign_region_access_tests.c @@ -69,54 +69,88 @@ char *test_ignore_reg_access_get_ign_reg_overlap(){ } char *test_ignore_reg_access_get_ign_reg_for_chr(){ - struct seq_region_t **ignore_regs; - int ignore_reg_count = 2; - char *chr = "10"; - ignore_regs = malloc(sizeof(struct seq_region_t *) * ignore_reg_count); - ignore_reg_access_get_ign_reg_for_chr(test_ign_file,chr,ignore_reg_count, ignore_regs); - mu_assert(((seq_region_t *)ignore_regs[0])->beg == 13,"Wrong start retrieved for ignored region 1.\n"); - mu_assert(((seq_region_t *)ignore_regs[0])->end == 5678,"Wrong stop retrieved for ignored region 1.\n"); - mu_assert(((seq_region_t *)ignore_regs[1])->beg == 5680,"Wrong start retrieved for ignored region 2.\n"); - mu_assert(((seq_region_t *)ignore_regs[1])->end == 5699,"Wrong stop retrieved for ignored region 2.\n"); - ignore_reg_access_destroy_seq_region_t_arr(ignore_reg_count, ignore_regs); - return NULL; + struct seq_region_t **ignore_regs; + int ignore_reg_count = 2; + char *chr = "10"; + ignore_regs = malloc(sizeof(struct seq_region_t *) * ignore_reg_count); + ignore_reg_access_get_ign_reg_for_chr(test_ign_file,chr,ignore_reg_count, ignore_regs); + mu_assert(((seq_region_t *)ignore_regs[0])->beg == 13,"Wrong start retrieved for ignored region 1.\n"); + mu_assert(((seq_region_t *)ignore_regs[0])->end == 5678,"Wrong stop retrieved for ignored region 1.\n"); + mu_assert(((seq_region_t *)ignore_regs[1])->beg == 5680,"Wrong start retrieved for ignored region 2.\n"); + mu_assert(((seq_region_t *)ignore_regs[1])->end == 5699,"Wrong stop retrieved for ignored region 2.\n"); + ignore_reg_access_destroy_seq_region_t_arr(ignore_reg_count, ignore_regs); + return NULL; } -char *test_ignore_reg_access_get_ign_reg_contained(){ - //List *ignore_reg_access_get_ign_reg_contained(int from, int to, struct seq_region_t **regions, int entry_count) - struct seq_region_t **ignore_regs; - int ignore_reg_count = 2; - ignore_regs = malloc(sizeof(struct seq_region_t *) * ignore_reg_count); - char *chr = "10"; - ignore_reg_access_get_ign_reg_for_chr(test_ign_file,chr,ignore_reg_count, ignore_regs); - List *contained = ignore_reg_access_get_ign_reg_contained(10,5679,ignore_regs,ignore_reg_count); - mu_assert(List_count(contained)==1,"Incorrect number of regions found.\n") - List_clear_destroy(contained); - contained = ignore_reg_access_get_ign_reg_contained(10,5700,ignore_regs,ignore_reg_count); - mu_assert(List_count(contained)==2,"Incorrect number of regions found.\n") - List_clear_destroy(contained); - ignore_reg_access_destroy_seq_region_t_arr(ignore_reg_count, ignore_regs); - return NULL; +char *test_ignore_reg_access_get_ign_regs_covered(){ + //List *ignore_reg_access_get_ign_reg_contained(int from, int to, struct seq_region_t **regions, int entry_count) + struct seq_region_t **ignore_regs; + int ignore_reg_count = 2; + //10 13 5678 + //10 5680 5699 + ignore_regs = malloc(sizeof(struct seq_region_t *) * ignore_reg_count); + char *chr = "10"; + ignore_reg_access_get_ign_reg_for_chr(test_ign_file,chr,ignore_reg_count, ignore_regs); + List *contained = ignore_reg_access_get_ign_regs_covered(10,5679,ignore_regs,ignore_reg_count); + mu_assert(List_count(contained)==1,"Incorrect number of regions found.\n") + List_clear_destroy(contained); + contained = ignore_reg_access_get_ign_regs_covered(10,5700,ignore_regs,ignore_reg_count); + mu_assert(List_count(contained)==2,"Incorrect number of regions found.\n") + List_clear_destroy(contained); + //Try with a beginning overlap - covered + contained = ignore_reg_access_get_ign_regs_covered(14,5700,ignore_regs,ignore_reg_count); + mu_assert(List_count(contained)==2,"Incorrect number of regions found.\n") + List_clear_destroy(contained); + //Try with a end overlap - covered + contained = ignore_reg_access_get_ign_regs_covered(5697,5700,ignore_regs,ignore_reg_count); + fprintf(stderr,"REGIONS4: %d\n",List_count(contained)); + mu_assert(List_count(contained)==1,"Incorrect number of regions found.\n") + List_clear_destroy(contained); + ignore_reg_access_destroy_seq_region_t_arr(ignore_reg_count, ignore_regs); + return NULL; } char *test_ignore_reg_access_resolve_ignores_to_analysis_sections(){ - //List *ignore_reg_access_resolve_ignores_to_analysis_sections(int start, int end, struct seq_region_t **regions, int entry_count) - struct seq_region_t **ignore_regs; - int ignore_reg_count = 2; - ignore_regs = malloc(sizeof(struct seq_region_t *) * ignore_reg_count); - char *chr = "10"; - ignore_reg_access_get_ign_reg_for_chr(test_ign_file,chr,ignore_reg_count, ignore_regs); -//10 13 5678 -//10 5680 5699 - List *sects = ignore_reg_access_resolve_ignores_to_analysis_sections(11,5679,ignore_regs,ignore_reg_count); - mu_assert(List_count(sects)==2,"Incorrect number of sections resolved.\n"); - mu_assert(((seq_region_t *)sects->first->value)->beg == 11,"Incorrect first section start.\n"); - mu_assert(((seq_region_t *)sects->first->value)->end == 12,"Incorrect first section stop.\n"); - mu_assert(((seq_region_t *)sects->last->value)->beg == 5679,"Incorrect first section start.\n"); - mu_assert(((seq_region_t *)sects->last->value)->end == 5679,"Incorrect first section stop.\n"); - List_clear_destroy(sects); - ignore_reg_access_destroy_seq_region_t_arr(ignore_reg_count, ignore_regs); - return NULL; + //List *ignore_reg_access_resolve_ignores_to_analysis_sections(int start, int end, struct seq_region_t **regions, int entry_count) + struct seq_region_t **ignore_regs; + int ignore_reg_count = 2; + ignore_regs = malloc(sizeof(struct seq_region_t *) * ignore_reg_count); + char *chr = "10"; + ignore_reg_access_get_ign_reg_for_chr(test_ign_file,chr,ignore_reg_count, ignore_regs); + //10 13 5678 + //10 5680 5699 + List *sects = ignore_reg_access_resolve_ignores_to_analysis_sections(11,5679,ignore_regs,ignore_reg_count); + mu_assert(List_count(sects)==2,"Incorrect number of sections resolved.\n"); + mu_assert(((seq_region_t *)sects->first->value)->beg == 11,"Incorrect first section start.\n"); + mu_assert(((seq_region_t *)sects->first->value)->end == 12,"Incorrect first section stop.\n"); + mu_assert(((seq_region_t *)sects->last->value)->beg == 5679,"Incorrect first section start.\n"); + mu_assert(((seq_region_t *)sects->last->value)->end == 5679,"Incorrect first section stop.\n"); + List_clear_destroy(sects); + + //Try with a start overlap (1 inside the region) + sects = ignore_reg_access_resolve_ignores_to_analysis_sections(14,5800,ignore_regs,ignore_reg_count); + LIST_FOREACH(sects, first, next, cur){ + fprintf(stderr,"REGION FOUND: %d-%d\n",((seq_region_t *) cur->value)->beg,((seq_region_t *) cur->value)->end); + } + mu_assert(List_count(sects)==2,"Incorrect number of sections resolved.\n"); + mu_assert(((seq_region_t *)sects->first->value)->beg == 5679,"Incorrect first section start.\n"); + mu_assert(((seq_region_t *)sects->first->value)->end == 5679,"Incorrect first section stop.\n"); + mu_assert(((seq_region_t *)sects->last->value)->beg == 5700,"Incorrect first section start.\n"); + mu_assert(((seq_region_t *)sects->last->value)->end == 5800,"Incorrect first section stop.\n"); + List_clear_destroy(sects); + + // Try with a start overlap exactly one before the start of the region as this is possible with the split logic + sects = ignore_reg_access_resolve_ignores_to_analysis_sections(12,5800,ignore_regs,ignore_reg_count); + mu_assert(List_count(sects)==3,"Incorrect number of sections resolved.\n"); + mu_assert(((seq_region_t *)sects->first->value)->beg == 12,"Incorrect first section start.\n"); + mu_assert(((seq_region_t *)sects->first->value)->end == 12,"Incorrect first section stop.\n"); + mu_assert(((seq_region_t *)sects->last->value)->beg == 5700,"Incorrect first section start.\n"); + mu_assert(((seq_region_t *)sects->last->value)->end == 5800,"Incorrect first section stop.\n"); + List_clear_destroy(sects); + ignore_reg_access_destroy_seq_region_t_arr(ignore_reg_count, ignore_regs); + + + return NULL; } char *all_tests() { @@ -124,7 +158,7 @@ char *all_tests() { mu_run_test(test_ignore_reg_access_get_ign_reg_count_for_chr); mu_run_test(test_ignore_reg_access_get_ign_reg_for_chr); mu_run_test(test_ignore_reg_access_get_ign_reg_overlap); - mu_run_test(test_ignore_reg_access_get_ign_reg_contained); + mu_run_test(test_ignore_reg_access_get_ign_regs_covered); mu_run_test(test_ignore_reg_access_resolve_ignores_to_analysis_sections); return NULL;