diff --git a/src/Newickform.c b/src/Newickform.c index 7f014de6..7e58c152 100644 --- a/src/Newickform.c +++ b/src/Newickform.c @@ -390,16 +390,16 @@ newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp } // Define data structures needed to record statistics and mask recombined sequence - int ** parent_recombinations_array = calloc(num_nodes,sizeof(int*)); + int ** recombinations_array = calloc(num_nodes,sizeof(int*)); for (int i = 0; i < num_nodes; ++i) { - parent_recombinations_array[i] = NULL; + recombinations_array[i] = NULL; } - int * parent_num_recombinations_array = calloc(num_nodes,sizeof(int)); + int * num_recombinations_array = calloc(num_nodes,sizeof(int)); int * current_total_snps_array = calloc(num_nodes,sizeof(int)); int * num_blocks_array = calloc(num_nodes,sizeof(int)); for (int i = 0; i < num_nodes; ++i) { - parent_num_recombinations_array[i] = 0; + num_recombinations_array[i] = 0; current_total_snps_array[i] = 0; num_blocks_array[i] = 0; } @@ -411,36 +411,28 @@ newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp int num_jobs = get_job_counts(node_depths,depth,num_nodes); int * jobNodeIndexArray = malloc(num_jobs * sizeof(int)); get_job_node_indices(jobNodeIndexArray,nodeArray,node_depths,depth,num_nodes); - printf("Depth is %d, num jobs is %d\n",depth,num_jobs); for (int node_num_index = 0; node_num_index < num_jobs; ++node_num_index) { int node_index = jobNodeIndexArray[node_num_index]; int parent_node_index = parents[node_index]; - if (parent_node_index > -1) - { - fill_in_recombinations_with_gaps(nodeArray, - node_index, - parent_node_index, - parent_recombinations_array, - parent_num_recombinations_array, - current_total_snps_array, - num_blocks_array, - nodeArray[node_index]->block_coordinates, - length_of_original_genome, - snp_locations, - number_of_snps); - } + fill_in_recombinations_with_gaps(nodeArray, + node_index, + parent_node_index, + recombinations_array, + num_recombinations_array, + current_total_snps_array, + num_blocks_array, + length_of_original_genome, + snp_locations, + number_of_snps); } } // Free gaps arrays - free(parent_num_recombinations_array); + free(num_recombinations_array); free(current_total_snps_array); free(num_blocks_array); - for (int i = 0; i < num_nodes; ++i) { - free(parent_recombinations_array[i]); - } - free(parent_recombinations_array); + free(recombinations_array); // Free general arrays free(nodeArray); diff --git a/src/branch_sequences.c b/src/branch_sequences.c index acd45b16..4a9b0b0f 100644 --- a/src/branch_sequences.c +++ b/src/branch_sequences.c @@ -105,21 +105,12 @@ int get_list_of_snp_indices_which_fall_in_downstream_recombinations(int ** curre // Go through the tree and build up the recombinations list from root to branch. Print out each sample name and a list of recombinations -void fill_in_recombinations_with_gaps(newick_node ** nodeArray, int node_index, int parent_node_index, int ** current_recombinations, int * num_recombinations, int * current_total_snps, int * num_blocks, int ** current_block_coordinates,int length_of_original_genome,int * snp_locations, int number_of_snps) +void fill_in_recombinations_with_gaps(newick_node ** nodeArray, int node_index, int parent_node_index, int ** current_recombinations, int * num_recombinations, int * current_total_snps, int * num_blocks, int length_of_original_genome,int * snp_locations, int number_of_snps) { // Define the relevant tree nodes newick_node * node = nodeArray[node_index]; - newick_node * parent = nodeArray[parent_node_index]; - - // Identify the recombinations leading to this node from the root - current_recombinations[node_index] = (int *) calloc((node->num_recombinations+1+num_recombinations[parent_node_index]),sizeof(int)); - num_recombinations[node_index] = copy_and_concat_integer_arrays(node->recombinations, - node->num_recombinations, - current_recombinations[parent_node_index], - num_recombinations[parent_node_index], - current_recombinations[node_index]); - + // overwrite the bases of snps with N's int i; int sequence_index; @@ -129,108 +120,113 @@ void fill_in_recombinations_with_gaps(newick_node ** nodeArray, int node_index, set_number_of_recombinations_for_sample(node->taxon,node->num_recombinations); set_number_of_snps_for_sample(node->taxon,node->number_of_snps); - // Get parental sequence - char * node_sequence = (char *) calloc((length_of_original_genome +1),sizeof(char)); - get_sequence_for_sample_name(node_sequence, node->taxon); - int genome_length_excluding_blocks_and_gaps = calculate_genome_length_excluding_blocks_and_gaps(node_sequence, - length_of_original_genome, - current_block_coordinates, - num_blocks[parent_node_index]); - - set_genome_length_excluding_blocks_and_gaps_for_sample(node->taxon, - genome_length_excluding_blocks_and_gaps); - - // Merge block coordinates putting most recent blocks first - int ** merged_block_coordinates; - merged_block_coordinates = (int **) calloc(3,sizeof(int *)); - merged_block_coordinates[0] = (int*) calloc((num_blocks[parent_node_index] + node->number_of_blocks+1),sizeof(int )); - merged_block_coordinates[1] = (int*) calloc((num_blocks[parent_node_index] + node->number_of_blocks+1),sizeof(int )); - copy_and_concat_2d_integer_arrays(current_block_coordinates, - num_blocks[parent_node_index], - node->block_coordinates, - node->number_of_blocks, - merged_block_coordinates - ); - - // Set the number of recombination blocks for the sample - set_number_of_blocks_for_sample(node->taxon, node->number_of_blocks); - - // Set number of branch bases in recombination by iterating through - // the first part of merged blocks (i.e. only blocks on the branch to this node) - set_number_of_branch_bases_in_recombinations(node->taxon, - calculate_number_of_bases_in_recombations_excluding_gaps(merged_block_coordinates, - node->number_of_blocks, - node_sequence, - snp_locations, - current_total_snps[parent_node_index]) -// root->number_of_snps) - ); -// current_total_snps[node_index] = current_total_snps[parent_node_index] + root->number_of_snps; // restore? - // Set number of total bases in recombination by iterating through - // all merged blocks leading to this node - set_number_of_bases_in_recombinations(node->taxon, - calculate_number_of_bases_in_recombations_excluding_gaps(merged_block_coordinates, - (num_blocks[parent_node_index] + node->number_of_blocks), - node_sequence, - snp_locations, - current_total_snps[parent_node_index]) -// current_total_snps[node_index]) + // Set root-specific values + if (parent_node_index == -1) + { + set_number_of_bases_in_recombinations(node->taxon,0); + set_number_of_branch_bases_in_recombinations(node->taxon,0); + set_internal_node(1,sequence_index); + set_genome_length_excluding_blocks_and_gaps_for_sample(node->taxon, + length_of_original_genome); + } + else + { + + // Get parental sequence + newick_node * parent = nodeArray[parent_node_index]; + char * node_sequence = (char *) calloc((length_of_original_genome +1),sizeof(char)); + get_sequence_for_sample_name(node_sequence, node->taxon); + + // Calculate clonal frame remaining at the parental node + int ** current_block_coordinates = parent->block_coordinates; + int genome_length_excluding_blocks_and_gaps = calculate_genome_length_excluding_blocks_and_gaps(node_sequence, + length_of_original_genome, + current_block_coordinates, + num_blocks[parent_node_index]); + + set_genome_length_excluding_blocks_and_gaps_for_sample(node->taxon, + genome_length_excluding_blocks_and_gaps); + + // Identify the recombinations leading to this node from the root + current_recombinations[node_index] = (int *) calloc((node->num_recombinations+1+num_recombinations[parent_node_index]),sizeof(int)); + num_recombinations[node_index] = copy_and_concat_integer_arrays(node->recombinations, + node->num_recombinations, + current_recombinations[parent_node_index], + num_recombinations[parent_node_index], + current_recombinations[node_index]); + + // Merge block coordinates putting most recent blocks first + int ** merged_block_coordinates; + merged_block_coordinates = (int **) calloc(3,sizeof(int *)); + merged_block_coordinates[0] = (int*) calloc((num_blocks[parent_node_index] + node->number_of_blocks+1),sizeof(int )); + merged_block_coordinates[1] = (int*) calloc((num_blocks[parent_node_index] + node->number_of_blocks+1),sizeof(int )); + copy_and_concat_2d_integer_arrays(node->block_coordinates, + node->number_of_blocks, + current_block_coordinates, + num_blocks[parent_node_index], + merged_block_coordinates ); - free(node_sequence); - - for(i = 0; i < num_recombinations[node_index]; i++) - { - update_sequence_base('N', sequence_index, current_recombinations[node_index][i]); - } - - // TODO: The stats for the number of snps in recombinations will need to be updated. - int * snps_in_recombinations = (int *) calloc((number_of_snps +1),sizeof(int)); - int num_snps_in_recombinations = get_list_of_snp_indices_which_fall_in_downstream_recombinations(merged_block_coordinates, - (num_blocks[parent_node_index] + node->number_of_blocks), - snp_locations, - number_of_snps, - snps_in_recombinations); - - for(i = 0; i < num_snps_in_recombinations; i++) - { - update_sequence_base('N', sequence_index, snps_in_recombinations[i]); - } - free(snps_in_recombinations); - - if (node->childNum > 0) - { -// child = root->child; - set_internal_node(1,sequence_index); - // Update number of SNPs - current_total_snps[node_index] = current_total_snps[parent_node_index] + node->number_of_snps; - num_blocks[node_index] = num_blocks[parent_node_index] + node->number_of_blocks; - -// while (child != NULL) -// { -// fill_in_recombinations_with_gaps(child->node, -// current_recombinations, -// num_current_recombinations, -// (current_total_snps + root->number_of_snps), -// (num_blocks + root->number_of_blocks), -// merged_block_coordinates, -// length_of_original_genome, -// snp_locations, -// number_of_snps -// ); -// child = child->next; -// -// } - } - else - { - set_internal_node(0,sequence_index); - } - free(merged_block_coordinates[0]); - free(merged_block_coordinates[1]); - free(merged_block_coordinates); + + // Set the number of recombination blocks for the sample + set_number_of_blocks_for_sample(node->taxon, node->number_of_blocks); + + // Set number of branch bases in recombination by iterating through + // the first part of merged blocks (i.e. only blocks on the branch to this node) + set_number_of_branch_bases_in_recombinations(node->taxon, + calculate_number_of_bases_in_recombinations_excluding_gaps(merged_block_coordinates, + node->number_of_blocks, + node_sequence, + snp_locations, + node->num_recombinations + node->number_of_snps) + ); + + // Set number of total bases in recombination by iterating through + // all merged blocks leading to this node + set_number_of_bases_in_recombinations(node->taxon, + calculate_number_of_bases_in_recombinations_excluding_gaps(merged_block_coordinates, + (num_blocks[parent_node_index] + node->number_of_blocks), + node_sequence, + snp_locations, + current_total_snps[parent_node_index] + node->num_recombinations + node->number_of_snps) + ); + free(node_sequence); + + for(i = 0; i < num_recombinations[node_index]; i++) + { + update_sequence_base('N', sequence_index, current_recombinations[node_index][i]); + } + + // TODO: The stats for the number of snps in recombinations will need to be updated. + int * snps_in_recombinations = (int *) calloc((number_of_snps +1),sizeof(int)); + int num_snps_in_recombinations = get_list_of_snp_indices_which_fall_in_downstream_recombinations(merged_block_coordinates, + (num_blocks[parent_node_index] + node->number_of_blocks), + snp_locations, + number_of_snps, + snps_in_recombinations); + + for(i = 0; i < num_snps_in_recombinations; i++) + { + update_sequence_base('N', sequence_index, snps_in_recombinations[i]); + } + free(snps_in_recombinations); + + if (node->childNum > 0) + { + set_internal_node(1,sequence_index); + // Update number of SNPs + current_total_snps[node_index] = current_total_snps[parent_node_index] + node->num_recombinations + node->number_of_snps; + num_blocks[node_index] = num_blocks[parent_node_index] + node->number_of_blocks; + nodeArray[node_index]->block_coordinates = merged_block_coordinates; + } + else + { + set_internal_node(0,sequence_index); + } + } + } -int calculate_number_of_bases_in_recombations_excluding_gaps(int ** block_coordinates, int num_blocks,char * child_sequence, int * snp_locations,int current_total_snps) +int calculate_number_of_bases_in_recombinations_excluding_gaps(int ** block_coordinates, int num_blocks,char * child_sequence, int * snp_locations,int current_total_snps) { int total_bases = 0; int current_block = 1; @@ -279,17 +275,15 @@ int calculate_number_of_bases_in_recombations_excluding_gaps(int ** block_coordi } for(start_block = 0; start_block < num_blocks; start_block++) { - if(block_coordinates[0][start_block] == -1 || block_coordinates[1][start_block] == -1) + if(block_coordinates[0][start_block] != -1 && block_coordinates[1][start_block] != -1) { - continue; - } - - - total_bases += calculate_block_size_without_gaps(child_sequence, + total_bases += calculate_block_size_without_gaps(child_sequence, snp_locations, block_coordinates[0][start_block], block_coordinates[1][start_block], current_total_snps); + } + } return total_bases; diff --git a/src/branch_sequences.h b/src/branch_sequences.h index e0a7548f..9b70fac5 100644 --- a/src/branch_sequences.h +++ b/src/branch_sequences.h @@ -30,7 +30,7 @@ int calculate_window_size(int branch_genome_size, int number_of_branch_snps,int double calculate_threshold(int branch_genome_size, int window_size, float uncorrected_p_value); int p_value_test(int branch_genome_size, int window_size, int num_branch_snps, int block_snp_count, int min_snps, float uncorrected_p_value); double reduce_factorial(int l, int i); -void fill_in_recombinations_with_gaps(newick_node ** nodeArray, int node_index, int parent_node_index, int ** current_recombinations, int * num_recombinations, int * current_total_snps,int * num_blocks, int ** current_block_coordinates,int length_of_original_genome,int * snp_locations, int number_of_snps); +void fill_in_recombinations_with_gaps(newick_node ** nodeArray, int node_index, int parent_node_index, int ** current_recombinations, int * num_recombinations, int * current_total_snps,int * num_blocks, int length_of_original_genome,int * snp_locations, int number_of_snps); int copy_and_concat_integer_arrays(int * array_1, int array_1_size, int * array_2, int array_2_size, int * output_array); int copy_and_concat_2d_integer_arrays(int ** array_1, int array_1_size, int ** array_2, int array_2_size, int ** output_array); double snp_density(int length_of_sequence, int number_of_snps); @@ -38,7 +38,7 @@ int calculate_cutoff(int branch_genome_size, int window_size, int num_branch_snp int get_smallest_log_likelihood(double * candidate_blocks, int number_of_candidate_blocks); int exclude_snp_sites_in_block(int window_start_coordinate, int window_end_coordinate, int * snp_site_coords, int number_of_branch_snps); int flag_smallest_log_likelihood_recombinations(int ** candidate_blocks, int number_of_candidate_blocks, int number_of_branch_snps, int * snp_site_coords, int * recombinations, int number_of_recombinations,newick_node * current_node, FILE * block_file_pointer, newick_node *root,int * snp_locations, int total_num_snps, FILE * gff_file_pointer, double * block_likelihooods); -int calculate_number_of_bases_in_recombations_excluding_gaps(int ** block_coordinates, int num_blocks,char * child_sequence, int * snp_locations,int current_total_snps); +int calculate_number_of_bases_in_recombinations_excluding_gaps(int ** block_coordinates, int num_blocks,char * child_sequence, int * snp_locations,int current_total_snps); void carry_unambiguous_gaps_up_tree(newick_node *root); void move_blocks_inwards_while_likelihood_improves(int number_of_blocks,int ** block_coordinates, int min_snps, int * snp_site_coords, int number_of_branch_snps,char * branch_snp_sequence, int * snp_locations, int branch_genome_size,char * child_sequence, int length_of_sequence, double * block_likelihoods, int cutoff_value, float trimming_ratio); int get_blocks(int ** block_coordinates, int branch_genome_size,int * snp_site_coords,int number_of_branch_snps, int window_size, int cutoff, char * original_sequence, int * snp_locations, int number_of_snps); diff --git a/src/snp_searching.c b/src/snp_searching.c index bee6e5df..ece5e192 100644 --- a/src/snp_searching.c +++ b/src/snp_searching.c @@ -211,7 +211,7 @@ int calculate_block_size_without_gaps(char * child_sequence, int * snp_locations int calculate_block_size_without_gaps_with_start_end_index(char * child_sequence, int * snp_locations, int starting_coordinate, int ending_coordinate, int current_snp_count, int start_index,int end_index) { int i; - int block_size = ending_coordinate - starting_coordinate; + int block_size = ending_coordinate - starting_coordinate + 1; // start and end coordinate are inclusive as they mark the bounding SNPs start_index = find_starting_index( starting_coordinate, snp_locations,start_index, end_index); for(i = start_index; i < current_snp_count ; i++)