Skip to content

Commit

Permalink
Fix code styling
Browse files Browse the repository at this point in the history
  • Loading branch information
Vini2 committed Feb 8, 2021
1 parent 8fdbad3 commit deae548
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 42 deletions.
47 changes: 28 additions & 19 deletions src/metacoag_SPAdes.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,12 @@

# Name vertices
for i in range(node_count):
assembly_graph.vs[i]["id"]= i
assembly_graph.vs[i]["label"]= "NODE_"+str(contigs_map[i])
assembly_graph.vs[i]["id"] = i
assembly_graph.vs[i]["label"] = "NODE_"+str(contigs_map[i])

# Get list of edges
edge_list = graph_utils.get_graph_edges_spades(assembly_graph_file, node_count, contigs_map, contigs_map_rev, paths, segment_contigs)
edge_list = graph_utils.get_graph_edges_spades(assembly_graph_file, node_count,
contigs_map, contigs_map_rev, paths, segment_contigs)

# Add edges to the graph
assembly_graph.add_edges(edge_list)
Expand Down Expand Up @@ -199,12 +200,12 @@
# Get contigs containing each marker gene for each iteration
seed_iter = {}

n=0
n = 0
for i in range(len(my_gene_counts)):
for item in marker_contig_counts:
if marker_contig_counts[item] == my_gene_counts[i]:
seed_iter[n] = marker_contigs[item]
n+=1
n += 1


# Initialise bins
Expand Down Expand Up @@ -268,7 +269,7 @@
top_nodes = []
edges = []

if len(to_bin)!=0:
if len(to_bin) != 0:

n_bins = len(bins)

Expand All @@ -292,15 +293,16 @@

for j in range(n_contigs):

tetramer_dist = matching_utils.get_tetramer_distance(normalized_tetramer_profiles[contigid], normalized_tetramer_profiles[bins[b][j]])
tetramer_dist = matching_utils.get_tetramer_distance(normalized_tetramer_profiles[contigid],
normalized_tetramer_profiles[bins[b][j]])
prob_comp = matching_utils.get_comp_probability(tetramer_dist)
prob_cov = matching_utils.get_cov_probability(coverages[contigid], coverages[bins[b][j]])

prob_product = prob_comp * prob_cov

log_prob = 0

if prob_product!=0.0:
if prob_product != 0.0:
log_prob = - (math.log(prob_comp, 10) + math.log(prob_cov, 10))
else:
log_prob = max_weight
Expand All @@ -325,7 +327,7 @@

edge_weights_per_iteration[i] = edge_weights

top_nodes = {n for n, d in B.nodes(data=True) if d['bipartite']==0}
top_nodes = {n for n, d in B.nodes(data=True) if d['bipartite'] == 0}
bottom_nodes = set(B) - top_nodes

my_matching = nx.algorithms.bipartite.matching.minimum_weight_full_matching(B, top_nodes, "weight")
Expand All @@ -352,7 +354,7 @@

avg_path_len = path_len_sum/len(bins[b])

if edge_weights[(l, my_matching[l])]<=w_intra and math.floor(avg_path_len)<=d_limit:
if edge_weights[(l, my_matching[l])] <= w_intra and math.floor(avg_path_len) <= d_limit:
bins[b].append(my_matching[l])
bin_of_contig[my_matching[l]] = b
matched_in_seed_iter.append(my_matching[l])
Expand All @@ -364,7 +366,7 @@

for nb in not_binned:

if edge_weights_per_iteration[i][(not_binned[nb][0], nb)]>=w_inter:
if edge_weights_per_iteration[i][(not_binned[nb][0], nb)] >= w_inter:

path_len_sum = 0

Expand All @@ -379,10 +381,10 @@

if math.floor(avg_path_len) >= d_limit:
logger.debug("Creating new bin...")
bins[n_bins]=[nb]
bins[n_bins] = [nb]
bin_of_contig[nb] = n_bins
binned_contigs.append(nb)
n_bins+=1
n_bins += 1
binned_contigs_with_markers.append("NODE_"+str(contigs_map[nb]))

logger.debug("Bins with contigs containing seed marker genes")
Expand Down Expand Up @@ -426,6 +428,7 @@
#-----------------------------------------------------

isolated = graph_utils.get_isolated(node_count, assembly_graph)

non_isolated = graph_utils.get_non_isolated(node_count, assembly_graph, binned_contigs)

logger.info("Number of non-isolated contigs: "+str(len(non_isolated)))
Expand All @@ -451,7 +454,8 @@
contigs_to_bin.update(closest_neighbours)

sorted_node_list = []
sorted_node_list_ = [list(label_prop_utils.runBFS(x, depth, min_length, binned_contigs, bin_of_contig, assembly_graph, normalized_tetramer_profiles, coverages, contig_lengths)) for x in contigs_to_bin]
sorted_node_list_ = [list(label_prop_utils.runBFS(x, depth, min_length, binned_contigs, bin_of_contig,
assembly_graph, normalized_tetramer_profiles, coverages, contig_lengths)) for x in contigs_to_bin]
sorted_node_list_ = [item for sublist in sorted_node_list_ for item in sublist]

for data in sorted_node_list_:
Expand All @@ -478,7 +482,8 @@
heapq.heapify(sorted_node_list)

for n in unbinned_neighbours:
candidates = list(label_prop_utils.runBFS(n, depth, min_length, binned_contigs, bin_of_contig, assembly_graph, normalized_tetramer_profiles, coverages, contig_lengths))
candidates = list(label_prop_utils.runBFS(n, depth, min_length, binned_contigs, bin_of_contig,
assembly_graph, normalized_tetramer_profiles, coverages, contig_lengths))
for c in candidates:
heapq.heappush(sorted_node_list, DataWrap(c))

Expand Down Expand Up @@ -534,8 +539,10 @@
# Assign contigs
with Pool(nthreads) as p:
assigned = list(tqdm(p.starmap(label_prop_utils.assign,
zip(unbinned_contigs, repeat(min_length), repeat(contig_lengths), repeat(normalized_tetramer_profiles),
repeat(bin_tetramer_profiles), repeat(len(bins)))), total=len(unbinned_contigs)))
zip(unbinned_contigs, repeat(min_length), repeat(contig_lengths),
repeat(normalized_tetramer_profiles), repeat(bin_tetramer_profiles),
repeat(len(bins)))),
total=len(unbinned_contigs)))

put_to_bins = list(filter(lambda x: x is not None, assigned))

Expand Down Expand Up @@ -565,7 +572,8 @@
contigs_to_bin.update(closest_neighbours)

sorted_node_list = []
sorted_node_list_ = [list(label_prop_utils.runBFS(x, depth, min_length, binned_contigs, bin_of_contig, assembly_graph, normalized_tetramer_profiles, coverages, contig_lengths)) for x in contigs_to_bin]
sorted_node_list_ = [list(label_prop_utils.runBFS(x, depth, min_length, binned_contigs, bin_of_contig,
assembly_graph, normalized_tetramer_profiles, coverages, contig_lengths)) for x in contigs_to_bin]
sorted_node_list_ = [item for sublist in sorted_node_list_ for item in sublist]

for data in sorted_node_list_:
Expand All @@ -588,7 +596,8 @@
heapq.heapify(sorted_node_list)

for n in unbinned_neighbours:
candidates = list(label_prop_utils.runBFS(n, depth, min_length, binned_contigs, bin_of_contig, assembly_graph, normalized_tetramer_profiles, coverages, contig_lengths))
candidates = list(label_prop_utils.runBFS(n, depth, min_length, binned_contigs, bin_of_contig,
assembly_graph, normalized_tetramer_profiles, coverages, contig_lengths))
for c in candidates:
heapq.heappush(sorted_node_list, DataWrap(c))

Expand Down
26 changes: 14 additions & 12 deletions src/metacoag_utils/feature_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def get_rc(seq):


def mer2bits(kmer):
bit_mer=nt_bits[kmer[0]]
bit_mer = nt_bits[kmer[0]]
for c in kmer[1:]:
bit_mer = (bit_mer << 2) | nt_bits[c]
return bit_mer
Expand Down Expand Up @@ -63,49 +63,51 @@ def get_tetramer_profiles(output_path, seqs, nthreads):
tetramer_profiles = {}
normalized_tetramer_profiles = {}

if os.path.isfile(output_path+"contig_tetramers.txt") and os.path.isfile(output_path+"normalized_contig_tetramers.txt"):
i=0
if os.path.isfile(output_path+"contig_tetramers.txt") and
os.path.isfile(output_path+"normalized_contig_tetramers.txt"):
i = 0
with open(output_path+"contig_tetramers.txt") as tetramers_file:
for line in tetramers_file.readlines():
f_list = [float(i) for i in line.split(" ") if i.strip()]
tetramer_profiles[i] = f_list
i+=1
i += 1

i=0
i = 0
with open(output_path+"normalized_contig_tetramers.txt") as tetramers_file:
for line in tetramers_file.readlines():
f_list = [float(i) for i in line.split(" ") if i.strip()]
normalized_tetramer_profiles[i] = f_list
i+=1
i += 1

else:

kmer_inds_4, kmer_count_len_4 = compute_kmer_inds(4)

pool = Pool(nthreads)
record_tetramers = pool.map(count_kmers, [(seq, 4, kmer_inds_4, kmer_count_len_4) for seq in seqs])
record_tetramers = pool.map(count_kmers,
[(seq, 4, kmer_inds_4, kmer_count_len_4) for seq in seqs])
pool.close()

normalized = [x[1] for x in record_tetramers]
unnormalized = [x[0] for x in record_tetramers]

i=0
i = 0

for l in range(len(unnormalized)):
tetramer_profiles[i] = unnormalized[l]
i+=1
i += 1

with open(output_path+"contig_tetramers.txt", "w+") as myfile:
for l in range(len(unnormalized)):
for j in range(len(unnormalized[l])):
myfile.write(str(unnormalized[l][j])+" ")
myfile.write("\n")

i=0
i = 0

for l in range(len(normalized)):
normalized_tetramer_profiles[i] = normalized[l]
i+=1
i += 1

with open(output_path+"normalized_contig_tetramers.txt", "w+") as myfile:
for l in range(len(normalized)):
Expand Down Expand Up @@ -145,6 +147,6 @@ def get_cov_len_spades(contigs_file, contigs_map_rev):

seqs.append(str(record.seq))

i+=1
i += 1

return seqs, coverages, contig_lengths
6 changes: 4 additions & 2 deletions src/metacoag_utils/graph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ def get_segment_paths_spades(contig_paths):



def get_graph_edges_spades(assembly_graph_file, node_count, contigs_map, contigs_map_rev, paths, segment_contigs):
def get_graph_edges_spades(
assembly_graph_file, node_count, contigs_map,
contigs_map_rev, paths, segment_contigs):

links = []
links_map = defaultdict(set)
Expand Down Expand Up @@ -110,7 +112,7 @@ def get_graph_edges_spades(assembly_graph_file, node_count, contigs_map, contigs
for new_link in new_links:
if new_link in segment_contigs:
for contig in segment_contigs[new_link]:
if i!=contigs_map_rev[int(contig)]:
if i != contigs_map_rev[int(contig)]:
# Add edge to list of edges
edge_list.append((i,contigs_map_rev[int(contig)]))

Expand Down
17 changes: 11 additions & 6 deletions src/metacoag_utils/label_prop_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@ def __lt__(self, other):


# The BFS function to search labelled nodes
def runBFS(node, threhold, min_length, binned_contigs, bin_of_contig, assembly_graph, tetramer_profiles, coverages, contig_lengths):
def runBFS(
node, threhold, min_length, binned_contigs, bin_of_contig,
assembly_graph, tetramer_profiles, coverages, contig_lengths):

queue = []
visited = set()
queue.append(node)
Expand All @@ -43,16 +46,16 @@ def runBFS(node, threhold, min_length, binned_contigs, bin_of_contig, assembly_g
prob_cov = matching_utils.get_cov_probability(coverages[node], coverages[active_node])

if contig_lengths[node] >= min_length and contig_lengths[active_node] >= min_length:
if prob_cov!=0.0 and prob_comp!=0.0:
if prob_cov != 0.0 and prob_comp != 0.0:
labelled_nodes.add((node, active_node, contig_bin, depth[active_node], -(math.log(prob_cov, 10)+math.log(prob_comp, 10))))
elif prob_cov==0.0 and prob_comp!=0.0:
elif prob_cov == 0.0 and prob_comp != 0.0:
labelled_nodes.add((node, active_node, contig_bin, depth[active_node], -math.log(prob_comp, 10)))
elif prob_cov!=0.0 and prob_comp==0.0:
elif prob_cov != 0.0 and prob_comp == 0.0:
labelled_nodes.add((node, active_node, contig_bin, depth[active_node], -math.log(prob_cov, 10)))
else:
labelled_nodes.add((node, active_node, contig_bin, depth[active_node], max_weight))
else:
if prob_cov!=0.0:
if prob_cov != 0.0:
labelled_nodes.add((node, active_node, contig_bin, depth[active_node], -math.log(prob_cov, 10)))
else:
labelled_nodes.add((node, active_node, contig_bin, depth[active_node], max_weight))
Expand All @@ -69,7 +72,9 @@ def runBFS(node, threhold, min_length, binned_contigs, bin_of_contig, assembly_g



def assign(contig, min_length, contig_lengths, normalized_tetramer_profiles, bin_tetramer_profiles, n_bins):
def assign(
contig, min_length, contig_lengths, normalized_tetramer_profiles,
bin_tetramer_profiles, n_bins):

if contig_lengths[contig] >= min_length:

Expand Down
6 changes: 4 additions & 2 deletions src/metacoag_utils/marker_gene_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ def scan_for_marker_genes(contig_file, nthreads, hard=0):

software_path = pathlib.Path(__file__).parent.parent.absolute()

fragScanURL = os.path.join(software_path.parent, 'auxiliary', 'FragGeneScan1.31', 'run_FragGeneScan.pl')
hmmExeURL = os.path.join(software_path.parent, 'auxiliary', 'hmmer-3.3', 'src', 'hmmsearch')
fragScanURL = os.path.join(software_path.parent, 'auxiliary',
'FragGeneScan1.31', 'run_FragGeneScan.pl')
hmmExeURL = os.path.join(software_path.parent, 'auxiliary', 'hmmer-3.3',
'src', 'hmmsearch')
markerURL = os.path.join(software_path.parent, 'auxiliary', 'marker.hmm')

logger.debug(markerURL)
Expand Down
4 changes: 3 additions & 1 deletion src/metacoag_utils/matching_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ def get_tetramer_distance(seq1, seq2):


def get_comp_probability(tetramer_dist):
return normpdf(tetramer_dist, mu_intra, sigma_intra)/(normpdf(tetramer_dist, mu_intra, sigma_intra)+normpdf(tetramer_dist, mu_inter, sigma_inter))
gaus_intra = normpdf(tetramer_dist, mu_intra, sigma_intra)
gaus_inter = normpdf(tetramer_dist, mu_inter, sigma_inter)
return gaus_intra/(gaus_intra+gaus_inter)


def get_cov_probability(cov1, cov2):
Expand Down

0 comments on commit deae548

Please sign in to comment.