From 6e759024ced21f66b838c2142e82cbfa537e4db5 Mon Sep 17 00:00:00 2001 From: Ola Brynildsrud Date: Thu, 16 Jun 2016 16:36:47 +0200 Subject: [PATCH 01/13] Open files as binary (b flag) --- scoary/methods.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scoary/methods.py b/scoary/methods.py index 7ec6962..0c2b010 100644 --- a/scoary/methods.py +++ b/scoary/methods.py @@ -95,12 +95,12 @@ def main(): starttime = time.time() - with open(args.genes, "rU") as genes, open(args.traits, "rU") as traits: + with open(args.genes, "rUb") as genes, open(args.traits, "rUb") as traits: if args.restrict_to is not None: allowed_isolates = [isolate for line in - open(args.restrict_to) + open(args.restrict_to,"rUb") for isolate in line.rstrip().split(",")] else: # Despite the confusing name @@ -427,7 +427,7 @@ def StoreTraitResult(Trait, Traitname, max_hits, p_cutoff, correctionmethod, upg """ The method that actually stores the results. Only accepts results from a single trait at a time """ - with open(Traitname + time.strftime("_%d_%m_%Y_%H%M") + ".csv", "w") as outfile: + with open(Traitname + time.strftime("_%d_%m_%Y_%H%M") + ".csv", "wb") as outfile: # Sort genes by p-value. sort_instructions = SortResultsAndSetKey(Trait) @@ -552,7 +552,7 @@ def StoreUPGMAtreeToFile(upgmatree): hamming distances in the gene presence/absence matrix """ treefilename = str("Tree" + time.strftime("_%d_%m_%Y_%H%M") + ".nwk") - with open(treefilename, "w") as treefile: + with open(treefilename, "wb") as treefile: Tree = str(upgmatree) Tree = Tree.replace("[", "(") Tree = Tree.replace("]", ")") From c94c6176d611ba206f80e4a0943f916828676122 Mon Sep 17 00:00:00 2001 From: Ola Brynildsrud Date: Thu, 16 Jun 2016 17:40:49 +0200 Subject: [PATCH 02/13] Allow writing of reduced subset file --- scoary/methods.py | 43 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/scoary/methods.py b/scoary/methods.py index 0c2b010..a20738b 100644 --- a/scoary/methods.py +++ b/scoary/methods.py @@ -71,6 +71,12 @@ def main(): ' strains. SCOARY will read the provided ' 'comma-separated table of strains and restrict ' 'analyzes to these.') + parser.add_argument('-w' '--write_reduced', + help='Use with -r if you want Scoary to create a new ' + 'gene presence absence file from your reduced set of ' + 'isolates.', + default=False, + action='store_true') parser.add_argument('-s', '--start_col', help='On which column in the gene presence/absence ' 'file do individual strain info start. Default=15. ' @@ -94,7 +100,7 @@ def main(): args = parser.parse_args() starttime = time.time() - + with open(args.genes, "rUb") as genes, open(args.traits, "rUb") as traits: if args.restrict_to is not None: @@ -107,12 +113,15 @@ def main(): # this actually means all isolates are allowed # and included in the analysis allowed_isolates = None + if args.write_reduced: + sys.exit("You cannot use the -w argument without specifying a subset (-r)") print("Reading gene presence absence file") genedic_and_matrix = Csv_to_dic_Roary(genes, args.delimiter, startcol=args.start_col - 1, - allowed_isolates=allowed_isolates) + allowed_isolates=allowed_isolates, + writereducedset=args.write_reduced) genedic = genedic_and_matrix["Roarydic"] zeroonesmatrix = genedic_and_matrix["Zero_ones_matrix"] strains = genedic_and_matrix["Strains"] @@ -189,15 +198,39 @@ def PopulateQuadTreeWithDistances(TDM): PopulatedQuadtree.insert_row(i, Quadmatrix[i]) return PopulatedQuadtree - -def Csv_to_dic_Roary(genefile, delimiter, startcol=0, allowed_isolates=None): +def ReduceSet(genefile, delimiter, startcol=14, allowed_isolates=None): + csvfile = csv.reader(genefile, skipinitialspace=True, delimiter=delimiter) + header = next(csvfile) + allowed_indexes = range(startcol) + for c in xrange(len(header)): + if header[c] in allowed_isolates: + allowed_indexes.append(c) + + print("Writing gene presence absence file for the reduced set of isolates") + reducedfilename = "gene_presence_absence_reduced_" + time.strftime("_%d_%m_%Y_%H%M") + ".csv" + with open(reducedfilename, "wb") as csvout: + wtr = csv.writer(csvout, delimiter = delimiter) + newheader = [header[a] for a in allowed_indexes] + wtr.writerow(newheader) + for r in csvfile: + wtr.writerow( tuple(r[a] for a in allowed_indexes) ) + print("Finished writing reduced gene presence absence list to file " + reducedfilename) + return reducedfilename + +def Csv_to_dic_Roary(genefile, delimiter, startcol=14, allowed_isolates=None, writereducedset=False): """ Converts a gene presence/absence file into dictionaries that are readable by Roary """ r = {} - csvfile = csv.reader(genefile, skipinitialspace=True) + if writereducedset: + file = open(ReduceSet(genefile,delimiter,startcol,allowed_isolates),"rUb") + csvfile = csv.reader(file, skipinitialspace=True, delimiter=delimiter) + file.close() + else: + csvfile = csv.reader(genefile, skipinitialspace=True, delimiter=delimiter) header = next(csvfile) + roaryfile = True strains = header[startcol:] From 07b5232771f8c29e398658cbd43f75c913c31ede Mon Sep 17 00:00:00 2001 From: Ola Brynildsrud Date: Fri, 17 Jun 2016 10:33:18 +0200 Subject: [PATCH 03/13] Fix writereduced --- scoary/methods.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scoary/methods.py b/scoary/methods.py index a20738b..3a20d2b 100644 --- a/scoary/methods.py +++ b/scoary/methods.py @@ -71,7 +71,7 @@ def main(): ' strains. SCOARY will read the provided ' 'comma-separated table of strains and restrict ' 'analyzes to these.') - parser.add_argument('-w' '--write_reduced', + parser.add_argument('-w', '--write_reduced', help='Use with -r if you want Scoary to create a new ' 'gene presence absence file from your reduced set of ' 'isolates.', @@ -226,7 +226,6 @@ def Csv_to_dic_Roary(genefile, delimiter, startcol=14, allowed_isolates=None, wr if writereducedset: file = open(ReduceSet(genefile,delimiter,startcol,allowed_isolates),"rUb") csvfile = csv.reader(file, skipinitialspace=True, delimiter=delimiter) - file.close() else: csvfile = csv.reader(genefile, skipinitialspace=True, delimiter=delimiter) header = next(csvfile) @@ -276,6 +275,8 @@ def Csv_to_dic_Roary(genefile, delimiter, startcol=14, allowed_isolates=None, wr zero_ones_matrix.append(zero_ones_line) # Transpose list for distance calculation purposes + if writereduced: + file.close() zero_ones_matrix = list(map(list, zip(*zero_ones_matrix))) return {"Roarydic": r, "Zero_ones_matrix": zero_ones_matrix, From 15ab4e6023506c7f37b045d0d1981bfb0cf4ea4d Mon Sep 17 00:00:00 2001 From: Ola Brynildsrud Date: Fri, 17 Jun 2016 13:05:20 +0200 Subject: [PATCH 04/13] Add newline --- scoary/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scoary/methods.py b/scoary/methods.py index 3a20d2b..a285f74 100644 --- a/scoary/methods.py +++ b/scoary/methods.py @@ -147,7 +147,7 @@ def main(): args.max_hits, args.p_value_cutoff, args.correction, upgmatree, GTC) - print("Finished. Checked a total of %d genes for associations to %d trait(s). " + print("\nFinished. Checked a total of %d genes for associations to %d trait(s). " "Total time used: %d seconds." % (len(genedic), len(traitsdic), int(time.time()-starttime))) From b24983169bbc831f7adca09427698863aeb13bd8 Mon Sep 17 00:00:00 2001 From: Ola Brynildsrud Date: Tue, 5 Jul 2016 13:16:47 +0200 Subject: [PATCH 05/13] Update methods.py --- scoary/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scoary/methods.py b/scoary/methods.py index a285f74..d4430b1 100644 --- a/scoary/methods.py +++ b/scoary/methods.py @@ -275,7 +275,7 @@ def Csv_to_dic_Roary(genefile, delimiter, startcol=14, allowed_isolates=None, wr zero_ones_matrix.append(zero_ones_line) # Transpose list for distance calculation purposes - if writereduced: + if writereducedset: file.close() zero_ones_matrix = list(map(list, zip(*zero_ones_matrix))) return {"Roarydic": r, From 14cdc6b8679fbffdf01bc218c5f94e492916ce99 Mon Sep 17 00:00:00 2001 From: Ola Brynildsrud Date: Tue, 5 Jul 2016 13:22:06 +0200 Subject: [PATCH 06/13] 1.3.5 --- scoary/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scoary/__init__.py b/scoary/__init__.py index ac422f1..5b8f37a 100644 --- a/scoary/__init__.py +++ b/scoary/__init__.py @@ -1 +1 @@ -__version__ = '1.3.4' +__version__ = '1.3.5' From a70b7b9e77a0aecbf0384903b768d71a20d288a4 Mon Sep 17 00:00:00 2001 From: Ola Brynildsrud Date: Tue, 5 Jul 2016 13:41:00 +0200 Subject: [PATCH 07/13] Require range on p --- scoary/methods.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scoary/methods.py b/scoary/methods.py index d4430b1..dc70c47 100644 --- a/scoary/methods.py +++ b/scoary/methods.py @@ -98,6 +98,9 @@ def main(): version=SCOARY_VERSION) args = parser.parse_args() + + if (args.p_value_cutoff > 1.0) or (args.p_value_cutoff =< 0.0): + sys.exit("P must be between 0.0 and 1.0 or exactly 1.0") starttime = time.time() From da296d3881363774a1942aff6bae8b67ca1e44c0 Mon Sep 17 00:00:00 2001 From: Ola Brynildsrud Date: Tue, 5 Jul 2016 13:47:44 +0200 Subject: [PATCH 08/13] Enforce single-char delimiter --- scoary/methods.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scoary/methods.py b/scoary/methods.py index dc70c47..d1f53b0 100644 --- a/scoary/methods.py +++ b/scoary/methods.py @@ -101,6 +101,8 @@ def main(): if (args.p_value_cutoff > 1.0) or (args.p_value_cutoff =< 0.0): sys.exit("P must be between 0.0 and 1.0 or exactly 1.0") + if (len(args.delimiter) > 1): + sys.exit("Delimiter must be a single character string. There is no support for tab.") starttime = time.time() From cda665feaa79421a53c6f885afc69ddf46e07cd5 Mon Sep 17 00:00:00 2001 From: Ola Brynildsrud Date: Tue, 5 Jul 2016 13:49:58 +0200 Subject: [PATCH 09/13] Fix typo in p_value_cutoff --- scoary/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scoary/methods.py b/scoary/methods.py index d1f53b0..19c55d7 100644 --- a/scoary/methods.py +++ b/scoary/methods.py @@ -99,7 +99,7 @@ def main(): args = parser.parse_args() - if (args.p_value_cutoff > 1.0) or (args.p_value_cutoff =< 0.0): + if (args.p_value_cutoff > 1.0) or (args.p_value_cutoff <= 0.0): sys.exit("P must be between 0.0 and 1.0 or exactly 1.0") if (len(args.delimiter) > 1): sys.exit("Delimiter must be a single character string. There is no support for tab.") From ade0c13fc67a73b4dbdeb046065c87c8790d0589 Mon Sep 17 00:00:00 2001 From: Ola Brynildsrud Date: Tue, 5 Jul 2016 14:01:58 +0200 Subject: [PATCH 10/13] Update to reflect 1.3.5 changes --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index 24df3a9..7fc62f6 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,11 @@ Scoary is designed to take the gene_presence_absence.csv file from [Roary] (http - [Contact] (#contact) ## What's new? +v1.3.5 (PRE-RELEASE) (5th Jul 2016) +- You can now use the -w option with -r to write a reduced gene presence/absence file containing only the subset isolates. This ensures that the program will run much faster if you have a large dataset (1000s of isolates) but only want to analyze a subset. Scoary automatically opens and analyzes the newly written file. +- All files are now opened as binary files. (Should speed up analysis in most cases.) +- This is a pre-release version. There might still be bugs in the code, in which case I would be grateful if you report them. + v1.3.4 (16th Jun 2016) - Scoary no longer crashes when using Scipy 0.16 instead of 0.17. - More information about what's going on is printed. (Useful for very large datasets that take long to analyze) @@ -197,6 +202,9 @@ optional arguments: On which column in the gene presence/absence file do individual strain info start. Default=15. (1-based indexing) + -w, --write_reduced Use with -r if you only want to analyze a subset of your + strains. SCOARY will read the provided comma-separated + table of strains and restrict analyzes to these. --delimiter DELIMITER The delimiter between cells in the gene presence/absence and trait files. @@ -216,6 +224,9 @@ Strain1,Strain2,Strain4,Strain9 This will restrict the current analysis to isolates 1,2,4 and 9, and will omit all others. +#### The -w flag +Using the **-w** flag with **-r** will make Scoary write a reduced gene presence/absence file containing only those isolates specified with **-r**. This makes the program run much faster if you are analyzing a small subset of a large dataset. + #### The -s parameter The **-s** parameter is used to indicate to Scoary which column in the gene_presence_absence.csv file is the _first_ column representing an isolate. By default it is set to 15 (1-based indexing). From 992eea0bfada86c42c27a1d250ad3e7a091eb0db Mon Sep 17 00:00:00 2001 From: Ola Brynildsrud Date: Tue, 5 Jul 2016 14:11:26 +0200 Subject: [PATCH 11/13] Remove binary enconding (python3) --- scoary/methods.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scoary/methods.py b/scoary/methods.py index 19c55d7..e1ca29c 100644 --- a/scoary/methods.py +++ b/scoary/methods.py @@ -106,12 +106,12 @@ def main(): starttime = time.time() - with open(args.genes, "rUb") as genes, open(args.traits, "rUb") as traits: + with open(args.genes, "rU") as genes, open(args.traits, "rU") as traits: if args.restrict_to is not None: allowed_isolates = [isolate for line in - open(args.restrict_to,"rUb") + open(args.restrict_to,"rU") for isolate in line.rstrip().split(",")] else: # Despite the confusing name @@ -213,7 +213,7 @@ def ReduceSet(genefile, delimiter, startcol=14, allowed_isolates=None): print("Writing gene presence absence file for the reduced set of isolates") reducedfilename = "gene_presence_absence_reduced_" + time.strftime("_%d_%m_%Y_%H%M") + ".csv" - with open(reducedfilename, "wb") as csvout: + with open(reducedfilename, "w") as csvout: wtr = csv.writer(csvout, delimiter = delimiter) newheader = [header[a] for a in allowed_indexes] wtr.writerow(newheader) @@ -229,7 +229,7 @@ def Csv_to_dic_Roary(genefile, delimiter, startcol=14, allowed_isolates=None, wr """ r = {} if writereducedset: - file = open(ReduceSet(genefile,delimiter,startcol,allowed_isolates),"rUb") + file = open(ReduceSet(genefile,delimiter,startcol,allowed_isolates),"rU") csvfile = csv.reader(file, skipinitialspace=True, delimiter=delimiter) else: csvfile = csv.reader(genefile, skipinitialspace=True, delimiter=delimiter) @@ -466,7 +466,7 @@ def StoreTraitResult(Trait, Traitname, max_hits, p_cutoff, correctionmethod, upg """ The method that actually stores the results. Only accepts results from a single trait at a time """ - with open(Traitname + time.strftime("_%d_%m_%Y_%H%M") + ".csv", "wb") as outfile: + with open(Traitname + time.strftime("_%d_%m_%Y_%H%M") + ".csv", "w") as outfile: # Sort genes by p-value. sort_instructions = SortResultsAndSetKey(Trait) @@ -591,7 +591,7 @@ def StoreUPGMAtreeToFile(upgmatree): hamming distances in the gene presence/absence matrix """ treefilename = str("Tree" + time.strftime("_%d_%m_%Y_%H%M") + ".nwk") - with open(treefilename, "wb") as treefile: + with open(treefilename, "w") as treefile: Tree = str(upgmatree) Tree = Tree.replace("[", "(") Tree = Tree.replace("]", ")") From 40bfc871c297a227f51b9580a34d1b9a8d4de4d6 Mon Sep 17 00:00:00 2001 From: Ola Brynildsrud Date: Tue, 5 Jul 2016 14:11:46 +0200 Subject: [PATCH 12/13] Remove binary enconding (python3) --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 7fc62f6..c1c42cb 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,6 @@ Scoary is designed to take the gene_presence_absence.csv file from [Roary] (http ## What's new? v1.3.5 (PRE-RELEASE) (5th Jul 2016) - You can now use the -w option with -r to write a reduced gene presence/absence file containing only the subset isolates. This ensures that the program will run much faster if you have a large dataset (1000s of isolates) but only want to analyze a subset. Scoary automatically opens and analyzes the newly written file. -- All files are now opened as binary files. (Should speed up analysis in most cases.) - This is a pre-release version. There might still be bugs in the code, in which case I would be grateful if you report them. v1.3.4 (16th Jun 2016) From c291c0916f93ca7f419faf3c348c97a0ed2159d6 Mon Sep 17 00:00:00 2001 From: Ola Brynildsrud Date: Tue, 5 Jul 2016 14:15:40 +0200 Subject: [PATCH 13/13] Range to list(range) (Python3) --- scoary/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scoary/methods.py b/scoary/methods.py index e1ca29c..2dd75bf 100644 --- a/scoary/methods.py +++ b/scoary/methods.py @@ -206,7 +206,7 @@ def PopulateQuadTreeWithDistances(TDM): def ReduceSet(genefile, delimiter, startcol=14, allowed_isolates=None): csvfile = csv.reader(genefile, skipinitialspace=True, delimiter=delimiter) header = next(csvfile) - allowed_indexes = range(startcol) + allowed_indexes = list(range(startcol)) for c in xrange(len(header)): if header[c] in allowed_isolates: allowed_indexes.append(c)