From 6e759024ced21f66b838c2142e82cbfa537e4db5 Mon Sep 17 00:00:00 2001
From: Ola Brynildsrud <olbb@fhi.no>
Date: Thu, 16 Jun 2016 16:36:47 +0200
Subject: [PATCH 01/13] Open files as binary (b flag)

---
 scoary/methods.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scoary/methods.py b/scoary/methods.py
index 7ec6962..0c2b010 100644
--- a/scoary/methods.py
+++ b/scoary/methods.py
@@ -95,12 +95,12 @@ def main():
 
     starttime = time.time()
 
-    with open(args.genes, "rU") as genes, open(args.traits, "rU") as traits:
+    with open(args.genes, "rUb") as genes, open(args.traits, "rUb") as traits:
 
         if args.restrict_to is not None:
             allowed_isolates = [isolate
                                 for line in
-                                open(args.restrict_to)
+                                open(args.restrict_to,"rUb")
                                 for isolate in line.rstrip().split(",")]
         else:
             # Despite the confusing name
@@ -427,7 +427,7 @@ def StoreTraitResult(Trait, Traitname, max_hits, p_cutoff, correctionmethod, upg
     """
     The method that actually stores the results. Only accepts results from a single trait at a time
     """
-    with open(Traitname + time.strftime("_%d_%m_%Y_%H%M") + ".csv", "w") as outfile:
+    with open(Traitname + time.strftime("_%d_%m_%Y_%H%M") + ".csv", "wb") as outfile:
         # Sort genes by p-value.
         sort_instructions = SortResultsAndSetKey(Trait)
 
@@ -552,7 +552,7 @@ def StoreUPGMAtreeToFile(upgmatree):
     hamming distances in the gene presence/absence matrix
     """
     treefilename = str("Tree" + time.strftime("_%d_%m_%Y_%H%M") + ".nwk")
-    with open(treefilename, "w") as treefile:
+    with open(treefilename, "wb") as treefile:
         Tree = str(upgmatree)
         Tree = Tree.replace("[", "(")
         Tree = Tree.replace("]", ")")

From c94c6176d611ba206f80e4a0943f916828676122 Mon Sep 17 00:00:00 2001
From: Ola Brynildsrud <olbb@fhi.no>
Date: Thu, 16 Jun 2016 17:40:49 +0200
Subject: [PATCH 02/13] Allow writing of reduced subset file

---
 scoary/methods.py | 43 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/scoary/methods.py b/scoary/methods.py
index 0c2b010..a20738b 100644
--- a/scoary/methods.py
+++ b/scoary/methods.py
@@ -71,6 +71,12 @@ def main():
                         ' strains. SCOARY will read the provided '
                         'comma-separated table of strains and restrict '
                         'analyzes to these.')
+    parser.add_argument('-w' '--write_reduced',
+                        help='Use with -r if you want Scoary to create a new '
+                        'gene presence absence file from your reduced set of '
+                        'isolates.',
+                        default=False,
+                        action='store_true')
     parser.add_argument('-s', '--start_col',
                         help='On which column in the gene presence/absence '
                         'file do individual strain info start. Default=15. '
@@ -94,7 +100,7 @@ def main():
     args = parser.parse_args()
 
     starttime = time.time()
-
+    
     with open(args.genes, "rUb") as genes, open(args.traits, "rUb") as traits:
 
         if args.restrict_to is not None:
@@ -107,12 +113,15 @@ def main():
             # this actually means all isolates are allowed
             # and included in the analysis
             allowed_isolates = None
+            if args.write_reduced:
+                sys.exit("You cannot use the -w argument without specifying a subset (-r)")
             
         print("Reading gene presence absence file")    
         genedic_and_matrix = Csv_to_dic_Roary(genes,
                                               args.delimiter,
                                               startcol=args.start_col - 1,
-                                              allowed_isolates=allowed_isolates)
+                                              allowed_isolates=allowed_isolates,
+                                              writereducedset=args.write_reduced)
         genedic = genedic_and_matrix["Roarydic"]
         zeroonesmatrix = genedic_and_matrix["Zero_ones_matrix"]
         strains = genedic_and_matrix["Strains"]
@@ -189,15 +198,39 @@ def PopulateQuadTreeWithDistances(TDM):
         PopulatedQuadtree.insert_row(i, Quadmatrix[i])
     return PopulatedQuadtree
 
-
-def Csv_to_dic_Roary(genefile, delimiter, startcol=0, allowed_isolates=None):
+def ReduceSet(genefile, delimiter, startcol=14, allowed_isolates=None):
+    csvfile = csv.reader(genefile, skipinitialspace=True, delimiter=delimiter)
+    header = next(csvfile)
+    allowed_indexes = range(startcol)
+    for c in xrange(len(header)):
+        if header[c] in allowed_isolates:
+            allowed_indexes.append(c)
+    
+    print("Writing gene presence absence file for the reduced set of isolates")
+    reducedfilename = "gene_presence_absence_reduced_" + time.strftime("_%d_%m_%Y_%H%M") + ".csv"
+    with open(reducedfilename, "wb") as csvout:
+        wtr = csv.writer(csvout, delimiter = delimiter)
+        newheader = [header[a] for a in allowed_indexes]
+        wtr.writerow(newheader)
+        for r in csvfile:
+            wtr.writerow( tuple(r[a] for a in allowed_indexes) )
+    print("Finished writing reduced gene presence absence list to file " + reducedfilename)
+    return reducedfilename
+
+def Csv_to_dic_Roary(genefile, delimiter, startcol=14, allowed_isolates=None, writereducedset=False):
     """
     Converts a gene presence/absence file into dictionaries
     that are readable by Roary
     """
     r = {}
-    csvfile = csv.reader(genefile, skipinitialspace=True)
+    if writereducedset:
+        file = open(ReduceSet(genefile,delimiter,startcol,allowed_isolates),"rUb")
+        csvfile = csv.reader(file, skipinitialspace=True, delimiter=delimiter)
+        file.close()
+    else:
+        csvfile = csv.reader(genefile, skipinitialspace=True, delimiter=delimiter)
     header = next(csvfile)
+            
     roaryfile = True
 
     strains = header[startcol:]

From 07b5232771f8c29e398658cbd43f75c913c31ede Mon Sep 17 00:00:00 2001
From: Ola Brynildsrud <olbb@fhi.no>
Date: Fri, 17 Jun 2016 10:33:18 +0200
Subject: [PATCH 03/13] Fix writereduced

---
 scoary/methods.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scoary/methods.py b/scoary/methods.py
index a20738b..3a20d2b 100644
--- a/scoary/methods.py
+++ b/scoary/methods.py
@@ -71,7 +71,7 @@ def main():
                         ' strains. SCOARY will read the provided '
                         'comma-separated table of strains and restrict '
                         'analyzes to these.')
-    parser.add_argument('-w' '--write_reduced',
+    parser.add_argument('-w', '--write_reduced',
                         help='Use with -r if you want Scoary to create a new '
                         'gene presence absence file from your reduced set of '
                         'isolates.',
@@ -226,7 +226,6 @@ def Csv_to_dic_Roary(genefile, delimiter, startcol=14, allowed_isolates=None, wr
     if writereducedset:
         file = open(ReduceSet(genefile,delimiter,startcol,allowed_isolates),"rUb")
         csvfile = csv.reader(file, skipinitialspace=True, delimiter=delimiter)
-        file.close()
     else:
         csvfile = csv.reader(genefile, skipinitialspace=True, delimiter=delimiter)
     header = next(csvfile)
@@ -276,6 +275,8 @@ def Csv_to_dic_Roary(genefile, delimiter, startcol=14, allowed_isolates=None, wr
             zero_ones_matrix.append(zero_ones_line)
 
     # Transpose list for distance calculation purposes
+    if writereduced:
+        file.close()
     zero_ones_matrix = list(map(list, zip(*zero_ones_matrix)))
     return {"Roarydic": r,
             "Zero_ones_matrix": zero_ones_matrix,

From 15ab4e6023506c7f37b045d0d1981bfb0cf4ea4d Mon Sep 17 00:00:00 2001
From: Ola Brynildsrud <olbb@fhi.no>
Date: Fri, 17 Jun 2016 13:05:20 +0200
Subject: [PATCH 04/13] Add newline

---
 scoary/methods.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scoary/methods.py b/scoary/methods.py
index 3a20d2b..a285f74 100644
--- a/scoary/methods.py
+++ b/scoary/methods.py
@@ -147,7 +147,7 @@ def main():
                      args.max_hits,
                      args.p_value_cutoff,
                      args.correction, upgmatree, GTC)
-        print("Finished. Checked a total of %d genes for associations to %d trait(s). "
+        print("\nFinished. Checked a total of %d genes for associations to %d trait(s). "
               "Total time used: %d seconds." % (len(genedic),
                                                 len(traitsdic),
                                                 int(time.time()-starttime)))

From b24983169bbc831f7adca09427698863aeb13bd8 Mon Sep 17 00:00:00 2001
From: Ola Brynildsrud <olbb@fhi.no>
Date: Tue, 5 Jul 2016 13:16:47 +0200
Subject: [PATCH 05/13] Update methods.py

---
 scoary/methods.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scoary/methods.py b/scoary/methods.py
index a285f74..d4430b1 100644
--- a/scoary/methods.py
+++ b/scoary/methods.py
@@ -275,7 +275,7 @@ def Csv_to_dic_Roary(genefile, delimiter, startcol=14, allowed_isolates=None, wr
             zero_ones_matrix.append(zero_ones_line)
 
     # Transpose list for distance calculation purposes
-    if writereduced:
+    if writereducedset:
         file.close()
     zero_ones_matrix = list(map(list, zip(*zero_ones_matrix)))
     return {"Roarydic": r,

From 14cdc6b8679fbffdf01bc218c5f94e492916ce99 Mon Sep 17 00:00:00 2001
From: Ola Brynildsrud <olbb@fhi.no>
Date: Tue, 5 Jul 2016 13:22:06 +0200
Subject: [PATCH 06/13] 1.3.5

---
 scoary/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scoary/__init__.py b/scoary/__init__.py
index ac422f1..5b8f37a 100644
--- a/scoary/__init__.py
+++ b/scoary/__init__.py
@@ -1 +1 @@
-__version__ = '1.3.4'
+__version__ = '1.3.5'

From a70b7b9e77a0aecbf0384903b768d71a20d288a4 Mon Sep 17 00:00:00 2001
From: Ola Brynildsrud <olbb@fhi.no>
Date: Tue, 5 Jul 2016 13:41:00 +0200
Subject: [PATCH 07/13] Require range on p

---
 scoary/methods.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scoary/methods.py b/scoary/methods.py
index d4430b1..dc70c47 100644
--- a/scoary/methods.py
+++ b/scoary/methods.py
@@ -98,6 +98,9 @@ def main():
                         version=SCOARY_VERSION)
 
     args = parser.parse_args()
+    
+    if (args.p_value_cutoff > 1.0) or (args.p_value_cutoff =< 0.0):
+        sys.exit("P must be between 0.0 and 1.0 or exactly 1.0")
 
     starttime = time.time()
     

From da296d3881363774a1942aff6bae8b67ca1e44c0 Mon Sep 17 00:00:00 2001
From: Ola Brynildsrud <olbb@fhi.no>
Date: Tue, 5 Jul 2016 13:47:44 +0200
Subject: [PATCH 08/13] Enforce single-char delimiter

---
 scoary/methods.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scoary/methods.py b/scoary/methods.py
index dc70c47..d1f53b0 100644
--- a/scoary/methods.py
+++ b/scoary/methods.py
@@ -101,6 +101,8 @@ def main():
     
     if (args.p_value_cutoff > 1.0) or (args.p_value_cutoff =< 0.0):
         sys.exit("P must be between 0.0 and 1.0 or exactly 1.0")
+    if (len(args.delimiter) > 1):
+        sys.exit("Delimiter must be a single character string. There is no support for tab.")
 
     starttime = time.time()
     

From cda665feaa79421a53c6f885afc69ddf46e07cd5 Mon Sep 17 00:00:00 2001
From: Ola Brynildsrud <olbb@fhi.no>
Date: Tue, 5 Jul 2016 13:49:58 +0200
Subject: [PATCH 09/13] Fix typo in p_value_cutoff

---
 scoary/methods.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scoary/methods.py b/scoary/methods.py
index d1f53b0..19c55d7 100644
--- a/scoary/methods.py
+++ b/scoary/methods.py
@@ -99,7 +99,7 @@ def main():
 
     args = parser.parse_args()
     
-    if (args.p_value_cutoff > 1.0) or (args.p_value_cutoff =< 0.0):
+    if (args.p_value_cutoff > 1.0) or (args.p_value_cutoff <= 0.0):
         sys.exit("P must be between 0.0 and 1.0 or exactly 1.0")
     if (len(args.delimiter) > 1):
         sys.exit("Delimiter must be a single character string. There is no support for tab.")

From ade0c13fc67a73b4dbdeb046065c87c8790d0589 Mon Sep 17 00:00:00 2001
From: Ola Brynildsrud <olbb@fhi.no>
Date: Tue, 5 Jul 2016 14:01:58 +0200
Subject: [PATCH 10/13] Update to reflect 1.3.5 changes

---
 README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/README.md b/README.md
index 24df3a9..7fc62f6 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,11 @@ Scoary is designed to take the gene_presence_absence.csv file from [Roary] (http
 - [Contact] (#contact)
 
 ## What's new?
+v1.3.5 (PRE-RELEASE) (5th Jul 2016)
+- You can now use the -w option with -r to write a reduced gene presence/absence file containing only the subset isolates. This ensures that the program will run much faster if you have a large dataset (1000s of isolates) but only want to analyze a subset. Scoary automatically opens and analyzes the newly written file.
+- All files are now opened as binary files. (Should speed up analysis in most cases.)
+- This is a pre-release version. There might still be bugs in the code, in which case I would be grateful if you report them.
+
 v1.3.4 (16th Jun 2016)
 - Scoary no longer crashes when using Scipy 0.16 instead of 0.17.
 - More information about what's going on is printed. (Useful for very large datasets that take long to analyze)
@@ -197,6 +202,9 @@ optional arguments:
                         On which column in the gene presence/absence file do
                         individual strain info start. Default=15. (1-based
                         indexing)
+  -w, --write_reduced   Use with -r if you only want to analyze a subset of your
+                        strains. SCOARY will read the provided comma-separated
+                        table of strains and restrict analyzes to these.
   --delimiter DELIMITER
                         The delimiter between cells in the gene
                         presence/absence and trait files.
@@ -216,6 +224,9 @@ Strain1,Strain2,Strain4,Strain9
 
 This will restrict the current analysis to isolates 1,2,4 and 9, and will omit all others.
 
+#### The -w flag
+Using the **-w** flag with **-r** will make Scoary write a reduced gene presence/absence file containing only those isolates specified with **-r**. This makes the program run much faster if you are analyzing a small subset of a large dataset.
+
 #### The -s parameter
 The **-s** parameter is used to indicate to Scoary which column in the gene_presence_absence.csv file is the _first_ column representing an isolate. By default it is set to 15 (1-based indexing).
 

From 992eea0bfada86c42c27a1d250ad3e7a091eb0db Mon Sep 17 00:00:00 2001
From: Ola Brynildsrud <olbb@fhi.no>
Date: Tue, 5 Jul 2016 14:11:26 +0200
Subject: [PATCH 11/13] Remove binary enconding (python3)

---
 scoary/methods.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scoary/methods.py b/scoary/methods.py
index 19c55d7..e1ca29c 100644
--- a/scoary/methods.py
+++ b/scoary/methods.py
@@ -106,12 +106,12 @@ def main():
 
     starttime = time.time()
     
-    with open(args.genes, "rUb") as genes, open(args.traits, "rUb") as traits:
+    with open(args.genes, "rU") as genes, open(args.traits, "rU") as traits:
 
         if args.restrict_to is not None:
             allowed_isolates = [isolate
                                 for line in
-                                open(args.restrict_to,"rUb")
+                                open(args.restrict_to,"rU")
                                 for isolate in line.rstrip().split(",")]
         else:
             # Despite the confusing name
@@ -213,7 +213,7 @@ def ReduceSet(genefile, delimiter, startcol=14, allowed_isolates=None):
     
     print("Writing gene presence absence file for the reduced set of isolates")
     reducedfilename = "gene_presence_absence_reduced_" + time.strftime("_%d_%m_%Y_%H%M") + ".csv"
-    with open(reducedfilename, "wb") as csvout:
+    with open(reducedfilename, "w") as csvout:
         wtr = csv.writer(csvout, delimiter = delimiter)
         newheader = [header[a] for a in allowed_indexes]
         wtr.writerow(newheader)
@@ -229,7 +229,7 @@ def Csv_to_dic_Roary(genefile, delimiter, startcol=14, allowed_isolates=None, wr
     """
     r = {}
     if writereducedset:
-        file = open(ReduceSet(genefile,delimiter,startcol,allowed_isolates),"rUb")
+        file = open(ReduceSet(genefile,delimiter,startcol,allowed_isolates),"rU")
         csvfile = csv.reader(file, skipinitialspace=True, delimiter=delimiter)
     else:
         csvfile = csv.reader(genefile, skipinitialspace=True, delimiter=delimiter)
@@ -466,7 +466,7 @@ def StoreTraitResult(Trait, Traitname, max_hits, p_cutoff, correctionmethod, upg
     """
     The method that actually stores the results. Only accepts results from a single trait at a time
     """
-    with open(Traitname + time.strftime("_%d_%m_%Y_%H%M") + ".csv", "wb") as outfile:
+    with open(Traitname + time.strftime("_%d_%m_%Y_%H%M") + ".csv", "w") as outfile:
         # Sort genes by p-value.
         sort_instructions = SortResultsAndSetKey(Trait)
 
@@ -591,7 +591,7 @@ def StoreUPGMAtreeToFile(upgmatree):
     hamming distances in the gene presence/absence matrix
     """
     treefilename = str("Tree" + time.strftime("_%d_%m_%Y_%H%M") + ".nwk")
-    with open(treefilename, "wb") as treefile:
+    with open(treefilename, "w") as treefile:
         Tree = str(upgmatree)
         Tree = Tree.replace("[", "(")
         Tree = Tree.replace("]", ")")

From 40bfc871c297a227f51b9580a34d1b9a8d4de4d6 Mon Sep 17 00:00:00 2001
From: Ola Brynildsrud <olbb@fhi.no>
Date: Tue, 5 Jul 2016 14:11:46 +0200
Subject: [PATCH 12/13] Remove binary enconding (python3)

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 7fc62f6..c1c42cb 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,6 @@ Scoary is designed to take the gene_presence_absence.csv file from [Roary] (http
 ## What's new?
 v1.3.5 (PRE-RELEASE) (5th Jul 2016)
 - You can now use the -w option with -r to write a reduced gene presence/absence file containing only the subset isolates. This ensures that the program will run much faster if you have a large dataset (1000s of isolates) but only want to analyze a subset. Scoary automatically opens and analyzes the newly written file.
-- All files are now opened as binary files. (Should speed up analysis in most cases.)
 - This is a pre-release version. There might still be bugs in the code, in which case I would be grateful if you report them.
 
 v1.3.4 (16th Jun 2016)

From c291c0916f93ca7f419faf3c348c97a0ed2159d6 Mon Sep 17 00:00:00 2001
From: Ola Brynildsrud <olbb@fhi.no>
Date: Tue, 5 Jul 2016 14:15:40 +0200
Subject: [PATCH 13/13] Range to list(range) (Python3)

---
 scoary/methods.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scoary/methods.py b/scoary/methods.py
index e1ca29c..2dd75bf 100644
--- a/scoary/methods.py
+++ b/scoary/methods.py
@@ -206,7 +206,7 @@ def PopulateQuadTreeWithDistances(TDM):
 def ReduceSet(genefile, delimiter, startcol=14, allowed_isolates=None):
     csvfile = csv.reader(genefile, skipinitialspace=True, delimiter=delimiter)
     header = next(csvfile)
-    allowed_indexes = range(startcol)
+    allowed_indexes = list(range(startcol))
     for c in xrange(len(header)):
         if header[c] in allowed_isolates:
             allowed_indexes.append(c)