diff --git a/README.md b/README.md index 24df3a9..c1c42cb 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,10 @@ Scoary is designed to take the gene_presence_absence.csv file from [Roary] (http - [Contact] (#contact) ## What's new? +v1.3.5 (PRE-RELEASE) (5th Jul 2016) +- You can now use the -w option with -r to write a reduced gene presence/absence file containing only the subset isolates. This ensures that the program will run much faster if you have a large dataset (1000s of isolates) but only want to analyze a subset. Scoary automatically opens and analyzes the newly written file. +- This is a pre-release version. There might still be bugs in the code, in which case I would be grateful if you report them. + v1.3.4 (16th Jun 2016) - Scoary no longer crashes when using Scipy 0.16 instead of 0.17. - More information about what's going on is printed. (Useful for very large datasets that take long to analyze) @@ -197,6 +201,9 @@ optional arguments: On which column in the gene presence/absence file do individual strain info start. Default=15. (1-based indexing) + -w, --write_reduced Use with -r if you only want to analyze a subset of your + strains. SCOARY will read the provided comma-separated + table of strains and restrict analyzes to these. --delimiter DELIMITER The delimiter between cells in the gene presence/absence and trait files. @@ -216,6 +223,9 @@ Strain1,Strain2,Strain4,Strain9 This will restrict the current analysis to isolates 1,2,4 and 9, and will omit all others. +#### The -w flag +Using the **-w** flag with **-r** will make Scoary write a reduced gene presence/absence file containing only those isolates specified with **-r**. This makes the program run much faster if you are analyzing a small subset of a large dataset. + #### The -s parameter The **-s** parameter is used to indicate to Scoary which column in the gene_presence_absence.csv file is the _first_ column representing an isolate. By default it is set to 15 (1-based indexing). diff --git a/scoary/__init__.py b/scoary/__init__.py index ac422f1..5b8f37a 100644 --- a/scoary/__init__.py +++ b/scoary/__init__.py @@ -1 +1 @@ -__version__ = '1.3.4' +__version__ = '1.3.5' diff --git a/scoary/methods.py b/scoary/methods.py index 7ec6962..2dd75bf 100644 --- a/scoary/methods.py +++ b/scoary/methods.py @@ -71,6 +71,12 @@ def main(): ' strains. SCOARY will read the provided ' 'comma-separated table of strains and restrict ' 'analyzes to these.') + parser.add_argument('-w', '--write_reduced', + help='Use with -r if you want Scoary to create a new ' + 'gene presence absence file from your reduced set of ' + 'isolates.', + default=False, + action='store_true') parser.add_argument('-s', '--start_col', help='On which column in the gene presence/absence ' 'file do individual strain info start. Default=15. ' @@ -92,27 +98,35 @@ def main(): version=SCOARY_VERSION) args = parser.parse_args() + + if (args.p_value_cutoff > 1.0) or (args.p_value_cutoff <= 0.0): + sys.exit("P must be between 0.0 and 1.0 or exactly 1.0") + if (len(args.delimiter) > 1): + sys.exit("Delimiter must be a single character string. There is no support for tab.") starttime = time.time() - + with open(args.genes, "rU") as genes, open(args.traits, "rU") as traits: if args.restrict_to is not None: allowed_isolates = [isolate for line in - open(args.restrict_to) + open(args.restrict_to,"rU") for isolate in line.rstrip().split(",")] else: # Despite the confusing name # this actually means all isolates are allowed # and included in the analysis allowed_isolates = None + if args.write_reduced: + sys.exit("You cannot use the -w argument without specifying a subset (-r)") print("Reading gene presence absence file") genedic_and_matrix = Csv_to_dic_Roary(genes, args.delimiter, startcol=args.start_col - 1, - allowed_isolates=allowed_isolates) + allowed_isolates=allowed_isolates, + writereducedset=args.write_reduced) genedic = genedic_and_matrix["Roarydic"] zeroonesmatrix = genedic_and_matrix["Zero_ones_matrix"] strains = genedic_and_matrix["Strains"] @@ -138,7 +152,7 @@ def main(): args.max_hits, args.p_value_cutoff, args.correction, upgmatree, GTC) - print("Finished. Checked a total of %d genes for associations to %d trait(s). " + print("\nFinished. Checked a total of %d genes for associations to %d trait(s). " "Total time used: %d seconds." % (len(genedic), len(traitsdic), int(time.time()-starttime))) @@ -189,15 +203,38 @@ def PopulateQuadTreeWithDistances(TDM): PopulatedQuadtree.insert_row(i, Quadmatrix[i]) return PopulatedQuadtree - -def Csv_to_dic_Roary(genefile, delimiter, startcol=0, allowed_isolates=None): +def ReduceSet(genefile, delimiter, startcol=14, allowed_isolates=None): + csvfile = csv.reader(genefile, skipinitialspace=True, delimiter=delimiter) + header = next(csvfile) + allowed_indexes = list(range(startcol)) + for c in xrange(len(header)): + if header[c] in allowed_isolates: + allowed_indexes.append(c) + + print("Writing gene presence absence file for the reduced set of isolates") + reducedfilename = "gene_presence_absence_reduced_" + time.strftime("_%d_%m_%Y_%H%M") + ".csv" + with open(reducedfilename, "w") as csvout: + wtr = csv.writer(csvout, delimiter = delimiter) + newheader = [header[a] for a in allowed_indexes] + wtr.writerow(newheader) + for r in csvfile: + wtr.writerow( tuple(r[a] for a in allowed_indexes) ) + print("Finished writing reduced gene presence absence list to file " + reducedfilename) + return reducedfilename + +def Csv_to_dic_Roary(genefile, delimiter, startcol=14, allowed_isolates=None, writereducedset=False): """ Converts a gene presence/absence file into dictionaries that are readable by Roary """ r = {} - csvfile = csv.reader(genefile, skipinitialspace=True) + if writereducedset: + file = open(ReduceSet(genefile,delimiter,startcol,allowed_isolates),"rU") + csvfile = csv.reader(file, skipinitialspace=True, delimiter=delimiter) + else: + csvfile = csv.reader(genefile, skipinitialspace=True, delimiter=delimiter) header = next(csvfile) + roaryfile = True strains = header[startcol:] @@ -243,6 +280,8 @@ def Csv_to_dic_Roary(genefile, delimiter, startcol=0, allowed_isolates=None): zero_ones_matrix.append(zero_ones_line) # Transpose list for distance calculation purposes + if writereducedset: + file.close() zero_ones_matrix = list(map(list, zip(*zero_ones_matrix))) return {"Roarydic": r, "Zero_ones_matrix": zero_ones_matrix,