Skip to content

Commit

Permalink
Merge pull request #25 from AdmiralenOla/development
Browse files Browse the repository at this point in the history
Pre-release version
  • Loading branch information
AdmiralenOla authored Jul 5, 2016
2 parents c1d7322 + c291c09 commit 0f6791e
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 8 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ Scoary is designed to take the gene_presence_absence.csv file from [Roary] (http
- [Contact] (#contact)

## What's new?
v1.3.5 (PRE-RELEASE) (5th Jul 2016)
- You can now use the -w option with -r to write a reduced gene presence/absence file containing only the subset isolates. This ensures that the program will run much faster if you have a large dataset (1000s of isolates) but only want to analyze a subset. Scoary automatically opens and analyzes the newly written file.
- This is a pre-release version. There might still be bugs in the code, in which case I would be grateful if you report them.

v1.3.4 (16th Jun 2016)
- Scoary no longer crashes when using Scipy 0.16 instead of 0.17.
- More information about what's going on is printed. (Useful for very large datasets that take long to analyze)
Expand Down Expand Up @@ -197,6 +201,9 @@ optional arguments:
On which column in the gene presence/absence file do
individual strain info start. Default=15. (1-based
indexing)
-w, --write_reduced Use with -r if you only want to analyze a subset of your
strains. SCOARY will read the provided comma-separated
table of strains and restrict analyzes to these.
--delimiter DELIMITER
The delimiter between cells in the gene
presence/absence and trait files.
Expand All @@ -216,6 +223,9 @@ Strain1,Strain2,Strain4,Strain9

This will restrict the current analysis to isolates 1,2,4 and 9, and will omit all others.

#### The -w flag
Using the **-w** flag with **-r** will make Scoary write a reduced gene presence/absence file containing only those isolates specified with **-r**. This makes the program run much faster if you are analyzing a small subset of a large dataset.

#### The -s parameter
The **-s** parameter is used to indicate to Scoary which column in the gene_presence_absence.csv file is the _first_ column representing an isolate. By default it is set to 15 (1-based indexing).

Expand Down
2 changes: 1 addition & 1 deletion scoary/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.3.4'
__version__ = '1.3.5'
53 changes: 46 additions & 7 deletions scoary/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,12 @@ def main():
' strains. SCOARY will read the provided '
'comma-separated table of strains and restrict '
'analyzes to these.')
parser.add_argument('-w', '--write_reduced',
help='Use with -r if you want Scoary to create a new '
'gene presence absence file from your reduced set of '
'isolates.',
default=False,
action='store_true')
parser.add_argument('-s', '--start_col',
help='On which column in the gene presence/absence '
'file do individual strain info start. Default=15. '
Expand All @@ -92,27 +98,35 @@ def main():
version=SCOARY_VERSION)

args = parser.parse_args()

if (args.p_value_cutoff > 1.0) or (args.p_value_cutoff <= 0.0):
sys.exit("P must be between 0.0 and 1.0 or exactly 1.0")
if (len(args.delimiter) > 1):
sys.exit("Delimiter must be a single character string. There is no support for tab.")

starttime = time.time()

with open(args.genes, "rU") as genes, open(args.traits, "rU") as traits:

if args.restrict_to is not None:
allowed_isolates = [isolate
for line in
open(args.restrict_to)
open(args.restrict_to,"rU")
for isolate in line.rstrip().split(",")]
else:
# Despite the confusing name
# this actually means all isolates are allowed
# and included in the analysis
allowed_isolates = None
if args.write_reduced:
sys.exit("You cannot use the -w argument without specifying a subset (-r)")

print("Reading gene presence absence file")
genedic_and_matrix = Csv_to_dic_Roary(genes,
args.delimiter,
startcol=args.start_col - 1,
allowed_isolates=allowed_isolates)
allowed_isolates=allowed_isolates,
writereducedset=args.write_reduced)
genedic = genedic_and_matrix["Roarydic"]
zeroonesmatrix = genedic_and_matrix["Zero_ones_matrix"]
strains = genedic_and_matrix["Strains"]
Expand All @@ -138,7 +152,7 @@ def main():
args.max_hits,
args.p_value_cutoff,
args.correction, upgmatree, GTC)
print("Finished. Checked a total of %d genes for associations to %d trait(s). "
print("\nFinished. Checked a total of %d genes for associations to %d trait(s). "
"Total time used: %d seconds." % (len(genedic),
len(traitsdic),
int(time.time()-starttime)))
Expand Down Expand Up @@ -189,15 +203,38 @@ def PopulateQuadTreeWithDistances(TDM):
PopulatedQuadtree.insert_row(i, Quadmatrix[i])
return PopulatedQuadtree


def Csv_to_dic_Roary(genefile, delimiter, startcol=0, allowed_isolates=None):
def ReduceSet(genefile, delimiter, startcol=14, allowed_isolates=None):
csvfile = csv.reader(genefile, skipinitialspace=True, delimiter=delimiter)
header = next(csvfile)
allowed_indexes = list(range(startcol))
for c in xrange(len(header)):
if header[c] in allowed_isolates:
allowed_indexes.append(c)

print("Writing gene presence absence file for the reduced set of isolates")
reducedfilename = "gene_presence_absence_reduced_" + time.strftime("_%d_%m_%Y_%H%M") + ".csv"
with open(reducedfilename, "w") as csvout:
wtr = csv.writer(csvout, delimiter = delimiter)
newheader = [header[a] for a in allowed_indexes]
wtr.writerow(newheader)
for r in csvfile:
wtr.writerow( tuple(r[a] for a in allowed_indexes) )
print("Finished writing reduced gene presence absence list to file " + reducedfilename)
return reducedfilename

def Csv_to_dic_Roary(genefile, delimiter, startcol=14, allowed_isolates=None, writereducedset=False):
"""
Converts a gene presence/absence file into dictionaries
that are readable by Roary
"""
r = {}
csvfile = csv.reader(genefile, skipinitialspace=True)
if writereducedset:
file = open(ReduceSet(genefile,delimiter,startcol,allowed_isolates),"rU")
csvfile = csv.reader(file, skipinitialspace=True, delimiter=delimiter)
else:
csvfile = csv.reader(genefile, skipinitialspace=True, delimiter=delimiter)
header = next(csvfile)

roaryfile = True

strains = header[startcol:]
Expand Down Expand Up @@ -243,6 +280,8 @@ def Csv_to_dic_Roary(genefile, delimiter, startcol=0, allowed_isolates=None):
zero_ones_matrix.append(zero_ones_line)

# Transpose list for distance calculation purposes
if writereducedset:
file.close()
zero_ones_matrix = list(map(list, zip(*zero_ones_matrix)))
return {"Roarydic": r,
"Zero_ones_matrix": zero_ones_matrix,
Expand Down

0 comments on commit 0f6791e

Please sign in to comment.