forked from demitri/peak-parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
peak_parser.py
161 lines (131 loc) · 6.38 KB
/
peak_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#! /usr/bin/env python3
import re
#object will be path defined by user when this method is called:
#path = sys.argv[1] or argparse
#newfile = Bedfile(path=path)
class Bedfile(object):
'''
Class to read a bedfile type file.
'''
def __init__(self, path=None): #standard idiom when defining classes
self.chromosomes = dict() # key=chromosome name, value=Chromosome object
self.filepath = path
with open(path) as bedfile:
for line in bedfile:
line = line.rstrip()
fields = line.split("\t")
chr_name = fields[0]
# get existing or create new chromosome
if chr_name in self.chromosomes:
chromosome = self.chromosomes[chr_name] #chromosome_name is associated with the chromosome value, which in the else statement below we turn into a Chromosome object list
else:
chromosome = Chromosome(name=chr_name) # we creating a new object in our dictionary, chromosome, and that object is of the Class Chromosome and its name is chr_name
self.chromosomes[chr_name.lower()] = chromosome
# create new peak from row
new_peak = Peak() #create new object with Peak class method
new_peak.start = int(fields[1]) #convert the string to an int
new_peak.end = int(fields[2])
new_peak.signal = float(fields[6]) #signal = peak height
peak_average = str(fields[3]).split(":")
new_peak.mean = peak_average[1]
#append the list of gene properties to the chromosome dictionary
chromosome.peaks.append(new_peak) #chromosome is an object containing a list of objects
#print(chromosome.peaks)
def chromosome_names(self):
return self.chromosomes.keys()
class Chromosome(object): #this is an ontology class designation object: we want to be able to attribute object lists to our dictionaries. Making a list a type Chromosome class allows us include properties in a list form.
def __init__(self, name=None):
self.name = name
self.length = None # number of base pairs
self.peaks = list() # will be used for a list of peak objects
self.genes = list() # will be used for a list of gene objects
self.degenes = list() # will be used for a list of differentially expressed gene objects---RNAseq foldchange file
class Peak(object):
def __init__(self):
#parenthesis .start() means I'm calling a function. .start means it is a property (variable)
#properties for holding peak information. Must be defined here as something or nothing, in this case, None.
#avoid placeholder properties until you know what you want: it takes up memory and other things
self.start = None
self.end = None
self.signal = None
self.mean = None
def width(self):
return (self.end - self.start) #must use this variable (property) within this class because it hasn't been defined elsewhere.
def intersects_with(self, other_peak):
pass # return True or False
def __repr__(self): #Default override. unix returns the print function from this. If we define it here, then it won't return the default object location
return "<Peak {0}-{1}>".format(self.start, self.end)
class Gff_file(object):
def __init__(self, path=None):
self.chromosomes= dict()
self.filepath = path
#parse file as normal
with open(path) as gf:
for line in gf:
line = line.rstrip()
fields = line.split("\t")
chr_name = fields[0] #setting chromosome name
# get existing or create new chromosome
if chr_name in self.chromosomes:
chromosome = self.chromosomes[chr_name] #add chromosome name as a key, this will NOT replace the values already associated with an existing identical key. Else will turn chromosome into a Chromosome object
else:
chromosome = Chromosome(name=chr_name) #creates Chromosome object with key name as the chromosome name
self.chromosomes[chr_name.lower()] = chromosome #fills the dictionary with the Chromosome object.
#get line only if it is a gene
if "gene" == fields[2]:
gene = Gene() #set new class so we can attribute properties.
if "+" == fields[6]: #identify if TSS is for a gene on positive or negative strand
tss = int(fields[3])
gene.tss = tss
gene.tss_1kb_upstream = (gene.tss - 1000)
elif "-" == fields[6]:
tss = int(fields[4]) # 5' to 3' direction changes on opposite strand
gene.tss = tss
gene.tss_1kb_upstream = (gene.tss + 1000)
gene.start = int(fields[3]) #start is tss for positive strand -1000 for peak analysis
gene.stop = int(fields[4]) #stop is tss for negative strange +1000 for peak analysis
gene.strand = fields[6]
gene_id = re.split(";|=", fields[8])
gene.ID = gene_id[1]
#append the list of gene properties to the chromosome dictionary
chromosome.genes.append(gene) #this populates the above chromosome dictionary.
class Gene(object):
def __init__(self):
self.start = None
self.stop = None
self.ID = None
self.strand = None
self.tss = None
self.tss_1kb_upstream = None
class DE_genes(object):
def __init__(self, path=None):
self.expression_values_dict = dict()
self.filepath=path
with open(path) as de_file:
# assert len(de_file) > 0, "file is empty"
for line in de_file:
line = line.rstrip()
fields = line.split('\t')
gene_n = re.split("_",fields[2])
gene_name = gene_n[-1]
if "gene" in self.expression_values_dict: #giving all the values the same key "gene" so it is easier to iterate over in execution script
chromosome = self.expression_values_dict["gene"]
else:
chromosome = Chromosome(name="gene") #calling Chromosome class here to populate list of objects later
self.expression_values_dict["gene"] = chromosome
gene = DEexpression()
gene.cluster = fields[0]
gene.ID = gene_name.upper()
gene.foldchange = (fields[3:len(fields)-1]) ##fix this: need to convert values in expression to float here
gene.annotation = fields[-1]
chromosome.degenes.append(gene)
# def __repr__(self): #Default override. unix returns the print function from this. If we define it here, then it won't return the default object location
# return "<Gene{0}\t{1}-{2}>".format(self.ID, self.end)
class DEexpression(object):
def __init__(self):
self.cluster = None
self.ID = None
self.foldchange = None
self.annotation = None
def __repr__(self): #Default override. unix returns the print function from this. If we define it here, then it won't return the default object location
return "<Gene{0}\t{1}-{2}>".format(self.ID, self.foldchange)