-
Notifications
You must be signed in to change notification settings - Fork 21
/
HLA-TAPAS.py
410 lines (292 loc) · 18.9 KB
/
HLA-TAPAS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
# -*- coding: utf-8 -*-
import os, sys, re
from os.path import exists, join, dirname, basename
import argparse, textwrap
from NomenCleaner.NomenCleaner import HATK_NomenCleaner
from MakeReference.MakeReference_v2 import MakeReference_v2
from SNP2HLA.SNP2HLA import SNP2HLA
from HLAassoc.HLAassoc import HLAassoc
from HLAManhattan.HLAmanhattan import HLAManhattan
# Patterns to use
p_Ggroup_HLA_allele = re.compile(r'^(\w+\*)?\d{2,3}(:\d{2,3})+G$')
std_MAIN_PROCESS_NAME = "[%s]: " % (basename(__file__))
std_ERROR_MAIN_PROCESS_NAME = "[%s::ERROR]: " % (basename(__file__))
std_WARNING_MAIN_PROCESS_NAME = "[%s::WARNING]: " % (basename(__file__))
def HLA_TAPAS(_target, _reference, _hg, _out, _hped, _chped=None,
f_save_intermediates=False, _dependency='./dependency',
b_mem = '2g', b_nthreads=1, b_niterations=5,
_hat='NomenCleaner/data/HLA_ALLELE_TABLE.imgt3320.hat',
f_oneF=False, f_twoF=False, f_threeF=False, f_fourF=False, f_Ggroup=False, f_Pgroup=False, f_NoCaption=False,
_dict_AA='MakeReference/data/HLA_DICTIONARY_AA.hg19.imgt3320.Ggroup',
_dict_SNPS='MakeReference/data/HLA_DICTIONARY_SNPS.hg19.imgt3320.Ggroup', _tolerated_diff=0.15,
_pheno=None, _pheno_name=None, _condition=None, _condition_list=None,
_reference_bim=None, _covar=None, _covar_name=None,
_pop=None, _sex=None, _pcs=None, _maf_threshold=None, f_aa_only=False,
f_remove_samples_by_haplo=False, f_remove_samples_aa_pattern=False, _min_haplo_count=10, _condition_gene=None,
f_exclude_composites=False, f_output_composites=False, f_exhaustive=False,
_exhaustive_aa_pos=None, _exhaustive_min_aa=2, _exhaustive_max_aa=2, f_exhaustive_no_filter=False
):
"""
Main execution part of HLA-TAPAS pipeline.
[1] NomenCleaner
[2] MakeReference_v2
[3] SNP2HLA
[4] HLAassoc
"""
print("")
### Exception Handling
# './dependency' folder
if not exists(_dependency):
print(std_ERROR_MAIN_PROCESS_NAME + "Folder for dependent software('{}') can't be found. Please check '--dependency' argument again.".format(_dependency))
sys.exit()
# Target data check.
if not exists(_target+'.bed'):
print(std_ERROR_MAIN_PROCESS_NAME + "One of Target SNP data can't be found('{}'). Please check '--target/-t' argument again.".format(_target + '.bed'))
sys.exit()
if not exists(_target+'.bim'):
print(std_ERROR_MAIN_PROCESS_NAME + "One of Target SNP data can't be found('{}'). Please check '--target/-t' argument again.".format(_target + '.bim'))
sys.exit()
if not exists(_target+'.fam'):
print(std_ERROR_MAIN_PROCESS_NAME + "One of Target SNP data can't be found('{}'). Please check '--target/-t' argument again.".format(_target + '.fam'))
sys.exit()
# Reference data check.
if not exists(_reference+'.bed'):
print(std_ERROR_MAIN_PROCESS_NAME + "One of Reference data can't be found('{}'). Please check '--reference/-ref' argument again.".format(_reference + '.bed'))
sys.exit()
if not exists(_reference+'.bim'):
print(std_ERROR_MAIN_PROCESS_NAME + "One of Reference data can't be found('{}'). Please check '--reference/-ref' argument again.".format(_reference + '.bim'))
sys.exit()
if not exists(_reference+'.fam'):
print(std_ERROR_MAIN_PROCESS_NAME + "One of Reference data can't be found('{}'). Please check '--reference/-ref' argument again.".format(_reference + '.fam'))
sys.exit()
if not exists(_hped):
print(std_ERROR_MAIN_PROCESS_NAME + "HLA type data for generating Reference panel can't be found('{}'). Please check '--hped' argument again.".format(_hped))
sys.exit()
# Dictionary
if _hg == '18':
_dict_AA = 'MakeReference/data/hg18/HLA_DICTIONARY_AA.hg18.imgt3320.Ggroup'
_dict_SNPS = 'MakeReference/data/hg18/HLA_DICTIONARY_SNPS.hg18.imgt3320.Ggroup'
elif _hg == '19':
_dict_AA = 'MakeReference/data/hg19/HLA_DICTIONARY_AA.hg19.imgt3320.Ggroup'
_dict_SNPS = 'MakeReference/data/hg19/HLA_DICTIONARY_SNPS.hg19.imgt3320.Ggroup'
elif _hg == '38':
_dict_AA = 'MakeReference/data/hg38/HLA_DICTIONARY_AA.hg38.imgt3320.Ggroup'
_dict_SNPS = 'MakeReference/data/hg38/HLA_DICTIONARY_SNPS.hg38.imgt3320.Ggroup'
else:
print(std_ERROR_MAIN_PROCESS_NAME + "Bizarre Human Genome version is given. Please check '--hg' argument again.")
sys.exit()
# Intermediate path check.
_out = _out if not _out.endswith('/') else _out.rstrip('/')
if bool(dirname(_out)):
out_dirname = dirname(_out)
os.makedirs(out_dirname, exist_ok=True)
else:
out_dirname = ''
### [1] NomenCleaner
# HPED G-group allele check
if CheckGgroupHPED(_hped) == -1:
sys.exit()
print(std_MAIN_PROCESS_NAME + "Generating G-group CHPED with given HPED('{}')." \
.format(_hped))
CHPED = HATK_NomenCleaner(_hped, _hat, _out+'.Ggroup', __f_NoCaption=False, __leave_NotFound=True,
__oneF=False, __twoF=False, __threeF=False, __fourF=False, __Ggroup=True, __Pgroup=False)
# '--leave-NotFound' as True by default as Yang had requested.
CHPED = CHPED.chped
print(std_MAIN_PROCESS_NAME + "Generated CHPED: '{}'.\n\n".format(CHPED))
### [2] MakeReference_v2
print(std_MAIN_PROCESS_NAME + "Generating Reference panel.")
REFERENCE = MakeReference_v2(CHPED, _out + '.REF.bglv4', _hg, _dict_AA, _dict_SNPS, _reference,
_java_heap_mem=b_mem, _p_dependency=_dependency,
f_save_intermediates=f_save_intermediates, f_phasing=True, _nthreads=b_nthreads)
print(std_MAIN_PROCESS_NAME + "Generated Reference panel : '{}'.\n\n".format(REFERENCE))
### [3] SNP2HLA
print(std_MAIN_PROCESS_NAME + "Performing SNP2HLA imputation.")
IMPUTED = SNP2HLA(_target, REFERENCE, _out+'.IMPUTED',
_tolerated_diff=_tolerated_diff, _p_dependency=_dependency,
_mem=b_mem, _b_nthreads=b_nthreads, _b_niterations=b_niterations)
print(std_MAIN_PROCESS_NAME + "Imputed result : '{}'\n\n".format(IMPUTED))
### [4] HLAassoc
# LOGISTIC REGRESSION.
ASSOC_STUDY = HLAassoc('LOGISTIC', _out + '.IMPUTED', _dependency,
_vcf=IMPUTED, _reference_bim=_reference_bim, _covar=_covar, _covar_name=_covar_name,
_pheno=_pheno, _pheno_name=_pheno_name, _condition=_condition, _condition_list=_condition_list,
_hped=_hped, _chped=CHPED)
LOGISTIC_RESULT = ASSOC_STUDY.assoc_result
print(std_MAIN_PROCESS_NAME + "Output Logistic Regression result : '{}'.\n\n".format(LOGISTIC_RESULT))
# OMNIBUS TEST.
OMNIBUS = HLAassoc('OMNIBUS', _out + '.IMPUTED.OMNIBUS', _dependency,
_vcf=IMPUTED, _file=None, _pop=_pop, _phased=None, _fam=_target+'.fam', _bim=REFERENCE+'.bim',
_pheno=_pheno, _sex=_sex, _pcs=_pcs, _maf_threshold=_maf_threshold,
f_aa_only=f_aa_only, _nthreads=1, f_remove_samples_by_haplo=f_remove_samples_by_haplo,
f_remove_samples_aa_pattern=f_remove_samples_aa_pattern, _min_haplo_count=_min_haplo_count,
_condition=_condition, _condition_gene=_condition_gene,
f_exclude_composites=f_exclude_composites,
f_output_composites=f_output_composites, f_exhaustive=f_exhaustive,
_exhaustive_aa_pos=_exhaustive_aa_pos, _exhaustive_min_aa=_exhaustive_min_aa,
_exhaustive_max_aa=_exhaustive_max_aa, f_exhaustive_no_filter=f_exhaustive_no_filter,
_java_heap_mem=b_mem)
OMNIBUS_RESULT = OMNIBUS.omnibus_result
print(std_MAIN_PROCESS_NAME + "Output Omnibus Test result : '{}'.\n\n".format(OMNIBUS_RESULT))
### [5] Manhattan Plot
print(std_MAIN_PROCESS_NAME + "Plotting Manhattan Plot.")
MANHATTAN_PLOT = HLAManhattan([LOGISTIC_RESULT], LOGISTIC_RESULT+'.manhattan', _hg)
print(std_MAIN_PROCESS_NAME + "Manhattan plot result : '{}'.\n\n".format(MANHATTAN_PLOT))
print(std_MAIN_PROCESS_NAME + "HLA-TAPAS done.")
return 0
def CheckGgroupHPED(_hped_Ggroup):
with open(_hped_Ggroup, 'r') as f_hped_Ggroup:
flag_stop = False
count = 0
for line in f_hped_Ggroup:
l = re.split(r'\s+', line.rstrip('\n'))
"""
0: FID,
1: IID,
2: PID,
3: MID,
4: Sex,
5: Phe,
6,7 : HLA-A,
8,9 : HLA-B,
...
20,21 : HLA-DRB1
Total 22 columns
"""
for i in range(6, 22, 2):
# allele1
if not (l[i] == '0' or bool(p_Ggroup_HLA_allele.match(l[i]))):
print(std_ERROR_MAIN_PROCESS_NAME + "Allele '{}' is not in G-group HLA allele name form." \
.format(l[i]))
flag_stop = True
# allele2
if not (l[i + 1] == '0' or bool(p_Ggroup_HLA_allele.match(l[i + 1]))):
print(std_ERROR_MAIN_PROCESS_NAME + "Allele '{}' is not in G-group HLA allele name form." \
.format(l[i + 1]))
flag_stop = True
if flag_stop:
break
if flag_stop:
print("Please check again the row {} of HPED file('{}').".format(count, _hped_Ggroup))
print("(row {}: {})".format(count, l))
return -1
count += 1
# if count > 5: break
return 0
if __name__ == "__main__":
parser = argparse.ArgumentParser(prog='HLA-TAPAS',
add_help=False,
formatter_class=argparse.RawTextHelpFormatter,
description=textwrap.dedent('''\
#################################################################################################
HLA-TAPAS
Author : Yang Luo
E-mail : [email protected]
Description here...
#################################################################################################
'''))
parser._optionals.title = "OPTIONS"
parser.add_argument("-h", "--help", help="\nShow this help message and exit\n\n", action='help')
### General
parser.add_argument("--target", "-t", help="\nThe prefix of Target PLINK SNP data to impute.(.bed/.bim/.fam)\n\n", required=True)
parser.add_argument("--reference", "-ref", help="\nThe prefix of reference panel for imputation.\n\n", required=True)
parser.add_argument("--out", "-o", help="\nOutput file prefix.\n\n", required=True)
parser.add_argument("--hg", help="\nHuman Genome version of Reference panel to generate.\n\n", choices=["18", "19", "38"], metavar="hg", required=True)
parser.add_argument("--hped-Ggroup", help="\nHLA Type Data with G-group HLA allele label.\n\n", dest="hped", required=True)
# parser.add_argument("--chped", help="\nHLA Type Data(.chped)\n\n")
parser.add_argument("--save-intermediates", help="\nDon't remove intermediate files.\n\n", action='store_true')
parser.add_argument("--dependency", help="\nSpecify a folder for external software.\n\n", default='dependency/')
# Beagle4.1
parser.add_argument("--mem", help="\nJava Heap Memory size for Bealge4.1. (ex. 2g, 500m)\n\n", default="4g")
parser.add_argument("--nthreads", help="\nThe number of threads to use in Bealge4.1. (ex. 2)\n\n", default=1, type=int)
parser.add_argument("--niterations", help="\n'niterations' parameter in Beagle4.1 (default: 5).\n\n", default=5)
### [1] NomenCleaner
# parser.add_argument("--hat", help="\nHLA Allele Table file(*.hat).\n\n",
# default='NomenCleaner/data/HLA_ALLELE_TABLE.imgt3320.hat')
# parser.add_argument("--leave-NotFound", help="\nLeaving HLA alleles which can't be found in given *.hat file(Novel or Erroneous allele) intact.\n\n", action='store_true') # True by default in 'HLA-TAPAS'.
# Output format
# output_digit_selection = parser.add_mutually_exclusive_group()
# output_digit_selection.add_argument("--1field", help="\nMake converted HLA alleles have maximum 1 field.\n\n",
# action="store_true", dest="oneF")
# output_digit_selection.add_argument("--2field", help="\nMake converted HLA alleles have maximum 2 fields.\n\n",
# action="store_true", dest="twoF")
# output_digit_selection.add_argument("--3field", help="\nMake converted HLA alleles have maximum 3 fields.\n\n",
# action="store_true", dest="threeF")
# output_digit_selection.add_argument("--4field", help="\nMake converted HLA alleles have maximum 4 fields\n\n",
# action="store_true", dest="fourF")
# output_digit_selection.add_argument("--Ggroup", help="\nMake converted HLA alleles have G code names.\n\n",
# action="store_true")
# output_digit_selection.add_argument("--Pgroup", help="\nMake converted HLA alleles have P code names.\n\n",
# action="store_true")
# Flag to remove HLA gene caption.
# parser.add_argument("--NoCaption", help="\nMake converted HLA alleles NOT have HLA gene prefix(ex. \"A*\").\n\n",
# action='store_true')
### [2] MakeReference_v2
# parser.add_argument("--phasing", help="\nPerform phasing with Beagle4.1.\n\n", action='store_true') # True by default in HLA-TAPAS.
# parser.add_argument("--dict-AA", help="\nPrefix of AA HLA Dictionary file(*.txt, *.map).\n\n",
# default='MakeReference/data/HLA_DICTIONARY_AA.hg19.imgt3320')
# parser.add_argument("--dict-SNPS", help="\nPrefix of SNP HLA Dictionary file(*.txt, *.map).\n\n",
# default='MakeReference/data/HLA_DICTIONARY_SNPS.hg19.imgt3320')
### [3] SNP2HLA
parser.add_argument("--tolerated-diff", help="\nTolerated diff (default : 0.15).\n\n", default=0.15, type=float)
### [4] HLAassoc
# Common
parser.add_argument("--pheno", help="\nSpecify phenotype information file (Plink v1.9).\n\n")
parser.add_argument("--pheno-name", help="\nSpecify the column name in phenotype file which you will use. (Plink v1.9)\n\n")
CondVars = parser.add_mutually_exclusive_group()
CondVars.add_argument("--condition", help="\nSpecify a single variant ID to condition(i.e. To set it as covariate). (Plink v1.9)\n\n")
CondVars.add_argument("--condition-list", help="\nSpecify a tsv file of multiple variant IDs to condition(i.e. To set it as covariates). (Plink v1.9)\n\n")
# Logistic Regression
parser.add_argument("--reference-bim", help="\nReference bim file to decode ATtrick.\n\n")
parser.add_argument("--covar", help="\nSpecify .covar file (Plink v1.9).\n\n")
parser.add_argument("--covar-name", help="\nSpecify the column name(s) in .covar file which you will use. (Plink v1.9)\n\n")
# Omnibus Test
parser.add_argument("--pop", help="\nFile name for Population information of samples.\n\n")
parser.add_argument("--sex", help="\nFile name for Sex information of samples.\n\n")
parser.add_argument("--pcs", help="\nFile name for Principal Components information.\n\n")
parser.add_argument("--maf-threshold", help="\nThreshold value for Minor Allele Frequency.\n\n", default=0.005)
parser.add_argument("--aa-only", help="\nRun association test only for AA changes.\n\n", action='store_true')
parser.add_argument("--remove-samples-by-haplo", action='store_true',
help="\nRemove samples based on haplotypes constructed using a subset of AAs.\n\n")
parser.add_argument("--remove-samples-aa-pattern", type=str, help="\nPrefix of markers to exclude. (ex. AA_B)\n\n")
parser.add_argument("--min-haplo-count", type=int, default=10, help="\nThe minimum number of haplotype count.\n\n")
parser.add_argument("--condition-gene", type=str, help="\nGene name for conditioning.\n\n")
parser.add_argument("--exclude-composites", action='store_true',
help="\nExclude composite amino acids from omnibus test.\n\n")
parser.add_argument("--output-composites", action='store_true',
help="\nOutput composite amino acids in .haplo.txt.\n\n")
parser.add_argument("--exhaustive", action='store_true',
help="\nRun exhaustive testing of AA combinations in a single HLA gene.\n\n")
parser.add_argument("--exhaustive-aa-pos", type=int,
help="\nSpecify a AA pos of the first element(s) of combinations.\n\n")
parser.add_argument("--exhaustive-min-aa", type=int, default=2,
help="\nMinimum number of AA positions to form a combination.\n\n")
parser.add_argument("--exhaustive-max-aa", type=int, default=2,
help="\nMaximum number of AA positions to form a combination.\n\n")
parser.add_argument("--exhaustive-no-filter", action='store_true',
help="\nDon't filter non-increasing AA combinations.\n\n")
##### <for Test> #####
# args = parser.parse_args('--target example/1958BC '
# '--hg 18 '
# '--reference example/HAPMAP_CEU '
# '--hped example/HAPMAP_CEU_HLA.Ggroup.hped '
# '--out tests/MyHLA-TAPAS/myHLA-TAPAS '
# '--pheno example/1958BC.phe'.split(' '))
##### <for Publication> #####
args = parser.parse_args()
print(args)
# Main implementation
HLA_TAPAS(args.target, args.reference, args.hg, args.out, args.hped,
f_save_intermediates=args.save_intermediates, _dependency=args.dependency,
b_mem=args.mem, b_nthreads=args.nthreads, b_niterations=args.niterations,
_tolerated_diff=args.tolerated_diff,
_pheno=args.pheno, _pheno_name=args.pheno_name, _condition=args.condition, _condition_list=args.condition_list,
_reference_bim=args.reference_bim, _covar=args.covar, _covar_name=args.covar_name,
_pop=args.pop, _sex=args.sex, _pcs=args.pcs, _maf_threshold=args.maf_threshold, f_aa_only=args.aa_only,
f_remove_samples_by_haplo=args.remove_samples_by_haplo,
f_remove_samples_aa_pattern=args.remove_samples_aa_pattern,
_min_haplo_count=args.min_haplo_count, _condition_gene=args.condition_gene,
f_exclude_composites=args.exclude_composites, f_output_composites=args.output_composites,
f_exhaustive=args.exhaustive, _exhaustive_aa_pos=args.exhaustive_aa_pos,
_exhaustive_min_aa=args.exhaustive_min_aa, _exhaustive_max_aa=args.exhaustive_max_aa,
f_exhaustive_no_filter=args.exhaustive_no_filter
)