-
Notifications
You must be signed in to change notification settings - Fork 1
/
clinical_3feat.py
168 lines (131 loc) · 5.05 KB
/
clinical_3feat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import os
import os.path as path
from glob import glob
import csv
import numpy as np
from utils import ensure_dir, unique
feat_root = '/gpfs/scratch/huidliu/disk/huidong/BMI_projects/data/brca_data/WSI_patch_data_feat_sel_1024'
brca_info_path = './dataset/brca_info.csv'
csv_file_path = './dataset/dataset_for_survival.csv'
clinic_feat_dir = '/gpfs/scratch/huidliu/disk/huidong/BMI_projects/data/brca_data/clinical_features_3feat'
def get_wsi_id_labels(csv_file_path):
with open(csv_file_path, newline='') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
header = None
wsi_labels = dict()
for row in spamreader:
if header is None:
header = row[0]
else:
wsi_id = row[6][1:-1]
if row[3] == 'NA' or int(row[3]) < 0:
continue
days = int(row[3])
dead = int(row[4][1:-1])
wsi_labels[wsi_id] = [dead, days]
return wsi_labels
def get_wsi_id_age(csv_file_path):
with open(csv_file_path, newline='') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
header = None
wsi_age = dict()
for row in spamreader:
if header is None:
header = row[0]
else:
wsi_id = row[1]
age = int(row[3])
wsi_age[wsi_id] = age
return wsi_age
def get_wsi_id_sex(csv_file_path):
with open(csv_file_path, newline='') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
header = None
wsi_sex = dict()
for row in spamreader:
if header is None:
header = row[0]
else:
wsi_id = row[1]
sex = row[50]
if sex == 'Male':
wsi_sex[wsi_id] = 1.0
else:
wsi_sex[wsi_id] = 2.0
return wsi_sex
def get_wsi_id_feats(csv_file_path):
with open(csv_file_path, newline='') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
header = None
wsi_data = dict()
for row in spamreader:
if header is None:
header = row[0]
else:
wsi_id = row[6][1:-1]
if row[3] == 'NA' or int(row[3]) < 0:
continue
call = row[2][1:-1]
tumor_stage = row[5][1:-1]
percent_pos = float(row[10])
collapsed_stage = row[13][1:-1]
days = int(row[3])
dead = int(row[4][1:-1])
wsi_data[wsi_id] = [collapsed_stage, days, dead]
return wsi_data
wsi_id_label = get_wsi_id_labels(csv_file_path)
wsi_id_age = get_wsi_id_age(brca_info_path)
wsi_id_sex = get_wsi_id_sex(brca_info_path)
wsi_id_feats = get_wsi_id_feats(csv_file_path)
col_stage = []
sex = []
for key, val in wsi_id_feats.items():
# call.append(val[0])
# stage.append(val[1])
col_stage.append(val[0])
# unique_call = unique(call)
# unique_stage = unique(stage)
unique_col_stage = unique(col_stage)
unique_sex = unique(sex)
# print(unique_call)
# print(unique_stage)
print(unique_col_stage)
print(unique_sex)
call2num = {'Her2': 3.0, 'LumB': 2.0, 'LumA': 1.0, 'Basal': 4.0}
stage2num = {'stage iib': 2.2, 'stage ia': 0.8, 'stage iiia': 2.8, 'stage iia': 1.8, 'stage i': 1.0, 'stage iiic': 3.4, 'stage iv': 4.0, 'stage ib': 1.2, 'stage iiib': 3.2, 'stage x': 5.0, 'not reported': 0.0, 'stage iii': 3.0, 'stage ii': 2.0}
col_stage2num = {'stage_ii': 2.0, 'stage_i': 1.0, 'stage_iii': 3.0, 'stage_iv': 4.0, 'stage_x/NR': 5.0}
valid_wsi_ids = {'train': [], 'valid': [], 'test': []}
for dataset in ['train', 'valid', 'test']:
feat_dir = path.join(feat_root, dataset)
wsi_path_list = glob('{}/*'.format(feat_dir))
for wsi_path in wsi_path_list:
wsi_id = path.basename(wsi_path)
valid_wsi_ids[dataset].append(wsi_id)
for wsi_id, feat_list in wsi_id_feats.items():
wsi_id_ = '-'.join(wsi_id.split('-')[:3])
wsi_id_feats[wsi_id][0] = col_stage2num[wsi_id_feats[wsi_id][0]]
if wsi_id_ in wsi_id_age:
age = wsi_id_age[wsi_id_]
else:
age = 0
if wsi_id_ in wsi_id_sex:
sex_ = wsi_id_sex[wsi_id_]
else:
sex_ = 2.0
wsi_id_feats[wsi_id].insert(0, float(age))
wsi_id_feats[wsi_id].insert(0, sex_)
print(wsi_id_feats)
set_id_feat = {'train': {}, 'valid': {}, 'test': {}}
for dataset in ['train', 'valid', 'test']:
for wsi_id in valid_wsi_ids[dataset]:
set_id_feat[dataset][wsi_id] = np.asarray(wsi_id_feats[wsi_id][:-2], dtype=np.float32)
print(set_id_feat)
for dataset in ['train', 'valid', 'test']:
dataset_dir = path.join(clinic_feat_dir, dataset)
ensure_dir(dataset_dir)
for wsi_id, feat in set_id_feat[dataset].items():
wsi_id_dir = path.join(dataset_dir, wsi_id)
ensure_dir(wsi_id_dir)
fn = path.join(wsi_id_dir, 'feat.npy')
np.save(fn, feat)
print('Feature extraction done!')