-
Notifications
You must be signed in to change notification settings - Fork 0
/
plot_data.py
181 lines (151 loc) · 8.9 KB
/
plot_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import line_printer
import pandas as pd
from load_symbol import LoadYahooSymbol
import os
from statistic_generator import StatisticGenerator
from line_printer import LinePrinter
from data_plotter import Plotter
import random
from tqdm import tqdm
import timeit
MAX_DATE_GAP = 10
MIN_DATA_VOLATILITY = 0.1
MIN_NUMBER_OF_SENTENCES_TO_CONSIDER_VALID_DATA = 10
ROUNDING_PRECISION = 2
class PlotData:
def __init__(self, percentage_of_data_to_plot: float, use_different_path_to_save_data: bool, mean_length: int,
column_name_for_plotting, plot_title, min_length_of_data_to_plot,
sentence_length: int, future_length: int, use_sentence_length_for_data_splitting: bool,
data_splits_for_plotting=3,
save_path=None, path="Yahoo_Stock", file_formats_to_load='csv', save_plots=True, verbose=False,
clip=True, percentage_for_normalization=0.75, clip_max=1.5, clip_min=-0.5, bin_size=0.1):
"""
This function plots a percentage of the entire data and saves them in their own directory.
:param percentage_of_data_to_plot: % of total data to plot. maximum is 1
:data_splits_for_plotting: this parameter determines how many parts data is split for printing. a good number
should be 2 or 3
:param path:
:param file_formats_to_load:
:param sentence_length: The actual size of input (number of bars) we want to give to our learning system
:param future_length: The number of bars to look into future for generating results
:param use_sentence_length_for_data_splitting: if True, it will use sentence length to plot the data.
Also, if true, then the percentage_for_normalization will be calculated as:
(sentence_length)/(sentence_length+future_length)
"""
if use_sentence_length_for_data_splitting:
percentage_for_normalization = sentence_length / (sentence_length + future_length)
if "/" != path[-1]:
raise Exception("Path name must end with /")
if percentage_of_data_to_plot > 1:
raise ValueError("percentage_of_data_to_plot must be less than 1")
plot_save_path = path
if use_different_path_to_save_data:
if not save_path:
raise Exception("Have to provide a save path if use_different_path_to_save_data is True")
elif not os.path.isdir(save_path):
os.makedirs(save_path)
plot_save_path = save_path
self.path = path
self.file_formats_to_load = file_formats_to_load
self.percentage_of_data_to_plot = percentage_of_data_to_plot
self.folder_list = self.load_all_sub_directories()
self.plotter = Plotter(mean_length, plot_title, save_plots=save_plots, verbose=verbose, clip=clip,
percentage_for_normalization=percentage_for_normalization, clip_max=clip_max,
clip_min=clip_min, bin_size=bin_size)
self.file_loader = LoadYahooSymbol()
self.data_splits_for_plotting = data_splits_for_plotting
if use_different_path_to_save_data:
self.save_path = plot_save_path
else:
self.save_path = self.path
self.line_printer = LinePrinter("-")
self.column_name_for_plotting = column_name_for_plotting
self.min_length_of_data_to_plot = min_length_of_data_to_plot
statistics_input = {'open_column_name': 'Open', 'high_column_name': 'High', 'low_column_name': 'Low',
'close_column_name': 'Close', 'volume_column_name': 'Volume',
'min_data_volatility': MIN_DATA_VOLATILITY,
'sentence_length': sentence_length, 'future_data_length': future_length,
'max_date_gap': MAX_DATE_GAP,
'default_column_name': 'Close',
'min_number_of_sentences': MIN_NUMBER_OF_SENTENCES_TO_CONSIDER_VALID_DATA,
'rounding_precision': ROUNDING_PRECISION}
self.statistic_generator = StatisticGenerator(**statistics_input)
self.sentence_length = sentence_length
self.future_length = future_length
self.use_sentence_length_for_data_splitting = use_sentence_length_for_data_splitting
def load_all_sub_directories(self):
all_folders = os.listdir(self.path)
folder_list = []
for file in all_folders:
if os.path.isdir(self.path + file):
folder_list.append(file)
return folder_list
def get_file_names_in_directory(self, dir_):
all_files = os.listdir(self.path + dir_)
file_list = []
print(self.path + dir_)
for file in all_files:
if os.path.isfile(self.path + dir_ + "/" + file):
if file.split('.')[-1] == self.file_formats_to_load:
file_list.append(file)
return file_list
def plotter_all_data(self):
statistic_results = []
for i in tqdm(range(len(self.folder_list))):
folder = self.folder_list[i]
files_in_directory = self.get_file_names_in_directory(folder)
# self.line_printer.print_line(text = str(int(len(files_in_directory) * self.percentage_of_data_to_plot)))
for file_counter in tqdm(range((int(len(files_in_directory) * self.percentage_of_data_to_plot)))):
# file_counter = random.randint(0, len(files_in_directory) - 1)
load_file_path = self.path + folder + "/"
save_file_path = self.save_path + folder + "/"
file_name = files_in_directory[file_counter]
file_data = self.file_loader.load_file(load_file_path, file_name)
singe_file_statistics = {}
file_data_length = len(file_data)
if file_data_length > self.min_length_of_data_to_plot:
singe_file_statistics['file_name'] = file_name
singe_file_statistics['symbol'] = file_name.replace('.csv', '')
if self.use_sentence_length_for_data_splitting:
date_step_size = self.sentence_length+self.future_length
number_of_chunks = file_data_length//date_step_size
print("Each part of plot has ", date_step_size, ' Bars')
else:
date_step_size = file_data_length // self.data_splits_for_plotting
number_of_chunks = self.data_splits_for_plotting
print("Each part of plot has ", date_step_size, ' Bars')
for chunk_id in range(number_of_chunks):
start_range = chunk_id * date_step_size
end_range = chunk_id * date_step_size + date_step_size
chunk_data = file_data.iloc[start_range:end_range]
# Statistics
print('Loading Statistics for ', file_name, ' part ', chunk_id)
singe_file_statistics[
'part_' + str(chunk_id + 1)] = self.statistic_generator.generate_chunk_statistics(
chunk_data)
usability_result = singe_file_statistics['part_' + str(chunk_id + 1)]['good_for_trading']
print("Plotting Part ", chunk_id, " of ", file_name)
usability_text = ''
if not usability_result:
usability_text = usability_text + 'NOT_'
usability_text = usability_text + 'Usable_'
plot_title = "Part_" + str(chunk_id + 1) + "_" + usability_text + str(date_step_size) + \
"_bars_" + file_name
self.plotter.plot_values(chunk_data[self.column_name_for_plotting], save_file_path, plot_title,
x_step_size=len(chunk_data) // 10)
print('Loading Statistics for ', file_name, ' Total')
singe_file_statistics['total'] = self.statistic_generator.generate_chunk_statistics(file_data)
statistic_results.append(self.statistic_generator.flatten(singe_file_statistics))
result_df = pd.DataFrame(statistic_results)
result_df.set_index(result_df['symbol'], drop=True, inplace=True)
result_df.drop('symbol', axis=1, inplace=True)
result_df.to_csv(self.path + 'Statistics.csv')
my_plotter = PlotData(1, True, 20, 'Close', 'Comparing Price and Normalized Price', 100, 64, 16, True,
save_path='charts/', path="test_data/")
folders = my_plotter.load_all_sub_directories()
print(folders)
print(my_plotter.get_file_names_in_directory(folders[0]))
starting_time = timeit.default_timer()
print("Start time :", starting_time)
my_plotter.plotter_all_data()
print("Time difference :", timeit.default_timer() - starting_time)