-
Notifications
You must be signed in to change notification settings - Fork 8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Apache Arrow stream writers #147
Changes from all commits
f40c692
63000b8
142c80d
0bb41eb
a5b3d2b
3d121a1
1e69ff4
1f1ff92
13a397a
83efd66
64870fc
b80dc70
8dead40
5fcbfad
9a6bd2e
bbb808a
40d972c
6c6bd97
28e6370
870dbfb
355df10
f882c60
4f3547b
2d17221
012921e
16db4c1
6ae187a
19434dc
4d100df
d67b72c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
#ifdef USE_ARROW | ||
|
||
#include "arrow_output_stream.h" | ||
|
||
std::shared_ptr<ApacheArrowWriter> ArrowOutputStream::create_arrow_file(const std::string& arrow_file_type, | ||
const std::string& arrow_file_path, | ||
const std::vector<std::string>& header) { | ||
|
||
std::string arrow_file_type_upper = Nyxus::toupper(arrow_file_type); | ||
|
||
if(arrow_file_path != "" && !fs::is_directory(arrow_file_path) && !(Nyxus::ends_with_substr(arrow_file_path, ".arrow") || Nyxus::ends_with_substr(arrow_file_path, ".feather") || Nyxus::ends_with_substr(arrow_file_path, ".parquet"))) { | ||
throw std::invalid_argument("The arrow file path must end in \".arrow\""); | ||
} | ||
|
||
if (!(arrow_file_type_upper == "ARROW" || arrow_file_type_upper == "ARROWIPC" || arrow_file_type_upper == "PARQUET")) { | ||
throw std::invalid_argument("The valid file types are ARROW, ARROWIPC, or PARQUET"); | ||
} | ||
|
||
std::string extension = (arrow_file_type_upper == "PARQUET") ? ".parquet" : ".arrow"; | ||
|
||
if (arrow_file_path == "") { | ||
arrow_file_path_ = "NyxusFeatures" + extension; | ||
} else { | ||
arrow_file_path_ = arrow_file_path; | ||
} | ||
|
||
if (fs::is_directory(arrow_file_path)) { | ||
arrow_file_path_ += "/NyxusFeatures" + extension; | ||
} | ||
|
||
writer_ = WriterFactory::create_writer(arrow_file_path_, header); | ||
|
||
return writer_; | ||
} | ||
|
||
|
||
std::shared_ptr<arrow::Table> ArrowOutputStream::get_arrow_table(const std::string& file_path, arrow::Status& table_status) { | ||
|
||
if (this->arrow_table_ != nullptr) return this->arrow_table_; | ||
|
||
this->arrow_table_ = writer_->get_arrow_table(file_path, table_status); | ||
|
||
return this->arrow_table_; | ||
} | ||
|
||
std::string ArrowOutputStream::get_arrow_path() { | ||
return arrow_file_path_; | ||
} | ||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#pragma once | ||
|
||
#ifdef USE_ARROW | ||
|
||
#include <string> | ||
#include <memory> | ||
|
||
#include "output_writers.h" | ||
#include "helpers/helpers.h" | ||
|
||
#include <arrow/table.h> | ||
|
||
#if __has_include(<filesystem>) | ||
#include <filesystem> | ||
namespace fs = std::filesystem; | ||
#elif __has_include(<experimental/filesystem>) | ||
#include <experimental/filesystem> | ||
namespace fs = std::experimental::filesystem; | ||
#else | ||
error "Missing the <filesystem> header." | ||
#endif | ||
|
||
/** | ||
* @brief Class to write to Apache Arrow formats | ||
* | ||
* This class provides methods for writing to the Arrow IPC and Parquet formats. | ||
* | ||
*/ | ||
class ArrowOutputStream { | ||
|
||
private: | ||
|
||
std::string arrow_file_path_ = ""; | ||
std::shared_ptr<ApacheArrowWriter> writer_ = nullptr; | ||
std::string arrow_output_type_ = ""; | ||
std::shared_ptr<arrow::Table> arrow_table_ = nullptr; | ||
|
||
public: | ||
std::shared_ptr<ApacheArrowWriter> create_arrow_file(const std::string& arrow_file_type, | ||
const std::string& arrow_file_path, | ||
const std::vector<std::string>& header); | ||
std::shared_ptr<arrow::Table> get_arrow_table(const std::string& file_path, arrow::Status& table_status); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function call should change in a subsequent PR not to pass a ref to retrieve an error/status code. Instead the caller code will check for |
||
std::string get_arrow_path(); | ||
}; | ||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,7 +32,7 @@ namespace Nyxus | |
|
||
bool scanFilePairParallel(const std::string& intens_fpath, const std::string& label_fpath, int num_fastloader_threads, int num_sensemaker_threads, int filepair_index, int tot_num_filepairs); | ||
std::string getPureFname(const std::string& fpath); | ||
int processDataset(const std::vector<std::string>& intensFiles, const std::vector<std::string>& labelFiles, int numFastloaderThreads, int numSensemakerThreads, int numReduceThreads, int min_online_roi_size, bool save2csv, const std::string& csvOutputDir); | ||
int processDataset(const std::vector<std::string>& intensFiles, const std::vector<std::string>& labelFiles, int numFastloaderThreads, int numSensemakerThreads, int numReduceThreads, int min_online_roi_size, bool save2csv, bool arrow_output, const std::string& csvOutputDir); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wouldn't having overloaded function processDataset() for the separate cases of CSV and Apache output types be less confusing than having a single overparametered processDataset() ? |
||
bool gatherRoisMetrics(const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads); | ||
bool processTrivialRois (const std::vector<int>& trivRoiLabels, const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads, size_t memory_limit); | ||
bool processNontrivialRois (const std::vector<int>& nontrivRoiLabels, const std::string& intens_fpath, const std::string& label_fpath, int num_FL_threads); | ||
|
@@ -46,7 +46,7 @@ namespace Nyxus | |
bool gatherRoisMetricsInMemory (const py::array_t<unsigned int, py::array::c_style | py::array::forcecast>& intens_image, const py::array_t<unsigned int, py::array::c_style | py::array::forcecast>& label_image, int start_idx); | ||
bool processIntSegImagePairInMemory (const std::string& intens_fpath, const std::string& label_fpath, int filepair_index, const std::string& intens_name, const std::string& seg_name); | ||
int processMontage(const py::array_t<unsigned int, py::array::c_style | py::array::forcecast>& intensFiles, const py::array_t<unsigned int, py::array::c_style | py::array::forcecast>& labelFiles, int numReduceThreads, const std::vector<std::string>& intensity_names, | ||
const std::vector<std::string>& seg_names, std::string& error_message); | ||
const std::vector<std::string>& seg_names, std::string& error_message, bool arrow_output=false, const std::string& outputDir=""); | ||
bool scanTrivialRois (const std::vector<int>& batch_labels, const py::array_t<unsigned int, py::array::c_style | py::array::forcecast>& intens_images, const py::array_t<unsigned int, py::array::c_style | py::array::forcecast>& label_images, int start_idx); | ||
bool processTrivialRoisInMemory (const std::vector<int>& trivRoiLabels, const py::array_t<unsigned int, py::array::c_style | py::array::forcecast>& intens_fpath, const py::array_t<unsigned int, py::array::c_style | py::array::forcecast>& label_fpath, int start_idx, size_t memory_limit); | ||
#endif | ||
|
@@ -55,7 +55,12 @@ namespace Nyxus | |
std::string get_feature_output_fname(const std::string& intFpath, const std::string& segFpath); | ||
extern const std::vector<std::string> mandatory_output_columns; | ||
bool save_features_2_csv (const std::string & intFpath, const std::string & segFpath, const std::string & outputDir); | ||
bool save_features_2_buffer (ResultsCache& results_cache); | ||
bool save_features_2_buffer (ResultsCache& results_cache); | ||
|
||
std::vector<std::tuple<std::vector<std::string>, int, std::vector<double>>> get_feature_values(); | ||
std::vector<std::string> get_header(const std::vector<std::tuple<std::string, AvailableFeatures>>& F ); | ||
|
||
|
||
|
||
void init_feature_buffers(); | ||
void clear_feature_buffers(); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,7 @@ | |
#include "dirs_and_files.h" | ||
#include "environment.h" | ||
#include "globals.h" | ||
|
||
#include "arrow_output_stream.h" | ||
#ifdef USE_GPU | ||
bool gpu_initialize(int dev_id); | ||
#endif | ||
|
@@ -59,6 +59,13 @@ int main (int argc, char** argv) | |
auto startTS = getTimeStr(); | ||
VERBOSLVL1(std::cout << "\n>>> STARTING >>> " << startTS << "\n";) | ||
|
||
|
||
bool use_arrow = false; | ||
|
||
#ifdef USE_ARROW | ||
use_arrow = theEnvironment.arrow_output_type == "ARROW" || theEnvironment.arrow_output_type == "PARQUET"; | ||
#endif | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This piece of logic is OK but is just somewhat not elegant to put such a low-level stuff in main(). Again, if we overload processDataset() as mentioned earlier, we would be able to call it separately with Apache-related and CSV parameters. |
||
// Process the image data | ||
int min_online_roi_size = 0; | ||
errorCode = processDataset ( | ||
|
@@ -68,7 +75,8 @@ int main (int argc, char** argv) | |
theEnvironment.n_pixel_scan_threads, | ||
theEnvironment.n_reduce_threads, | ||
min_online_roi_size, | ||
theEnvironment.useCsv, // 'true' to save to csv | ||
use_arrow, | ||
theEnvironment.useCsv, | ||
theEnvironment.output_dir); | ||
|
||
// Report feature extraction error, if any | ||
|
@@ -90,25 +98,6 @@ int main (int argc, char** argv) | |
break; | ||
} | ||
|
||
// Save features in Apache formats, if enabled | ||
#ifdef USE_ARROW | ||
|
||
if (theEnvironment.arrow_output_type == "ARROW" || theEnvironment.arrow_output_type == "ARROWIPC") | ||
theEnvironment.arrow_output.create_arrow_file(theResultsCache.get_headerBuf(), | ||
theResultsCache.get_stringColBuf(), | ||
theResultsCache.get_calcResultBuf(), | ||
theResultsCache.get_num_rows(), | ||
theEnvironment.output_dir); | ||
|
||
else | ||
if (theEnvironment.arrow_output_type == "PARQUET") | ||
theEnvironment.arrow_output.create_parquet_file(theResultsCache.get_headerBuf(), | ||
theResultsCache.get_stringColBuf(), | ||
theResultsCache.get_calcResultBuf(), | ||
theResultsCache.get_num_rows(), | ||
theEnvironment.output_dir); | ||
#endif | ||
|
||
// Process nested ROIs | ||
if (theEnvironment.nestedOptions.defined()) | ||
{ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How a user can specify via arrow_file_type_upper that the desired format is ".feather" ?