Skip to content

Commit

Permalink
Refactoring of File Reader classes to accommodate for AWS SDK S3 inte…
Browse files Browse the repository at this point in the history
…gration (NVIDIA#5434)

Signed-off-by: Joaquin Anton <[email protected]>
  • Loading branch information
jantonguirao authored Apr 19, 2024
1 parent 22864d9 commit 35fdbd9
Show file tree
Hide file tree
Showing 46 changed files with 997 additions and 481 deletions.
4 changes: 2 additions & 2 deletions dali/kernels/slice/slice_flip_normalize_gpu_test.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -79,7 +79,7 @@ class SliceFlipNormalizeGPUTest : public ::testing::Test {
}

void LoadTensor(Tensor<CPUBackend> &tensor, const std::string& path_npy) {
auto stream = FileStream::Open(path_npy, false, false);
auto stream = FileStream::Open(path_npy);
tensor = ::dali::numpy::ReadTensor(stream.get(), true);
}

Expand Down
2 changes: 1 addition & 1 deletion dali/operators/imgcodec/decoder_test_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ inline Tensor<CPUBackend> ReadReference(InputStream *src, TensorLayout layout =
*/
inline Tensor<CPUBackend> ReadReferenceFrom(const std::string &reference_path,
TensorLayout layout = "HWC") {
auto src = FileStream::Open(reference_path, false, false);
auto src = FileStream::Open(reference_path);
return ReadReference(src.get(), layout);
}

Expand Down
6 changes: 5 additions & 1 deletion dali/operators/reader/file_reader_op.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2017-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -114,6 +114,10 @@ list of files in the sub-directories of the ``file_root``.
This argument is ignored when file paths are taken from ``file_list`` or ``files``.)",
kKnownExtensionsGlob)
.AddOptionalArg<std::vector<string>>("dir_filters", R"(A list of glob strings to filter the
list of sub-directories under ``file_root``.
This argument is ignored when file paths are taken from ``file_list`` or ``files``.)", nullptr)
.AddOptionalArg<bool>("case_sensitive_filter", R"(If set to True, the filter will be matched
case-sensitively, otherwise case-insensitively.)", false)
.AddParent("LoaderBase");
Expand Down
6 changes: 4 additions & 2 deletions dali/operators/reader/loader/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -25,6 +25,7 @@ collect_headers(DALI_INST_HDRS PARENT_SCOPE)

set(DALI_OPERATOR_SRCS ${DALI_OPERATOR_SRCS}
"${CMAKE_CURRENT_SOURCE_DIR}/filesystem.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/discover_files.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/file_label_loader.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/coco_loader.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/loader.cc"
Expand Down Expand Up @@ -57,7 +58,8 @@ endif()
set(DALI_OPERATOR_TEST_SRCS ${DALI_OPERATOR_TEST_SRCS}
"${CMAKE_CURRENT_SOURCE_DIR}/loader_test.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/sequence_loader_test.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/filesystem_test.cc")
"${CMAKE_CURRENT_SOURCE_DIR}/filesystem_test.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/discover_files_test.cc")

if (BUILD_LIBSND)
set(DALI_OPERATOR_TEST_SRCS ${DALI_OPERATOR_TEST_SRCS}
Expand Down
28 changes: 14 additions & 14 deletions dali/operators/reader/loader/coco_loader.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -141,13 +141,13 @@ void SaveToFile(const std::vector<std::vector<T> > &input, const std::string pat
}

template <>
void SaveToFile(const ImageIdPairs &image_id_pairs, const std::string path) {
if (image_id_pairs.empty())
void SaveToFile(const std::vector<FileLabelEntry> &entries, const std::string path) {
if (entries.empty())
return;
std::ofstream file(path);
DALI_ENFORCE(file, "CocoReader meta file error while saving: " + path);
for (const auto &p : image_id_pairs) {
file << p.first << std::endl;
for (const auto &p : entries) {
file << p.filename << std::endl;
}
DALI_ENFORCE(file.good(), make_string("Error writing to path: ", path));
}
Expand Down Expand Up @@ -203,16 +203,16 @@ void LoadFromFile(std::vector<std::vector<T> > &output, const std::string path)
}

template <>
void LoadFromFile(ImageIdPairs &image_id_pairs, const std::string path) {
void LoadFromFile(std::vector<FileLabelEntry> &entries, const std::string path) {
std::ifstream file(path);
image_id_pairs.clear();
entries.clear();
if (!file.good())
return;

int id = 0;
std::string filename;
while (file >> filename) {
image_id_pairs.emplace_back(std::move(filename), int{id});
entries.push_back({std::move(filename), id});
++id;
}
}
Expand Down Expand Up @@ -417,14 +417,14 @@ void ParseJsonFile(const OpSpec &spec, std::vector<detail::ImageInfo> &image_inf

} // namespace detail

void CocoLoader::SavePreprocessedAnnotations(const std::string &path,
const ImageIdPairs &image_id_pairs) {
void CocoLoader::SavePreprocessedAnnotations(
const std::string &path, const std::vector<FileLabelEntry> &entries) {
using detail::SaveToFile;
SaveToFile(offsets_, path + "/offsets.dat");
SaveToFile(boxes_, path + "/boxes.dat");
SaveToFile(labels_, path + "/labels.dat");
SaveToFile(counts_, path + "/counts.dat");
SaveToFile(image_id_pairs, path + "/filenames.dat");
SaveToFile(entries, path + "/filenames.dat");

if (output_polygon_masks_ || output_pixelwise_masks_) {
SaveToFile(polygon_data_, path + "/polygon_data.dat");
Expand Down Expand Up @@ -459,7 +459,7 @@ void CocoLoader::ParsePreprocessedAnnotations() {
LoadFromFile(boxes_, path + "/boxes.dat");
LoadFromFile(labels_, path + "/labels.dat");
LoadFromFile(counts_, path + "/counts.dat");
LoadFromFile(image_label_pairs_, path + "/filenames.dat");
LoadFromFile(file_label_entries_, path + "/filenames.dat");

if (output_polygon_masks_ || output_pixelwise_masks_) {
LoadFromFile(polygon_data_, path + "/polygon_data.dat");
Expand Down Expand Up @@ -628,7 +628,7 @@ void CocoLoader::ParseJsonAnnotations() {
}
}

image_label_pairs_.emplace_back(std::move(image_info.filename_), new_image_id);
file_label_entries_.push_back({std::move(image_info.filename_), new_image_id});
new_image_id++;
}
}
Expand All @@ -639,7 +639,7 @@ void CocoLoader::ParseJsonAnnotations() {
if (spec_.GetArgument<bool>("save_preprocessed_annotations")) {
SavePreprocessedAnnotations(
spec_.GetArgument<std::string>("save_preprocessed_annotations_dir"),
image_label_pairs_);
file_label_entries_);
}
}

Expand Down
11 changes: 5 additions & 6 deletions dali/operators/reader/loader/coco_loader.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -34,8 +34,6 @@ extern "C" {

namespace dali {

using ImageIdPairs = std::vector<std::pair<std::string, int>>;

inline bool OutPolygonMasksEnabled(const OpSpec &spec) {
return spec.GetArgument<bool>("polygon_masks") ||
(spec.HasArgument("masks") && spec.GetArgument<bool>("masks"));
Expand Down Expand Up @@ -189,12 +187,12 @@ class DLL_PUBLIC CocoLoader : public FileLabelLoaderBase<true> {
// seeded with hardcoded value to get
// the same sequence on every shard
std::mt19937 g(kDaliDataloaderSeed);
std::shuffle(image_label_pairs_.begin(), image_label_pairs_.end(), g);
std::shuffle(file_label_entries_.begin(), file_label_entries_.end(), g);
}

if (IsCheckpointingEnabled() && shuffle_after_epoch_) {
// save initial order
backup_image_label_pairs_ = image_label_pairs_;
backup_file_label_entries_ = file_label_entries_;
}
Reset(true);
}
Expand All @@ -203,7 +201,8 @@ class DLL_PUBLIC CocoLoader : public FileLabelLoaderBase<true> {

void ParseJsonAnnotations();

void SavePreprocessedAnnotations(const std::string &path, const ImageIdPairs &image_id_pairs);
void SavePreprocessedAnnotations(
const std::string &path, const std::vector<FileLabelEntry> &image_id_pairs);

private:
const OpSpec spec_;
Expand Down
5 changes: 2 additions & 3 deletions dali/operators/reader/loader/cufile_loader.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -38,8 +38,7 @@ namespace dali {
template <typename Target>
class CUFileLoader : public FileLoader<GPUBackend, Target, CUFileStream> {
public:
explicit CUFileLoader(const OpSpec& spec, vector<std::string> images = {},
bool shuffle_after_epoch = false)
CUFileLoader(const OpSpec& spec, bool shuffle_after_epoch)
: FileLoader<GPUBackend, Target, CUFileStream>(spec, shuffle_after_epoch) {
}

Expand Down
142 changes: 142 additions & 0 deletions dali/operators/reader/loader/discover_files.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "dali/operators/reader/loader/discover_files.h"
#include <dirent.h>
#include <errno.h>
#include <fnmatch.h>
#include <glob.h>
#include <sys/stat.h>
#include <algorithm>
#include <cstring>
#include <filesystem>
#include <optional>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "dali/core/call_at_exit.h"
#include "dali/core/error_handling.h"
#include "dali/operators/reader/loader/filesystem.h"
#include "dali/operators/reader/loader/utils.h"

namespace dali {

std::vector<std::string> list_subdirectories(const std::string &parent_dir,
const std::vector<std::string> dir_filters = {},
bool case_sensitive_filter = true) {
// open the root
DIR *dir = opendir(parent_dir.c_str());
DALI_ENFORCE(dir != nullptr, make_string("Failed to open ", parent_dir));
auto cleanup = AtScopeExit([&dir] {
closedir(dir);
});

struct dirent *entry;
std::vector<std::string> subdirs;

while ((entry = readdir(dir))) {
struct stat s;
std::string entry_name(entry->d_name);
std::string full_path = filesystem::join_path(parent_dir, entry_name);
int ret = stat(full_path.c_str(), &s);
DALI_ENFORCE(ret == 0, "Could not access " + full_path + " during directory traversal.");
if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
continue;
if (S_ISDIR(s.st_mode)) {
if (dir_filters.empty()) {
subdirs.push_back(entry_name);
} else {
for (auto &filter : dir_filters) {
if (fnmatch(filter.c_str(), entry_name.c_str(),
case_sensitive_filter ? 0 : FNM_CASEFOLD) == 0) {
subdirs.push_back(entry_name);
}
}
}
}
}
// sort directories to preserve class alphabetic order, as readdir could
// return unordered dir list. Otherwise file reader for training and validation
// could return directories with the same names in completely different order
std::sort(subdirs.begin(), subdirs.end());
return subdirs;
}

std::vector<std::string> list_files(const std::string &parent_dir,
const std::vector<std::string> filters = {},
bool case_sensitive_filter = true) {
DIR *dir = opendir(parent_dir.c_str());
DALI_ENFORCE(dir != nullptr, make_string("Failed to open ", parent_dir));
auto cleanup = AtScopeExit([&dir] {
closedir(dir);
});

dirent *entry;
std::vector<std::string> files;
while ((entry = readdir(dir))) {
#ifdef _DIRENT_HAVE_D_TYPE
/*
* we support only regular files and symlinks, if FS returns DT_UNKNOWN
* it doesn't mean anything and let us validate filename itself
*/
if (entry->d_type != DT_REG && entry->d_type != DT_LNK && entry->d_type != DT_UNKNOWN) {
continue;
}
#endif
std::string fname(entry->d_name);
for (auto &filter : filters) {
if (fnmatch(filter.c_str(), fname.c_str(), case_sensitive_filter ? 0 : FNM_CASEFOLD) == 0) {
files.push_back(fname);
break;
}
}
}
std::sort(files.begin(), files.end());
return files;
}

std::vector<FileLabelEntry> discover_files(const std::string &file_root,
const FileDiscoveryOptions &opts) {
bool is_s3 = starts_with(file_root, "s3://");
if (is_s3) {
DALI_FAIL("This version of DALI was not built with AWS S3 storage support.");
}

std::vector<std::string> subdirs;
subdirs = list_subdirectories(file_root, opts.dir_filters, opts.case_sensitive_filter);
std::vector<FileLabelEntry> entries;
auto process_dir = [&](const std::string &rel_dirpath, std::optional<int> label = {}) {
auto full_dirpath = filesystem::join_path(file_root, rel_dirpath);
auto tmp_files = list_files(full_dirpath, opts.file_filters, opts.case_sensitive_filter);
for (const auto &f : tmp_files) {
entries.push_back({filesystem::join_path(rel_dirpath, f), label});
}
};

// if we are in "label_from_subdir" mode, we need a subdir to infer the label, therefore we don't
// visit the current directory
if (!opts.label_from_subdir) {
process_dir(".");
}
for (unsigned dir_idx = 0; dir_idx < subdirs.size(); ++dir_idx) {
process_dir(subdirs[dir_idx],
opts.label_from_subdir ? std::optional<int>{dir_idx} : std::nullopt);
}
size_t total_dir_count = opts.label_from_subdir ? subdirs.size() : subdirs.size() + 1;
LOG_LINE << "read " << entries.size() << " files from " << total_dir_count << "directories\n";
return entries;
}

} // namespace dali
Loading

0 comments on commit 35fdbd9

Please sign in to comment.