From a594b0904724d3eadd1b988c5ee32acae1a0091e Mon Sep 17 00:00:00 2001 From: "Wang, Fuyuan (NIH/NCI) [C]" Date: Mon, 25 Sep 2023 10:15:46 -0400 Subject: [PATCH] add functionality to copy the static files to the transformed data folder and upload it to the s3 bucket with the transformed data --- config/gmb_config.yaml.j2 | 9 +++++---- config/gmb_config_example.yaml | 2 +- gmb_config.yaml.j2 | 1 + gmb_transformation.py | 15 +++++++++++++++ 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/config/gmb_config.yaml.j2 b/config/gmb_config.yaml.j2 index 69823d1..179aee7 100644 --- a/config/gmb_config.yaml.j2 +++ b/config/gmb_config.yaml.j2 @@ -1,8 +1,9 @@ USERNAME: {{rave_user}} PASSWORD: {{rave_password}} API: https://ccredc.mdsol.com/RaveWebServices/studies/000048({{rave_env}})/datasets/regular -OUTPUT_FOLDER: ./raw_data/ -OUTPUT_NODE_FOLDER: ./transformed_data/ -NODE_FILE: ./node_file/000048_Model.yml +OUTPUT_FOLDER_RAW: ./gmb_raw_data_files/ +OUTPUT_FOLDER_TRANSFORMED: ./gmb_transformed_data_files/ +DATA_MODEL_NODE_FILE: ./node_file/000048_Model.yml +S3_BUCKET: cloudone-gmb-nonprod-metadata RAVE_DATA_VERSION: {{rave_data_version}} -S3_BUCKET: {{bucket_name}} +STATIC_FILES: static-files/ \ No newline at end of file diff --git a/config/gmb_config_example.yaml b/config/gmb_config_example.yaml index ec62b2e..ea49d4a 100644 --- a/config/gmb_config_example.yaml +++ b/config/gmb_config_example.yaml @@ -8,4 +8,4 @@ S3_BUCKET: S3_BUCKET_NAME RAVE_DATA_VERSION: 1147 DATA_LOADER: /Documents/icdc-dataloader-master/loader.py DATA_LOADER_CONFIG: gmb-local.yml -STATIC_FILES: ./static-files/ \ No newline at end of file +STATIC_FILES: static-files/ \ No newline at end of file diff --git a/gmb_config.yaml.j2 b/gmb_config.yaml.j2 index b8860a0..179aee7 100644 --- a/gmb_config.yaml.j2 +++ b/gmb_config.yaml.j2 @@ -6,3 +6,4 @@ OUTPUT_FOLDER_TRANSFORMED: ./gmb_transformed_data_files/ DATA_MODEL_NODE_FILE: ./node_file/000048_Model.yml S3_BUCKET: cloudone-gmb-nonprod-metadata RAVE_DATA_VERSION: {{rave_data_version}} +STATIC_FILES: static-files/ \ No newline at end of file diff --git a/gmb_transformation.py b/gmb_transformation.py index f76838b..a4d6e77 100644 --- a/gmb_transformation.py +++ b/gmb_transformation.py @@ -43,6 +43,11 @@ def upload_files(self, s3): file_directory = self.config['OUTPUT_FOLDER_TRANSFORMED'] + file_name s3_file_directory = 'Transformed' + '/' + timestamp + '/' + file_name s3.upload_file(file_directory ,self.config['S3_BUCKET'], s3_file_directory) + for file_name in os.listdir(self.config['STATIC_FILES']): + if file_name.endswith('.tsv'): + file_directory = self.config['STATIC_FILES'] + file_name + s3_file_directory = 'Transformed' + '/' + timestamp + '/' + file_name + s3.upload_file(file_directory ,self.config['S3_BUCKET'], s3_file_directory) subfolder = 's3://' + self.config['S3_BUCKET'] + '/' + 'Transformed' + '/' + timestamp self.log.info(f'Data files upload to {subfolder}') @@ -183,6 +188,16 @@ def transform(self): else: self.log.info(f'{file_name[0]} is not in the node file') + #download the static files before upload + for key in s3.list_objects(Bucket = self.config['S3_BUCKET'], Prefix = self.config['STATIC_FILES'])['Contents']: + if key['Key'].endswith(".tsv"): + if not os.path.exists(self.config['STATIC_FILES']): + # If the path does not exist, then create the folder + os.mkdir(self.config['STATIC_FILES']) + static_file_name = key['Key'].split('/') + static_file_key = self.config['STATIC_FILES'] + static_file_name[1] + s3.download_file(self.config['S3_BUCKET'], key['Key'], static_file_key) + self.upload_files(s3)