Sage-Bionetworks · philerooski · Jul 15, 2024 · Jun 24, 2024 · Jun 26, 2024 · Jun 26, 2024
@@ -88,6 +88,56 @@ jobs:
           Payload: '{"RequestType": "Create"}'
           LogType: Tail
 
+  deploy-snowflake-main:
+    name: Deploy Snowflake resources
+    needs: sceptre-deploy-main
+    runs-on: ubuntu-latest
+    env:
+      PRIVATE_KEY_PASSPHRASE: ${{ secrets.SNOWFLAKE_PRIVATE_KEY_PASSPHRASE }}
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Configure Snowflake connection
+        run: |
+          # Create temporary files for config.toml and our private key
+          config_file=$(mktemp)
+          private_key_file=$(mktemp)
+
+          # Write to the private key file
+          echo "${{ secrets.SNOWFLAKE_PRIVATE_KEY }}" > $private_key_file
+
+          # Write to config.toml file
+          echo 'default_connection_name = "recover"' >> $config_file
+          echo '[connections.recover]' >> $config_file
+          echo "account = \"${{ vars.SNOWFLAKE_ACCOUNT }}\"" >> $config_file
+          echo "user = \"${{ vars.SNOWFLAKE_USER }}\"" >> $config_file
+          echo "role = \"${{ vars.SNOWFLAKE_ROLE }}\"" >> $config_file
+          echo 'authenticator = "SNOWFLAKE_JWT"' >> $config_file
+          echo "private_key_path = \"$private_key_file\"" >> $config_file
+
+          # Write config.toml path to global environment
+          echo "SNOWFLAKE_CONFIG_PATH=$config_file" >> $GITHUB_ENV
+
+      - name: Configuration file information
+        run: |
+          echo "Snowflake configuration is located at $SNOWFLAKE_CONFIG_PATH"
+          cat $SNOWFLAKE_CONFIG_PATH
+
+      - name: Install Snowflake CLI
+        uses: Snowflake-Labs/snowflake-cli-action@v1
+        with:
+          default-config-file-path: ${{ env.SNOWFLAKE_CONFIG_PATH }}
+
+      - name: Test Snowflake connection
+        run: |
+          snow --version
+          snow connection test
+
+      - name: Deploy Snowflake objects
+        run: |
+          snow sql \
+            -D "environment=main" \
+            -f snowflake/objects/deploy.sql
 
   sts-access-test:
     name: Runs STS access tests on prod synapse folders

@@ -26,6 +26,58 @@ jobs:
       - uses: actions/setup-python@v4
       - uses: pre-commit/[email protected]
 
+  deploy-snowflake:
+    name: Deploy Snowflake resources
+    needs: pre-commit
+    runs-on: ubuntu-latest
+    env:
+      PRIVATE_KEY_PASSPHRASE: ${{ secrets.SNOWFLAKE_PRIVATE_KEY_PASSPHRASE }}
+      SNOWFLAKE_ENVIRONMENT: ${{ github.ref_name == 'main' && 'staging' || github.ref_name }}
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Configure Snowflake connection
+        run: |
+          # Create temporary files for config.toml and our private key
+          config_file=$(mktemp)
+          private_key_file=$(mktemp)
+
+          # Write to the private key file
+          echo "${{ secrets.SNOWFLAKE_PRIVATE_KEY }}" > $private_key_file
+
+          # Write to config.toml file
+          echo 'default_connection_name = "recover"' >> $config_file
+          echo '[connections.recover]' >> $config_file
+          echo "account = \"${{ vars.SNOWFLAKE_ACCOUNT }}\"" >> $config_file
+          echo "user = \"${{ vars.SNOWFLAKE_USER }}\"" >> $config_file
+          echo "role = \"${{ vars.SNOWFLAKE_ROLE }}\"" >> $config_file
+          echo 'authenticator = "SNOWFLAKE_JWT"' >> $config_file
+          echo "private_key_path = \"$private_key_file\"" >> $config_file
+
+          # Write config.toml path to global environment
+          echo "SNOWFLAKE_CONFIG_PATH=$config_file" >> $GITHUB_ENV
+
+      - name: Configuration file information
+        run: |
+          echo "Snowflake configuration is located at $SNOWFLAKE_CONFIG_PATH"
+          cat $SNOWFLAKE_CONFIG_PATH
+
+      - name: Install Snowflake CLI
+        uses: Snowflake-Labs/snowflake-cli-action@v1
+        with:
+          default-config-file-path: ${{ env.SNOWFLAKE_CONFIG_PATH }}
+
+      - name: Test Snowflake connection
+        run: |
+          snow --version
+          snow connection test
+
+      - name: Deploy Snowflake objects
+        run: |
+          snow sql \
+            -D "environment=$SNOWFLAKE_ENVIRONMENT" \
+            -f snowflake/objects/deploy.sql
+
   upload-files:
     name: Upload files to S3 bucket in development
     runs-on: ubuntu-latest
@@ -100,7 +152,6 @@ jobs:
           --test-sts-permission read_write
           -v
 
-
   pytest-docker:
     name: Build and push testing docker images to the pytest ECR repository.
     needs: pre-commit
@@ -156,7 +207,6 @@ jobs:
       ecr-username: ${{ steps.login-ecr.outputs[steps.ecr.outputs.username-key] }}
       ecr-password: ${{ steps.login-ecr.outputs[steps.ecr.outputs.password-key] }}
 
-
   glue-unit-tests:
     name: Run Pytest unit tests for AWS glue
     needs: pytest-docker
@@ -207,7 +257,6 @@ jobs:
           su - glue_user --command "cd $GITHUB_WORKSPACE &&
           python3 -m pytest tests/test_json_to_parquet.py --namespace $NAMESPACE -v"
 
-
   sceptre-deploy-develop:
     name: Deploys branch using sceptre
     runs-on: ubuntu-latest
@@ -291,7 +340,6 @@ jobs:
           --bucket $DEV_INTERMEDIATE_BUCKET
           --bucket_prefix "${{ env.NAMESPACE }}/json/"
 
-
   integration-test-develop:
     name: Triggers ETL workflow with S3 test files
     runs-on: ubuntu-latest

@@ -0,0 +1,50 @@
+# Snowflake
+
+We use Snowflake to interact with RECOVER data. Snowflake is a [DBMS](https://en.wikipedia.org/wiki/Database) with managed infrastructure that enables us to work with the data in a way that's easier and faster compared to AWS Glue. For general information about Snowflake, see Snowflake's [Key Concepts and Architecture](https://docs.snowflake.com/en/user-guide/intro-key-concepts).
+
+## Deployment
+We deploy Snowflake objects as part of our CI/CD process. When a commit is pushed to a branch, all objects under the `objects` folder are deployed automatically to a database prefixed with the branch name, e.g., `RECOVER_MY_BRANCH_NAME`. Deployment happens within its own isolated environment, ensuring that deployments do not interfere with each other and maintain their own independent configurations and resources. An environment is analogous to a database. That said, account-level objects are reeused by each deployment. For more information, see [our deployment entrypoint](objects/deploy.sql).
+
+### Deployment logic and object definitions
+Deployment logic and object [DDL](https://en.wikipedia.org/wiki/Data_definition_language) is organized as a hierarchy:
+```
+snowflake/objects
+└── database
+    └── recover
+        └── schema
+            └── parquet
+                ├── file_format
+                ├── stage
+                └── table
+```
+
+Every level in this hierarchy has a `deploy.sql` which will deploy all child objects with respect to the current directory.
+```
+snowflake/objects
+├── database
+│   ├── deploy.sql
+│   └── recover
+│       ├── deploy.sql
+│       └── schema
+│           ├── deploy.sql
+│           └── parquet
+│               ├── deploy.sql
+│               ├── file_format
+│               │   ├── deploy.sql
+│               │   └── parquet_format.sql
+│               ├── stage
+│               │   ├── deploy.sql
+│               │   └── parquet_s3.sql
+│               └── table
+│                   ├── deploy.sql
+│                   ├── enrolledparticipants_customfields_symptoms_parquet.sql
+│                   ├── enrolledparticipants_customfields_treatments_parquet.sql
+│                   ├── enrolledparticipants_parquet.sql
+│                   ├── fitbitactivitylogs_parquet.sql
+│                   ├── fitbitdailydata_parquet.sql
+│                   ├── ...
+└── deploy.sql
+```
+For example, the file located at `snowflake/objects/database/recover/deploy.sql` will deploy all objects under the `RECOVER` database, and `snowflake/objects/database/recover/schema/parquet/deploy.sql` will deploy the file formats, stages, and tables under the `PARQUET` schema.
+
+The child objects, that is, the schemas, file formats, stages, tables – anything that is not a database – are defined in such a way as to be agnostic to their parent context. For example, `snowflake/objects/database/recover/schema/parquet/deploy.sql` will deploy the `PARQUET` schema and all its child objects to whichever database your Snowflake user assumes. There is nothing in the SQL which restricts the `PARQUET` schema to be created within the `RECOVER` database. Likewise, the tables in `snowflake/objects/database/recover/schema/parquet/table/` can be deployed to any schema, although their DDL is specific to the columns in our Parquet datasets.
@@ -0,0 +1,17 @@
+/*
+  The current maximum allowed execution depth of EXECUTE IMMEDIATE FROM
+  statements is 5. Unfortunately, that makes a call stack which looks like:
+
+  depth
+  0      deploy.sql =>
+  1        databases/deploy.sql =>
+  2          databases/recover/deploy.sql =>
+  3            databases/recover/schemas/deploy.sql =>
+  4              databases/recover/schemas/parquet/deploy.sql =>
+  5                databases/recover/schemas/parquet/tables/deploy.sql =>
+  6                  databases/recover/schemas/parquet/tables/enrolled_participants.sql
+
+  not possible. To circumvent this issue, we omit the highest level of
+  abstraction (databases/deploy.sql) and instead EXECUTE IMMEDIATE FROM
+  database deployments individually in the primary deployment script (deploy.sql)
+*/
@@ -0,0 +1,12 @@
+/*
+  Create a recover database (if it doesn't yet exist) for an environment and
+  deploy all child objects.
+*/
+CREATE DATABASE IF NOT EXISTS recover_{{ environment }};
+USE DATABASE recover_{{ environment }};
+
+EXECUTE IMMEDIATE
+    FROM './schema/deploy.sql'
+    USING (
+        git_branch => '{{ git_branch }}'
+    );
@@ -0,0 +1,8 @@
+/*
+  Deploy schemas and their child objects.
+*/
+EXECUTE IMMEDIATE
+    FROM './parquet/deploy.sql'
+    USING (
+        git_branch => '{{ git_branch }}'
+    );
@@ -0,0 +1,22 @@
+/*
+  Create a parquet schema (if it doesn't yet exist) and deploy all child objects.
+*/
+CREATE SCHEMA IF NOT EXISTS parquet;
+USE SCHEMA parquet;
+
+SET parquet_file_format_name = 'parquet_format';
+SET parquet_stage_name = 'parquet_s3';
+
+EXECUTE IMMEDIATE
+    FROM './file_format/deploy.sql'
+    USING (
+        parquet_file_format_name => $parquet_file_format_name
+    );
+EXECUTE IMMEDIATE
+    FROM './stage/deploy.sql'
+    USING (
+        git_branch => '{{ git_branch }}',
+        parquet_stage_name => $parquet_stage_name
+    );
+EXECUTE IMMEDIATE
+    FROM './table/deploy.sql';
@@ -0,0 +1,8 @@
+/*
+  Deploy all file formats
+*/
+EXECUTE IMMEDIATE
+  FROM './parquet_format.sql'
+  USING (
+    parquet_file_format_name => '{{ parquet_file_format_name }}'
+  );
@@ -0,0 +1,7 @@
+/*
+  Create the Parquet file format
+*/
+CREATE OR REPLACE FILE FORMAT {{ parquet_file_format_name }}
+    TYPE = PARQUET
+    COMPRESSION = AUTO
+    USE_VECTORIZED_SCANNER = TRUE;
@@ -0,0 +1,9 @@
+/*
+  Deploy all stages under the `parquet` schema.
+*/
+EXECUTE IMMEDIATE
+    FROM './parquet_s3.sql'
+    USING (
+        git_branch => '{{ git_branch }}',
+        parquet_stage_name => '{{ parquet_stage_name }}'
+    );
@@ -0,0 +1,6 @@
+/*
+  Create an external stage over the Parquet data in S3
+*/
+CREATE OR REPLACE STAGE {{ parquet_stage_name }}
+  URL = 's3://recover-processed-data/{{ git_branch }}/parquet/'
+  STORAGE_INTEGRATION = recover_prod_s3;
@@ -0,0 +1,54 @@
+/*
+  CREATE OR ALTER all tables
+*/
+
+EXECUTE IMMEDIATE
+    FROM './enrolledparticipants_customfields_symptoms_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './enrolledparticipants_customfields_treatments_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './enrolledparticipants_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './fitbitactivitylogs_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './fitbitdailydata_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './fitbitdevices_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './fitbitecg_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './fitbitecg_waveformsamples_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './fitbitintradaycombined_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './fitbitrestingheartrates_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './fitbitsleeplogs_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './fitbitsleeplogs_sleeplogdetails_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './googlefitsamples_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './healthkitv2activitysummaries_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './healthkitv2electrocardiogram_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './healthkitv2electrocardiogram_subsamples_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './healthkitv2heartbeat_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './healthkitv2heartbeat_subsamples_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './healthkitv2samples_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './healthkitv2statistics_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './healthkitv2workouts_events_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './healthkitv2workouts_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './symptomlog_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './symptomlog_value_symptoms_parquet.sql';
+EXECUTE IMMEDIATE
+    FROM './symptomlog_value_treatments_parquet.sql';
@@ -0,0 +1,11 @@
+CREATE OR ALTER TABLE ENROLLEDPARTICIPANTS_CUSTOMFIELDS_SYMPTOMS (
+    "id" NUMBER(38,0),
+    "index" NUMBER(38,0),
+    "CustomFields_Symptoms_id" VARCHAR(16777216),
+    "name" VARCHAR(16777216),
+    "color" VARCHAR(16777216),
+    "severityTracking" VARCHAR(16777216),
+    "inactive" BOOLEAN,
+    "ParticipantIdentifier" VARCHAR(16777216),
+    "ParticipantID" VARCHAR(16777216)
+);
@@ -0,0 +1,10 @@
+CREATE OR ALTER TABLE ENROLLEDPARTICIPANTS_CUSTOMFIELDS_TREATMENTS (
+    "id" NUMBER(38,0),
+    "CustomFields_Treatments_id" VARCHAR(16777216),
+    "name" VARCHAR(16777216),
+    "color" VARCHAR(16777216),
+    "index" NUMBER(38,0),
+    "inactive" BOOLEAN,
+    "ParticipantIdentifier" VARCHAR(16777216),
+    "ParticipantID" VARCHAR(16777216)
+);